Merge tag 'pull-vfio-20240723-1' of https://github.com/legoater/qemu into staging
vfio queue:
* IOMMUFD Dirty Tracking support
* Fix for a possible SEGV in IOMMU type1 container
* Dropped initialization of host IOMMU device with mdev devices
# -----BEGIN PGP SIGNATURE-----
#
# iQIzBAABCAAdFiEEoPZlSPBIlev+awtgUaNDx8/77KEFAmafyVUACgkQUaNDx8/7
# 7KGebRAAzEYxvstDxSPNF+1xx937TKbRpiKYtspTfEgu4Ht50MwO2ZqnVWzTBSwa
# qcjhDf2avMBpBvkp4O9fR7nXR0HRN2KvYrBSThZ3Qpqu4KjxCAGcHI5uYmgfizYh
# BBLrw3eWME5Ry220TinQF5KFl50vGq7Z/mku5N5Tgj2qfTfCXYK1Kc19SyAga49n
# LSokTIjZAGJa4vxrE7THawaEUjFRjfCJey64JUs/TPJaGr4R1snJcWgETww6juUE
# 9OSw/xl0AoQhaN/ZTRC1qCsBLUI2MVPsC+x+vqVK62HlTjCx+uDRVQ8KzfDzjCeH
# gaLkMjxJSuJZMpm4UU7DBzDGEGcEBCGeNyFt37BSqqPPpX55CcFhj++d8vqTiwpF
# YzmTNd/znxcZTw6OJN9sQZohh+NeS86CVZ3x31HD3dXifhRf17jbh7NoIyi+0ZCb
# N+mytOH5BXsD+ddwbk+yMaxXV43Fgz7ThG5tB1tjhhNtLZHDA5ezFvGZ5F/FJrqE
# xAbjOhz5MC+RcOVNSzQJCULNqFpfE6Gqeys6btEDm/ltf4LpAe6W1HYuv8BJc19T
# UsqGK2yKAuQX8GErYxJ1zqZCttVrgpsmXFYTC5iGbxC84mvsF0Iti96IdXz9gfzN
# Vlb2OxoefcOwVqIhbkvTZW0ZwYGGDDPAYhLMfr5lSuRqj123OOo=
# =cViP
# -----END PGP SIGNATURE-----
# gpg: Signature made Wed 24 Jul 2024 01:16:37 AM AEST
# gpg: using RSA key A0F66548F04895EBFE6B0B6051A343C7CFFBECA1
# gpg: Good signature from "Cédric Le Goater <clg@kaod.org>" [undefined]
# gpg: WARNING: This key is not certified with a trusted signature!
# gpg: There is no indication that the signature belongs to the owner.
# Primary key fingerprint: A0F6 6548 F048 95EB FE6B 0B60 51A3 43C7 CFFB ECA1
* tag 'pull-vfio-20240723-1' of https://github.com/legoater/qemu:
vfio/common: Allow disabling device dirty page tracking
vfio/migration: Don't block migration device dirty tracking is unsupported
vfio/iommufd: Implement VFIOIOMMUClass::query_dirty_bitmap support
vfio/iommufd: Implement VFIOIOMMUClass::set_dirty_tracking support
vfio/iommufd: Probe and request hwpt dirty tracking capability
vfio/{iommufd, container}: Invoke HostIOMMUDevice::realize() during attach_device()
vfio/iommufd: Add hw_caps field to HostIOMMUDeviceCaps
vfio/{iommufd,container}: Remove caps::aw_bits
vfio/iommufd: Introduce auto domain creation
vfio/ccw: Don't initialize HOST_IOMMU_DEVICE with mdev
vfio/ap: Don't initialize HOST_IOMMU_DEVICE with mdev
vfio/iommufd: Return errno in iommufd_cdev_attach_ioas_hwpt()
backends/iommufd: Extend iommufd_backend_get_device_info() to fetch HW capabilities
vfio/iommufd: Don't initialize nor set a HOST_IOMMU_DEVICE with mdev
vfio/pci: Extract mdev check into an helper
hw/vfio/container: Fix SIGSEV on vfio_container_instance_finalize()
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
diff --git a/MAINTAINERS b/MAINTAINERS
index d5ff6c2..0645cbf 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -140,6 +140,7 @@
F: target/i386/*.[ch]
F: target/i386/Kconfig
F: target/i386/meson.build
+F: tools/i386/
Guest CPU cores (TCG)
---------------------
@@ -2009,6 +2010,7 @@
F: qapi/pci.json
F: docs/pci*
F: docs/specs/*pci*
+F: docs/system/sriov.rst
PCIE DOE
M: Huai-Cheng Kuo <hchkuo@avery-design.com.tw>
@@ -2208,6 +2210,7 @@
vhost
M: Michael S. Tsirkin <mst@redhat.com>
+R: Stefano Garzarella <sgarzare@redhat.com>
S: Supported
F: hw/*/*vhost*
F: docs/interop/vhost-user.json
@@ -3398,6 +3401,12 @@
F: docs/specs/tpm.rst
T: git https://github.com/stefanberger/qemu-tpm.git tpm-next
+SPDM
+M: Alistair Francis <alistair.francis@wdc.com>
+S: Maintained
+F: backends/spdm-socket.c
+F: include/sysemu/spdm-socket.h
+
Checkpatch
S: Odd Fixes
F: scripts/checkpatch.pl
@@ -3657,6 +3666,7 @@
VT-d Emulation
M: Michael S. Tsirkin <mst@redhat.com>
R: Jason Wang <jasowang@redhat.com>
+R: Yi Liu <yi.l.liu@intel.com>
S: Supported
F: hw/i386/intel_iommu.c
F: hw/i386/intel_iommu_internal.h
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 64bf47a..67b7736 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -340,14 +340,71 @@
return ret;
}
+void kvm_park_vcpu(CPUState *cpu)
+{
+ struct KVMParkedVcpu *vcpu;
+
+ trace_kvm_park_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
+
+ vcpu = g_malloc0(sizeof(*vcpu));
+ vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
+ vcpu->kvm_fd = cpu->kvm_fd;
+ QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
+}
+
+int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id)
+{
+ struct KVMParkedVcpu *cpu;
+ int kvm_fd = -ENOENT;
+
+ QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
+ if (cpu->vcpu_id == vcpu_id) {
+ QLIST_REMOVE(cpu, node);
+ kvm_fd = cpu->kvm_fd;
+ g_free(cpu);
+ }
+ }
+
+ trace_kvm_unpark_vcpu(vcpu_id, kvm_fd > 0 ? "unparked" : "!found parked");
+
+ return kvm_fd;
+}
+
+int kvm_create_vcpu(CPUState *cpu)
+{
+ unsigned long vcpu_id = kvm_arch_vcpu_id(cpu);
+ KVMState *s = kvm_state;
+ int kvm_fd;
+
+ /* check if the KVM vCPU already exist but is parked */
+ kvm_fd = kvm_unpark_vcpu(s, vcpu_id);
+ if (kvm_fd < 0) {
+ /* vCPU not parked: create a new KVM vCPU */
+ kvm_fd = kvm_vm_ioctl(s, KVM_CREATE_VCPU, vcpu_id);
+ if (kvm_fd < 0) {
+ error_report("KVM_CREATE_VCPU IOCTL failed for vCPU %lu", vcpu_id);
+ return kvm_fd;
+ }
+ }
+
+ cpu->kvm_fd = kvm_fd;
+ cpu->kvm_state = s;
+ cpu->vcpu_dirty = true;
+ cpu->dirty_pages = 0;
+ cpu->throttle_us_per_full = 0;
+
+ trace_kvm_create_vcpu(cpu->cpu_index, vcpu_id, kvm_fd);
+
+ return 0;
+}
+
static int do_kvm_destroy_vcpu(CPUState *cpu)
{
KVMState *s = kvm_state;
long mmap_size;
- struct KVMParkedVcpu *vcpu = NULL;
int ret = 0;
- trace_kvm_destroy_vcpu();
+ trace_kvm_destroy_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
ret = kvm_arch_destroy_vcpu(cpu);
if (ret < 0) {
@@ -373,10 +430,7 @@
}
}
- vcpu = g_malloc0(sizeof(*vcpu));
- vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
- vcpu->kvm_fd = cpu->kvm_fd;
- QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
+ kvm_park_vcpu(cpu);
err:
return ret;
}
@@ -389,24 +443,6 @@
}
}
-static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
-{
- struct KVMParkedVcpu *cpu;
-
- QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
- if (cpu->vcpu_id == vcpu_id) {
- int kvm_fd;
-
- QLIST_REMOVE(cpu, node);
- kvm_fd = cpu->kvm_fd;
- g_free(cpu);
- return kvm_fd;
- }
- }
-
- return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
-}
-
int kvm_init_vcpu(CPUState *cpu, Error **errp)
{
KVMState *s = kvm_state;
@@ -415,19 +451,14 @@
trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
- ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
+ ret = kvm_create_vcpu(cpu);
if (ret < 0) {
- error_setg_errno(errp, -ret, "kvm_init_vcpu: kvm_get_vcpu failed (%lu)",
+ error_setg_errno(errp, -ret,
+ "kvm_init_vcpu: kvm_create_vcpu failed (%lu)",
kvm_arch_vcpu_id(cpu));
goto err;
}
- cpu->kvm_fd = ret;
- cpu->kvm_state = s;
- cpu->vcpu_dirty = true;
- cpu->dirty_pages = 0;
- cpu->throttle_us_per_full = 0;
-
mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
if (mmap_size < 0) {
ret = mmap_size;
@@ -3745,6 +3776,21 @@
s->device = g_strdup(value);
}
+static void kvm_set_kvm_rapl(Object *obj, bool value, Error **errp)
+{
+ KVMState *s = KVM_STATE(obj);
+ s->msr_energy.enable = value;
+}
+
+static void kvm_set_kvm_rapl_socket_path(Object *obj,
+ const char *str,
+ Error **errp)
+{
+ KVMState *s = KVM_STATE(obj);
+ g_free(s->msr_energy.socket_path);
+ s->msr_energy.socket_path = g_strdup(str);
+}
+
static void kvm_accel_instance_init(Object *obj)
{
KVMState *s = KVM_STATE(obj);
@@ -3764,6 +3810,7 @@
s->xen_gnttab_max_frames = 64;
s->xen_evtchn_max_pirq = 256;
s->device = NULL;
+ s->msr_energy.enable = false;
}
/**
@@ -3808,6 +3855,17 @@
object_class_property_set_description(oc, "device",
"Path to the device node to use (default: /dev/kvm)");
+ object_class_property_add_bool(oc, "rapl",
+ NULL,
+ kvm_set_kvm_rapl);
+ object_class_property_set_description(oc, "rapl",
+ "Allow energy related MSRs for RAPL interface in Guest");
+
+ object_class_property_add_str(oc, "rapl-helper-socket", NULL,
+ kvm_set_kvm_rapl_socket_path);
+ object_class_property_set_description(oc, "rapl-helper-socket",
+ "Socket Path for comminucating with the Virtual MSR helper daemon");
+
kvm_arch_accel_class_init(oc);
}
diff --git a/accel/kvm/kvm-cpus.h b/accel/kvm/kvm-cpus.h
index ca40add..171b22fd 100644
--- a/accel/kvm/kvm-cpus.h
+++ b/accel/kvm/kvm-cpus.h
@@ -22,5 +22,4 @@
int kvm_insert_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len);
int kvm_remove_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len);
void kvm_remove_all_breakpoints(CPUState *cpu);
-
#endif /* KVM_CPUS_H */
diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events
index 681ccb6..37626c1 100644
--- a/accel/kvm/trace-events
+++ b/accel/kvm/trace-events
@@ -9,6 +9,10 @@
kvm_failed_reg_get(uint64_t id, const char *msg) "Warning: Unable to retrieve ONEREG %" PRIu64 " from KVM: %s"
kvm_failed_reg_set(uint64_t id, const char *msg) "Warning: Unable to set ONEREG %" PRIu64 " to KVM: %s"
kvm_init_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu"
+kvm_create_vcpu(int cpu_index, unsigned long arch_cpu_id, int kvm_fd) "index: %d, id: %lu, kvm fd: %d"
+kvm_destroy_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu"
+kvm_park_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu"
+kvm_unpark_vcpu(unsigned long arch_cpu_id, const char *msg) "id: %lu %s"
kvm_irqchip_commit_routes(void) ""
kvm_irqchip_add_msi_route(char *name, int vector, int virq) "dev %s vector %d virq %d"
kvm_irqchip_update_msi_route(int virq) "Updating MSI route virq=%d"
@@ -25,7 +29,6 @@
kvm_dirty_ring_reap(uint64_t count, int64_t t) "reaped %"PRIu64" pages (took %"PRIi64" us)"
kvm_dirty_ring_reaper_kick(const char *reason) "%s"
kvm_dirty_ring_flush(int finished) "%d"
-kvm_destroy_vcpu(void) ""
kvm_failed_get_vcpu_mmap_size(void) ""
kvm_cpu_exec(void) ""
kvm_interrupt_exit_request(void) ""
diff --git a/backends/Kconfig b/backends/Kconfig
index 2cb23f6..d3dbe19 100644
--- a/backends/Kconfig
+++ b/backends/Kconfig
@@ -3,3 +3,7 @@
config IOMMUFD
bool
depends on VFIO
+
+config SPDM_SOCKET
+ bool
+ default y
diff --git a/backends/meson.build b/backends/meson.build
index 749b491..da714b9 100644
--- a/backends/meson.build
+++ b/backends/meson.build
@@ -33,4 +33,6 @@
system_ss.add(when: gio, if_true: files('dbus-vmstate.c'))
system_ss.add(when: 'CONFIG_SGX', if_true: files('hostmem-epc.c'))
+system_ss.add(when: 'CONFIG_SPDM_SOCKET', if_true: files('spdm-socket.c'))
+
subdir('tpm')
diff --git a/backends/spdm-socket.c b/backends/spdm-socket.c
new file mode 100644
index 0000000..d0663d6
--- /dev/null
+++ b/backends/spdm-socket.c
@@ -0,0 +1,216 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/*
+ * QEMU SPDM socket support
+ *
+ * This is based on:
+ * https://github.com/DMTF/spdm-emu/blob/07c0a838bcc1c6207c656ac75885c0603e344b6f/spdm_emu/spdm_emu_common/command.c
+ * but has been re-written to match QEMU style
+ *
+ * Copyright (c) 2021, DMTF. All rights reserved.
+ * Copyright (c) 2023. Western Digital Corporation or its affiliates.
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/spdm-socket.h"
+#include "qapi/error.h"
+
+static bool read_bytes(const int socket, uint8_t *buffer,
+ size_t number_of_bytes)
+{
+ ssize_t number_received = 0;
+ ssize_t result;
+
+ while (number_received < number_of_bytes) {
+ result = recv(socket, buffer + number_received,
+ number_of_bytes - number_received, 0);
+ if (result <= 0) {
+ return false;
+ }
+ number_received += result;
+ }
+ return true;
+}
+
+static bool read_data32(const int socket, uint32_t *data)
+{
+ bool result;
+
+ result = read_bytes(socket, (uint8_t *)data, sizeof(uint32_t));
+ if (!result) {
+ return result;
+ }
+ *data = ntohl(*data);
+ return true;
+}
+
+static bool read_multiple_bytes(const int socket, uint8_t *buffer,
+ uint32_t *bytes_received,
+ uint32_t max_buffer_length)
+{
+ uint32_t length;
+ bool result;
+
+ result = read_data32(socket, &length);
+ if (!result) {
+ return result;
+ }
+
+ if (length > max_buffer_length) {
+ return false;
+ }
+
+ if (bytes_received) {
+ *bytes_received = length;
+ }
+
+ if (length == 0) {
+ return true;
+ }
+
+ return read_bytes(socket, buffer, length);
+}
+
+static bool receive_platform_data(const int socket,
+ uint32_t transport_type,
+ uint32_t *command,
+ uint8_t *receive_buffer,
+ uint32_t *bytes_to_receive)
+{
+ bool result;
+ uint32_t response;
+ uint32_t bytes_received;
+
+ result = read_data32(socket, &response);
+ if (!result) {
+ return result;
+ }
+ *command = response;
+
+ result = read_data32(socket, &transport_type);
+ if (!result) {
+ return result;
+ }
+
+ bytes_received = 0;
+ result = read_multiple_bytes(socket, receive_buffer, &bytes_received,
+ *bytes_to_receive);
+ if (!result) {
+ return result;
+ }
+ *bytes_to_receive = bytes_received;
+
+ return result;
+}
+
+static bool write_bytes(const int socket, const uint8_t *buffer,
+ uint32_t number_of_bytes)
+{
+ ssize_t number_sent = 0;
+ ssize_t result;
+
+ while (number_sent < number_of_bytes) {
+ result = send(socket, buffer + number_sent,
+ number_of_bytes - number_sent, 0);
+ if (result == -1) {
+ return false;
+ }
+ number_sent += result;
+ }
+ return true;
+}
+
+static bool write_data32(const int socket, uint32_t data)
+{
+ data = htonl(data);
+ return write_bytes(socket, (uint8_t *)&data, sizeof(uint32_t));
+}
+
+static bool write_multiple_bytes(const int socket, const uint8_t *buffer,
+ uint32_t bytes_to_send)
+{
+ bool result;
+
+ result = write_data32(socket, bytes_to_send);
+ if (!result) {
+ return result;
+ }
+
+ return write_bytes(socket, buffer, bytes_to_send);
+}
+
+static bool send_platform_data(const int socket,
+ uint32_t transport_type, uint32_t command,
+ const uint8_t *send_buffer, size_t bytes_to_send)
+{
+ bool result;
+
+ result = write_data32(socket, command);
+ if (!result) {
+ return result;
+ }
+
+ result = write_data32(socket, transport_type);
+ if (!result) {
+ return result;
+ }
+
+ return write_multiple_bytes(socket, send_buffer, bytes_to_send);
+}
+
+int spdm_socket_connect(uint16_t port, Error **errp)
+{
+ int client_socket;
+ struct sockaddr_in server_addr;
+
+ client_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+ if (client_socket < 0) {
+ error_setg(errp, "cannot create socket: %s", strerror(errno));
+ return -1;
+ }
+
+ memset((char *)&server_addr, 0, sizeof(server_addr));
+ server_addr.sin_family = AF_INET;
+ server_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ server_addr.sin_port = htons(port);
+
+
+ if (connect(client_socket, (struct sockaddr *)&server_addr,
+ sizeof(server_addr)) < 0) {
+ error_setg(errp, "cannot connect: %s", strerror(errno));
+ close(client_socket);
+ return -1;
+ }
+
+ return client_socket;
+}
+
+uint32_t spdm_socket_rsp(const int socket, uint32_t transport_type,
+ void *req, uint32_t req_len,
+ void *rsp, uint32_t rsp_len)
+{
+ uint32_t command;
+ bool result;
+
+ result = send_platform_data(socket, transport_type,
+ SPDM_SOCKET_COMMAND_NORMAL,
+ req, req_len);
+ if (!result) {
+ return 0;
+ }
+
+ result = receive_platform_data(socket, transport_type, &command,
+ (uint8_t *)rsp, &rsp_len);
+ if (!result) {
+ return 0;
+ }
+
+ assert(command != 0);
+
+ return rsp_len;
+}
+
+void spdm_socket_close(const int socket, uint32_t transport_type)
+{
+ send_platform_data(socket, transport_type,
+ SPDM_SOCKET_COMMAND_SHUTDOWN, NULL, 0);
+}
diff --git a/contrib/systemd/qemu-vmsr-helper.service b/contrib/systemd/qemu-vmsr-helper.service
new file mode 100644
index 0000000..8fd397b
--- /dev/null
+++ b/contrib/systemd/qemu-vmsr-helper.service
@@ -0,0 +1,15 @@
+[Unit]
+Description=Virtual RAPL MSR Daemon for QEMU
+
+[Service]
+WorkingDirectory=/tmp
+Type=simple
+ExecStart=/usr/bin/qemu-vmsr-helper
+PrivateTmp=yes
+ProtectSystem=strict
+ReadWritePaths=/var/run
+RestrictAddressFamilies=AF_UNIX
+Restart=always
+RestartSec=0
+
+[Install]
diff --git a/contrib/systemd/qemu-vmsr-helper.socket b/contrib/systemd/qemu-vmsr-helper.socket
new file mode 100644
index 0000000..183e830
--- /dev/null
+++ b/contrib/systemd/qemu-vmsr-helper.socket
@@ -0,0 +1,9 @@
+[Unit]
+Description=Virtual RAPL MSR helper for QEMU
+
+[Socket]
+ListenStream=/run/qemu-vmsr-helper.sock
+SocketMode=0600
+
+[Install]
+WantedBy=multi-user.target
diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c
index 9492146..6cc18a1 100644
--- a/contrib/vhost-user-blk/vhost-user-blk.c
+++ b/contrib/vhost-user-blk/vhost-user-blk.c
@@ -196,7 +196,7 @@
VubDev *vdev_blk = req->vdev_blk;
desc = buf;
uint64_t range[2] = { le64_to_cpu(desc->sector) << 9,
- le32_to_cpu(desc->num_sectors) << 9 };
+ (uint64_t)le32_to_cpu(desc->num_sectors) << 9 };
if (type == VIRTIO_BLK_T_DISCARD) {
if (ioctl(vdev_blk->blk_fd, BLKDISCARD, range) == 0) {
g_free(buf);
diff --git a/docs/interop/qemu-ga.rst b/docs/interop/qemu-ga.rst
index 72fb75a..9c73808 100644
--- a/docs/interop/qemu-ga.rst
+++ b/docs/interop/qemu-ga.rst
@@ -28,11 +28,30 @@
option wins, but the lists accumulate (see below for configuration
file format).
+If an allowed RPCs list is defined in the configuration, then all
+RPCs will be blocked by default, except for the allowed list.
+
+If a blocked RPCs list is defined in the configuration, then all
+RPCs will be allowed by default, except for the blocked list.
+
+If both allowed and blocked RPCs lists are defined in the configuration,
+then all RPCs will be blocked by default, then the allowed list will
+be applied, followed by the blocked list.
+
+While filesystems are frozen, all except for a designated safe set
+of RPCs will blocked, regardless of what the general configuration
+declares.
+
Options
-------
.. program:: qemu-ga
+.. option:: -c, --config=PATH
+
+ Configuration file path (the default is |CONFDIR|\ ``/qemu-ga.conf``,
+ unless overriden by the QGA_CONF environment variable)
+
.. option:: -m, --method=METHOD
Transport method: one of ``unix-listen``, ``virtio-serial``, or
@@ -131,6 +150,7 @@
statedir string
verbose boolean
block-rpcs string list
+allow-rpcs string list
============= ===========
See also
diff --git a/docs/specs/acpi_hw_reduced_hotplug.rst b/docs/specs/acpi_hw_reduced_hotplug.rst
index 0bd3f93..3acd6fc 100644
--- a/docs/specs/acpi_hw_reduced_hotplug.rst
+++ b/docs/specs/acpi_hw_reduced_hotplug.rst
@@ -64,7 +64,8 @@
0: Memory hotplug event
1: System power down event
2: NVDIMM hotplug event
- 3-31: Reserved
+ 3: CPU hotplug event
+ 4-31: Reserved
**write_access:**
diff --git a/docs/specs/index.rst b/docs/specs/index.rst
index 1484e3e..be899b4 100644
--- a/docs/specs/index.rst
+++ b/docs/specs/index.rst
@@ -29,7 +29,9 @@
edu
ivshmem-spec
pvpanic
+ spdm
standard-vga
virt-ctlr
vmcoreinfo
vmgenid
+ rapl-msr
diff --git a/docs/specs/rapl-msr.rst b/docs/specs/rapl-msr.rst
new file mode 100644
index 0000000..1202ee8
--- /dev/null
+++ b/docs/specs/rapl-msr.rst
@@ -0,0 +1,155 @@
+================
+RAPL MSR support
+================
+
+The RAPL interface (Running Average Power Limit) is advertising the accumulated
+energy consumption of various power domains (e.g. CPU packages, DRAM, etc.).
+
+The consumption is reported via MSRs (model specific registers) like
+MSR_PKG_ENERGY_STATUS for the CPU package power domain. These MSRs are 64 bits
+registers that represent the accumulated energy consumption in micro Joules.
+
+Thanks to the MSR Filtering patch [#a]_ not all MSRs are handled by KVM. Some
+of them can now be handled by the userspace (QEMU). It uses a mechanism called
+"MSR filtering" where a list of MSRs is given at init time of a VM to KVM so
+that a callback is put in place. The design of this patch uses only this
+mechanism for handling the MSRs between guest/host.
+
+At the moment the following MSRs are involved:
+
+.. code:: C
+
+ #define MSR_RAPL_POWER_UNIT 0x00000606
+ #define MSR_PKG_POWER_LIMIT 0x00000610
+ #define MSR_PKG_ENERGY_STATUS 0x00000611
+ #define MSR_PKG_POWER_INFO 0x00000614
+
+The ``*_POWER_UNIT``, ``*_POWER_LIMIT``, ``*_POWER INFO`` are part of the RAPL
+spec and specify the power limit of the package, provide range of parameter(min
+power, max power,..) and also the information of the multiplier for the energy
+counter to calculate the power. Those MSRs are populated once at the beginning
+by reading the host CPU MSRs and are given back to the guest 1:1 when
+requested.
+
+The MSR_PKG_ENERGY_STATUS is a counter; it represents the total amount of
+energy consumed since the last time the register was cleared. If you multiply
+it with the UNIT provided above you'll get the power in micro-joules. This
+counter is always increasing and it increases more or less faster depending on
+the consumption of the package. This counter is supposed to overflow at some
+point.
+
+Each core belonging to the same Package reading the MSR_PKG_ENERGY_STATUS (i.e
+"rdmsr 0x611") will retrieve the same value. The value represents the energy
+for the whole package. Whatever Core reading it will get the same value and a
+core that belongs to PKG-0 will not be able to get the value of PKG-1 and
+vice-versa.
+
+High level implementation
+-------------------------
+
+In order to update the value of the virtual MSR, a QEMU thread is created.
+The thread is basically just an infinity loop that does:
+
+1. Snapshot of the time metrics of all QEMU threads (Time spent scheduled in
+ Userspace and System)
+
+2. Snapshot of the actual MSR_PKG_ENERGY_STATUS counter of all packages where
+ the QEMU threads are running on.
+
+3. Sleep for 1 second - During this pause the vcpu and other non-vcpu threads
+ will do what they have to do and so the energy counter will increase.
+
+4. Repeat 2. and 3. and calculate the delta of every metrics representing the
+ time spent scheduled for each QEMU thread *and* the energy spent by the
+ packages during the pause.
+
+5. Filter the vcpu threads and the non-vcpu threads.
+
+6. Retrieve the topology of the Virtual Machine. This helps identify which
+ vCPU is running on which virtual package.
+
+7. The total energy spent by the non-vcpu threads is divided by the number
+ of vcpu threads so that each vcpu thread will get an equal part of the
+ energy spent by the QEMU workers.
+
+8. Calculate the ratio of energy spent per vcpu threads.
+
+9. Calculate the energy for each virtual package.
+
+10. The virtual MSRs are updated for each virtual package. Each vCPU that
+ belongs to the same package will return the same value when accessing the
+ the MSR.
+
+11. Loop back to 1.
+
+Ratio calculation
+-----------------
+
+In Linux, a process has an execution time associated with it. The scheduler is
+dividing the time in clock ticks. The number of clock ticks per second can be
+found by the sysconf system call. A typical value of clock ticks per second is
+100. So a core can run a process at the maximum of 100 ticks per second. If a
+package has 4 cores, 400 ticks maximum can be scheduled on all the cores
+of the package for a period of 1 second.
+
+The /proc/[pid]/stat [#b]_ is a sysfs file that can give the executed time of a
+process with the [pid] as the process ID. It gives the amount of ticks the
+process has been scheduled in userspace (utime) and kernel space (stime).
+
+By reading those metrics for a thread, one can calculate the ratio of time the
+package has spent executing the thread.
+
+Example:
+
+A 4 cores package can schedule a maximum of 400 ticks per second with 100 ticks
+per second per core. If a thread was scheduled for 100 ticks between a second
+on this package, that means my thread has been scheduled for 1/4 of the whole
+package. With that, the calculation of the energy spent by the thread on this
+package during this whole second is 1/4 of the total energy spent by the
+package.
+
+Usage
+-----
+
+Currently this feature is only working on an Intel CPU that has the RAPL driver
+mounted and available in the sysfs. if not, QEMU fails at start-up.
+
+This feature is activated with -accel
+kvm,rapl=true,rapl-helper-socket=/path/sock.sock
+
+It is important that the socket path is the same as the one
+:program:`qemu-vmsr-helper` is listening to.
+
+qemu-vmsr-helper
+----------------
+
+The qemu-vmsr-helper is working very much like the qemu-pr-helper. Instead of
+making persistent reservation, qemu-vmsr-helper is here to overcome the
+CVE-2020-8694 which remove user access to the rapl msr attributes.
+
+A socket communication is established between QEMU processes that has the RAPL
+MSR support activated and the qemu-vmsr-helper. A systemd service and socket
+activation is provided in contrib/systemd/qemu-vmsr-helper.(service/socket).
+
+The systemd socket uses 600, like contrib/systemd/qemu-pr-helper.socket. The
+socket can be passed via SCM_RIGHTS by libvirt, or its permissions can be
+changed (e.g. 660 and root:kvm for a Debian system for example). Libvirt could
+also start a separate helper if needed. All in all, the policy is left to the
+user.
+
+See the qemu-pr-helper documentation or manpage for further details.
+
+Current Limitations
+-------------------
+
+- Works only on Intel host CPUs because AMD CPUs are using different MSR
+ addresses.
+
+- Only the Package Power-Plane (MSR_PKG_ENERGY_STATUS) is reported at the
+ moment.
+
+References
+----------
+
+.. [#a] https://patchwork.kernel.org/project/kvm/patch/20200916202951.23760-7-graf@amazon.com/
+.. [#b] https://man7.org/linux/man-pages/man5/proc.5.html
diff --git a/docs/specs/spdm.rst b/docs/specs/spdm.rst
new file mode 100644
index 0000000..f7de080
--- /dev/null
+++ b/docs/specs/spdm.rst
@@ -0,0 +1,134 @@
+======================================================
+QEMU Security Protocols and Data Models (SPDM) Support
+======================================================
+
+SPDM enables authentication, attestation and key exchange to assist in
+providing infrastructure security enablement. It's a standard published
+by the `DMTF`_.
+
+QEMU supports connecting to a SPDM responder implementation. This allows an
+external application to emulate the SPDM responder logic for an SPDM device.
+
+Setting up a SPDM server
+========================
+
+When using QEMU with SPDM devices QEMU will connect to a server which
+implements the SPDM functionality.
+
+SPDM-Utils
+----------
+
+You can use `SPDM Utils`_ to emulate a responder. This is the simplest method.
+
+SPDM-Utils is a Linux applications to manage, test and develop devices
+supporting DMTF Security Protocol and Data Model (SPDM). It is written in Rust
+and utilises libspdm.
+
+To use SPDM-Utils you will need to do the following steps. Details are included
+in the SPDM-Utils README.
+
+ 1. `Build libspdm`_
+ 2. `Build SPDM Utils`_
+ 3. `Run it as a server`_
+
+spdm-emu
+--------
+
+You can use `spdm emu`_ to model the
+SPDM responder.
+
+.. code-block:: shell
+
+ $ cd spdm-emu
+ $ git submodule init; git submodule update --recursive
+ $ mkdir build; cd build
+ $ cmake -DARCH=x64 -DTOOLCHAIN=GCC -DTARGET=Debug -DCRYPTO=openssl ..
+ $ make -j32
+ $ make copy_sample_key # Build certificates, required for SPDM authentication.
+
+It is worth noting that the certificates should be in compliance with
+PCIe r6.1 sec 6.31.3. This means you will need to add the following to
+openssl.cnf
+
+.. code-block::
+
+ subjectAltName = otherName:2.23.147;UTF8:Vendor=1b36:Device=0010:CC=010802:REV=02:SSVID=1af4:SSID=1100
+ 2.23.147 = ASN1:OID:2.23.147
+
+and then manually regenerate some certificates with:
+
+.. code-block:: shell
+
+ $ openssl req -nodes -newkey ec:param.pem -keyout end_responder.key \
+ -out end_responder.req -sha384 -batch \
+ -subj "/CN=DMTF libspdm ECP384 responder cert"
+
+ $ openssl x509 -req -in end_responder.req -out end_responder.cert \
+ -CA inter.cert -CAkey inter.key -sha384 -days 3650 -set_serial 3 \
+ -extensions v3_end -extfile ../openssl.cnf
+
+ $ openssl asn1parse -in end_responder.cert -out end_responder.cert.der
+
+ $ cat ca.cert.der inter.cert.der end_responder.cert.der > bundle_responder.certchain.der
+
+You can use SPDM-Utils instead as it will generate the correct certificates
+automatically.
+
+The responder can then be launched with
+
+.. code-block:: shell
+
+ $ cd bin
+ $ ./spdm_responder_emu --trans PCI_DOE
+
+Connecting an SPDM NVMe device
+==============================
+
+Once a SPDM server is running we can start QEMU and connect to the server.
+
+For an NVMe device first let's setup a block we can use
+
+.. code-block:: shell
+
+ $ cd qemu-spdm/linux/image
+ $ dd if=/dev/zero of=blknvme bs=1M count=2096 # 2GB NNMe Drive
+
+Then you can add this to your QEMU command line:
+
+.. code-block:: shell
+
+ -drive file=blknvme,if=none,id=mynvme,format=raw \
+ -device nvme,drive=mynvme,serial=deadbeef,spdm_port=2323
+
+At which point QEMU will try to connect to the SPDM server.
+
+Note that if using x64-64 you will want to use the q35 machine instead
+of the default. So the entire QEMU command might look like this
+
+.. code-block:: shell
+
+ qemu-system-x86_64 -M q35 \
+ --kernel bzImage \
+ -drive file=rootfs.ext2,if=virtio,format=raw \
+ -append "root=/dev/vda console=ttyS0" \
+ -net none -nographic \
+ -drive file=blknvme,if=none,id=mynvme,format=raw \
+ -device nvme,drive=mynvme,serial=deadbeef,spdm_port=2323
+
+.. _DMTF:
+ https://www.dmtf.org/standards/SPDM
+
+.. _SPDM Utils:
+ https://github.com/westerndigitalcorporation/spdm-utils
+
+.. _spdm emu:
+ https://github.com/dmtf/spdm-emu
+
+.. _Build libspdm:
+ https://github.com/westerndigitalcorporation/spdm-utils?tab=readme-ov-file#build-libspdm
+
+.. _Build SPDM Utils:
+ https://github.com/westerndigitalcorporation/spdm-utils?tab=readme-ov-file#build-the-binary
+
+.. _Run it as a server:
+ https://github.com/westerndigitalcorporation/spdm-utils#qemu-spdm-device-emulation
diff --git a/docs/system/index.rst b/docs/system/index.rst
index c21065e..718e9d3 100644
--- a/docs/system/index.rst
+++ b/docs/system/index.rst
@@ -39,3 +39,4 @@
multi-process
confidential-guest-support
vm-templating
+ sriov
diff --git a/docs/system/sriov.rst b/docs/system/sriov.rst
new file mode 100644
index 0000000..a851a66
--- /dev/null
+++ b/docs/system/sriov.rst
@@ -0,0 +1,36 @@
+.. SPDX-License-Identifier: GPL-2.0-or-later
+
+Compsable SR-IOV device
+=======================
+
+SR-IOV (Single Root I/O Virtualization) is an optional extended capability of a
+PCI Express device. It allows a single physical function (PF) to appear as
+multiple virtual functions (VFs) for the main purpose of eliminating software
+overhead in I/O from virtual machines.
+
+There are devices with predefined SR-IOV configurations, but it is also possible
+to compose an SR-IOV device yourself. Composing an SR-IOV device is currently
+only supported by virtio-net-pci.
+
+Users can configure an SR-IOV-capable virtio-net device by adding
+virtio-net-pci functions to a bus. Below is a command line example:
+
+.. code-block:: shell
+
+ -netdev user,id=n -netdev user,id=o
+ -netdev user,id=p -netdev user,id=q
+ -device pcie-root-port,id=b
+ -device virtio-net-pci,bus=b,addr=0x0.0x3,netdev=q,sriov-pf=f
+ -device virtio-net-pci,bus=b,addr=0x0.0x2,netdev=p,sriov-pf=f
+ -device virtio-net-pci,bus=b,addr=0x0.0x1,netdev=o,sriov-pf=f
+ -device virtio-net-pci,bus=b,addr=0x0.0x0,netdev=n,id=f
+
+The VFs specify the paired PF with ``sriov-pf`` property. The PF must be
+added after all VFs. It is the user's responsibility to ensure that VFs have
+function numbers larger than one of the PF, and that the function numbers
+have a consistent stride.
+
+You may also need to perform additional steps to activate the SR-IOV feature on
+your guest. For Linux, refer to [1]_.
+
+.. [1] https://docs.kernel.org/PCI/pci-iov-howto.html
diff --git a/docs/tools/index.rst b/docs/tools/index.rst
index 8e65ce0..33ad438 100644
--- a/docs/tools/index.rst
+++ b/docs/tools/index.rst
@@ -16,3 +16,4 @@
qemu-pr-helper
qemu-trace-stap
virtfs-proxy-helper
+ qemu-vmsr-helper
diff --git a/docs/tools/qemu-vmsr-helper.rst b/docs/tools/qemu-vmsr-helper.rst
new file mode 100644
index 0000000..6ec87b4
--- /dev/null
+++ b/docs/tools/qemu-vmsr-helper.rst
@@ -0,0 +1,89 @@
+==================================
+QEMU virtual RAPL MSR helper
+==================================
+
+Synopsis
+--------
+
+**qemu-vmsr-helper** [*OPTION*]
+
+Description
+-----------
+
+Implements the virtual RAPL MSR helper for QEMU.
+
+Accessing the RAPL (Running Average Power Limit) MSR enables the RAPL powercap
+driver to advertise and monitor the power consumption or accumulated energy
+consumption of different power domains, such as CPU packages, DRAM, and other
+components when available.
+
+However those register are accesible under priviliged access (CAP_SYS_RAWIO).
+QEMU can use an external helper to access those priviliged register.
+
+:program:`qemu-vmsr-helper` is that external helper; it creates a listener
+socket which will accept incoming connections for communication with QEMU.
+
+If you want to run VMs in a setup like this, this helper should be started as a
+system service, and you should read the QEMU manual section on "RAPL MSR
+support" to find out how to configure QEMU to connect to the socket created by
+:program:`qemu-vmsr-helper`.
+
+After connecting to the socket, :program:`qemu-vmsr-helper` can
+optionally drop root privileges, except for those capabilities that
+are needed for its operation.
+
+:program:`qemu-vmsr-helper` can also use the systemd socket activation
+protocol. In this case, the systemd socket unit should specify a
+Unix stream socket, like this::
+
+ [Socket]
+ ListenStream=/var/run/qemu-vmsr-helper.sock
+
+Options
+-------
+
+.. program:: qemu-vmsr-helper
+
+.. option:: -d, --daemon
+
+ run in the background (and create a PID file)
+
+.. option:: -q, --quiet
+
+ decrease verbosity
+
+.. option:: -v, --verbose
+
+ increase verbosity
+
+.. option:: -f, --pidfile=PATH
+
+ PID file when running as a daemon. By default the PID file
+ is created in the system runtime state directory, for example
+ :file:`/var/run/qemu-vmsr-helper.pid`.
+
+.. option:: -k, --socket=PATH
+
+ path to the socket. By default the socket is created in
+ the system runtime state directory, for example
+ :file:`/var/run/qemu-vmsr-helper.sock`.
+
+.. option:: -T, --trace [[enable=]PATTERN][,events=FILE][,file=FILE]
+
+ .. include:: ../qemu-option-trace.rst.inc
+
+.. option:: -u, --user=USER
+
+ user to drop privileges to
+
+.. option:: -g, --group=GROUP
+
+ group to drop privileges to
+
+.. option:: -h, --help
+
+ Display a help message and exit.
+
+.. option:: -V, --version
+
+ Display version information and exit.
diff --git a/gdbstub/gdbstub.c b/gdbstub/gdbstub.c
index b7be8e5..d08568c 100644
--- a/gdbstub/gdbstub.c
+++ b/gdbstub/gdbstub.c
@@ -618,6 +618,19 @@
}
}
+void gdb_unregister_coprocessor_all(CPUState *cpu)
+{
+ /*
+ * Safe to nuke everything. GDBRegisterState::xml is static const char so
+ * it won't be freed
+ */
+ g_array_free(cpu->gdb_regs, true);
+
+ cpu->gdb_regs = NULL;
+ cpu->gdb_num_regs = 0;
+ cpu->gdb_num_g_regs = 0;
+}
+
static void gdb_process_breakpoint_remove_all(GDBProcess *p)
{
CPUState *cpu = gdb_get_first_cpu_in_process(p);
diff --git a/hw/acpi/acpi-cpu-hotplug-stub.c b/hw/acpi/acpi-cpu-hotplug-stub.c
index 3fc4b14..c6c61bb 100644
--- a/hw/acpi/acpi-cpu-hotplug-stub.c
+++ b/hw/acpi/acpi-cpu-hotplug-stub.c
@@ -19,6 +19,12 @@
return;
}
+void cpu_hotplug_hw_init(MemoryRegion *as, Object *owner,
+ CPUHotplugState *state, hwaddr base_addr)
+{
+ return;
+}
+
void acpi_cpu_ospm_status(CPUHotplugState *cpu_st, ACPIOSTInfoList ***list)
{
return;
diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c
index 2d81c1e..5cb60ca 100644
--- a/hw/acpi/cpu.c
+++ b/hw/acpi/cpu.c
@@ -7,7 +7,6 @@
#include "trace.h"
#include "sysemu/numa.h"
-#define ACPI_CPU_HOTPLUG_REG_LEN 12
#define ACPI_CPU_SELECTOR_OFFSET_WR 0
#define ACPI_CPU_FLAGS_OFFSET_RW 4
#define ACPI_CPU_CMD_OFFSET_WR 5
@@ -339,9 +338,10 @@
#define CPU_FW_EJECT_EVENT "CEJF"
void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts,
- build_madt_cpu_fn build_madt_cpu, hwaddr io_base,
+ build_madt_cpu_fn build_madt_cpu, hwaddr base_addr,
const char *res_root,
- const char *event_handler_method)
+ const char *event_handler_method,
+ AmlRegionSpace rs)
{
Aml *ifctx;
Aml *field;
@@ -365,14 +365,22 @@
aml_name_decl("_UID", aml_string("CPU Hotplug resources")));
aml_append(cpu_ctrl_dev, aml_mutex(CPU_LOCK, 0));
+ assert((rs == AML_SYSTEM_IO) || (rs == AML_SYSTEM_MEMORY));
+
crs = aml_resource_template();
- aml_append(crs, aml_io(AML_DECODE16, io_base, io_base, 1,
+ if (rs == AML_SYSTEM_IO) {
+ aml_append(crs, aml_io(AML_DECODE16, base_addr, base_addr, 1,
ACPI_CPU_HOTPLUG_REG_LEN));
+ } else if (rs == AML_SYSTEM_MEMORY) {
+ aml_append(crs, aml_memory32_fixed(base_addr,
+ ACPI_CPU_HOTPLUG_REG_LEN, AML_READ_WRITE));
+ }
+
aml_append(cpu_ctrl_dev, aml_name_decl("_CRS", crs));
/* declare CPU hotplug MMIO region with related access fields */
aml_append(cpu_ctrl_dev,
- aml_operation_region("PRST", AML_SYSTEM_IO, aml_int(io_base),
+ aml_operation_region("PRST", rs, aml_int(base_addr),
ACPI_CPU_HOTPLUG_REG_LEN));
field = aml_field("PRST", AML_BYTE_ACC, AML_NOLOCK,
diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c
index 2d6e91b..15b4c3e 100644
--- a/hw/acpi/generic_event_device.c
+++ b/hw/acpi/generic_event_device.c
@@ -25,6 +25,7 @@
ACPI_GED_MEM_HOTPLUG_EVT,
ACPI_GED_PWR_DOWN_EVT,
ACPI_GED_NVDIMM_HOTPLUG_EVT,
+ ACPI_GED_CPU_HOTPLUG_EVT,
};
/*
@@ -107,6 +108,9 @@
aml_append(if_ctx, aml_call0(MEMORY_DEVICES_CONTAINER "."
MEMORY_SLOT_SCAN_METHOD));
break;
+ case ACPI_GED_CPU_HOTPLUG_EVT:
+ aml_append(if_ctx, aml_call0(AML_GED_EVT_CPU_SCAN_METHOD));
+ break;
case ACPI_GED_PWR_DOWN_EVT:
aml_append(if_ctx,
aml_notify(aml_name(ACPI_POWER_BUTTON_DEVICE),
@@ -234,6 +238,8 @@
} else {
acpi_memory_plug_cb(hotplug_dev, &s->memhp_state, dev, errp);
}
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+ acpi_cpu_plug_cb(hotplug_dev, &s->cpuhp_state, dev, errp);
} else {
error_setg(errp, "virt: device plug request for unsupported device"
" type: %s", object_get_typename(OBJECT(dev)));
@@ -248,6 +254,8 @@
if ((object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) &&
!(object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)))) {
acpi_memory_unplug_request_cb(hotplug_dev, &s->memhp_state, dev, errp);
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+ acpi_cpu_unplug_request_cb(hotplug_dev, &s->cpuhp_state, dev, errp);
} else {
error_setg(errp, "acpi: device unplug request for unsupported device"
" type: %s", object_get_typename(OBJECT(dev)));
@@ -261,6 +269,8 @@
if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
acpi_memory_unplug_cb(&s->memhp_state, dev, errp);
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+ acpi_cpu_unplug_cb(&s->cpuhp_state, dev, errp);
} else {
error_setg(errp, "acpi: device unplug for unsupported device"
" type: %s", object_get_typename(OBJECT(dev)));
@@ -272,6 +282,7 @@
AcpiGedState *s = ACPI_GED(adev);
acpi_memory_ospm_status(&s->memhp_state, list);
+ acpi_cpu_ospm_status(&s->cpuhp_state, list);
}
static void acpi_ged_send_event(AcpiDeviceIf *adev, AcpiEventStatusBits ev)
@@ -286,6 +297,8 @@
sel = ACPI_GED_PWR_DOWN_EVT;
} else if (ev & ACPI_NVDIMM_HOTPLUG_STATUS) {
sel = ACPI_GED_NVDIMM_HOTPLUG_EVT;
+ } else if (ev & ACPI_CPU_HOTPLUG_STATUS) {
+ sel = ACPI_GED_CPU_HOTPLUG_EVT;
} else {
/* Unknown event. Return without generating interrupt. */
warn_report("GED: Unsupported event %d. No irq injected", ev);
@@ -371,6 +384,42 @@
}
};
+static void acpi_ged_realize(DeviceState *dev, Error **errp)
+{
+ SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
+ AcpiGedState *s = ACPI_GED(dev);
+ uint32_t ged_events;
+ int i;
+
+ ged_events = ctpop32(s->ged_event_bitmap);
+
+ for (i = 0; i < ARRAY_SIZE(ged_supported_events) && ged_events; i++) {
+ uint32_t event = s->ged_event_bitmap & ged_supported_events[i];
+
+ if (!event) {
+ continue;
+ }
+
+ switch (event) {
+ case ACPI_GED_CPU_HOTPLUG_EVT:
+ /* initialize CPU Hotplug related regions */
+ memory_region_init(&s->container_cpuhp, OBJECT(dev),
+ "cpuhp container",
+ ACPI_CPU_HOTPLUG_REG_LEN);
+ sysbus_init_mmio(sbd, &s->container_cpuhp);
+ cpu_hotplug_hw_init(&s->container_cpuhp, OBJECT(dev),
+ &s->cpuhp_state, 0);
+ break;
+ }
+ ged_events--;
+ }
+
+ if (ged_events) {
+ error_report("Unsupported events specified");
+ abort();
+ }
+}
+
static void acpi_ged_initfn(Object *obj)
{
DeviceState *dev = DEVICE(obj);
@@ -411,6 +460,7 @@
dc->desc = "ACPI Generic Event Device";
device_class_set_props(dc, acpi_ged_properties);
dc->vmsd = &vmstate_acpi_ged;
+ dc->realize = acpi_ged_realize;
hc->plug = acpi_ged_device_plug_cb;
hc->unplug_request = acpi_ged_unplug_request_cb;
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index b0c68d6..719e83e 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -3308,6 +3308,7 @@
static void virt_machine_9_0_options(MachineClass *mc)
{
virt_machine_9_1_options(mc);
+ mc->smbios_memory_device_size = 16 * GiB;
compat_props_add(mc->compat_props, hw_compat_9_0, hw_compat_9_0_len);
}
DEFINE_VIRT_MACHINE(9, 0)
diff --git a/hw/audio/virtio-snd.c b/hw/audio/virtio-snd.c
index 5993f4f..e5196aa 100644
--- a/hw/audio/virtio-snd.c
+++ b/hw/audio/virtio-snd.c
@@ -282,11 +282,13 @@
error_report("Number of channels is not supported.");
return cpu_to_le32(VIRTIO_SND_S_NOT_SUPP);
}
- if (!(supported_formats & BIT(params->format))) {
+ if (BIT(params->format) > sizeof(supported_formats) ||
+ !(supported_formats & BIT(params->format))) {
error_report("Stream format is not supported.");
return cpu_to_le32(VIRTIO_SND_S_NOT_SUPP);
}
- if (!(supported_rates & BIT(params->rate))) {
+ if (BIT(params->rate) > sizeof(supported_rates) ||
+ !(supported_rates & BIT(params->rate))) {
error_report("Stream rate is not supported.");
return cpu_to_le32(VIRTIO_SND_S_NOT_SUPP);
}
@@ -1261,7 +1263,7 @@
{
VirtIOSoundPCMStream *stream = data;
VirtIOSoundPCMBuffer *buffer;
- size_t size;
+ size_t size, max_size;
WITH_QEMU_LOCK_GUARD(&stream->queue_mutex) {
while (!QSIMPLEQ_EMPTY(&stream->queue)) {
@@ -1275,7 +1277,12 @@
continue;
}
+ max_size = iov_size(buffer->elem->in_sg, buffer->elem->in_num);
for (;;) {
+ if (buffer->size >= max_size) {
+ return_rx_buffer(stream, buffer);
+ break;
+ }
size = AUD_read(stream->voice.in,
buffer->data + buffer->size,
MIN(available, (stream->params.period_bytes -
diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index fdbc30b..5b7f46b 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -51,6 +51,7 @@
VIRTIO_F_RING_PACKED,
VIRTIO_F_IOMMU_PLATFORM,
VIRTIO_F_RING_RESET,
+ VIRTIO_F_IN_ORDER,
VIRTIO_F_NOTIFICATION_DATA,
VHOST_INVALID_FEATURE_BIT
};
diff --git a/hw/core/cpu-common.c b/hw/core/cpu-common.c
index d2e3e45..7982ecd 100644
--- a/hw/core/cpu-common.c
+++ b/hw/core/cpu-common.c
@@ -282,7 +282,10 @@
}
#endif
free_queued_cpu_work(cpu);
- g_array_free(cpu->gdb_regs, TRUE);
+ /* If cleanup didn't happen in context to gdb_unregister_coprocessor_all */
+ if (cpu->gdb_regs) {
+ g_array_free(cpu->gdb_regs, TRUE);
+ }
qemu_lockcnt_destroy(&cpu->in_ioctl_lock);
qemu_mutex_destroy(&cpu->work_mutex);
qemu_cond_destroy(cpu->halt_cond);
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 8a878f8..27dcda0 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -1005,6 +1005,12 @@
/* Default 128 MB as guest ram size */
mc->default_ram_size = 128 * MiB;
mc->rom_file_has_mr = true;
+ /*
+ * SMBIOS 3.1.0 7.18.5 Memory Device — Extended Size
+ * use max possible value that could be encoded into
+ * 'Extended Size' field (2047Tb).
+ */
+ mc->smbios_memory_device_size = 2047 * TiB;
/* numa node memory size aligned on 8MB by default.
* On Linux, each node's border has to be 8MB aligned
diff --git a/hw/cxl/cxl-events.c b/hw/cxl/cxl-events.c
index d397718..12dee2e 100644
--- a/hw/cxl/cxl-events.c
+++ b/hw/cxl/cxl-events.c
@@ -139,6 +139,19 @@
return cxl_event_count(log) == 1;
}
+void cxl_discard_all_event_records(CXLDeviceState *cxlds)
+{
+ CXLEventLogType log_type;
+ CXLEventLog *log;
+
+ for (log_type = 0; log_type < CXL_EVENT_TYPE_MAX; log_type++) {
+ log = &cxlds->event_logs[log_type];
+ while (!cxl_event_empty(log)) {
+ cxl_event_delete_head(cxlds, log_type, log);
+ }
+ }
+}
+
CXLRetCode cxl_event_get_records(CXLDeviceState *cxlds, CXLGetEventPayload *pl,
uint8_t log_type, int max_recs,
size_t *len)
diff --git a/hw/cxl/cxl-host.c b/hw/cxl/cxl-host.c
index c5f5fcf..e9f2543 100644
--- a/hw/cxl/cxl-host.c
+++ b/hw/cxl/cxl-host.c
@@ -315,7 +315,8 @@
static void machine_get_cfmw(Object *obj, Visitor *v, const char *name,
void *opaque, Error **errp)
{
- CXLFixedMemoryWindowOptionsList **list = opaque;
+ CXLState *state = opaque;
+ CXLFixedMemoryWindowOptionsList **list = &state->cfmw_list;
visit_type_CXLFixedMemoryWindowOptionsList(v, name, list, errp);
}
diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index 74eeb6f..b752920 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -12,6 +12,7 @@
#include "hw/pci/msix.h"
#include "hw/cxl/cxl.h"
#include "hw/cxl/cxl_events.h"
+#include "hw/cxl/cxl_mailbox.h"
#include "hw/pci/pci.h"
#include "hw/pci-bridge/cxl_upstream_port.h"
#include "qemu/cutils.h"
@@ -62,12 +63,18 @@
#define SET_INTERRUPT_POLICY 0x3
FIRMWARE_UPDATE = 0x02,
#define GET_INFO 0x0
+ #define TRANSFER 0x1
+ #define ACTIVATE 0x2
TIMESTAMP = 0x03,
#define GET 0x0
#define SET 0x1
LOGS = 0x04,
#define GET_SUPPORTED 0x0
#define GET_LOG 0x1
+ FEATURES = 0x05,
+ #define GET_SUPPORTED 0x0
+ #define GET_FEATURE 0x1
+ #define SET_FEATURE 0x2
IDENTIFY = 0x40,
#define MEMORY_DEVICE 0x0
CCLS = 0x41,
@@ -83,6 +90,9 @@
#define GET_POISON_LIST 0x0
#define INJECT_POISON 0x1
#define CLEAR_POISON 0x2
+ #define GET_SCAN_MEDIA_CAPABILITIES 0x3
+ #define SCAN_MEDIA 0x4
+ #define GET_SCAN_MEDIA_RESULTS 0x5
DCD_CONFIG = 0x48,
#define GET_DC_CONFIG 0x0
#define GET_DYN_CAP_EXT_LIST 0x1
@@ -235,7 +245,6 @@
log_type = payload_in[0];
pl = (CXLGetEventPayload *)payload_out;
- memset(pl, 0, sizeof(*pl));
max_recs = (cxlds->payload_size - CXL_EVENT_PAYLOAD_HDR_SIZE) /
CXL_EVENT_RECORD_SIZE;
@@ -273,7 +282,6 @@
CXLEventLog *log;
policy = (CXLEventInterruptPolicy *)payload_out;
- memset(policy, 0, sizeof(*policy));
log = &cxlds->event_logs[CXL_EVENT_TYPE_INFO];
if (log->irq_enabled) {
@@ -372,7 +380,6 @@
QEMU_BUILD_BUG_ON(sizeof(*is_identify) != 18);
is_identify = (void *)payload_out;
- memset(is_identify, 0, sizeof(*is_identify));
is_identify->pcie_vid = class->vendor_id;
is_identify->pcie_did = class->device_id;
if (object_dynamic_cast(OBJECT(cci->d), TYPE_CXL_USP)) {
@@ -606,7 +613,6 @@
QEMU_BUILD_BUG_ON(sizeof(*bg_op_status) != 8);
bg_op_status = (void *)payload_out;
- memset(bg_op_status, 0, sizeof(*bg_op_status));
bg_op_status->status = cci->bg.complete_pct << 1;
if (cci->bg.runtime > 0) {
bg_op_status->status |= 1U << 0;
@@ -618,6 +624,9 @@
return CXL_MBOX_SUCCESS;
}
+#define CXL_FW_SLOTS 2
+#define CXL_FW_SIZE 0x02000000 /* 32 mb */
+
/* CXL r3.1 Section 8.2.9.3.1: Get FW Info (Opcode 0200h) */
static CXLRetCode cmd_firmware_update_get_info(const struct cxl_cmd *cmd,
uint8_t *payload_in,
@@ -647,17 +656,193 @@
}
fw_info = (void *)payload_out;
- memset(fw_info, 0, sizeof(*fw_info));
- fw_info->slots_supported = 2;
- fw_info->slot_info = BIT(0) | BIT(3);
- fw_info->caps = 0;
- pstrcpy(fw_info->fw_rev1, sizeof(fw_info->fw_rev1), "BWFW VERSION 0");
+ fw_info->slots_supported = CXL_FW_SLOTS;
+ fw_info->slot_info = (cci->fw.active_slot & 0x7) |
+ ((cci->fw.staged_slot & 0x7) << 3);
+ fw_info->caps = BIT(0); /* online update supported */
+
+ if (cci->fw.slot[0]) {
+ pstrcpy(fw_info->fw_rev1, sizeof(fw_info->fw_rev1), "BWFW VERSION 0");
+ }
+ if (cci->fw.slot[1]) {
+ pstrcpy(fw_info->fw_rev2, sizeof(fw_info->fw_rev2), "BWFW VERSION 1");
+ }
*len_out = sizeof(*fw_info);
return CXL_MBOX_SUCCESS;
}
+/* CXL r3.1 section 8.2.9.3.2: Transfer FW (Opcode 0201h) */
+#define CXL_FW_XFER_ALIGNMENT 128
+
+#define CXL_FW_XFER_ACTION_FULL 0x0
+#define CXL_FW_XFER_ACTION_INIT 0x1
+#define CXL_FW_XFER_ACTION_CONTINUE 0x2
+#define CXL_FW_XFER_ACTION_END 0x3
+#define CXL_FW_XFER_ACTION_ABORT 0x4
+
+static CXLRetCode cmd_firmware_update_transfer(const struct cxl_cmd *cmd,
+ uint8_t *payload_in,
+ size_t len,
+ uint8_t *payload_out,
+ size_t *len_out,
+ CXLCCI *cci)
+{
+ struct {
+ uint8_t action;
+ uint8_t slot;
+ uint8_t rsvd1[2];
+ uint32_t offset;
+ uint8_t rsvd2[0x78];
+ uint8_t data[];
+ } QEMU_PACKED *fw_transfer = (void *)payload_in;
+ size_t offset, length;
+
+ if (fw_transfer->action == CXL_FW_XFER_ACTION_ABORT) {
+ /*
+ * At this point there aren't any on-going transfers
+ * running in the bg - this is serialized before this
+ * call altogether. Just mark the state machine and
+ * disregard any other input.
+ */
+ cci->fw.transferring = false;
+ return CXL_MBOX_SUCCESS;
+ }
+
+ offset = fw_transfer->offset * CXL_FW_XFER_ALIGNMENT;
+ length = len - sizeof(*fw_transfer);
+ if (offset + length > CXL_FW_SIZE) {
+ return CXL_MBOX_INVALID_INPUT;
+ }
+
+ if (cci->fw.transferring) {
+ if (fw_transfer->action == CXL_FW_XFER_ACTION_FULL ||
+ fw_transfer->action == CXL_FW_XFER_ACTION_INIT) {
+ return CXL_MBOX_FW_XFER_IN_PROGRESS;
+ }
+ /*
+ * Abort partitioned package transfer if over 30 secs
+ * between parts. As opposed to the explicit ABORT action,
+ * semantically treat this condition as an error - as
+ * if a part action were passed without a previous INIT.
+ */
+ if (difftime(time(NULL), cci->fw.last_partxfer) > 30.0) {
+ cci->fw.transferring = false;
+ return CXL_MBOX_INVALID_INPUT;
+ }
+ } else if (fw_transfer->action == CXL_FW_XFER_ACTION_CONTINUE ||
+ fw_transfer->action == CXL_FW_XFER_ACTION_END) {
+ return CXL_MBOX_INVALID_INPUT;
+ }
+
+ /* allow back-to-back retransmission */
+ if ((offset != cci->fw.prev_offset || length != cci->fw.prev_len) &&
+ (fw_transfer->action == CXL_FW_XFER_ACTION_CONTINUE ||
+ fw_transfer->action == CXL_FW_XFER_ACTION_END)) {
+ /* verify no overlaps */
+ if (offset < cci->fw.prev_offset + cci->fw.prev_len) {
+ return CXL_MBOX_FW_XFER_OUT_OF_ORDER;
+ }
+ }
+
+ switch (fw_transfer->action) {
+ case CXL_FW_XFER_ACTION_FULL: /* ignores offset */
+ case CXL_FW_XFER_ACTION_END:
+ if (fw_transfer->slot == 0 ||
+ fw_transfer->slot == cci->fw.active_slot ||
+ fw_transfer->slot > CXL_FW_SLOTS) {
+ return CXL_MBOX_FW_INVALID_SLOT;
+ }
+
+ /* mark the slot used upon bg completion */
+ break;
+ case CXL_FW_XFER_ACTION_INIT:
+ if (offset != 0) {
+ return CXL_MBOX_INVALID_INPUT;
+ }
+
+ cci->fw.transferring = true;
+ cci->fw.prev_offset = offset;
+ cci->fw.prev_len = length;
+ break;
+ case CXL_FW_XFER_ACTION_CONTINUE:
+ cci->fw.prev_offset = offset;
+ cci->fw.prev_len = length;
+ break;
+ default:
+ return CXL_MBOX_INVALID_INPUT;
+ }
+
+ if (fw_transfer->action == CXL_FW_XFER_ACTION_FULL) {
+ cci->bg.runtime = 10 * 1000UL;
+ } else {
+ cci->bg.runtime = 2 * 1000UL;
+ }
+ /* keep relevant context for bg completion */
+ cci->fw.curr_action = fw_transfer->action;
+ cci->fw.curr_slot = fw_transfer->slot;
+ *len_out = 0;
+
+ return CXL_MBOX_BG_STARTED;
+}
+
+static void __do_firmware_xfer(CXLCCI *cci)
+{
+ switch (cci->fw.curr_action) {
+ case CXL_FW_XFER_ACTION_FULL:
+ case CXL_FW_XFER_ACTION_END:
+ cci->fw.slot[cci->fw.curr_slot - 1] = true;
+ cci->fw.transferring = false;
+ break;
+ case CXL_FW_XFER_ACTION_INIT:
+ case CXL_FW_XFER_ACTION_CONTINUE:
+ time(&cci->fw.last_partxfer);
+ break;
+ default:
+ break;
+ }
+}
+
+/* CXL r3.1 section 8.2.9.3.3: Activate FW (Opcode 0202h) */
+static CXLRetCode cmd_firmware_update_activate(const struct cxl_cmd *cmd,
+ uint8_t *payload_in,
+ size_t len,
+ uint8_t *payload_out,
+ size_t *len_out,
+ CXLCCI *cci)
+{
+ struct {
+ uint8_t action;
+ uint8_t slot;
+ } QEMU_PACKED *fw_activate = (void *)payload_in;
+ QEMU_BUILD_BUG_ON(sizeof(*fw_activate) != 0x2);
+
+ if (fw_activate->slot == 0 ||
+ fw_activate->slot == cci->fw.active_slot ||
+ fw_activate->slot > CXL_FW_SLOTS) {
+ return CXL_MBOX_FW_INVALID_SLOT;
+ }
+
+ /* ensure that an actual fw package is there */
+ if (!cci->fw.slot[fw_activate->slot - 1]) {
+ return CXL_MBOX_FW_INVALID_SLOT;
+ }
+
+ switch (fw_activate->action) {
+ case 0: /* online */
+ cci->fw.active_slot = fw_activate->slot;
+ break;
+ case 1: /* reset */
+ cci->fw.staged_slot = fw_activate->slot;
+ break;
+ default:
+ return CXL_MBOX_INVALID_INPUT;
+ }
+
+ return CXL_MBOX_SUCCESS;
+}
+
/* CXL r3.1 Section 8.2.9.4.1: Get Timestamp (Opcode 0300h) */
static CXLRetCode cmd_timestamp_get(const struct cxl_cmd *cmd,
uint8_t *payload_in,
@@ -768,6 +953,388 @@
return CXL_MBOX_SUCCESS;
}
+/* CXL r3.1 section 8.2.9.6: Features */
+/*
+ * Get Supported Features output payload
+ * CXL r3.1 section 8.2.9.6.1 Table 8-96
+ */
+typedef struct CXLSupportedFeatureHeader {
+ uint16_t entries;
+ uint16_t nsuppfeats_dev;
+ uint32_t reserved;
+} QEMU_PACKED CXLSupportedFeatureHeader;
+
+/*
+ * Get Supported Features Supported Feature Entry
+ * CXL r3.1 section 8.2.9.6.1 Table 8-97
+ */
+typedef struct CXLSupportedFeatureEntry {
+ QemuUUID uuid;
+ uint16_t feat_index;
+ uint16_t get_feat_size;
+ uint16_t set_feat_size;
+ uint32_t attr_flags;
+ uint8_t get_feat_version;
+ uint8_t set_feat_version;
+ uint16_t set_feat_effects;
+ uint8_t rsvd[18];
+} QEMU_PACKED CXLSupportedFeatureEntry;
+
+/*
+ * Get Supported Features Supported Feature Entry
+ * CXL rev 3.1 section 8.2.9.6.1 Table 8-97
+ */
+/* Supported Feature Entry : attribute flags */
+#define CXL_FEAT_ENTRY_ATTR_FLAG_CHANGABLE BIT(0)
+#define CXL_FEAT_ENTRY_ATTR_FLAG_DEEPEST_RESET_PERSISTENCE_MASK GENMASK(3, 1)
+#define CXL_FEAT_ENTRY_ATTR_FLAG_PERSIST_ACROSS_FIRMWARE_UPDATE BIT(4)
+#define CXL_FEAT_ENTRY_ATTR_FLAG_SUPPORT_DEFAULT_SELECTION BIT(5)
+#define CXL_FEAT_ENTRY_ATTR_FLAG_SUPPORT_SAVED_SELECTION BIT(6)
+
+/* Supported Feature Entry : set feature effects */
+#define CXL_FEAT_ENTRY_SFE_CONFIG_CHANGE_COLD_RESET BIT(0)
+#define CXL_FEAT_ENTRY_SFE_IMMEDIATE_CONFIG_CHANGE BIT(1)
+#define CXL_FEAT_ENTRY_SFE_IMMEDIATE_DATA_CHANGE BIT(2)
+#define CXL_FEAT_ENTRY_SFE_IMMEDIATE_POLICY_CHANGE BIT(3)
+#define CXL_FEAT_ENTRY_SFE_IMMEDIATE_LOG_CHANGE BIT(4)
+#define CXL_FEAT_ENTRY_SFE_SECURITY_STATE_CHANGE BIT(5)
+#define CXL_FEAT_ENTRY_SFE_BACKGROUND_OPERATION BIT(6)
+#define CXL_FEAT_ENTRY_SFE_SUPPORT_SECONDARY_MAILBOX BIT(7)
+#define CXL_FEAT_ENTRY_SFE_SUPPORT_ABORT_BACKGROUND_OPERATION BIT(8)
+#define CXL_FEAT_ENTRY_SFE_CEL_VALID BIT(9)
+#define CXL_FEAT_ENTRY_SFE_CONFIG_CHANGE_CONV_RESET BIT(10)
+#define CXL_FEAT_ENTRY_SFE_CONFIG_CHANGE_CXL_RESET BIT(11)
+
+enum CXL_SUPPORTED_FEATURES_LIST {
+ CXL_FEATURE_PATROL_SCRUB = 0,
+ CXL_FEATURE_ECS,
+ CXL_FEATURE_MAX
+};
+
+/* Get Feature CXL 3.1 Spec 8.2.9.6.2 */
+/*
+ * Get Feature input payload
+ * CXL r3.1 section 8.2.9.6.2 Table 8-99
+ */
+/* Get Feature : Payload in selection */
+enum CXL_GET_FEATURE_SELECTION {
+ CXL_GET_FEATURE_SEL_CURRENT_VALUE,
+ CXL_GET_FEATURE_SEL_DEFAULT_VALUE,
+ CXL_GET_FEATURE_SEL_SAVED_VALUE,
+ CXL_GET_FEATURE_SEL_MAX
+};
+
+/* Set Feature CXL 3.1 Spec 8.2.9.6.3 */
+/*
+ * Set Feature input payload
+ * CXL r3.1 section 8.2.9.6.3 Table 8-101
+ */
+typedef struct CXLSetFeatureInHeader {
+ QemuUUID uuid;
+ uint32_t flags;
+ uint16_t offset;
+ uint8_t version;
+ uint8_t rsvd[9];
+} QEMU_PACKED QEMU_ALIGNED(16) CXLSetFeatureInHeader;
+
+/* Set Feature : Payload in flags */
+#define CXL_SET_FEATURE_FLAG_DATA_TRANSFER_MASK 0x7
+enum CXL_SET_FEATURE_FLAG_DATA_TRANSFER {
+ CXL_SET_FEATURE_FLAG_FULL_DATA_TRANSFER,
+ CXL_SET_FEATURE_FLAG_INITIATE_DATA_TRANSFER,
+ CXL_SET_FEATURE_FLAG_CONTINUE_DATA_TRANSFER,
+ CXL_SET_FEATURE_FLAG_FINISH_DATA_TRANSFER,
+ CXL_SET_FEATURE_FLAG_ABORT_DATA_TRANSFER,
+ CXL_SET_FEATURE_FLAG_DATA_TRANSFER_MAX
+};
+#define CXL_SET_FEAT_DATA_SAVED_ACROSS_RESET BIT(3)
+
+/* CXL r3.1 section 8.2.9.9.11.1: Device Patrol Scrub Control Feature */
+static const QemuUUID patrol_scrub_uuid = {
+ .data = UUID(0x96dad7d6, 0xfde8, 0x482b, 0xa7, 0x33,
+ 0x75, 0x77, 0x4e, 0x06, 0xdb, 0x8a)
+};
+
+typedef struct CXLMemPatrolScrubSetFeature {
+ CXLSetFeatureInHeader hdr;
+ CXLMemPatrolScrubWriteAttrs feat_data;
+} QEMU_PACKED QEMU_ALIGNED(16) CXLMemPatrolScrubSetFeature;
+
+/*
+ * CXL r3.1 section 8.2.9.9.11.2:
+ * DDR5 Error Check Scrub (ECS) Control Feature
+ */
+static const QemuUUID ecs_uuid = {
+ .data = UUID(0xe5b13f22, 0x2328, 0x4a14, 0xb8, 0xba,
+ 0xb9, 0x69, 0x1e, 0x89, 0x33, 0x86)
+};
+
+typedef struct CXLMemECSSetFeature {
+ CXLSetFeatureInHeader hdr;
+ CXLMemECSWriteAttrs feat_data[];
+} QEMU_PACKED QEMU_ALIGNED(16) CXLMemECSSetFeature;
+
+/* CXL r3.1 section 8.2.9.6.1: Get Supported Features (Opcode 0500h) */
+static CXLRetCode cmd_features_get_supported(const struct cxl_cmd *cmd,
+ uint8_t *payload_in,
+ size_t len_in,
+ uint8_t *payload_out,
+ size_t *len_out,
+ CXLCCI *cci)
+{
+ struct {
+ uint32_t count;
+ uint16_t start_index;
+ uint16_t reserved;
+ } QEMU_PACKED QEMU_ALIGNED(16) * get_feats_in = (void *)payload_in;
+
+ struct {
+ CXLSupportedFeatureHeader hdr;
+ CXLSupportedFeatureEntry feat_entries[];
+ } QEMU_PACKED QEMU_ALIGNED(16) * get_feats_out = (void *)payload_out;
+ uint16_t index, req_entries;
+ uint16_t entry;
+
+ if (!object_dynamic_cast(OBJECT(cci->d), TYPE_CXL_TYPE3)) {
+ return CXL_MBOX_UNSUPPORTED;
+ }
+ if (get_feats_in->count < sizeof(CXLSupportedFeatureHeader) ||
+ get_feats_in->start_index >= CXL_FEATURE_MAX) {
+ return CXL_MBOX_INVALID_INPUT;
+ }
+
+ req_entries = (get_feats_in->count -
+ sizeof(CXLSupportedFeatureHeader)) /
+ sizeof(CXLSupportedFeatureEntry);
+ req_entries = MIN(req_entries,
+ (CXL_FEATURE_MAX - get_feats_in->start_index));
+
+ for (entry = 0, index = get_feats_in->start_index;
+ entry < req_entries; index++) {
+ switch (index) {
+ case CXL_FEATURE_PATROL_SCRUB:
+ /* Fill supported feature entry for device patrol scrub control */
+ get_feats_out->feat_entries[entry++] =
+ (struct CXLSupportedFeatureEntry) {
+ .uuid = patrol_scrub_uuid,
+ .feat_index = index,
+ .get_feat_size = sizeof(CXLMemPatrolScrubReadAttrs),
+ .set_feat_size = sizeof(CXLMemPatrolScrubWriteAttrs),
+ .attr_flags = CXL_FEAT_ENTRY_ATTR_FLAG_CHANGABLE,
+ .get_feat_version = CXL_MEMDEV_PS_GET_FEATURE_VERSION,
+ .set_feat_version = CXL_MEMDEV_PS_SET_FEATURE_VERSION,
+ .set_feat_effects = CXL_FEAT_ENTRY_SFE_IMMEDIATE_CONFIG_CHANGE |
+ CXL_FEAT_ENTRY_SFE_CEL_VALID,
+ };
+ break;
+ case CXL_FEATURE_ECS:
+ /* Fill supported feature entry for device DDR5 ECS control */
+ get_feats_out->feat_entries[entry++] =
+ (struct CXLSupportedFeatureEntry) {
+ .uuid = ecs_uuid,
+ .feat_index = index,
+ .get_feat_size = CXL_ECS_NUM_MEDIA_FRUS *
+ sizeof(CXLMemECSReadAttrs),
+ .set_feat_size = CXL_ECS_NUM_MEDIA_FRUS *
+ sizeof(CXLMemECSWriteAttrs),
+ .attr_flags = CXL_FEAT_ENTRY_ATTR_FLAG_CHANGABLE,
+ .get_feat_version = CXL_ECS_GET_FEATURE_VERSION,
+ .set_feat_version = CXL_ECS_SET_FEATURE_VERSION,
+ .set_feat_effects = CXL_FEAT_ENTRY_SFE_IMMEDIATE_CONFIG_CHANGE |
+ CXL_FEAT_ENTRY_SFE_CEL_VALID,
+ };
+ break;
+ default:
+ __builtin_unreachable();
+ }
+ }
+ get_feats_out->hdr.nsuppfeats_dev = CXL_FEATURE_MAX;
+ get_feats_out->hdr.entries = req_entries;
+ *len_out = sizeof(CXLSupportedFeatureHeader) +
+ req_entries * sizeof(CXLSupportedFeatureEntry);
+
+ return CXL_MBOX_SUCCESS;
+}
+
+/* CXL r3.1 section 8.2.9.6.2: Get Feature (Opcode 0501h) */
+static CXLRetCode cmd_features_get_feature(const struct cxl_cmd *cmd,
+ uint8_t *payload_in,
+ size_t len_in,
+ uint8_t *payload_out,
+ size_t *len_out,
+ CXLCCI *cci)
+{
+ struct {
+ QemuUUID uuid;
+ uint16_t offset;
+ uint16_t count;
+ uint8_t selection;
+ } QEMU_PACKED QEMU_ALIGNED(16) * get_feature;
+ uint16_t bytes_to_copy = 0;
+ CXLType3Dev *ct3d;
+ CXLSetFeatureInfo *set_feat_info;
+
+ if (!object_dynamic_cast(OBJECT(cci->d), TYPE_CXL_TYPE3)) {
+ return CXL_MBOX_UNSUPPORTED;
+ }
+
+ ct3d = CXL_TYPE3(cci->d);
+ get_feature = (void *)payload_in;
+
+ set_feat_info = &ct3d->set_feat_info;
+ if (qemu_uuid_is_equal(&get_feature->uuid, &set_feat_info->uuid)) {
+ return CXL_MBOX_FEATURE_TRANSFER_IN_PROGRESS;
+ }
+
+ if (get_feature->selection != CXL_GET_FEATURE_SEL_CURRENT_VALUE) {
+ return CXL_MBOX_UNSUPPORTED;
+ }
+ if (get_feature->offset + get_feature->count > cci->payload_max) {
+ return CXL_MBOX_INVALID_INPUT;
+ }
+
+ if (qemu_uuid_is_equal(&get_feature->uuid, &patrol_scrub_uuid)) {
+ if (get_feature->offset >= sizeof(CXLMemPatrolScrubReadAttrs)) {
+ return CXL_MBOX_INVALID_INPUT;
+ }
+ bytes_to_copy = sizeof(CXLMemPatrolScrubReadAttrs) -
+ get_feature->offset;
+ bytes_to_copy = MIN(bytes_to_copy, get_feature->count);
+ memcpy(payload_out,
+ (uint8_t *)&ct3d->patrol_scrub_attrs + get_feature->offset,
+ bytes_to_copy);
+ } else if (qemu_uuid_is_equal(&get_feature->uuid, &ecs_uuid)) {
+ if (get_feature->offset >= CXL_ECS_NUM_MEDIA_FRUS *
+ sizeof(CXLMemECSReadAttrs)) {
+ return CXL_MBOX_INVALID_INPUT;
+ }
+ bytes_to_copy = CXL_ECS_NUM_MEDIA_FRUS *
+ sizeof(CXLMemECSReadAttrs) -
+ get_feature->offset;
+ bytes_to_copy = MIN(bytes_to_copy, get_feature->count);
+ memcpy(payload_out,
+ (uint8_t *)&ct3d->ecs_attrs + get_feature->offset,
+ bytes_to_copy);
+ } else {
+ return CXL_MBOX_UNSUPPORTED;
+ }
+
+ *len_out = bytes_to_copy;
+
+ return CXL_MBOX_SUCCESS;
+}
+
+/* CXL r3.1 section 8.2.9.6.3: Set Feature (Opcode 0502h) */
+static CXLRetCode cmd_features_set_feature(const struct cxl_cmd *cmd,
+ uint8_t *payload_in,
+ size_t len_in,
+ uint8_t *payload_out,
+ size_t *len_out,
+ CXLCCI *cci)
+{
+ CXLSetFeatureInHeader *hdr = (void *)payload_in;
+ CXLMemPatrolScrubWriteAttrs *ps_write_attrs;
+ CXLMemPatrolScrubSetFeature *ps_set_feature;
+ CXLMemECSWriteAttrs *ecs_write_attrs;
+ CXLMemECSSetFeature *ecs_set_feature;
+ CXLSetFeatureInfo *set_feat_info;
+ uint16_t bytes_to_copy = 0;
+ uint8_t data_transfer_flag;
+ CXLType3Dev *ct3d;
+ uint16_t count;
+
+
+ if (!object_dynamic_cast(OBJECT(cci->d), TYPE_CXL_TYPE3)) {
+ return CXL_MBOX_UNSUPPORTED;
+ }
+ ct3d = CXL_TYPE3(cci->d);
+ set_feat_info = &ct3d->set_feat_info;
+
+ if (!qemu_uuid_is_null(&set_feat_info->uuid) &&
+ !qemu_uuid_is_equal(&hdr->uuid, &set_feat_info->uuid)) {
+ return CXL_MBOX_FEATURE_TRANSFER_IN_PROGRESS;
+ }
+ if (hdr->flags & CXL_SET_FEAT_DATA_SAVED_ACROSS_RESET) {
+ set_feat_info->data_saved_across_reset = true;
+ } else {
+ set_feat_info->data_saved_across_reset = false;
+ }
+
+ data_transfer_flag =
+ hdr->flags & CXL_SET_FEATURE_FLAG_DATA_TRANSFER_MASK;
+ if (data_transfer_flag == CXL_SET_FEATURE_FLAG_INITIATE_DATA_TRANSFER) {
+ set_feat_info->uuid = hdr->uuid;
+ set_feat_info->data_size = 0;
+ }
+ set_feat_info->data_transfer_flag = data_transfer_flag;
+ set_feat_info->data_offset = hdr->offset;
+ bytes_to_copy = len_in - sizeof(CXLSetFeatureInHeader);
+
+ if (qemu_uuid_is_equal(&hdr->uuid, &patrol_scrub_uuid)) {
+ if (hdr->version != CXL_MEMDEV_PS_SET_FEATURE_VERSION) {
+ return CXL_MBOX_UNSUPPORTED;
+ }
+
+ ps_set_feature = (void *)payload_in;
+ ps_write_attrs = &ps_set_feature->feat_data;
+ memcpy((uint8_t *)&ct3d->patrol_scrub_wr_attrs + hdr->offset,
+ ps_write_attrs,
+ bytes_to_copy);
+ set_feat_info->data_size += bytes_to_copy;
+
+ if (data_transfer_flag == CXL_SET_FEATURE_FLAG_FULL_DATA_TRANSFER ||
+ data_transfer_flag == CXL_SET_FEATURE_FLAG_FINISH_DATA_TRANSFER) {
+ ct3d->patrol_scrub_attrs.scrub_cycle &= ~0xFF;
+ ct3d->patrol_scrub_attrs.scrub_cycle |=
+ ct3d->patrol_scrub_wr_attrs.scrub_cycle_hr & 0xFF;
+ ct3d->patrol_scrub_attrs.scrub_flags &= ~0x1;
+ ct3d->patrol_scrub_attrs.scrub_flags |=
+ ct3d->patrol_scrub_wr_attrs.scrub_flags & 0x1;
+ }
+ } else if (qemu_uuid_is_equal(&hdr->uuid,
+ &ecs_uuid)) {
+ if (hdr->version != CXL_ECS_SET_FEATURE_VERSION) {
+ return CXL_MBOX_UNSUPPORTED;
+ }
+
+ ecs_set_feature = (void *)payload_in;
+ ecs_write_attrs = ecs_set_feature->feat_data;
+ memcpy((uint8_t *)ct3d->ecs_wr_attrs + hdr->offset,
+ ecs_write_attrs,
+ bytes_to_copy);
+ set_feat_info->data_size += bytes_to_copy;
+
+ if (data_transfer_flag == CXL_SET_FEATURE_FLAG_FULL_DATA_TRANSFER ||
+ data_transfer_flag == CXL_SET_FEATURE_FLAG_FINISH_DATA_TRANSFER) {
+ for (count = 0; count < CXL_ECS_NUM_MEDIA_FRUS; count++) {
+ ct3d->ecs_attrs[count].ecs_log_cap =
+ ct3d->ecs_wr_attrs[count].ecs_log_cap;
+ ct3d->ecs_attrs[count].ecs_config =
+ ct3d->ecs_wr_attrs[count].ecs_config & 0x1F;
+ }
+ }
+ } else {
+ return CXL_MBOX_UNSUPPORTED;
+ }
+
+ if (data_transfer_flag == CXL_SET_FEATURE_FLAG_FULL_DATA_TRANSFER ||
+ data_transfer_flag == CXL_SET_FEATURE_FLAG_FINISH_DATA_TRANSFER ||
+ data_transfer_flag == CXL_SET_FEATURE_FLAG_ABORT_DATA_TRANSFER) {
+ memset(&set_feat_info->uuid, 0, sizeof(QemuUUID));
+ if (qemu_uuid_is_equal(&hdr->uuid, &patrol_scrub_uuid)) {
+ memset(&ct3d->patrol_scrub_wr_attrs, 0, set_feat_info->data_size);
+ } else if (qemu_uuid_is_equal(&hdr->uuid, &ecs_uuid)) {
+ memset(ct3d->ecs_wr_attrs, 0, set_feat_info->data_size);
+ }
+ set_feat_info->data_transfer_flag = 0;
+ set_feat_info->data_saved_across_reset = false;
+ set_feat_info->data_offset = 0;
+ set_feat_info->data_size = 0;
+ }
+
+ return CXL_MBOX_SUCCESS;
+}
+
/* CXL r3.1 Section 8.2.9.9.1.1: Identify Memory Device (Opcode 4000h) */
static CXLRetCode cmd_identify_memory_device(const struct cxl_cmd *cmd,
uint8_t *payload_in,
@@ -805,7 +1372,6 @@
}
id = (void *)payload_out;
- memset(id, 0, sizeof(*id));
snprintf(id->fw_revision, 0x10, "BWFW VERSION %02d", 0);
@@ -953,6 +1519,7 @@
memset(lsa, 0, memory_region_size(mr));
}
}
+ cxl_discard_all_event_records(&ct3d->cxl_dstate);
}
/*
@@ -1095,7 +1662,6 @@
out_pl_len = sizeof(*out) + record_count * sizeof(out->records[0]);
assert(out_pl_len <= CXL_MAILBOX_MAX_PAYLOAD_SIZE);
- memset(out, 0, out_pl_len);
QLIST_FOREACH(ent, poison_list, node) {
uint64_t start, stop;
@@ -1117,6 +1683,10 @@
out->flags = (1 << 1);
stq_le_p(&out->overflow_timestamp, ct3d->poison_list_overflow_ts);
}
+ if (scan_media_running(cci)) {
+ out->flags |= (1 << 2);
+ }
+
stw_le_p(&out->count, record_count);
*len_out = out_pl_len;
return CXL_MBOX_SUCCESS;
@@ -1146,6 +1716,16 @@
return CXL_MBOX_SUCCESS;
}
}
+ /*
+ * Freeze the list if there is an on-going scan media operation.
+ */
+ if (scan_media_running(cci)) {
+ /*
+ * XXX: Spec is ambiguous - is this case considered
+ * a successful return despite not adding to the list?
+ */
+ goto success;
+ }
if (ct3d->poison_list_cnt == CXL_POISON_LIST_LIMIT) {
return CXL_MBOX_INJECT_POISON_LIMIT;
@@ -1161,6 +1741,7 @@
*/
QLIST_INSERT_HEAD(poison_list, p, node);
ct3d->poison_list_cnt++;
+success:
*len_out = 0;
return CXL_MBOX_SUCCESS;
@@ -1200,6 +1781,17 @@
}
}
+ /*
+ * Freeze the list if there is an on-going scan media operation.
+ */
+ if (scan_media_running(cci)) {
+ /*
+ * XXX: Spec is ambiguous - is this case considered
+ * a successful return despite not removing from the list?
+ */
+ goto success;
+ }
+
QLIST_FOREACH(ent, poison_list, node) {
/*
* Test for contained in entry. Simpler than general case
@@ -1210,7 +1802,7 @@
}
}
if (!ent) {
- return CXL_MBOX_SUCCESS;
+ goto success;
}
QLIST_REMOVE(ent, node);
@@ -1247,12 +1839,262 @@
}
/* Any fragments have been added, free original entry */
g_free(ent);
+success:
*len_out = 0;
return CXL_MBOX_SUCCESS;
}
/*
+ * CXL r3.1 section 8.2.9.9.4.4: Get Scan Media Capabilities
+ */
+static CXLRetCode
+cmd_media_get_scan_media_capabilities(const struct cxl_cmd *cmd,
+ uint8_t *payload_in,
+ size_t len_in,
+ uint8_t *payload_out,
+ size_t *len_out,
+ CXLCCI *cci)
+{
+ struct get_scan_media_capabilities_pl {
+ uint64_t pa;
+ uint64_t length;
+ } QEMU_PACKED;
+
+ struct get_scan_media_capabilities_out_pl {
+ uint32_t estimated_runtime_ms;
+ };
+
+ CXLType3Dev *ct3d = CXL_TYPE3(cci->d);
+ CXLDeviceState *cxl_dstate = &ct3d->cxl_dstate;
+ struct get_scan_media_capabilities_pl *in = (void *)payload_in;
+ struct get_scan_media_capabilities_out_pl *out = (void *)payload_out;
+ uint64_t query_start;
+ uint64_t query_length;
+
+ query_start = ldq_le_p(&in->pa);
+ /* 64 byte alignment required */
+ if (query_start & 0x3f) {
+ return CXL_MBOX_INVALID_INPUT;
+ }
+ query_length = ldq_le_p(&in->length) * CXL_CACHE_LINE_SIZE;
+
+ if (query_start + query_length > cxl_dstate->static_mem_size) {
+ return CXL_MBOX_INVALID_PA;
+ }
+
+ /*
+ * Just use 400 nanosecond access/read latency + 100 ns for
+ * the cost of updating the poison list. For small enough
+ * chunks return at least 1 ms.
+ */
+ stl_le_p(&out->estimated_runtime_ms,
+ MAX(1, query_length * (0.0005L / 64)));
+
+ *len_out = sizeof(*out);
+ return CXL_MBOX_SUCCESS;
+}
+
+static void __do_scan_media(CXLType3Dev *ct3d)
+{
+ CXLPoison *ent;
+ unsigned int results_cnt = 0;
+
+ QLIST_FOREACH(ent, &ct3d->scan_media_results, node) {
+ results_cnt++;
+ }
+
+ /* only scan media may clear the overflow */
+ if (ct3d->poison_list_overflowed &&
+ ct3d->poison_list_cnt == results_cnt) {
+ cxl_clear_poison_list_overflowed(ct3d);
+ }
+ /* scan media has run since last conventional reset */
+ ct3d->scan_media_hasrun = true;
+}
+
+/*
+ * CXL r3.1 section 8.2.9.9.4.5: Scan Media
+ */
+static CXLRetCode cmd_media_scan_media(const struct cxl_cmd *cmd,
+ uint8_t *payload_in,
+ size_t len_in,
+ uint8_t *payload_out,
+ size_t *len_out,
+ CXLCCI *cci)
+{
+ struct scan_media_pl {
+ uint64_t pa;
+ uint64_t length;
+ uint8_t flags;
+ } QEMU_PACKED;
+
+ struct scan_media_pl *in = (void *)payload_in;
+ CXLType3Dev *ct3d = CXL_TYPE3(cci->d);
+ CXLDeviceState *cxl_dstate = &ct3d->cxl_dstate;
+ uint64_t query_start;
+ uint64_t query_length;
+ CXLPoison *ent, *next;
+
+ query_start = ldq_le_p(&in->pa);
+ /* 64 byte alignment required */
+ if (query_start & 0x3f) {
+ return CXL_MBOX_INVALID_INPUT;
+ }
+ query_length = ldq_le_p(&in->length) * CXL_CACHE_LINE_SIZE;
+
+ if (query_start + query_length > cxl_dstate->static_mem_size) {
+ return CXL_MBOX_INVALID_PA;
+ }
+ if (ct3d->dc.num_regions && query_start + query_length >=
+ cxl_dstate->static_mem_size + ct3d->dc.total_capacity) {
+ return CXL_MBOX_INVALID_PA;
+ }
+
+ if (in->flags == 0) { /* TODO */
+ qemu_log_mask(LOG_UNIMP,
+ "Scan Media Event Log is unsupported\n");
+ }
+
+ /* any previous results are discarded upon a new Scan Media */
+ QLIST_FOREACH_SAFE(ent, &ct3d->scan_media_results, node, next) {
+ QLIST_REMOVE(ent, node);
+ g_free(ent);
+ }
+
+ /* kill the poison list - it will be recreated */
+ if (ct3d->poison_list_overflowed) {
+ QLIST_FOREACH_SAFE(ent, &ct3d->poison_list, node, next) {
+ QLIST_REMOVE(ent, node);
+ g_free(ent);
+ ct3d->poison_list_cnt--;
+ }
+ }
+
+ /*
+ * Scan the backup list and move corresponding entries
+ * into the results list, updating the poison list
+ * when possible.
+ */
+ QLIST_FOREACH_SAFE(ent, &ct3d->poison_list_bkp, node, next) {
+ CXLPoison *res;
+
+ if (ent->start >= query_start + query_length ||
+ ent->start + ent->length <= query_start) {
+ continue;
+ }
+
+ /*
+ * If a Get Poison List cmd comes in while this
+ * scan is being done, it will see the new complete
+ * list, while setting the respective flag.
+ */
+ if (ct3d->poison_list_cnt < CXL_POISON_LIST_LIMIT) {
+ CXLPoison *p = g_new0(CXLPoison, 1);
+
+ p->start = ent->start;
+ p->length = ent->length;
+ p->type = ent->type;
+ QLIST_INSERT_HEAD(&ct3d->poison_list, p, node);
+ ct3d->poison_list_cnt++;
+ }
+
+ res = g_new0(CXLPoison, 1);
+ res->start = ent->start;
+ res->length = ent->length;
+ res->type = ent->type;
+ QLIST_INSERT_HEAD(&ct3d->scan_media_results, res, node);
+
+ QLIST_REMOVE(ent, node);
+ g_free(ent);
+ }
+
+ cci->bg.runtime = MAX(1, query_length * (0.0005L / 64));
+ *len_out = 0;
+
+ return CXL_MBOX_BG_STARTED;
+}
+
+/*
+ * CXL r3.1 section 8.2.9.9.4.6: Get Scan Media Results
+ */
+static CXLRetCode cmd_media_get_scan_media_results(const struct cxl_cmd *cmd,
+ uint8_t *payload_in,
+ size_t len_in,
+ uint8_t *payload_out,
+ size_t *len_out,
+ CXLCCI *cci)
+{
+ struct get_scan_media_results_out_pl {
+ uint64_t dpa_restart;
+ uint64_t length;
+ uint8_t flags;
+ uint8_t rsvd1;
+ uint16_t count;
+ uint8_t rsvd2[0xc];
+ struct {
+ uint64_t addr;
+ uint32_t length;
+ uint32_t resv;
+ } QEMU_PACKED records[];
+ } QEMU_PACKED;
+
+ struct get_scan_media_results_out_pl *out = (void *)payload_out;
+ CXLType3Dev *ct3d = CXL_TYPE3(cci->d);
+ CXLPoisonList *scan_media_results = &ct3d->scan_media_results;
+ CXLPoison *ent, *next;
+ uint16_t total_count = 0, record_count = 0, i = 0;
+ uint16_t out_pl_len;
+
+ if (!ct3d->scan_media_hasrun) {
+ return CXL_MBOX_UNSUPPORTED;
+ }
+
+ /*
+ * Calculate limits, all entries are within the same address range of the
+ * last scan media call.
+ */
+ QLIST_FOREACH(ent, scan_media_results, node) {
+ size_t rec_size = record_count * sizeof(out->records[0]);
+
+ if (sizeof(*out) + rec_size < CXL_MAILBOX_MAX_PAYLOAD_SIZE) {
+ record_count++;
+ }
+ total_count++;
+ }
+
+ out_pl_len = sizeof(*out) + record_count * sizeof(out->records[0]);
+ assert(out_pl_len <= CXL_MAILBOX_MAX_PAYLOAD_SIZE);
+
+ memset(out, 0, out_pl_len);
+ QLIST_FOREACH_SAFE(ent, scan_media_results, node, next) {
+ uint64_t start, stop;
+
+ if (i == record_count) {
+ break;
+ }
+
+ start = ROUND_DOWN(ent->start, 64ull);
+ stop = ROUND_DOWN(ent->start, 64ull) + ent->length;
+ stq_le_p(&out->records[i].addr, start | (ent->type & 0x7));
+ stl_le_p(&out->records[i].length, (stop - start) / CXL_CACHE_LINE_SIZE);
+ i++;
+
+ /* consume the returning entry */
+ QLIST_REMOVE(ent, node);
+ g_free(ent);
+ }
+
+ stw_le_p(&out->count, record_count);
+ if (total_count > record_count) {
+ out->flags = (1 << 0); /* More Media Error Records */
+ }
+
+ *len_out = out_pl_len;
+ return CXL_MBOX_SUCCESS;
+}
+
+/*
* CXL r3.1 section 8.2.9.9.9.1: Get Dynamic Capacity Configuration
* (Opcode: 4800h)
*/
@@ -1822,40 +2664,51 @@
return CXL_MBOX_SUCCESS;
}
-#define IMMEDIATE_CONFIG_CHANGE (1 << 1)
-#define IMMEDIATE_DATA_CHANGE (1 << 2)
-#define IMMEDIATE_POLICY_CHANGE (1 << 3)
-#define IMMEDIATE_LOG_CHANGE (1 << 4)
-#define SECURITY_STATE_CHANGE (1 << 5)
-#define BACKGROUND_OPERATION (1 << 6)
-
static const struct cxl_cmd cxl_cmd_set[256][256] = {
[EVENTS][GET_RECORDS] = { "EVENTS_GET_RECORDS",
cmd_events_get_records, 1, 0 },
[EVENTS][CLEAR_RECORDS] = { "EVENTS_CLEAR_RECORDS",
- cmd_events_clear_records, ~0, IMMEDIATE_LOG_CHANGE },
+ cmd_events_clear_records, ~0, CXL_MBOX_IMMEDIATE_LOG_CHANGE },
[EVENTS][GET_INTERRUPT_POLICY] = { "EVENTS_GET_INTERRUPT_POLICY",
cmd_events_get_interrupt_policy, 0, 0 },
[EVENTS][SET_INTERRUPT_POLICY] = { "EVENTS_SET_INTERRUPT_POLICY",
cmd_events_set_interrupt_policy,
- ~0, IMMEDIATE_CONFIG_CHANGE },
+ ~0, CXL_MBOX_IMMEDIATE_CONFIG_CHANGE },
[FIRMWARE_UPDATE][GET_INFO] = { "FIRMWARE_UPDATE_GET_INFO",
cmd_firmware_update_get_info, 0, 0 },
+ [FIRMWARE_UPDATE][TRANSFER] = { "FIRMWARE_UPDATE_TRANSFER",
+ cmd_firmware_update_transfer, ~0, CXL_MBOX_BACKGROUND_OPERATION },
+ [FIRMWARE_UPDATE][ACTIVATE] = { "FIRMWARE_UPDATE_ACTIVATE",
+ cmd_firmware_update_activate, 2, CXL_MBOX_BACKGROUND_OPERATION },
[TIMESTAMP][GET] = { "TIMESTAMP_GET", cmd_timestamp_get, 0, 0 },
[TIMESTAMP][SET] = { "TIMESTAMP_SET", cmd_timestamp_set,
- 8, IMMEDIATE_POLICY_CHANGE },
+ 8, CXL_MBOX_IMMEDIATE_POLICY_CHANGE },
[LOGS][GET_SUPPORTED] = { "LOGS_GET_SUPPORTED", cmd_logs_get_supported,
0, 0 },
[LOGS][GET_LOG] = { "LOGS_GET_LOG", cmd_logs_get_log, 0x18, 0 },
+ [FEATURES][GET_SUPPORTED] = { "FEATURES_GET_SUPPORTED",
+ cmd_features_get_supported, 0x8, 0 },
+ [FEATURES][GET_FEATURE] = { "FEATURES_GET_FEATURE",
+ cmd_features_get_feature, 0x15, 0 },
+ [FEATURES][SET_FEATURE] = { "FEATURES_SET_FEATURE",
+ cmd_features_set_feature,
+ ~0,
+ (CXL_MBOX_IMMEDIATE_CONFIG_CHANGE |
+ CXL_MBOX_IMMEDIATE_DATA_CHANGE |
+ CXL_MBOX_IMMEDIATE_POLICY_CHANGE |
+ CXL_MBOX_IMMEDIATE_LOG_CHANGE |
+ CXL_MBOX_SECURITY_STATE_CHANGE)},
[IDENTIFY][MEMORY_DEVICE] = { "IDENTIFY_MEMORY_DEVICE",
cmd_identify_memory_device, 0, 0 },
[CCLS][GET_PARTITION_INFO] = { "CCLS_GET_PARTITION_INFO",
cmd_ccls_get_partition_info, 0, 0 },
[CCLS][GET_LSA] = { "CCLS_GET_LSA", cmd_ccls_get_lsa, 8, 0 },
[CCLS][SET_LSA] = { "CCLS_SET_LSA", cmd_ccls_set_lsa,
- ~0, IMMEDIATE_CONFIG_CHANGE | IMMEDIATE_DATA_CHANGE },
+ ~0, CXL_MBOX_IMMEDIATE_CONFIG_CHANGE | CXL_MBOX_IMMEDIATE_DATA_CHANGE },
[SANITIZE][OVERWRITE] = { "SANITIZE_OVERWRITE", cmd_sanitize_overwrite, 0,
- IMMEDIATE_DATA_CHANGE | SECURITY_STATE_CHANGE | BACKGROUND_OPERATION },
+ (CXL_MBOX_IMMEDIATE_DATA_CHANGE |
+ CXL_MBOX_SECURITY_STATE_CHANGE |
+ CXL_MBOX_BACKGROUND_OPERATION)},
[PERSISTENT_MEM][GET_SECURITY_STATE] = { "GET_SECURITY_STATE",
cmd_get_security_state, 0, 0 },
[MEDIA_AND_POISON][GET_POISON_LIST] = { "MEDIA_AND_POISON_GET_POISON_LIST",
@@ -1864,6 +2717,14 @@
cmd_media_inject_poison, 8, 0 },
[MEDIA_AND_POISON][CLEAR_POISON] = { "MEDIA_AND_POISON_CLEAR_POISON",
cmd_media_clear_poison, 72, 0 },
+ [MEDIA_AND_POISON][GET_SCAN_MEDIA_CAPABILITIES] = {
+ "MEDIA_AND_POISON_GET_SCAN_MEDIA_CAPABILITIES",
+ cmd_media_get_scan_media_capabilities, 16, 0 },
+ [MEDIA_AND_POISON][SCAN_MEDIA] = { "MEDIA_AND_POISON_SCAN_MEDIA",
+ cmd_media_scan_media, 17, CXL_MBOX_BACKGROUND_OPERATION },
+ [MEDIA_AND_POISON][GET_SCAN_MEDIA_RESULTS] = {
+ "MEDIA_AND_POISON_GET_SCAN_MEDIA_RESULTS",
+ cmd_media_get_scan_media_results, 0, 0 },
};
static const struct cxl_cmd cxl_cmd_set_dcd[256][256] = {
@@ -1874,10 +2735,10 @@
8, 0 },
[DCD_CONFIG][ADD_DYN_CAP_RSP] = {
"DCD_ADD_DYNAMIC_CAPACITY_RESPONSE", cmd_dcd_add_dyn_cap_rsp,
- ~0, IMMEDIATE_DATA_CHANGE },
+ ~0, CXL_MBOX_IMMEDIATE_DATA_CHANGE },
[DCD_CONFIG][RELEASE_DYN_CAP] = {
"DCD_RELEASE_DYNAMIC_CAPACITY", cmd_dcd_release_dyn_cap,
- ~0, IMMEDIATE_DATA_CHANGE },
+ ~0, CXL_MBOX_IMMEDIATE_DATA_CHANGE },
};
static const struct cxl_cmd cxl_cmd_set_sw[256][256] = {
@@ -1885,8 +2746,8 @@
[INFOSTAT][BACKGROUND_OPERATION_STATUS] = { "BACKGROUND_OPERATION_STATUS",
cmd_infostat_bg_op_sts, 0, 0 },
[TIMESTAMP][GET] = { "TIMESTAMP_GET", cmd_timestamp_get, 0, 0 },
- [TIMESTAMP][SET] = { "TIMESTAMP_SET", cmd_timestamp_set, 0,
- IMMEDIATE_POLICY_CHANGE },
+ [TIMESTAMP][SET] = { "TIMESTAMP_SET", cmd_timestamp_set, 8,
+ CXL_MBOX_IMMEDIATE_POLICY_CHANGE },
[LOGS][GET_SUPPORTED] = { "LOGS_GET_SUPPORTED", cmd_logs_get_supported, 0,
0 },
[LOGS][GET_LOG] = { "LOGS_GET_LOG", cmd_logs_get_log, 0x18, 0 },
@@ -1913,6 +2774,7 @@
int ret;
const struct cxl_cmd *cxl_cmd;
opcode_handler h;
+ CXLDeviceState *cxl_dstate;
*len_out = 0;
cxl_cmd = &cci->cxl_cmd_set[set][cmd];
@@ -1928,28 +2790,34 @@
}
/* Only one bg command at a time */
- if ((cxl_cmd->effect & BACKGROUND_OPERATION) &&
+ if ((cxl_cmd->effect & CXL_MBOX_BACKGROUND_OPERATION) &&
cci->bg.runtime > 0) {
return CXL_MBOX_BUSY;
}
- /* forbid any selected commands while overwriting */
- if (sanitize_running(cci)) {
- if (h == cmd_events_get_records ||
- h == cmd_ccls_get_partition_info ||
- h == cmd_ccls_set_lsa ||
- h == cmd_ccls_get_lsa ||
- h == cmd_logs_get_log ||
- h == cmd_media_get_poison_list ||
- h == cmd_media_inject_poison ||
- h == cmd_media_clear_poison ||
- h == cmd_sanitize_overwrite) {
- return CXL_MBOX_MEDIA_DISABLED;
+ /* forbid any selected commands while the media is disabled */
+ if (object_dynamic_cast(OBJECT(cci->d), TYPE_CXL_TYPE3)) {
+ cxl_dstate = &CXL_TYPE3(cci->d)->cxl_dstate;
+
+ if (cxl_dev_media_disabled(cxl_dstate)) {
+ if (h == cmd_events_get_records ||
+ h == cmd_ccls_get_partition_info ||
+ h == cmd_ccls_set_lsa ||
+ h == cmd_ccls_get_lsa ||
+ h == cmd_logs_get_log ||
+ h == cmd_media_get_poison_list ||
+ h == cmd_media_inject_poison ||
+ h == cmd_media_clear_poison ||
+ h == cmd_sanitize_overwrite ||
+ h == cmd_firmware_update_transfer ||
+ h == cmd_firmware_update_activate) {
+ return CXL_MBOX_MEDIA_DISABLED;
+ }
}
}
ret = (*h)(cxl_cmd, pl_in, len_in, pl_out, len_out, cci);
- if ((cxl_cmd->effect & BACKGROUND_OPERATION) &&
+ if ((cxl_cmd->effect & CXL_MBOX_BACKGROUND_OPERATION) &&
ret == CXL_MBOX_BG_STARTED) {
*bg_started = true;
} else {
@@ -1987,6 +2855,9 @@
cci->bg.complete_pct = 100;
cci->bg.ret_code = ret;
switch (cci->bg.opcode) {
+ case 0x0201: /* fw transfer */
+ __do_firmware_xfer(cci);
+ break;
case 0x4400: /* sanitize */
{
CXLType3Dev *ct3d = CXL_TYPE3(cci->d);
@@ -1995,8 +2866,13 @@
cxl_dev_enable_media(&ct3d->cxl_dstate);
}
break;
- case 0x4304: /* TODO: scan media */
+ case 0x4304: /* scan media */
+ {
+ CXLType3Dev *ct3d = CXL_TYPE3(cci->d);
+
+ __do_scan_media(ct3d);
break;
+ }
default:
__builtin_unreachable();
break;
@@ -2053,6 +2929,10 @@
cci->bg.runtime = 0;
cci->bg.timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
bg_timercb, cci);
+
+ memset(&cci->fw, 0, sizeof(cci->fw));
+ cci->fw.active_slot = 1;
+ cci->fw.slot[cci->fw.active_slot - 1] = true;
}
static void cxl_copy_cci_commands(CXLCCI *cci, const struct cxl_cmd (*cxl_cmds)[256])
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index f4e366f..5d4bd2b 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -1536,7 +1536,8 @@
.fw_unplugs_cpu = pm->smi_on_cpu_unplug,
};
build_cpus_aml(dsdt, machine, opts, pc_madt_cpu_entry,
- pm->cpu_hp_io_base, "\\_SB.PCI0", "\\_GPE._E02");
+ pm->cpu_hp_io_base, "\\_SB.PCI0", "\\_GPE._E02",
+ AML_SYSTEM_IO);
}
if (pcms->memhp_io_base && nr_mem) {
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 37c21a0a..be0cb39 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -358,7 +358,7 @@
{
struct vtd_iotlb_key key;
VTDIOTLBEntry *entry;
- int level;
+ unsigned level;
for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) {
key.gfn = vtd_get_iotlb_gfn(addr, level);
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index f8cf99b..5f32c36 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -264,10 +264,10 @@
#define VTD_FRCD_FR(val) (((val) & 0xffULL) << 32)
#define VTD_FRCD_SID_MASK 0xffffULL
#define VTD_FRCD_SID(val) ((val) & VTD_FRCD_SID_MASK)
+#define VTD_FRCD_PV(val) (((val) & 0xffffULL) << 40)
+#define VTD_FRCD_PP(val) (((val) & 0x1ULL) << 31)
/* For the low 64-bit of 128-bit */
#define VTD_FRCD_FI(val) ((val) & ~0xfffULL)
-#define VTD_FRCD_PV(val) (((val) & 0xffffULL) << 40)
-#define VTD_FRCD_PP(val) (((val) & 0x1) << 31)
#define VTD_FRCD_IR_IDX(val) (((val) & 0xffffULL) << 48)
/* DMA Remapping Fault Conditions */
@@ -436,7 +436,7 @@
uint16_t domain_id;
uint32_t pasid;
uint64_t addr;
- uint8_t mask;
+ uint64_t mask;
};
typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo;
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 9445b07..d9e6924 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -495,6 +495,7 @@
pc_i440fx_machine_9_1_options(m);
m->alias = NULL;
m->is_default = false;
+ m->smbios_memory_device_size = 16 * GiB;
compat_props_add(m->compat_props, hw_compat_9_0, hw_compat_9_0_len);
compat_props_add(m->compat_props, pc_compat_9_0, pc_compat_9_0_len);
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 71d3c6d..9d108b1 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -374,6 +374,7 @@
PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
pc_q35_machine_9_1_options(m);
m->alias = NULL;
+ m->smbios_memory_device_size = 16 * GiB;
compat_props_add(m->compat_props, hw_compat_9_0, hw_compat_9_0_len);
compat_props_add(m->compat_props, pc_compat_9_0, pc_compat_9_0_len);
pcmc->isa_bios_alias = false;
diff --git a/hw/i386/sgx.c b/hw/i386/sgx.c
index a14a84b..849472a 100644
--- a/hw/i386/sgx.c
+++ b/hw/i386/sgx.c
@@ -268,10 +268,12 @@
bool sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size)
{
- PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
+ PCMachineState *pcms =
+ (PCMachineState *)object_dynamic_cast(qdev_get_machine(),
+ TYPE_PC_MACHINE);
SGXEPCDevice *epc;
- if (pcms->sgx_epc.size == 0 || pcms->sgx_epc.nr_sections <= section_nr) {
+ if (!pcms || pcms->sgx_epc.size == 0 || pcms->sgx_epc.nr_sections <= section_nr) {
return true;
}
diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index 35ac598..d648192 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -737,6 +737,11 @@
error_setg(errp, "volatile memdev must have backing device");
return false;
}
+ if (host_memory_backend_is_mapped(ct3d->hostvmem)) {
+ error_setg(errp, "memory backend %s can't be used multiple times.",
+ object_get_canonical_path_component(OBJECT(ct3d->hostvmem)));
+ return false;
+ }
memory_region_set_nonvolatile(vmr, false);
memory_region_set_enabled(vmr, true);
host_memory_backend_set_mapped(ct3d->hostvmem, true);
@@ -760,6 +765,11 @@
error_setg(errp, "persistent memdev must have backing device");
return false;
}
+ if (host_memory_backend_is_mapped(ct3d->hostpmem)) {
+ error_setg(errp, "memory backend %s can't be used multiple times.",
+ object_get_canonical_path_component(OBJECT(ct3d->hostpmem)));
+ return false;
+ }
memory_region_set_nonvolatile(pmr, true);
memory_region_set_enabled(pmr, true);
host_memory_backend_set_mapped(ct3d->hostpmem, true);
@@ -790,6 +800,11 @@
return false;
}
+ if (host_memory_backend_is_mapped(ct3d->dc.host_dc)) {
+ error_setg(errp, "memory backend %s can't be used multiple times.",
+ object_get_canonical_path_component(OBJECT(ct3d->dc.host_dc)));
+ return false;
+ }
/*
* Set DC regions as volatile for now, non-volatile support can
* be added in the future if needed.
@@ -829,6 +844,7 @@
uint8_t *pci_conf = pci_dev->config;
unsigned short msix_num = 6;
int i, rc;
+ uint16_t count;
QTAILQ_INIT(&ct3d->error_list);
@@ -893,6 +909,28 @@
}
cxl_event_init(&ct3d->cxl_dstate, 2);
+ /* Set default value for patrol scrub attributes */
+ ct3d->patrol_scrub_attrs.scrub_cycle_cap =
+ CXL_MEMDEV_PS_SCRUB_CYCLE_CHANGE_CAP_DEFAULT |
+ CXL_MEMDEV_PS_SCRUB_REALTIME_REPORT_CAP_DEFAULT;
+ ct3d->patrol_scrub_attrs.scrub_cycle =
+ CXL_MEMDEV_PS_CUR_SCRUB_CYCLE_DEFAULT |
+ (CXL_MEMDEV_PS_MIN_SCRUB_CYCLE_DEFAULT << 8);
+ ct3d->patrol_scrub_attrs.scrub_flags = CXL_MEMDEV_PS_ENABLE_DEFAULT;
+
+ /* Set default value for DDR5 ECS read attributes */
+ for (count = 0; count < CXL_ECS_NUM_MEDIA_FRUS; count++) {
+ ct3d->ecs_attrs[count].ecs_log_cap =
+ CXL_ECS_LOG_ENTRY_TYPE_DEFAULT;
+ ct3d->ecs_attrs[count].ecs_cap =
+ CXL_ECS_REALTIME_REPORT_CAP_DEFAULT;
+ ct3d->ecs_attrs[count].ecs_config =
+ CXL_ECS_THRESHOLD_COUNT_DEFAULT |
+ (CXL_ECS_MODE_DEFAULT << 3);
+ /* Reserved */
+ ct3d->ecs_attrs[count].ecs_flags = 0;
+ }
+
return;
err_release_cdat:
@@ -1127,7 +1165,7 @@
return MEMTX_ERROR;
}
- if (sanitize_running(&ct3d->cci)) {
+ if (cxl_dev_media_disabled(&ct3d->cxl_dstate)) {
qemu_guest_getrandom_nofail(data, size);
return MEMTX_OK;
}
@@ -1149,7 +1187,7 @@
return MEMTX_ERROR;
}
- if (sanitize_running(&ct3d->cci)) {
+ if (cxl_dev_media_disabled(&ct3d->cxl_dstate)) {
return MEMTX_OK;
}
@@ -1304,6 +1342,12 @@
cxl_device_get_timestamp(&ct3d->cxl_dstate);
}
+void cxl_clear_poison_list_overflowed(CXLType3Dev *ct3d)
+{
+ ct3d->poison_list_overflowed = false;
+ ct3d->poison_list_overflow_ts = 0;
+}
+
void qmp_cxl_inject_poison(const char *path, uint64_t start, uint64_t length,
Error **errp)
{
@@ -1340,19 +1384,21 @@
}
}
- if (ct3d->poison_list_cnt == CXL_POISON_LIST_LIMIT) {
- cxl_set_poison_list_overflowed(ct3d);
- return;
- }
-
p = g_new0(CXLPoison, 1);
p->length = length;
p->start = start;
/* Different from injected via the mbox */
p->type = CXL_POISON_TYPE_INTERNAL;
- QLIST_INSERT_HEAD(&ct3d->poison_list, p, node);
- ct3d->poison_list_cnt++;
+ if (ct3d->poison_list_cnt < CXL_POISON_LIST_LIMIT) {
+ QLIST_INSERT_HEAD(&ct3d->poison_list, p, node);
+ ct3d->poison_list_cnt++;
+ } else {
+ if (!ct3d->poison_list_overflowed) {
+ cxl_set_poison_list_overflowed(ct3d);
+ }
+ QLIST_INSERT_HEAD(&ct3d->poison_list_bkp, p, node);
+ }
}
/* For uncorrectable errors include support for multiple header recording */
diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index 18898af..a788e69 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -48,6 +48,7 @@
VIRTIO_F_IOMMU_PLATFORM,
VIRTIO_F_RING_PACKED,
VIRTIO_F_RING_RESET,
+ VIRTIO_F_IN_ORDER,
VIRTIO_F_NOTIFICATION_DATA,
VIRTIO_NET_F_HASH_REPORT,
VHOST_INVALID_FEATURE_BIT
@@ -78,6 +79,7 @@
VIRTIO_F_IOMMU_PLATFORM,
VIRTIO_F_RING_PACKED,
VIRTIO_F_RING_RESET,
+ VIRTIO_F_IN_ORDER,
VIRTIO_NET_F_RSS,
VIRTIO_NET_F_HASH_REPORT,
VIRTIO_NET_F_GUEST_USO4,
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 8d25174..e86ea2e 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -203,6 +203,7 @@
#include "sysemu/hostmem.h"
#include "hw/pci/msix.h"
#include "hw/pci/pcie_sriov.h"
+#include "sysemu/spdm-socket.h"
#include "migration/vmstate.h"
#include "nvme.h"
@@ -8315,6 +8316,27 @@
return 0;
}
+static bool pcie_doe_spdm_rsp(DOECap *doe_cap)
+{
+ void *req = pcie_doe_get_write_mbox_ptr(doe_cap);
+ uint32_t req_len = pcie_doe_get_obj_len(req) * 4;
+ void *rsp = doe_cap->read_mbox;
+ uint32_t rsp_len = SPDM_SOCKET_MAX_MESSAGE_BUFFER_SIZE;
+
+ uint32_t recvd = spdm_socket_rsp(doe_cap->spdm_socket,
+ SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE,
+ req, req_len, rsp, rsp_len);
+ doe_cap->read_mbox_len += DIV_ROUND_UP(recvd, 4);
+
+ return recvd != 0;
+}
+
+static DOEProtocol doe_spdm_prot[] = {
+ { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_CMA, pcie_doe_spdm_rsp },
+ { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_SECURED_CMA, pcie_doe_spdm_rsp },
+ { }
+};
+
static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
{
ERRP_GUARD();
@@ -8402,6 +8424,25 @@
nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
+ pcie_cap_deverr_init(pci_dev);
+
+ /* DOE Initialisation */
+ if (pci_dev->spdm_port) {
+ uint16_t doe_offset = n->params.sriov_max_vfs ?
+ PCI_CONFIG_SPACE_SIZE + PCI_ARI_SIZEOF
+ : PCI_CONFIG_SPACE_SIZE;
+
+ pcie_doe_init(pci_dev, &pci_dev->doe_spdm, doe_offset,
+ doe_spdm_prot, true, 0);
+
+ pci_dev->doe_spdm.spdm_socket = spdm_socket_connect(pci_dev->spdm_port,
+ errp);
+
+ if (pci_dev->doe_spdm.spdm_socket < 0) {
+ return false;
+ }
+ }
+
if (n->params.cmb_size_mb) {
nvme_init_cmb(n, pci_dev);
}
@@ -8650,6 +8691,11 @@
g_free(n->cmb.buf);
}
+ if (pci_dev->doe_spdm.spdm_socket > 0) {
+ spdm_socket_close(pci_dev->doe_spdm.spdm_socket,
+ SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE);
+ }
+
if (n->pmr.dev) {
host_memory_backend_set_mapped(n->pmr.dev, false);
}
@@ -8695,6 +8741,7 @@
DEFINE_PROP_BOOL("msix-exclusive-bar", NvmeCtrl, params.msix_exclusive_bar,
false),
DEFINE_PROP_UINT16("mqes", NvmeCtrl, params.mqes, 0x7ff),
+ DEFINE_PROP_UINT16("spdm_port", PCIDevice, spdm_port, 0),
DEFINE_PROP_END_OF_LIST(),
};
@@ -8766,11 +8813,25 @@
{
uint16_t old_num_vfs = pcie_sriov_num_vfs(dev);
+ if (pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) {
+ pcie_doe_write_config(&dev->doe_spdm, address, val, len);
+ }
pci_default_write_config(dev, address, val, len);
pcie_cap_flr_write_config(dev, address, val, len);
nvme_sriov_post_write_config(dev, old_num_vfs);
}
+static uint32_t nvme_pci_read_config(PCIDevice *dev, uint32_t address, int len)
+{
+ uint32_t val;
+ if (dev->spdm_port && pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) {
+ if (pcie_doe_read_config(&dev->doe_spdm, address, len, &val)) {
+ return val;
+ }
+ }
+ return pci_default_read_config(dev, address, len);
+}
+
static const VMStateDescription nvme_vmstate = {
.name = "nvme",
.unmigratable = 1,
@@ -8783,6 +8844,7 @@
pc->realize = nvme_realize;
pc->config_write = nvme_pci_write_config;
+ pc->config_read = nvme_pci_read_config;
pc->exit = nvme_exit;
pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
pc->revision = 2;
diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c
index f69413e..391fabb 100644
--- a/hw/pci-host/gpex-acpi.c
+++ b/hw/pci-host/gpex-acpi.c
@@ -7,7 +7,8 @@
#include "hw/pci/pcie_host.h"
#include "hw/acpi/cxl.h"
-static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq)
+static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq,
+ Aml *scope, uint8_t bus_num)
{
Aml *method, *crs;
int i, slot_no;
@@ -20,7 +21,7 @@
Aml *pkg = aml_package(4);
aml_append(pkg, aml_int((slot_no << 16) | 0xFFFF));
aml_append(pkg, aml_int(i));
- aml_append(pkg, aml_name("GSI%d", gsi));
+ aml_append(pkg, aml_name("L%.02X%X", bus_num, gsi));
aml_append(pkg, aml_int(0));
aml_append(rt_pkg, pkg);
}
@@ -30,7 +31,7 @@
/* Create GSI link device */
for (i = 0; i < PCI_NUM_PINS; i++) {
uint32_t irqs = irq + i;
- Aml *dev_gsi = aml_device("GSI%d", i);
+ Aml *dev_gsi = aml_device("L%.02X%X", bus_num, i);
aml_append(dev_gsi, aml_name_decl("_HID", aml_string("PNP0C0F")));
aml_append(dev_gsi, aml_name_decl("_UID", aml_int(i)));
crs = aml_resource_template();
@@ -45,7 +46,7 @@
aml_append(dev_gsi, aml_name_decl("_CRS", crs));
method = aml_method("_SRS", 1, AML_NOTSERIALIZED);
aml_append(dev_gsi, method);
- aml_append(dev, dev_gsi);
+ aml_append(scope, dev_gsi);
}
}
@@ -174,7 +175,7 @@
aml_append(dev, aml_name_decl("_PXM", aml_int(numa_node)));
}
- acpi_dsdt_add_pci_route_table(dev, cfg->irq);
+ acpi_dsdt_add_pci_route_table(dev, cfg->irq, scope, bus_num);
/*
* Resources defined for PXBs are composed of the following parts:
@@ -205,7 +206,7 @@
aml_append(dev, aml_name_decl("_STR", aml_unicode("PCIe 0 Device")));
aml_append(dev, aml_name_decl("_CCA", aml_int(1)));
- acpi_dsdt_add_pci_route_table(dev, cfg->irq);
+ acpi_dsdt_add_pci_route_table(dev, cfg->irq, scope, 0);
method = aml_method("_CBA", 0, AML_NOTSERIALIZED);
aml_append(method, aml_return(aml_int(cfg->ecam.base)));
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 4c7be52..8ad5d7e 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -85,6 +85,7 @@
QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present,
QEMU_PCIE_ARI_NEXTFN_1_BITNR, false),
+ DEFINE_PROP_STRING("sriov-pf", PCIDevice, sriov_pf),
DEFINE_PROP_END_OF_LIST()
};
@@ -959,13 +960,8 @@
dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
}
- /*
- * With SR/IOV and ARI, a device at function 0 need not be a multifunction
- * device, as it may just be a VF that ended up with function 0 in
- * the legacy PCI interpretation. Avoid failing in such cases:
- */
- if (pci_is_vf(dev) &&
- dev->exp.sriov_vf.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
+ /* SR/IOV is not handled here. */
+ if (pci_is_vf(dev)) {
return;
}
@@ -998,7 +994,8 @@
}
/* function 0 indicates single function, so function > 0 must be NULL */
for (func = 1; func < PCI_FUNC_MAX; ++func) {
- if (bus->devices[PCI_DEVFN(slot, func)]) {
+ PCIDevice *device = bus->devices[PCI_DEVFN(slot, func)];
+ if (device && !pci_is_vf(device)) {
error_setg(errp, "PCI: %x.0 indicates single function, "
"but %x.%x is already populated.",
slot, slot, func);
@@ -1283,6 +1280,7 @@
pci_unregister_io_regions(pci_dev);
pci_del_option_rom(pci_dev);
+ pcie_sriov_unregister_device(pci_dev);
if (pc->exit) {
pc->exit(pci_dev);
@@ -1314,7 +1312,6 @@
pcibus_t size = memory_region_size(memory);
uint8_t hdr_type;
- assert(!pci_is_vf(pci_dev)); /* VFs must use pcie_sriov_vf_register_bar */
assert(region_num >= 0);
assert(region_num < PCI_NUM_REGIONS);
assert(is_power_of_2(size));
@@ -1325,7 +1322,6 @@
assert(hdr_type != PCI_HEADER_TYPE_BRIDGE || region_num < 2);
r = &pci_dev->io_regions[region_num];
- r->addr = PCI_BAR_UNMAPPED;
r->size = size;
r->type = type;
r->memory = memory;
@@ -1333,22 +1329,35 @@
? pci_get_bus(pci_dev)->address_space_io
: pci_get_bus(pci_dev)->address_space_mem;
- wmask = ~(size - 1);
- if (region_num == PCI_ROM_SLOT) {
- /* ROM enable bit is writable */
- wmask |= PCI_ROM_ADDRESS_ENABLE;
- }
+ if (pci_is_vf(pci_dev)) {
+ PCIDevice *pf = pci_dev->exp.sriov_vf.pf;
+ assert(!pf || type == pf->exp.sriov_pf.vf_bar_type[region_num]);
- addr = pci_bar(pci_dev, region_num);
- pci_set_long(pci_dev->config + addr, type);
-
- if (!(r->type & PCI_BASE_ADDRESS_SPACE_IO) &&
- r->type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
- pci_set_quad(pci_dev->wmask + addr, wmask);
- pci_set_quad(pci_dev->cmask + addr, ~0ULL);
+ r->addr = pci_bar_address(pci_dev, region_num, r->type, r->size);
+ if (r->addr != PCI_BAR_UNMAPPED) {
+ memory_region_add_subregion_overlap(r->address_space,
+ r->addr, r->memory, 1);
+ }
} else {
- pci_set_long(pci_dev->wmask + addr, wmask & 0xffffffff);
- pci_set_long(pci_dev->cmask + addr, 0xffffffff);
+ r->addr = PCI_BAR_UNMAPPED;
+
+ wmask = ~(size - 1);
+ if (region_num == PCI_ROM_SLOT) {
+ /* ROM enable bit is writable */
+ wmask |= PCI_ROM_ADDRESS_ENABLE;
+ }
+
+ addr = pci_bar(pci_dev, region_num);
+ pci_set_long(pci_dev->config + addr, type);
+
+ if (!(r->type & PCI_BASE_ADDRESS_SPACE_IO) &&
+ r->type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+ pci_set_quad(pci_dev->wmask + addr, wmask);
+ pci_set_quad(pci_dev->cmask + addr, ~0ULL);
+ } else {
+ pci_set_long(pci_dev->wmask + addr, wmask & 0xffffffff);
+ pci_set_long(pci_dev->cmask + addr, 0xffffffff);
+ }
}
}
@@ -1437,7 +1446,11 @@
pci_get_word(pf->config + sriov_cap + PCI_SRIOV_VF_OFFSET);
uint16_t vf_stride =
pci_get_word(pf->config + sriov_cap + PCI_SRIOV_VF_STRIDE);
- uint32_t vf_num = (d->devfn - (pf->devfn + vf_offset)) / vf_stride;
+ uint32_t vf_num = d->devfn - (pf->devfn + vf_offset);
+
+ if (vf_num) {
+ vf_num /= vf_stride;
+ }
if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
new_addr = pci_get_quad(pf->config + bar);
@@ -2105,6 +2118,11 @@
}
}
+ if (!pcie_sriov_register_device(pci_dev, errp)) {
+ pci_qdev_unrealize(DEVICE(pci_dev));
+ return;
+ }
+
/*
* A PCIe Downstream Port that do not have ARI Forwarding enabled must
* associate only Device 0 with the device attached to the bus
diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c
index 56523ab..0fc9f81 100644
--- a/hw/pci/pcie_sriov.c
+++ b/hw/pci/pcie_sriov.c
@@ -20,6 +20,8 @@
#include "qapi/error.h"
#include "trace.h"
+static GHashTable *pfs;
+
static void unparent_vfs(PCIDevice *dev, uint16_t total_vfs)
{
for (uint16_t i = 0; i < total_vfs; i++) {
@@ -31,17 +33,62 @@
dev->exp.sriov_pf.vf = NULL;
}
-bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset,
- const char *vfname, uint16_t vf_dev_id,
- uint16_t init_vfs, uint16_t total_vfs,
- uint16_t vf_offset, uint16_t vf_stride,
- Error **errp)
+static void clear_ctrl_vfe(PCIDevice *dev)
{
- BusState *bus = qdev_get_parent_bus(&dev->qdev);
- int32_t devfn = dev->devfn + vf_offset;
+ uint8_t *ctrl = dev->config + dev->exp.sriov_cap + PCI_SRIOV_CTRL;
+ pci_set_word(ctrl, pci_get_word(ctrl) & ~PCI_SRIOV_CTRL_VFE);
+}
+
+static void register_vfs(PCIDevice *dev)
+{
+ uint16_t num_vfs;
+ uint16_t i;
+ uint16_t sriov_cap = dev->exp.sriov_cap;
+
+ assert(sriov_cap > 0);
+ num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
+ if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) {
+ clear_ctrl_vfe(dev);
+ return;
+ }
+
+ trace_sriov_register_vfs(dev->name, PCI_SLOT(dev->devfn),
+ PCI_FUNC(dev->devfn), num_vfs);
+ for (i = 0; i < num_vfs; i++) {
+ pci_set_enabled(dev->exp.sriov_pf.vf[i], true);
+ }
+}
+
+static void unregister_vfs(PCIDevice *dev)
+{
+ uint16_t i;
+ uint8_t *cfg = dev->config + dev->exp.sriov_cap;
+
+ trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn),
+ PCI_FUNC(dev->devfn));
+ for (i = 0; i < pci_get_word(cfg + PCI_SRIOV_TOTAL_VF); i++) {
+ pci_set_enabled(dev->exp.sriov_pf.vf[i], false);
+ }
+}
+
+static bool pcie_sriov_pf_init_common(PCIDevice *dev, uint16_t offset,
+ uint16_t vf_dev_id, uint16_t init_vfs,
+ uint16_t total_vfs, uint16_t vf_offset,
+ uint16_t vf_stride, Error **errp)
+{
uint8_t *cfg = dev->config + offset;
uint8_t *wmask;
+ if (!pci_is_express(dev)) {
+ error_setg(errp, "PCI Express is required for SR-IOV PF");
+ return false;
+ }
+
+ if (pci_is_vf(dev)) {
+ error_setg(errp, "a device cannot be both an SR-IOV PF and a VF");
+ return false;
+ }
+
if (total_vfs) {
uint16_t ari_cap = pcie_find_capability(dev, PCI_EXT_CAP_ID_ARI);
uint16_t first_vf_devfn = dev->devfn + vf_offset;
@@ -90,6 +137,28 @@
qdev_prop_set_bit(&dev->qdev, "multifunction", true);
+ return true;
+}
+
+bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset,
+ const char *vfname, uint16_t vf_dev_id,
+ uint16_t init_vfs, uint16_t total_vfs,
+ uint16_t vf_offset, uint16_t vf_stride,
+ Error **errp)
+{
+ BusState *bus = qdev_get_parent_bus(&dev->qdev);
+ int32_t devfn = dev->devfn + vf_offset;
+
+ if (pfs && g_hash_table_contains(pfs, dev->qdev.id)) {
+ error_setg(errp, "attaching user-created SR-IOV VF unsupported");
+ return false;
+ }
+
+ if (!pcie_sriov_pf_init_common(dev, offset, vf_dev_id, init_vfs,
+ total_vfs, vf_offset, vf_stride, errp)) {
+ return false;
+ }
+
dev->exp.sriov_pf.vf = g_new(PCIDevice *, total_vfs);
for (uint16_t i = 0; i < total_vfs; i++) {
@@ -119,7 +188,24 @@
{
uint8_t *cfg = dev->config + dev->exp.sriov_cap;
- unparent_vfs(dev, pci_get_word(cfg + PCI_SRIOV_TOTAL_VF));
+ if (dev->exp.sriov_pf.vf_user_created) {
+ uint16_t ven_id = pci_get_word(dev->config + PCI_VENDOR_ID);
+ uint16_t total_vfs = pci_get_word(dev->config + PCI_SRIOV_TOTAL_VF);
+ uint16_t vf_dev_id = pci_get_word(dev->config + PCI_SRIOV_VF_DID);
+
+ unregister_vfs(dev);
+
+ for (uint16_t i = 0; i < total_vfs; i++) {
+ PCIDevice *vf = dev->exp.sriov_pf.vf[i];
+
+ vf->exp.sriov_vf.pf = NULL;
+
+ pci_config_set_vendor_id(vf->config, ven_id);
+ pci_config_set_device_id(vf->config, vf_dev_id);
+ }
+ } else {
+ unparent_vfs(dev, pci_get_word(cfg + PCI_SRIOV_TOTAL_VF));
+ }
}
void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num,
@@ -152,74 +238,172 @@
void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num,
MemoryRegion *memory)
{
- PCIIORegion *r;
- PCIBus *bus = pci_get_bus(dev);
uint8_t type;
- pcibus_t size = memory_region_size(memory);
- assert(pci_is_vf(dev)); /* PFs must use pci_register_bar */
- assert(region_num >= 0);
- assert(region_num < PCI_NUM_REGIONS);
+ assert(dev->exp.sriov_vf.pf);
type = dev->exp.sriov_vf.pf->exp.sriov_pf.vf_bar_type[region_num];
- if (!is_power_of_2(size)) {
- error_report("%s: PCI region size must be a power"
- " of two - type=0x%x, size=0x%"FMT_PCIBUS,
- __func__, type, size);
- exit(1);
- }
-
- r = &dev->io_regions[region_num];
- r->memory = memory;
- r->address_space =
- type & PCI_BASE_ADDRESS_SPACE_IO
- ? bus->address_space_io
- : bus->address_space_mem;
- r->size = size;
- r->type = type;
-
- r->addr = pci_bar_address(dev, region_num, r->type, r->size);
- if (r->addr != PCI_BAR_UNMAPPED) {
- memory_region_add_subregion_overlap(r->address_space,
- r->addr, r->memory, 1);
- }
+ return pci_register_bar(dev, region_num, type, memory);
}
-static void clear_ctrl_vfe(PCIDevice *dev)
+static gint compare_vf_devfns(gconstpointer a, gconstpointer b)
{
- uint8_t *ctrl = dev->config + dev->exp.sriov_cap + PCI_SRIOV_CTRL;
- pci_set_word(ctrl, pci_get_word(ctrl) & ~PCI_SRIOV_CTRL_VFE);
+ return (*(PCIDevice **)a)->devfn - (*(PCIDevice **)b)->devfn;
}
-static void register_vfs(PCIDevice *dev)
+int16_t pcie_sriov_pf_init_from_user_created_vfs(PCIDevice *dev,
+ uint16_t offset,
+ Error **errp)
{
- uint16_t num_vfs;
+ GPtrArray *pf;
+ PCIDevice **vfs;
+ BusState *bus = qdev_get_parent_bus(DEVICE(dev));
+ uint16_t ven_id = pci_get_word(dev->config + PCI_VENDOR_ID);
+ uint16_t vf_dev_id;
+ uint16_t vf_offset;
+ uint16_t vf_stride;
uint16_t i;
- uint16_t sriov_cap = dev->exp.sriov_cap;
- assert(sriov_cap > 0);
- num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
- if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) {
- clear_ctrl_vfe(dev);
- return;
+ if (!pfs || !dev->qdev.id) {
+ return 0;
}
- trace_sriov_register_vfs(dev->name, PCI_SLOT(dev->devfn),
- PCI_FUNC(dev->devfn), num_vfs);
- for (i = 0; i < num_vfs; i++) {
- pci_set_enabled(dev->exp.sriov_pf.vf[i], true);
+ pf = g_hash_table_lookup(pfs, dev->qdev.id);
+ if (!pf) {
+ return 0;
}
+
+ if (pf->len > UINT16_MAX) {
+ error_setg(errp, "too many VFs");
+ return -1;
+ }
+
+ g_ptr_array_sort(pf, compare_vf_devfns);
+ vfs = (void *)pf->pdata;
+
+ if (vfs[0]->devfn <= dev->devfn) {
+ error_setg(errp, "a VF function number is less than the PF function number");
+ return -1;
+ }
+
+ vf_dev_id = pci_get_word(vfs[0]->config + PCI_DEVICE_ID);
+ vf_offset = vfs[0]->devfn - dev->devfn;
+ vf_stride = pf->len < 2 ? 0 : vfs[1]->devfn - vfs[0]->devfn;
+
+ for (i = 0; i < pf->len; i++) {
+ if (bus != qdev_get_parent_bus(&vfs[i]->qdev)) {
+ error_setg(errp, "SR-IOV VF parent bus mismatches with PF");
+ return -1;
+ }
+
+ if (ven_id != pci_get_word(vfs[i]->config + PCI_VENDOR_ID)) {
+ error_setg(errp, "SR-IOV VF vendor ID mismatches with PF");
+ return -1;
+ }
+
+ if (vf_dev_id != pci_get_word(vfs[i]->config + PCI_DEVICE_ID)) {
+ error_setg(errp, "inconsistent SR-IOV VF device IDs");
+ return -1;
+ }
+
+ for (size_t j = 0; j < PCI_NUM_REGIONS; j++) {
+ if (vfs[i]->io_regions[j].size != vfs[0]->io_regions[j].size ||
+ vfs[i]->io_regions[j].type != vfs[0]->io_regions[j].type) {
+ error_setg(errp, "inconsistent SR-IOV BARs");
+ return -1;
+ }
+ }
+
+ if (vfs[i]->devfn - vfs[0]->devfn != vf_stride * i) {
+ error_setg(errp, "inconsistent SR-IOV stride");
+ return -1;
+ }
+ }
+
+ if (!pcie_sriov_pf_init_common(dev, offset, vf_dev_id, pf->len,
+ pf->len, vf_offset, vf_stride, errp)) {
+ return -1;
+ }
+
+ for (i = 0; i < pf->len; i++) {
+ vfs[i]->exp.sriov_vf.pf = dev;
+ vfs[i]->exp.sriov_vf.vf_number = i;
+
+ /* set vid/did according to sr/iov spec - they are not used */
+ pci_config_set_vendor_id(vfs[i]->config, 0xffff);
+ pci_config_set_device_id(vfs[i]->config, 0xffff);
+ }
+
+ dev->exp.sriov_pf.vf = vfs;
+ dev->exp.sriov_pf.vf_user_created = true;
+
+ for (i = 0; i < PCI_NUM_REGIONS; i++) {
+ PCIIORegion *region = &vfs[0]->io_regions[i];
+
+ if (region->size) {
+ pcie_sriov_pf_init_vf_bar(dev, i, region->type, region->size);
+ }
+ }
+
+ return PCI_EXT_CAP_SRIOV_SIZEOF;
}
-static void unregister_vfs(PCIDevice *dev)
+bool pcie_sriov_register_device(PCIDevice *dev, Error **errp)
{
- uint16_t i;
- uint8_t *cfg = dev->config + dev->exp.sriov_cap;
+ if (!dev->exp.sriov_pf.vf && dev->qdev.id &&
+ pfs && g_hash_table_contains(pfs, dev->qdev.id)) {
+ error_setg(errp, "attaching user-created SR-IOV VF unsupported");
+ return false;
+ }
- trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn),
- PCI_FUNC(dev->devfn));
- for (i = 0; i < pci_get_word(cfg + PCI_SRIOV_TOTAL_VF); i++) {
- pci_set_enabled(dev->exp.sriov_pf.vf[i], false);
+ if (dev->sriov_pf) {
+ PCIDevice *pci_pf;
+ GPtrArray *pf;
+
+ if (!PCI_DEVICE_GET_CLASS(dev)->sriov_vf_user_creatable) {
+ error_setg(errp, "user cannot create SR-IOV VF with this device type");
+ return false;
+ }
+
+ if (!pci_is_express(dev)) {
+ error_setg(errp, "PCI Express is required for SR-IOV VF");
+ return false;
+ }
+
+ if (!pci_qdev_find_device(dev->sriov_pf, &pci_pf)) {
+ error_setg(errp, "PCI device specified as SR-IOV PF already exists");
+ return false;
+ }
+
+ if (!pfs) {
+ pfs = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
+ }
+
+ pf = g_hash_table_lookup(pfs, dev->sriov_pf);
+ if (!pf) {
+ pf = g_ptr_array_new();
+ g_hash_table_insert(pfs, g_strdup(dev->sriov_pf), pf);
+ }
+
+ g_ptr_array_add(pf, dev);
+ }
+
+ return true;
+}
+
+void pcie_sriov_unregister_device(PCIDevice *dev)
+{
+ if (dev->sriov_pf && pfs) {
+ GPtrArray *pf = g_hash_table_lookup(pfs, dev->sriov_pf);
+
+ if (pf) {
+ g_ptr_array_remove_fast(pf, dev);
+
+ if (!pf->len) {
+ g_hash_table_remove(pfs, dev->sriov_pf);
+ g_ptr_array_free(pf, FALSE);
+ }
+ }
}
}
@@ -306,7 +490,7 @@
uint16_t pcie_sriov_vf_number(PCIDevice *dev)
{
- assert(pci_is_vf(dev));
+ assert(dev->exp.sriov_vf.pf);
return dev->exp.sriov_vf.vf_number;
}
diff --git a/hw/riscv/virt-acpi-build.c b/hw/riscv/virt-acpi-build.c
index 0925528..36d6a3a 100644
--- a/hw/riscv/virt-acpi-build.c
+++ b/hw/riscv/virt-acpi-build.c
@@ -141,12 +141,36 @@
}
}
+static void acpi_dsdt_add_plic_aplic(Aml *scope, uint8_t socket_count,
+ uint64_t mmio_base, uint64_t mmio_size,
+ const char *hid)
+{
+ uint64_t plic_aplic_addr;
+ uint32_t gsi_base;
+ uint8_t socket;
+
+ for (socket = 0; socket < socket_count; socket++) {
+ plic_aplic_addr = mmio_base + mmio_size * socket;
+ gsi_base = VIRT_IRQCHIP_NUM_SOURCES * socket;
+ Aml *dev = aml_device("IC%.02X", socket);
+ aml_append(dev, aml_name_decl("_HID", aml_string("%s", hid)));
+ aml_append(dev, aml_name_decl("_UID", aml_int(socket)));
+ aml_append(dev, aml_name_decl("_GSB", aml_int(gsi_base)));
+
+ Aml *crs = aml_resource_template();
+ aml_append(crs, aml_memory32_fixed(plic_aplic_addr, mmio_size,
+ AML_READ_WRITE));
+ aml_append(dev, aml_name_decl("_CRS", crs));
+ aml_append(scope, dev);
+ }
+}
+
static void
acpi_dsdt_add_uart(Aml *scope, const MemMapEntry *uart_memmap,
uint32_t uart_irq)
{
Aml *dev = aml_device("COM0");
- aml_append(dev, aml_name_decl("_HID", aml_string("PNP0501")));
+ aml_append(dev, aml_name_decl("_HID", aml_string("RSCV0003")));
aml_append(dev, aml_name_decl("_UID", aml_int(0)));
Aml *crs = aml_resource_template();
@@ -411,6 +435,14 @@
socket_count = riscv_socket_count(ms);
+ if (s->aia_type == VIRT_AIA_TYPE_NONE) {
+ acpi_dsdt_add_plic_aplic(scope, socket_count, memmap[VIRT_PLIC].base,
+ memmap[VIRT_PLIC].size, "RSCV0001");
+ } else {
+ acpi_dsdt_add_plic_aplic(scope, socket_count, memmap[VIRT_APLIC_S].base,
+ memmap[VIRT_APLIC_S].size, "RSCV0002");
+ }
+
acpi_dsdt_add_uart(scope, &memmap[VIRT_UART0], UART0_IRQ);
if (socket_count == 1) {
diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
index 3d5fe09..49cff2a 100644
--- a/hw/scsi/vhost-scsi.c
+++ b/hw/scsi/vhost-scsi.c
@@ -38,6 +38,7 @@
VIRTIO_RING_F_EVENT_IDX,
VIRTIO_SCSI_F_HOTPLUG,
VIRTIO_F_RING_RESET,
+ VIRTIO_F_IN_ORDER,
VIRTIO_F_NOTIFICATION_DATA,
VHOST_INVALID_FEATURE_BIT
};
diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
index cc91ade..55e4be5 100644
--- a/hw/scsi/vhost-user-scsi.c
+++ b/hw/scsi/vhost-user-scsi.c
@@ -36,6 +36,7 @@
VIRTIO_RING_F_EVENT_IDX,
VIRTIO_SCSI_F_HOTPLUG,
VIRTIO_F_RING_RESET,
+ VIRTIO_F_IN_ORDER,
VIRTIO_F_NOTIFICATION_DATA,
VHOST_INVALID_FEATURE_BIT
};
diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c
index 3b77034..a394514 100644
--- a/hw/smbios/smbios.c
+++ b/hw/smbios/smbios.c
@@ -1093,6 +1093,7 @@
Error **errp)
{
unsigned i, dimm_cnt, offset;
+ MachineClass *mc = MACHINE_GET_CLASS(ms);
ERRP_GUARD();
assert(ep_type == SMBIOS_ENTRY_POINT_TYPE_32 ||
@@ -1123,12 +1124,12 @@
smbios_build_type_9_table(errp);
smbios_build_type_11_table();
-#define MAX_DIMM_SZ (16 * GiB)
-#define GET_DIMM_SZ ((i < dimm_cnt - 1) ? MAX_DIMM_SZ \
- : ((current_machine->ram_size - 1) % MAX_DIMM_SZ) + 1)
+#define GET_DIMM_SZ ((i < dimm_cnt - 1) ? mc->smbios_memory_device_size \
+ : ((current_machine->ram_size - 1) % mc->smbios_memory_device_size) + 1)
- dimm_cnt = QEMU_ALIGN_UP(current_machine->ram_size, MAX_DIMM_SZ) /
- MAX_DIMM_SZ;
+ dimm_cnt = QEMU_ALIGN_UP(current_machine->ram_size,
+ mc->smbios_memory_device_size) /
+ mc->smbios_memory_device_size;
/*
* The offset determines if we need to keep additional space between
diff --git a/hw/timer/hpet.c b/hw/timer/hpet.c
index 4cb5393..471950a 100644
--- a/hw/timer/hpet.c
+++ b/hw/timer/hpet.c
@@ -54,10 +54,12 @@
uint64_t cmp; /* comparator */
uint64_t fsb; /* FSB route */
/* Hidden register state */
+ uint64_t cmp64; /* comparator (extended to counter width) */
uint64_t period; /* Last value written to comparator */
uint8_t wrap_flag; /* timer pop will indicate wrap for one-shot 32-bit
* mode. Next pop will be actual timer expiration.
*/
+ uint64_t last; /* last value armed, to avoid timer storms */
} HPETTimer;
struct HPETState {
@@ -116,11 +118,6 @@
static uint32_t hpet_time_after(uint64_t a, uint64_t b)
{
- return ((int32_t)(b - a) < 0);
-}
-
-static uint32_t hpet_time_after64(uint64_t a, uint64_t b)
-{
return ((int64_t)(b - a) < 0);
}
@@ -156,29 +153,34 @@
return ns_to_ticks(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + s->hpet_offset);
}
-/*
- * calculate diff between comparator value and current ticks
- */
-static inline uint64_t hpet_calculate_diff(HPETTimer *t, uint64_t current)
+static uint64_t hpet_get_ns(HPETState *s, uint64_t tick)
{
+ return ticks_to_ns(tick) - s->hpet_offset;
+}
+/*
+ * calculate next value of the general counter that matches the
+ * target (either entirely, or the low 32-bit only depending on
+ * the timer mode).
+ */
+static uint64_t hpet_calculate_cmp64(HPETTimer *t, uint64_t cur_tick, uint64_t target)
+{
if (t->config & HPET_TN_32BIT) {
- uint32_t diff, cmp;
-
- cmp = (uint32_t)t->cmp;
- diff = cmp - (uint32_t)current;
- diff = (int32_t)diff > 0 ? diff : (uint32_t)1;
- return (uint64_t)diff;
+ uint64_t result = deposit64(cur_tick, 0, 32, target);
+ if (result < cur_tick) {
+ result += 0x100000000ULL;
+ }
+ return result;
} else {
- uint64_t diff, cmp;
-
- cmp = t->cmp;
- diff = cmp - current;
- diff = (int64_t)diff > 0 ? diff : (uint64_t)1;
- return diff;
+ return target;
}
}
+static uint64_t hpet_next_wrap(uint64_t cur_tick)
+{
+ return (cur_tick | 0xffffffffU) + 1;
+}
+
static void update_irq(struct HPETTimer *timer, int set)
{
uint64_t mask;
@@ -196,21 +198,31 @@
}
s = timer->state;
mask = 1 << timer->tn;
- if (!set || !timer_enabled(timer) || !hpet_enabled(timer->state)) {
+
+ if (set && (timer->config & HPET_TN_TYPE_LEVEL)) {
+ /*
+ * If HPET_TN_ENABLE bit is 0, "the timer will still operate and
+ * generate appropriate status bits, but will not cause an interrupt"
+ */
+ s->isr |= mask;
+ } else {
s->isr &= ~mask;
+ }
+
+ if (set && timer_enabled(timer) && hpet_enabled(s)) {
+ if (timer_fsb_route(timer)) {
+ address_space_stl_le(&address_space_memory, timer->fsb >> 32,
+ timer->fsb & 0xffffffff, MEMTXATTRS_UNSPECIFIED,
+ NULL);
+ } else if (timer->config & HPET_TN_TYPE_LEVEL) {
+ qemu_irq_raise(s->irqs[route]);
+ } else {
+ qemu_irq_pulse(s->irqs[route]);
+ }
+ } else {
if (!timer_fsb_route(timer)) {
qemu_irq_lower(s->irqs[route]);
}
- } else if (timer_fsb_route(timer)) {
- address_space_stl_le(&address_space_memory, timer->fsb >> 32,
- timer->fsb & 0xffffffff, MEMTXATTRS_UNSPECIFIED,
- NULL);
- } else if (timer->config & HPET_TN_TYPE_LEVEL) {
- s->isr |= mask;
- qemu_irq_raise(s->irqs[route]);
- } else {
- s->isr &= ~mask;
- qemu_irq_pulse(s->irqs[route]);
}
}
@@ -250,7 +262,13 @@
static int hpet_post_load(void *opaque, int version_id)
{
HPETState *s = opaque;
+ int i;
+ for (i = 0; i < s->num_timers; i++) {
+ HPETTimer *t = &s->timer[i];
+ t->cmp64 = hpet_calculate_cmp64(t, s->hpet_counter, t->cmp);
+ t->last = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) - NANOSECONDS_PER_SECOND;
+ }
/* Recalculate the offset between the main counter and guest time */
if (!s->hpet_offset_saved) {
s->hpet_offset = ticks_to_ns(s->hpet_counter)
@@ -346,14 +364,17 @@
}
};
-static void hpet_arm(HPETTimer *t, uint64_t ticks)
+static void hpet_arm(HPETTimer *t, uint64_t tick)
{
- if (ticks < ns_to_ticks(INT64_MAX / 2)) {
- timer_mod(t->qemu_timer,
- qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + ticks_to_ns(ticks));
- } else {
- timer_del(t->qemu_timer);
+ uint64_t ns = hpet_get_ns(t->state, tick);
+
+ /* Clamp period to reasonable min value (1 us) */
+ if (timer_is_periodic(t) && ns - t->last < 1000) {
+ ns = t->last + 1000;
}
+
+ t->last = ns;
+ timer_mod(t->qemu_timer, ns);
}
/*
@@ -362,72 +383,68 @@
static void hpet_timer(void *opaque)
{
HPETTimer *t = opaque;
- uint64_t diff;
-
uint64_t period = t->period;
uint64_t cur_tick = hpet_get_ticks(t->state);
if (timer_is_periodic(t) && period != 0) {
+ while (hpet_time_after(cur_tick, t->cmp64)) {
+ t->cmp64 += period;
+ }
if (t->config & HPET_TN_32BIT) {
- while (hpet_time_after(cur_tick, t->cmp)) {
- t->cmp = (uint32_t)(t->cmp + t->period);
- }
+ t->cmp = (uint32_t)t->cmp64;
} else {
- while (hpet_time_after64(cur_tick, t->cmp)) {
- t->cmp += period;
- }
+ t->cmp = t->cmp64;
}
- diff = hpet_calculate_diff(t, cur_tick);
- hpet_arm(t, diff);
- } else if (t->config & HPET_TN_32BIT && !timer_is_periodic(t)) {
- if (t->wrap_flag) {
- diff = hpet_calculate_diff(t, cur_tick);
- hpet_arm(t, diff);
- t->wrap_flag = 0;
- }
+ hpet_arm(t, t->cmp64);
+ } else if (t->wrap_flag) {
+ t->wrap_flag = 0;
+ hpet_arm(t, t->cmp64);
}
update_irq(t, 1);
}
static void hpet_set_timer(HPETTimer *t)
{
- uint64_t diff;
- uint32_t wrap_diff; /* how many ticks until we wrap? */
uint64_t cur_tick = hpet_get_ticks(t->state);
- /* whenever new timer is being set up, make sure wrap_flag is 0 */
t->wrap_flag = 0;
- diff = hpet_calculate_diff(t, cur_tick);
+ t->cmp64 = hpet_calculate_cmp64(t, cur_tick, t->cmp);
+ if (t->config & HPET_TN_32BIT) {
- /* hpet spec says in one-shot 32-bit mode, generate an interrupt when
- * counter wraps in addition to an interrupt with comparator match.
- */
- if (t->config & HPET_TN_32BIT && !timer_is_periodic(t)) {
- wrap_diff = 0xffffffff - (uint32_t)cur_tick;
- if (wrap_diff < (uint32_t)diff) {
- diff = wrap_diff;
+ /* hpet spec says in one-shot 32-bit mode, generate an interrupt when
+ * counter wraps in addition to an interrupt with comparator match.
+ */
+ if (!timer_is_periodic(t) && t->cmp64 > hpet_next_wrap(cur_tick)) {
t->wrap_flag = 1;
+ hpet_arm(t, hpet_next_wrap(cur_tick));
+ return;
}
}
- hpet_arm(t, diff);
+ hpet_arm(t, t->cmp64);
}
static void hpet_del_timer(HPETTimer *t)
{
+ HPETState *s = t->state;
timer_del(t->qemu_timer);
- update_irq(t, 0);
+
+ if (s->isr & (1 << t->tn)) {
+ /* For level-triggered interrupt, this leaves ISR set but lowers irq. */
+ update_irq(t, 1);
+ }
}
static uint64_t hpet_ram_read(void *opaque, hwaddr addr,
unsigned size)
{
HPETState *s = opaque;
- uint64_t cur_tick, index;
+ int shift = (addr & 4) * 8;
+ uint64_t cur_tick;
trace_hpet_ram_read(addr);
- index = addr;
+
/*address range of all TN regs*/
- if (index >= 0x100 && index <= 0x3ff) {
+ if (addr >= 0x100 && addr <= 0x3ff) {
uint8_t timer_id = (addr - 0x100) / 0x20;
HPETTimer *timer = &s->timer[timer_id];
@@ -436,52 +453,33 @@
return 0;
}
- switch ((addr - 0x100) % 0x20) {
- case HPET_TN_CFG:
- return timer->config;
- case HPET_TN_CFG + 4: // Interrupt capabilities
- return timer->config >> 32;
+ switch (addr & 0x18) {
+ case HPET_TN_CFG: // including interrupt capabilities
+ return timer->config >> shift;
case HPET_TN_CMP: // comparator register
- return timer->cmp;
- case HPET_TN_CMP + 4:
- return timer->cmp >> 32;
+ return timer->cmp >> shift;
case HPET_TN_ROUTE:
- return timer->fsb;
- case HPET_TN_ROUTE + 4:
- return timer->fsb >> 32;
+ return timer->fsb >> shift;
default:
trace_hpet_ram_read_invalid();
break;
}
} else {
- switch (index) {
- case HPET_ID:
- return s->capability;
- case HPET_PERIOD:
- return s->capability >> 32;
+ switch (addr & ~4) {
+ case HPET_ID: // including HPET_PERIOD
+ return s->capability >> shift;
case HPET_CFG:
- return s->config;
- case HPET_CFG + 4:
- trace_hpet_invalid_hpet_cfg(4);
- return 0;
+ return s->config >> shift;
case HPET_COUNTER:
if (hpet_enabled(s)) {
cur_tick = hpet_get_ticks(s);
} else {
cur_tick = s->hpet_counter;
}
- trace_hpet_ram_read_reading_counter(0, cur_tick);
- return cur_tick;
- case HPET_COUNTER + 4:
- if (hpet_enabled(s)) {
- cur_tick = hpet_get_ticks(s);
- } else {
- cur_tick = s->hpet_counter;
- }
- trace_hpet_ram_read_reading_counter(4, cur_tick);
- return cur_tick >> 32;
+ trace_hpet_ram_read_reading_counter(addr & 4, cur_tick);
+ return cur_tick >> shift;
case HPET_STATUS:
- return s->isr;
+ return s->isr >> shift;
default:
trace_hpet_ram_read_invalid();
break;
@@ -495,15 +493,14 @@
{
int i;
HPETState *s = opaque;
- uint64_t old_val, new_val, val, index;
+ int shift = (addr & 4) * 8;
+ int len = MIN(size * 8, 64 - shift);
+ uint64_t old_val, new_val, cleared;
trace_hpet_ram_write(addr, value);
- index = addr;
- old_val = hpet_ram_read(opaque, addr, 4);
- new_val = value;
/*address range of all TN regs*/
- if (index >= 0x100 && index <= 0x3ff) {
+ if (addr >= 0x100 && addr <= 0x3ff) {
uint8_t timer_id = (addr - 0x100) / 0x20;
HPETTimer *timer = &s->timer[timer_id];
@@ -512,71 +509,49 @@
trace_hpet_timer_id_out_of_range(timer_id);
return;
}
- switch ((addr - 0x100) % 0x20) {
+ switch (addr & 0x18) {
case HPET_TN_CFG:
- trace_hpet_ram_write_tn_cfg();
- if (activating_bit(old_val, new_val, HPET_TN_FSB_ENABLE)) {
+ trace_hpet_ram_write_tn_cfg(addr & 4);
+ old_val = timer->config;
+ new_val = deposit64(old_val, shift, len, value);
+ new_val = hpet_fixup_reg(new_val, old_val, HPET_TN_CFG_WRITE_MASK);
+ if (deactivating_bit(old_val, new_val, HPET_TN_TYPE_LEVEL)) {
+ /*
+ * Do this before changing timer->config; otherwise, if
+ * HPET_TN_FSB is set, update_irq will not lower the qemu_irq.
+ */
update_irq(timer, 0);
}
- val = hpet_fixup_reg(new_val, old_val, HPET_TN_CFG_WRITE_MASK);
- timer->config = (timer->config & 0xffffffff00000000ULL) | val;
+ timer->config = new_val;
+ if (activating_bit(old_val, new_val, HPET_TN_ENABLE)
+ && (s->isr & (1 << timer_id))) {
+ update_irq(timer, 1);
+ }
if (new_val & HPET_TN_32BIT) {
timer->cmp = (uint32_t)timer->cmp;
timer->period = (uint32_t)timer->period;
}
- if (activating_bit(old_val, new_val, HPET_TN_ENABLE) &&
- hpet_enabled(s)) {
- hpet_set_timer(timer);
- } else if (deactivating_bit(old_val, new_val, HPET_TN_ENABLE)) {
- hpet_del_timer(timer);
- }
- break;
- case HPET_TN_CFG + 4: // Interrupt capabilities
- trace_hpet_ram_write_invalid_tn_cfg(4);
- break;
- case HPET_TN_CMP: // comparator register
- trace_hpet_ram_write_tn_cmp(0);
- if (timer->config & HPET_TN_32BIT) {
- new_val = (uint32_t)new_val;
- }
- if (!timer_is_periodic(timer)
- || (timer->config & HPET_TN_SETVAL)) {
- timer->cmp = (timer->cmp & 0xffffffff00000000ULL) | new_val;
- }
- if (timer_is_periodic(timer)) {
- /*
- * FIXME: Clamp period to reasonable min value?
- * Clamp period to reasonable max value
- */
- if (timer->config & HPET_TN_32BIT) {
- new_val = MIN(new_val, ~0u >> 1);
- }
- timer->period =
- (timer->period & 0xffffffff00000000ULL) | new_val;
- }
- /*
- * FIXME: on a 64-bit write, HPET_TN_SETVAL should apply to the
- * high bits part as well.
- */
- timer->config &= ~HPET_TN_SETVAL;
if (hpet_enabled(s)) {
hpet_set_timer(timer);
}
break;
- case HPET_TN_CMP + 4: // comparator register high order
- trace_hpet_ram_write_tn_cmp(4);
+ case HPET_TN_CMP: // comparator register
+ if (timer->config & HPET_TN_32BIT) {
+ /* High 32-bits are zero, leave them untouched. */
+ if (shift) {
+ trace_hpet_ram_write_invalid_tn_cmp();
+ break;
+ }
+ len = 64;
+ value = (uint32_t) value;
+ }
+ trace_hpet_ram_write_tn_cmp(addr & 4);
if (!timer_is_periodic(timer)
|| (timer->config & HPET_TN_SETVAL)) {
- timer->cmp = (timer->cmp & 0xffffffffULL) | new_val << 32;
+ timer->cmp = deposit64(timer->cmp, shift, len, value);
}
if (timer_is_periodic(timer)) {
- /*
- * FIXME: Clamp period to reasonable min value?
- * Clamp period to reasonable max value
- */
- new_val = MIN(new_val, ~0u >> 1);
- timer->period =
- (timer->period & 0xffffffffULL) | new_val << 32;
+ timer->period = deposit64(timer->period, shift, len, value);
}
timer->config &= ~HPET_TN_SETVAL;
if (hpet_enabled(s)) {
@@ -584,10 +559,7 @@
}
break;
case HPET_TN_ROUTE:
- timer->fsb = (timer->fsb & 0xffffffff00000000ULL) | new_val;
- break;
- case HPET_TN_ROUTE + 4:
- timer->fsb = (new_val << 32) | (timer->fsb & 0xffffffff);
+ timer->fsb = deposit64(timer->fsb, shift, len, value);
break;
default:
trace_hpet_ram_write_invalid();
@@ -595,20 +567,23 @@
}
return;
} else {
- switch (index) {
+ switch (addr & ~4) {
case HPET_ID:
return;
case HPET_CFG:
- val = hpet_fixup_reg(new_val, old_val, HPET_CFG_WRITE_MASK);
- s->config = (s->config & 0xffffffff00000000ULL) | val;
+ old_val = s->config;
+ new_val = deposit64(old_val, shift, len, value);
+ new_val = hpet_fixup_reg(new_val, old_val, HPET_CFG_WRITE_MASK);
+ s->config = new_val;
if (activating_bit(old_val, new_val, HPET_CFG_ENABLE)) {
/* Enable main counter and interrupt generation. */
s->hpet_offset =
ticks_to_ns(s->hpet_counter) - qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
for (i = 0; i < s->num_timers; i++) {
- if ((&s->timer[i])->cmp != ~0ULL) {
- hpet_set_timer(&s->timer[i]);
+ if (timer_enabled(&s->timer[i]) && (s->isr & (1 << i))) {
+ update_irq(&s->timer[i], 1);
}
+ hpet_set_timer(&s->timer[i]);
}
} else if (deactivating_bit(old_val, new_val, HPET_CFG_ENABLE)) {
/* Halt main counter and disable interrupt generation. */
@@ -629,13 +604,11 @@
qemu_set_irq(s->irqs[RTC_ISA_IRQ], s->rtc_irq_level);
}
break;
- case HPET_CFG + 4:
- trace_hpet_invalid_hpet_cfg(4);
- break;
case HPET_STATUS:
- val = new_val & s->isr;
+ new_val = value << shift;
+ cleared = new_val & s->isr;
for (i = 0; i < s->num_timers; i++) {
- if (val & (1 << i)) {
+ if (cleared & (1 << i)) {
update_irq(&s->timer[i], 0);
}
}
@@ -644,15 +617,7 @@
if (hpet_enabled(s)) {
trace_hpet_ram_write_counter_write_while_enabled();
}
- s->hpet_counter =
- (s->hpet_counter & 0xffffffff00000000ULL) | value;
- trace_hpet_ram_write_counter_written(0, value, s->hpet_counter);
- break;
- case HPET_COUNTER + 4:
- trace_hpet_ram_write_counter_write_while_enabled();
- s->hpet_counter =
- (s->hpet_counter & 0xffffffffULL) | (((uint64_t)value) << 32);
- trace_hpet_ram_write_counter_written(4, value, s->hpet_counter);
+ s->hpet_counter = deposit64(s->hpet_counter, shift, len, value);
break;
default:
trace_hpet_ram_write_invalid();
@@ -666,7 +631,11 @@
.write = hpet_ram_write,
.valid = {
.min_access_size = 4,
- .max_access_size = 4,
+ .max_access_size = 8,
+ },
+ .impl = {
+ .min_access_size = 4,
+ .max_access_size = 8,
},
.endianness = DEVICE_NATIVE_ENDIAN,
};
diff --git a/hw/timer/trace-events b/hw/timer/trace-events
index de769f4..f48a712 100644
--- a/hw/timer/trace-events
+++ b/hw/timer/trace-events
@@ -108,9 +108,9 @@
hpet_ram_read_invalid(void) "invalid hpet_ram_readl"
hpet_ram_write(uint64_t addr, uint64_t value) "enter hpet_ram_writel at 0x%" PRIx64 " = 0x%" PRIx64
hpet_ram_write_timer_id(uint64_t timer_id) "hpet_ram_writel timer_id = 0x%" PRIx64
-hpet_ram_write_tn_cfg(void) "hpet_ram_writel HPET_TN_CFG"
-hpet_ram_write_invalid_tn_cfg(uint8_t reg_off) "invalid HPET_TN_CFG + %" PRIu8 " write"
+hpet_ram_write_tn_cfg(uint8_t reg_off) "hpet_ram_writel HPET_TN_CFG + %" PRIu8
hpet_ram_write_tn_cmp(uint8_t reg_off) "hpet_ram_writel HPET_TN_CMP + %" PRIu8
+hpet_ram_write_invalid_tn_cmp(void) "invalid HPET_TN_CMP + 4 write"
hpet_ram_write_invalid(void) "invalid hpet_ram_writel"
hpet_ram_write_counter_write_while_enabled(void) "Writing counter while HPET enabled!"
hpet_ram_write_counter_written(uint8_t reg_off, uint64_t value, uint64_t counter) "HPET counter + %" PRIu8 "written. crt = 0x%" PRIx64 " -> 0x%" PRIx64
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 17a5865..36d0cf6 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -602,7 +602,7 @@
IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
int iommu_idx;
- trace_vfio_listener_region_add_iommu(iova, end);
+ trace_vfio_listener_region_add_iommu(section->mr->name, iova, end);
/*
* FIXME: For VFIO iommu types which have KVM acceleration to
* avoid bouncing all map/unmaps through qemu this way, this
@@ -728,6 +728,7 @@
if (memory_region_is_iommu(section->mr)) {
VFIOGuestIOMMU *giommu;
+ trace_vfio_listener_region_del_iommu(section->mr->name);
QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
giommu->n.start == section->offset_within_region) {
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index e16179b..98bd4dc 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -95,7 +95,8 @@
vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "iommu %s @ 0x%"PRIx64" - 0x%"PRIx64
vfio_listener_region_skip(const char *name, uint64_t start, uint64_t end) "SKIPPING %s 0x%"PRIx64" - 0x%"PRIx64
vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"
-vfio_listener_region_add_iommu(uint64_t start, uint64_t end) "region_add [iommu] 0x%"PRIx64" - 0x%"PRIx64
+vfio_listener_region_add_iommu(const char* name, uint64_t start, uint64_t end) "region_add [iommu] %s 0x%"PRIx64" - 0x%"PRIx64
+vfio_listener_region_del_iommu(const char *name) "region_del [iommu] %s"
vfio_listener_region_add_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] 0x%"PRIx64" - 0x%"PRIx64" [%p]"
vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_within_region, uintptr_t page_size) "Region \"%s\" iova=0x%"PRIx64" offset_within_region=0x%"PRIx64" qemu_real_host_page_size=0x%"PRIxPTR
vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index b7c04f0..04e36ae 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -116,6 +116,7 @@
virtio_iommu_set_config(uint8_t bypass) "bypass=0x%x"
virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"
virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"
+virtio_iommu_detach_endpoint_from_domain(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"
virtio_iommu_map(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end, uint64_t phys_start, uint32_t flags) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 " phys_start=0x%"PRIx64" flags=%d"
virtio_iommu_unmap(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64
virtio_iommu_unmap_done(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64
diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
index ae48cc1..32ee7f4 100644
--- a/hw/virtio/vhost-user-fs.c
+++ b/hw/virtio/vhost-user-fs.c
@@ -33,6 +33,7 @@
VIRTIO_F_RING_PACKED,
VIRTIO_F_IOMMU_PLATFORM,
VIRTIO_F_RING_RESET,
+ VIRTIO_F_IN_ORDER,
VIRTIO_F_NOTIFICATION_DATA,
VHOST_INVALID_FEATURE_BIT
};
diff --git a/hw/virtio/vhost-user-vsock.c b/hw/virtio/vhost-user-vsock.c
index 802b44a..da3b0e0 100644
--- a/hw/virtio/vhost-user-vsock.c
+++ b/hw/virtio/vhost-user-vsock.c
@@ -21,6 +21,7 @@
VIRTIO_RING_F_INDIRECT_DESC,
VIRTIO_RING_F_EVENT_IDX,
VIRTIO_F_NOTIFY_ON_EMPTY,
+ VIRTIO_F_IN_ORDER,
VIRTIO_F_NOTIFICATION_DATA,
VHOST_INVALID_FEATURE_BIT
};
diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c
index bbe8aa4..5034768 100644
--- a/hw/virtio/virtio-crypto.c
+++ b/hw/virtio/virtio-crypto.c
@@ -205,6 +205,7 @@
int queue_index;
uint32_t algo, keytype, keylen;
+ sreq->info.op_code = opcode;
algo = ldl_le_p(&sess_req->para.algo);
keytype = ldl_le_p(&sess_req->para.keytype);
keylen = ldl_le_p(&sess_req->para.keylen);
@@ -224,7 +225,6 @@
iov_discard_front(&iov, &out_num, keylen);
}
- sreq->info.op_code = opcode;
asym_info = &sreq->info.u.asym_sess_info;
asym_info->algo = algo;
asym_info->keytype = keytype;
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 33ae61c..59ef4fb 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -308,6 +308,7 @@
if (!ep->domain) {
return;
}
+ trace_virtio_iommu_detach_endpoint_from_domain(domain->id, ep->id);
g_tree_foreach(domain->mappings, virtio_iommu_notify_unmap_cb,
ep->iommu_mr);
QLIST_REMOVE(ep, next);
@@ -467,26 +468,6 @@
return &sdev->as;
}
-static void virtio_iommu_device_clear(VirtIOIOMMU *s, PCIBus *bus, int devfn)
-{
- IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus);
- IOMMUDevice *sdev;
-
- if (!sbus) {
- return;
- }
-
- sdev = sbus->pbdev[devfn];
- if (!sdev) {
- return;
- }
-
- g_list_free_full(sdev->resv_regions, g_free);
- sdev->resv_regions = NULL;
- g_free(sdev);
- sbus->pbdev[devfn] = NULL;
-}
-
static gboolean hiod_equal(gconstpointer v1, gconstpointer v2)
{
const struct hiod_key *key1 = v1;
@@ -558,8 +539,6 @@
{
IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus);
IOMMUDevice *sdev;
- GList *current_ranges;
- GList *l, *tmp, *new_ranges = NULL;
int ret = -EINVAL;
if (!sbus) {
@@ -573,35 +552,10 @@
return ret;
}
- current_ranges = sdev->host_resv_ranges;
-
- g_assert(!sdev->probe_done);
-
- /* check that each new resv region is included in an existing one */
if (sdev->host_resv_ranges) {
- range_inverse_array(iova_ranges,
- &new_ranges,
- 0, UINT64_MAX);
-
- for (tmp = new_ranges; tmp; tmp = tmp->next) {
- Range *newr = (Range *)tmp->data;
- bool included = false;
-
- for (l = current_ranges; l; l = l->next) {
- Range * r = (Range *)l->data;
-
- if (range_contains_range(r, newr)) {
- included = true;
- break;
- }
- }
- if (!included) {
- goto error;
- }
- }
- /* all new reserved ranges are included in existing ones */
- ret = 0;
- goto out;
+ error_setg(errp, "%s virtio-iommu does not support aliased BDF",
+ __func__);
+ return ret;
}
range_inverse_array(iova_ranges,
@@ -610,14 +564,31 @@
rebuild_resv_regions(sdev);
return 0;
-error:
- error_setg(errp, "%s Conflicting host reserved ranges set!",
- __func__);
-out:
- g_list_free_full(new_ranges, g_free);
- return ret;
}
+static void virtio_iommu_unset_host_iova_ranges(VirtIOIOMMU *s, PCIBus *bus,
+ int devfn)
+{
+ IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus);
+ IOMMUDevice *sdev;
+
+ if (!sbus) {
+ return;
+ }
+
+ sdev = sbus->pbdev[devfn];
+ if (!sdev) {
+ return;
+ }
+
+ g_list_free_full(g_steal_pointer(&sdev->host_resv_ranges), g_free);
+ g_list_free_full(sdev->resv_regions, g_free);
+ sdev->host_resv_ranges = NULL;
+ sdev->resv_regions = NULL;
+ add_prop_resv_regions(sdev);
+}
+
+
static bool check_page_size_mask(VirtIOIOMMU *viommu, uint64_t new_mask,
Error **errp)
{
@@ -726,9 +697,10 @@
if (!hiod) {
return;
}
+ virtio_iommu_unset_host_iova_ranges(viommu, hiod->aliased_bus,
+ hiod->aliased_devfn);
g_hash_table_remove(viommu->host_iommu_devices, &key);
- virtio_iommu_device_clear(viommu, bus, devfn);
}
static const PCIIOMMUOps virtio_iommu_ops = {
@@ -815,6 +787,7 @@
if (QLIST_EMPTY(&domain->endpoint_list)) {
g_tree_remove(s->domains, GUINT_TO_POINTER(domain->id));
}
+ g_tree_remove(s->endpoints, GUINT_TO_POINTER(ep_id));
return VIRTIO_IOMMU_S_OK;
}
@@ -977,7 +950,6 @@
}
buf += count;
free -= count;
- sdev->probe_done = true;
return VIRTIO_IOMMU_S_OK;
}
diff --git a/hw/virtio/virtio-net-pci.c b/hw/virtio/virtio-net-pci.c
index e03543a..dba4987 100644
--- a/hw/virtio/virtio-net-pci.c
+++ b/hw/virtio/virtio-net-pci.c
@@ -75,6 +75,7 @@
k->device_id = PCI_DEVICE_ID_VIRTIO_NET;
k->revision = VIRTIO_PCI_ABI_VERSION;
k->class_id = PCI_CLASS_NETWORK_ETHERNET;
+ k->sriov_vf_user_creatable = true;
set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
device_class_set_props(dc, virtio_net_properties);
vpciklass->realize = virtio_net_pci_realize;
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 9534730..0c8fcc5 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -1955,6 +1955,7 @@
uint8_t *config;
uint32_t size;
VirtIODevice *vdev = virtio_bus_get_device(bus);
+ int16_t res;
/*
* Virtio capabilities present without
@@ -2100,6 +2101,14 @@
pci_register_bar(&proxy->pci_dev, proxy->legacy_io_bar_idx,
PCI_BASE_ADDRESS_SPACE_IO, &proxy->bar);
}
+
+ res = pcie_sriov_pf_init_from_user_created_vfs(&proxy->pci_dev,
+ proxy->last_pcie_cap_offset,
+ errp);
+ if (res > 0) {
+ proxy->last_pcie_cap_offset += res;
+ virtio_add_feature(&vdev->host_features, VIRTIO_F_SR_IOV);
+ }
}
static void virtio_pci_device_unplugged(DeviceState *d)
@@ -2187,7 +2196,7 @@
if (pcie_port && pci_is_express(pci_dev)) {
int pos;
- uint16_t last_pcie_cap_offset = PCI_CONFIG_SPACE_SIZE;
+ proxy->last_pcie_cap_offset = PCI_CONFIG_SPACE_SIZE;
pos = pcie_endpoint_cap_init(pci_dev, 0);
assert(pos > 0);
@@ -2207,9 +2216,9 @@
pci_set_word(pci_dev->config + pos + PCI_PM_PMC, 0x3);
if (proxy->flags & VIRTIO_PCI_FLAG_AER) {
- pcie_aer_init(pci_dev, PCI_ERR_VER, last_pcie_cap_offset,
+ pcie_aer_init(pci_dev, PCI_ERR_VER, proxy->last_pcie_cap_offset,
PCI_ERR_SIZEOF, NULL);
- last_pcie_cap_offset += PCI_ERR_SIZEOF;
+ proxy->last_pcie_cap_offset += PCI_ERR_SIZEOF;
}
if (proxy->flags & VIRTIO_PCI_FLAG_INIT_DEVERR) {
@@ -2234,9 +2243,9 @@
}
if (proxy->flags & VIRTIO_PCI_FLAG_ATS) {
- pcie_ats_init(pci_dev, last_pcie_cap_offset,
+ pcie_ats_init(pci_dev, proxy->last_pcie_cap_offset,
proxy->flags & VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED);
- last_pcie_cap_offset += PCI_EXT_CAP_ATS_SIZEOF;
+ proxy->last_pcie_cap_offset += PCI_EXT_CAP_ATS_SIZEOF;
}
if (proxy->flags & VIRTIO_PCI_FLAG_INIT_FLR) {
@@ -2263,6 +2272,7 @@
bool pcie_port = pci_bus_is_express(pci_get_bus(pci_dev)) &&
!pci_bus_is_root(pci_get_bus(pci_dev));
+ pcie_sriov_pf_exit(&proxy->pci_dev);
msix_uninit_exclusive_bar(pci_dev);
if (proxy->flags & VIRTIO_PCI_FLAG_AER && pcie_port &&
pci_is_express(pci_dev)) {
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 583a224..397c261 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -872,6 +872,46 @@
vq->used_elems[idx].ndescs = elem->ndescs;
}
+static void virtqueue_ordered_fill(VirtQueue *vq, const VirtQueueElement *elem,
+ unsigned int len)
+{
+ unsigned int i, steps, max_steps;
+
+ i = vq->used_idx % vq->vring.num;
+ steps = 0;
+ /*
+ * We shouldn't need to increase 'i' by more than the distance
+ * between used_idx and last_avail_idx.
+ */
+ max_steps = (vq->last_avail_idx - vq->used_idx) % vq->vring.num;
+
+ /* Search for element in vq->used_elems */
+ while (steps <= max_steps) {
+ /* Found element, set length and mark as filled */
+ if (vq->used_elems[i].index == elem->index) {
+ vq->used_elems[i].len = len;
+ vq->used_elems[i].in_order_filled = true;
+ break;
+ }
+
+ i += vq->used_elems[i].ndescs;
+ steps += vq->used_elems[i].ndescs;
+
+ if (i >= vq->vring.num) {
+ i -= vq->vring.num;
+ }
+ }
+
+ /*
+ * We should be able to find a matching VirtQueueElement in
+ * used_elems. If we don't, this is an error.
+ */
+ if (steps >= max_steps) {
+ qemu_log_mask(LOG_GUEST_ERROR, "%s: %s cannot fill buffer id %u\n",
+ __func__, vq->vdev->name, elem->index);
+ }
+}
+
static void virtqueue_packed_fill_desc(VirtQueue *vq,
const VirtQueueElement *elem,
unsigned int idx,
@@ -922,7 +962,9 @@
return;
}
- if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
+ if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_IN_ORDER)) {
+ virtqueue_ordered_fill(vq, elem, len);
+ } else if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
virtqueue_packed_fill(vq, elem, len, idx);
} else {
virtqueue_split_fill(vq, elem, len, idx);
@@ -981,6 +1023,73 @@
}
}
+static void virtqueue_ordered_flush(VirtQueue *vq)
+{
+ unsigned int i = vq->used_idx % vq->vring.num;
+ unsigned int ndescs = 0;
+ uint16_t old = vq->used_idx;
+ uint16_t new;
+ bool packed;
+ VRingUsedElem uelem;
+
+ packed = virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED);
+
+ if (packed) {
+ if (unlikely(!vq->vring.desc)) {
+ return;
+ }
+ } else if (unlikely(!vq->vring.used)) {
+ return;
+ }
+
+ /* First expected in-order element isn't ready, nothing to do */
+ if (!vq->used_elems[i].in_order_filled) {
+ return;
+ }
+
+ /* Search for filled elements in-order */
+ while (vq->used_elems[i].in_order_filled) {
+ /*
+ * First entry for packed VQs is written last so the guest
+ * doesn't see invalid descriptors.
+ */
+ if (packed && i != vq->used_idx) {
+ virtqueue_packed_fill_desc(vq, &vq->used_elems[i], ndescs, false);
+ } else if (!packed) {
+ uelem.id = vq->used_elems[i].index;
+ uelem.len = vq->used_elems[i].len;
+ vring_used_write(vq, &uelem, i);
+ }
+
+ vq->used_elems[i].in_order_filled = false;
+ ndescs += vq->used_elems[i].ndescs;
+ i += vq->used_elems[i].ndescs;
+ if (i >= vq->vring.num) {
+ i -= vq->vring.num;
+ }
+ }
+
+ if (packed) {
+ virtqueue_packed_fill_desc(vq, &vq->used_elems[vq->used_idx], 0, true);
+ vq->used_idx += ndescs;
+ if (vq->used_idx >= vq->vring.num) {
+ vq->used_idx -= vq->vring.num;
+ vq->used_wrap_counter ^= 1;
+ vq->signalled_used_valid = false;
+ }
+ } else {
+ /* Make sure buffer is written before we update index. */
+ smp_wmb();
+ new = old + ndescs;
+ vring_used_idx_set(vq, new);
+ if (unlikely((int16_t)(new - vq->signalled_used) <
+ (uint16_t)(new - old))) {
+ vq->signalled_used_valid = false;
+ }
+ }
+ vq->inuse -= ndescs;
+}
+
void virtqueue_flush(VirtQueue *vq, unsigned int count)
{
if (virtio_device_disabled(vq->vdev)) {
@@ -988,7 +1097,9 @@
return;
}
- if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
+ if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_IN_ORDER)) {
+ virtqueue_ordered_flush(vq);
+ } else if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
virtqueue_packed_flush(vq, count);
} else {
virtqueue_split_flush(vq, count);
@@ -1505,7 +1616,7 @@
static void *virtqueue_split_pop(VirtQueue *vq, size_t sz)
{
- unsigned int i, head, max;
+ unsigned int i, head, max, idx;
VRingMemoryRegionCaches *caches;
MemoryRegionCache indirect_desc_cache;
MemoryRegionCache *desc_cache;
@@ -1629,6 +1740,13 @@
elem->in_sg[i] = iov[out_num + i];
}
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_IN_ORDER)) {
+ idx = (vq->last_avail_idx - 1) % vq->vring.num;
+ vq->used_elems[idx].index = elem->index;
+ vq->used_elems[idx].len = elem->len;
+ vq->used_elems[idx].ndescs = elem->ndescs;
+ }
+
vq->inuse++;
trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num);
@@ -1762,6 +1880,13 @@
elem->index = id;
elem->ndescs = (desc_cache == &indirect_desc_cache) ? 1 : elem_entries;
+
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_IN_ORDER)) {
+ vq->used_elems[vq->last_avail_idx].index = elem->index;
+ vq->used_elems[vq->last_avail_idx].len = elem->len;
+ vq->used_elems[vq->last_avail_idx].ndescs = elem->ndescs;
+ }
+
vq->last_avail_idx += elem->ndescs;
vq->inuse += elem->ndescs;
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 815342d..240ee04 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -129,6 +129,14 @@
*/
void cpu_address_space_init(CPUState *cpu, int asidx,
const char *prefix, MemoryRegion *mr);
+/**
+ * cpu_address_space_destroy:
+ * @cpu: CPU for which address space needs to be destroyed
+ * @asidx: integer index of this address space
+ *
+ * Note that with KVM only one address space is supported.
+ */
+void cpu_address_space_destroy(CPUState *cpu, int asidx);
void cpu_physical_memory_rw(hwaddr addr, void *buf,
hwaddr len, bool is_write);
diff --git a/include/exec/gdbstub.h b/include/exec/gdbstub.h
index 1bd2c4e..d73f424 100644
--- a/include/exec/gdbstub.h
+++ b/include/exec/gdbstub.h
@@ -41,6 +41,12 @@
const GDBFeature *feature, int g_pos);
/**
+ * gdb_unregister_coprocessor_all() - unregisters supplemental set of registers
+ * @cpu - the CPU associated with registers
+ */
+void gdb_unregister_coprocessor_all(CPUState *cpu);
+
+/**
* gdbserver_start: start the gdb server
* @port_or_device: connection spec for gdb
*
diff --git a/include/hw/acpi/cpu.h b/include/hw/acpi/cpu.h
index e6e1a9e..32654dc 100644
--- a/include/hw/acpi/cpu.h
+++ b/include/hw/acpi/cpu.h
@@ -19,6 +19,8 @@
#include "hw/boards.h"
#include "hw/hotplug.h"
+#define ACPI_CPU_HOTPLUG_REG_LEN 12
+
typedef struct AcpiCpuStatus {
CPUState *cpu;
uint64_t arch_id;
@@ -61,9 +63,10 @@
GArray *entry, bool force_enabled);
void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts,
- build_madt_cpu_fn build_madt_cpu, hwaddr io_base,
+ build_madt_cpu_fn build_madt_cpu, hwaddr base_addr,
const char *res_root,
- const char *event_handler_method);
+ const char *event_handler_method,
+ AmlRegionSpace rs);
void acpi_cpu_ospm_status(CPUHotplugState *cpu_st, ACPIOSTInfoList ***list);
diff --git a/include/hw/acpi/generic_event_device.h b/include/hw/acpi/generic_event_device.h
index ba84ce0..40af355 100644
--- a/include/hw/acpi/generic_event_device.h
+++ b/include/hw/acpi/generic_event_device.h
@@ -62,6 +62,7 @@
#include "hw/sysbus.h"
#include "hw/acpi/memory_hotplug.h"
#include "hw/acpi/ghes.h"
+#include "hw/acpi/cpu.h"
#include "qom/object.h"
#define ACPI_POWER_BUTTON_DEVICE "PWRB"
@@ -86,6 +87,7 @@
#define GED_DEVICE "GED"
#define AML_GED_EVT_REG "EREG"
#define AML_GED_EVT_SEL "ESEL"
+#define AML_GED_EVT_CPU_SCAN_METHOD "\\_SB.GED.CSCN"
/*
* Platforms need to specify the GED event bitmap
@@ -95,6 +97,7 @@
#define ACPI_GED_MEM_HOTPLUG_EVT 0x1
#define ACPI_GED_PWR_DOWN_EVT 0x2
#define ACPI_GED_NVDIMM_HOTPLUG_EVT 0x4
+#define ACPI_GED_CPU_HOTPLUG_EVT 0x8
typedef struct GEDState {
MemoryRegion evt;
@@ -106,6 +109,8 @@
SysBusDevice parent_obj;
MemHotplugState memhp_state;
MemoryRegion container_memhp;
+ CPUHotplugState cpuhp_state;
+ MemoryRegion container_cpuhp;
GEDState ged_state;
uint32_t ged_event_bitmap;
qemu_irq irq;
diff --git a/include/hw/boards.h b/include/hw/boards.h
index ef6f18f..48ff6d8 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -237,6 +237,9 @@
* purposes only.
* Applies only to default memory backend, i.e., explicit memory backend
* wasn't used.
+ * @smbios_memory_device_size:
+ * Default size of memory device,
+ * SMBIOS 3.1.0 "7.18 Memory Device (Type 17)"
*/
struct MachineClass {
/*< private >*/
@@ -304,6 +307,7 @@
const CPUArchIdList *(*possible_cpu_arch_ids)(MachineState *machine);
int64_t (*get_default_cpu_node_id)(const MachineState *ms, int idx);
ram_addr_t (*fixup_ram_size)(ram_addr_t size);
+ uint64_t smbios_memory_device_size;
};
/**
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index d946161..1c9c775 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -496,6 +496,7 @@
QSIMPLEQ_HEAD(, qemu_work_item) work_list;
struct CPUAddressSpace *cpu_ases;
+ int cpu_ases_count;
int num_ases;
AddressSpace *as;
MemoryRegion *memory;
diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
index 0a4fcb2..fdd0f4e 100644
--- a/include/hw/cxl/cxl_device.h
+++ b/include/hw/cxl/cxl_device.h
@@ -181,6 +181,21 @@
uint64_t runtime;
QEMUTimer *timer;
} bg;
+
+ /* firmware update */
+ struct {
+ uint8_t active_slot;
+ uint8_t staged_slot;
+ bool slot[4];
+ uint8_t curr_action;
+ uint8_t curr_slot;
+ /* handle partial transfers */
+ bool transferring;
+ size_t prev_offset;
+ size_t prev_len;
+ time_t last_partxfer;
+ } fw;
+
size_t payload_max;
/* Pointer to device hosting the CCI */
DeviceState *d;
@@ -397,9 +412,14 @@
#define cxl_dev_enable_media(cxlds) \
do { __toggle_media((cxlds), 0x1); } while (0)
-static inline bool sanitize_running(CXLCCI *cci)
+static inline bool cxl_dev_media_disabled(CXLDeviceState *cxl_dstate)
{
- return !!cci->bg.runtime && cci->bg.opcode == 0x4400;
+ uint64_t dev_status_reg = cxl_dstate->mbox_reg_state64[R_CXL_MEM_DEV_STS];
+ return FIELD_EX64(dev_status_reg, CXL_MEM_DEV_STS, MEDIA_STATUS) == 0x3;
+}
+static inline bool scan_media_running(CXLCCI *cci)
+{
+ return !!cci->bg.runtime && cci->bg.opcode == 0x4304;
}
typedef struct CXLError {
@@ -422,6 +442,47 @@
typedef QLIST_HEAD(, CXLPoison) CXLPoisonList;
#define CXL_POISON_LIST_LIMIT 256
+/* CXL memory device patrol scrub control attributes */
+typedef struct CXLMemPatrolScrubReadAttrs {
+ uint8_t scrub_cycle_cap;
+ uint16_t scrub_cycle;
+ uint8_t scrub_flags;
+} QEMU_PACKED CXLMemPatrolScrubReadAttrs;
+
+typedef struct CXLMemPatrolScrubWriteAttrs {
+ uint8_t scrub_cycle_hr;
+ uint8_t scrub_flags;
+} QEMU_PACKED CXLMemPatrolScrubWriteAttrs;
+
+#define CXL_MEMDEV_PS_GET_FEATURE_VERSION 0x01
+#define CXL_MEMDEV_PS_SET_FEATURE_VERSION 0x01
+#define CXL_MEMDEV_PS_SCRUB_CYCLE_CHANGE_CAP_DEFAULT BIT(0)
+#define CXL_MEMDEV_PS_SCRUB_REALTIME_REPORT_CAP_DEFAULT BIT(1)
+#define CXL_MEMDEV_PS_CUR_SCRUB_CYCLE_DEFAULT 12
+#define CXL_MEMDEV_PS_MIN_SCRUB_CYCLE_DEFAULT 1
+#define CXL_MEMDEV_PS_ENABLE_DEFAULT 0
+
+/* CXL memory device DDR5 ECS control attributes */
+typedef struct CXLMemECSReadAttrs {
+ uint8_t ecs_log_cap;
+ uint8_t ecs_cap;
+ uint16_t ecs_config;
+ uint8_t ecs_flags;
+} QEMU_PACKED CXLMemECSReadAttrs;
+
+typedef struct CXLMemECSWriteAttrs {
+ uint8_t ecs_log_cap;
+ uint16_t ecs_config;
+} QEMU_PACKED CXLMemECSWriteAttrs;
+
+#define CXL_ECS_GET_FEATURE_VERSION 0x01
+#define CXL_ECS_SET_FEATURE_VERSION 0x01
+#define CXL_ECS_LOG_ENTRY_TYPE_DEFAULT 0x01
+#define CXL_ECS_REALTIME_REPORT_CAP_DEFAULT 1
+#define CXL_ECS_THRESHOLD_COUNT_DEFAULT 3 /* 3: 256, 4: 1024, 5: 4096 */
+#define CXL_ECS_MODE_DEFAULT 0
+#define CXL_ECS_NUM_MEDIA_FRUS 3 /* Default */
+
#define DCD_MAX_NUM_REGION 8
typedef struct CXLDCExtentRaw {
@@ -459,6 +520,14 @@
unsigned long *blk_bitmap;
} CXLDCRegion;
+typedef struct CXLSetFeatureInfo {
+ QemuUUID uuid;
+ uint8_t data_transfer_flag;
+ bool data_saved_across_reset;
+ uint16_t data_offset;
+ size_t data_size;
+} CXLSetFeatureInfo;
+
struct CXLType3Dev {
/* Private */
PCIDevice parent_obj;
@@ -491,6 +560,19 @@
unsigned int poison_list_cnt;
bool poison_list_overflowed;
uint64_t poison_list_overflow_ts;
+ /* Poison Injection - backup */
+ CXLPoisonList poison_list_bkp;
+ CXLPoisonList scan_media_results;
+ bool scan_media_hasrun;
+
+ CXLSetFeatureInfo set_feat_info;
+
+ /* Patrol scrub control attributes */
+ CXLMemPatrolScrubReadAttrs patrol_scrub_attrs;
+ CXLMemPatrolScrubWriteAttrs patrol_scrub_wr_attrs;
+ /* ECS control attributes */
+ CXLMemECSReadAttrs ecs_attrs[CXL_ECS_NUM_MEDIA_FRUS];
+ CXLMemECSWriteAttrs ecs_wr_attrs[CXL_ECS_NUM_MEDIA_FRUS];
struct dynamic_capacity {
HostMemoryBackend *host_dc;
@@ -554,10 +636,12 @@
size_t *len);
CXLRetCode cxl_event_clear_records(CXLDeviceState *cxlds,
CXLClearEventPayload *pl);
+void cxl_discard_all_event_records(CXLDeviceState *cxlds);
void cxl_event_irq_assert(CXLType3Dev *ct3d);
void cxl_set_poison_list_overflowed(CXLType3Dev *ct3d);
+void cxl_clear_poison_list_overflowed(CXLType3Dev *ct3d);
CXLDCRegion *cxl_find_dc_region(CXLType3Dev *ct3d, uint64_t dpa, uint64_t len);
diff --git a/include/hw/cxl/cxl_mailbox.h b/include/hw/cxl/cxl_mailbox.h
new file mode 100644
index 0000000..beb0480
--- /dev/null
+++ b/include/hw/cxl/cxl_mailbox.h
@@ -0,0 +1,18 @@
+/*
+ * QEMU CXL Mailbox
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See the
+ * COPYING file in the top-level directory.
+ */
+
+#ifndef CXL_MAILBOX_H
+#define CXL_MAILBOX_H
+
+#define CXL_MBOX_IMMEDIATE_CONFIG_CHANGE (1 << 1)
+#define CXL_MBOX_IMMEDIATE_DATA_CHANGE (1 << 2)
+#define CXL_MBOX_IMMEDIATE_POLICY_CHANGE (1 << 3)
+#define CXL_MBOX_IMMEDIATE_LOG_CHANGE (1 << 4)
+#define CXL_MBOX_SECURITY_STATE_CHANGE (1 << 5)
+#define CXL_MBOX_BACKGROUND_OPERATION (1 << 6)
+
+#endif
diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
index ca15132..e7e41cb 100644
--- a/include/hw/pci/pci_device.h
+++ b/include/hw/pci/pci_device.h
@@ -3,6 +3,7 @@
#include "hw/pci/pci.h"
#include "hw/pci/pcie.h"
+#include "hw/pci/pcie_doe.h"
#define TYPE_PCI_DEVICE "pci-device"
typedef struct PCIDeviceClass PCIDeviceClass;
@@ -37,6 +38,8 @@
uint16_t subsystem_id; /* only for header type = 0 */
const char *romfile; /* rom bar */
+
+ bool sriov_vf_user_creatable;
};
enum PCIReqIDType {
@@ -157,9 +160,17 @@
MSIVectorReleaseNotifier msix_vector_release_notifier;
MSIVectorPollNotifier msix_vector_poll_notifier;
+ /* SPDM */
+ uint16_t spdm_port;
+
+ /* DOE */
+ DOECap doe_spdm;
+
/* ID of standby device in net_failover pair */
char *failover_pair_id;
uint32_t acpi_index;
+
+ char *sriov_pf;
};
static inline int pci_intx(PCIDevice *pci_dev)
@@ -192,7 +203,7 @@
static inline int pci_is_vf(const PCIDevice *d)
{
- return d->exp.sriov_vf.pf != NULL;
+ return d->sriov_pf || d->exp.sriov_vf.pf != NULL;
}
static inline uint32_t pci_config_size(const PCIDevice *d)
diff --git a/include/hw/pci/pcie_doe.h b/include/hw/pci/pcie_doe.h
index 87dc17d..9e1275d 100644
--- a/include/hw/pci/pcie_doe.h
+++ b/include/hw/pci/pcie_doe.h
@@ -46,6 +46,8 @@
/* PCI-SIG defined Data Object Types - r6.0 Table 6-32 */
#define PCI_SIG_DOE_DISCOVERY 0x00
+#define PCI_SIG_DOE_CMA 0x01
+#define PCI_SIG_DOE_SECURED_CMA 0x02
#define PCI_DOE_DW_SIZE_MAX (1 << 18)
#define PCI_DOE_PROTOCOL_NUM_MAX 256
@@ -106,6 +108,9 @@
/* Protocols and its callback response */
DOEProtocol *protocols;
uint16_t protocol_num;
+
+ /* Used for spdm-socket */
+ int spdm_socket;
};
void pcie_doe_init(PCIDevice *pdev, DOECap *doe_cap, uint16_t offset,
diff --git a/include/hw/pci/pcie_sriov.h b/include/hw/pci/pcie_sriov.h
index c5d2d31..f75b8f2 100644
--- a/include/hw/pci/pcie_sriov.h
+++ b/include/hw/pci/pcie_sriov.h
@@ -18,6 +18,7 @@
typedef struct PCIESriovPF {
uint8_t vf_bar_type[PCI_NUM_REGIONS]; /* Store type for each VF bar */
PCIDevice **vf; /* Pointer to an array of num_vfs VF devices */
+ bool vf_user_created; /* If VFs are created by user */
} PCIESriovPF;
typedef struct PCIESriovVF {
@@ -40,6 +41,23 @@
void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num,
MemoryRegion *memory);
+/**
+ * pcie_sriov_pf_init_from_user_created_vfs() - Initialize PF with user-created
+ * VFs.
+ * @dev: A PCIe device being realized.
+ * @offset: The offset of the SR-IOV capability.
+ * @errp: pointer to Error*, to store an error if it happens.
+ *
+ * Return: The size of added capability. 0 if the user did not create VFs.
+ * -1 if failed.
+ */
+int16_t pcie_sriov_pf_init_from_user_created_vfs(PCIDevice *dev,
+ uint16_t offset,
+ Error **errp);
+
+bool pcie_sriov_register_device(PCIDevice *dev, Error **errp);
+void pcie_sriov_unregister_device(PCIDevice *dev);
+
/*
* Default (minimal) page size support values
* as required by the SR/IOV standard:
diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h
index bdb3da7..7db4210 100644
--- a/include/hw/virtio/virtio-iommu.h
+++ b/include/hw/virtio/virtio-iommu.h
@@ -43,7 +43,6 @@
MemoryRegion bypass_mr; /* The alias of shared memory MR */
GList *resv_regions;
GList *host_resv_ranges;
- bool probe_done;
} IOMMUDevice;
typedef struct IOMMUPciBus {
diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h
index 9e67ba3..34539f2 100644
--- a/include/hw/virtio/virtio-pci.h
+++ b/include/hw/virtio/virtio-pci.h
@@ -152,6 +152,7 @@
uint32_t modern_io_bar_idx;
uint32_t modern_mem_bar_idx;
int config_cap;
+ uint16_t last_pcie_cap_offset;
uint32_t flags;
bool disable_modern;
bool ignore_backend_features;
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 7512afb..d2a1938 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -69,6 +69,8 @@
unsigned int ndescs;
unsigned int out_num;
unsigned int in_num;
+ /* Element has been processed (VIRTIO_F_IN_ORDER) */
+ bool in_order_filled;
hwaddr *in_addr;
hwaddr *out_addr;
struct iovec *in_sg;
@@ -371,7 +373,9 @@
DEFINE_PROP_BIT64("packed", _state, _field, \
VIRTIO_F_RING_PACKED, false), \
DEFINE_PROP_BIT64("queue_reset", _state, _field, \
- VIRTIO_F_RING_RESET, true)
+ VIRTIO_F_RING_RESET, true), \
+ DEFINE_PROP_BIT64("in_order", _state, _field, \
+ VIRTIO_F_IN_ORDER, false)
hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n);
bool virtio_queue_enabled_legacy(VirtIODevice *vdev, int n);
diff --git a/include/io/channel.h b/include/io/channel.h
index 7986c49..bdf0bca 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -160,6 +160,9 @@
void *opaque);
int (*io_flush)(QIOChannel *ioc,
Error **errp);
+ int (*io_peerpid)(QIOChannel *ioc,
+ unsigned int *pid,
+ Error **errp);
};
/* General I/O handling functions */
@@ -981,4 +984,22 @@
int qio_channel_flush(QIOChannel *ioc,
Error **errp);
+/**
+ * qio_channel_get_peercred:
+ * @ioc: the channel object
+ * @pid: pointer to pid
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Returns the pid of the peer process connected to this socket.
+ *
+ * The use of this function is possible only for connected
+ * AF_UNIX stream sockets and for AF_UNIX stream and datagram
+ * socket pairs on Linux.
+ * Return -1 on error with pid -1 for the non-Linux OS.
+ *
+ */
+int qio_channel_get_peerpid(QIOChannel *ioc,
+ unsigned int *pid,
+ Error **errp);
+
#endif /* QIO_CHANNEL_H */
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index c31d9c7..c4a914b 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -313,6 +313,31 @@
*/
bool kvm_device_supported(int vmfd, uint64_t type);
+/**
+ * kvm_create_vcpu - Gets a parked KVM vCPU or creates a KVM vCPU
+ * @cpu: QOM CPUState object for which KVM vCPU has to be fetched/created.
+ *
+ * @returns: 0 when success, errno (<0) when failed.
+ */
+int kvm_create_vcpu(CPUState *cpu);
+
+/**
+ * kvm_park_vcpu - Park QEMU KVM vCPU context
+ * @cpu: QOM CPUState object for which QEMU KVM vCPU context has to be parked.
+ *
+ * @returns: none
+ */
+void kvm_park_vcpu(CPUState *cpu);
+
+/**
+ * kvm_unpark_vcpu - unpark QEMU KVM vCPU context
+ * @s: KVM State
+ * @vcpu_id: Architecture vCPU ID of the parked vCPU
+ *
+ * @returns: KVM fd
+ */
+int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id);
+
/* Arch specific hooks */
extern const KVMCapabilityInfo kvm_arch_required_capabilities[];
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
index 3f3d13f..1d8fb14 100644
--- a/include/sysemu/kvm_int.h
+++ b/include/sysemu/kvm_int.h
@@ -14,6 +14,9 @@
#include "qemu/accel.h"
#include "qemu/queue.h"
#include "sysemu/kvm.h"
+#include "hw/boards.h"
+#include "hw/i386/topology.h"
+#include "io/channel-socket.h"
typedef struct KVMSlot
{
@@ -50,6 +53,34 @@
#define KVM_MSI_HASHTAB_SIZE 256
+typedef struct KVMHostTopoInfo {
+ /* Number of package on the Host */
+ unsigned int maxpkgs;
+ /* Number of cpus on the Host */
+ unsigned int maxcpus;
+ /* Number of cpus on each different package */
+ unsigned int *pkg_cpu_count;
+ /* Each package can have different maxticks */
+ unsigned int *maxticks;
+} KVMHostTopoInfo;
+
+struct KVMMsrEnergy {
+ pid_t pid;
+ bool enable;
+ char *socket_path;
+ QIOChannelSocket *sioc;
+ QemuThread msr_thr;
+ unsigned int guest_vcpus;
+ unsigned int guest_vsockets;
+ X86CPUTopoInfo guest_topo_info;
+ KVMHostTopoInfo host_topo;
+ const CPUArchIdList *guest_cpu_list;
+ uint64_t *msr_value;
+ uint64_t msr_unit;
+ uint64_t msr_limit;
+ uint64_t msr_info;
+};
+
enum KVMDirtyRingReaperState {
KVM_DIRTY_RING_REAPER_NONE = 0,
/* The reaper is sleeping */
@@ -117,6 +148,7 @@
bool kvm_dirty_ring_with_bitmap;
uint64_t kvm_eager_split_size; /* Eager Page Splitting chunk size */
struct KVMDirtyRingReaper reaper;
+ struct KVMMsrEnergy msr_energy;
NotifyVmexitOption notify_vmexit;
uint32_t notify_window;
uint32_t xen_version;
diff --git a/include/sysemu/spdm-socket.h b/include/sysemu/spdm-socket.h
new file mode 100644
index 0000000..5d8bd9a
--- /dev/null
+++ b/include/sysemu/spdm-socket.h
@@ -0,0 +1,74 @@
+/*
+ * QEMU SPDM socket support
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef SPDM_REQUESTER_H
+#define SPDM_REQUESTER_H
+
+/**
+ * spdm_socket_connect: connect to an external SPDM socket
+ * @port: port to connect to
+ * @errp: error object handle
+ *
+ * This will connect to an external SPDM socket server. On error
+ * it will return -1 and errp will be set. On success this function
+ * will return the socket number.
+ */
+int spdm_socket_connect(uint16_t port, Error **errp);
+
+/**
+ * spdm_socket_rsp: send and receive a message to a SPDM server
+ * @socket: socket returned from spdm_socket_connect()
+ * @transport_type: SPDM_SOCKET_TRANSPORT_TYPE_* macro
+ * @req: request buffer
+ * @req_len: request buffer length
+ * @rsp: response buffer
+ * @rsp_len: response buffer length
+ *
+ * Send platform data to a SPDM server on socket and then receive
+ * a response.
+ */
+uint32_t spdm_socket_rsp(const int socket, uint32_t transport_type,
+ void *req, uint32_t req_len,
+ void *rsp, uint32_t rsp_len);
+
+/**
+ * spdm_socket_close: send a shutdown command to the server
+ * @socket: socket returned from spdm_socket_connect()
+ * @transport_type: SPDM_SOCKET_TRANSPORT_TYPE_* macro
+ *
+ * This will issue a shutdown command to the server.
+ */
+void spdm_socket_close(const int socket, uint32_t transport_type);
+
+#define SPDM_SOCKET_COMMAND_NORMAL 0x0001
+#define SPDM_SOCKET_COMMAND_OOB_ENCAP_KEY_UPDATE 0x8001
+#define SPDM_SOCKET_COMMAND_CONTINUE 0xFFFD
+#define SPDM_SOCKET_COMMAND_SHUTDOWN 0xFFFE
+#define SPDM_SOCKET_COMMAND_UNKOWN 0xFFFF
+#define SPDM_SOCKET_COMMAND_TEST 0xDEAD
+
+#define SPDM_SOCKET_TRANSPORT_TYPE_MCTP 0x01
+#define SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE 0x02
+
+#define SPDM_SOCKET_MAX_MESSAGE_BUFFER_SIZE 0x1200
+
+#endif
diff --git a/io/channel-socket.c b/io/channel-socket.c
index 3a899b0..608bcf0 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -841,6 +841,33 @@
socket_set_cork(sioc->fd, v);
}
+static int
+qio_channel_socket_get_peerpid(QIOChannel *ioc,
+ unsigned int *pid,
+ Error **errp)
+{
+#ifdef CONFIG_LINUX
+ QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
+ Error *err = NULL;
+ socklen_t len = sizeof(struct ucred);
+
+ struct ucred cred;
+ if (getsockopt(sioc->fd,
+ SOL_SOCKET, SO_PEERCRED,
+ &cred, &len) == -1) {
+ error_setg_errno(&err, errno, "Unable to get peer credentials");
+ error_propagate(errp, err);
+ *pid = -1;
+ return -1;
+ }
+ *pid = (unsigned int)cred.pid;
+ return 0;
+#else
+ error_setg(errp, "Unsupported feature");
+ *pid = -1;
+ return -1;
+#endif
+}
static int
qio_channel_socket_close(QIOChannel *ioc,
@@ -938,6 +965,7 @@
#ifdef QEMU_MSG_ZEROCOPY
ioc_klass->io_flush = qio_channel_socket_flush;
#endif
+ ioc_klass->io_peerpid = qio_channel_socket_get_peerpid;
}
static const TypeInfo qio_channel_socket_info = {
diff --git a/io/channel.c b/io/channel.c
index a1f12f8..e3f17c2 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -548,6 +548,19 @@
}
}
+int qio_channel_get_peerpid(QIOChannel *ioc,
+ unsigned int *pid,
+ Error **errp)
+{
+ QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
+
+ if (!klass->io_peerpid) {
+ error_setg(errp, "Channel does not support peer pid");
+ return -1;
+ }
+ klass->io_peerpid(ioc, pid, errp);
+ return 0;
+}
off_t qio_channel_io_seek(QIOChannel *ioc,
off_t offset,
diff --git a/meson.build b/meson.build
index a1e5127..af9f038 100644
--- a/meson.build
+++ b/meson.build
@@ -2187,6 +2187,19 @@
.require(libcap_ng.found(), error_message: 'the virtfs proxy helper requires libcap-ng') \
.allowed()
+qga_fsfreeze = false
+qga_fstrim = false
+if host_os == 'linux'
+ if cc.has_header_symbol('linux/fs.h', 'FIFREEZE')
+ qga_fsfreeze = true
+ endif
+ if cc.has_header_symbol('linux/fs.h', 'FITRIM')
+ qga_fstrim = true
+ endif
+elif host_os == 'freebsd' and cc.has_header_symbol('ufs/ffs/fs.h', 'UFSSUSPEND')
+ qga_fsfreeze = true
+endif
+
if get_option('block_drv_ro_whitelist') == ''
config_host_data.set('CONFIG_BDRV_RO_WHITELIST', '')
else
@@ -2263,6 +2276,7 @@
config_host_data.set('CONFIG_BDRV_WHITELIST_TOOLS', get_option('block_drv_whitelist_in_tools'))
config_host_data.set('CONFIG_BRLAPI', brlapi.found())
config_host_data.set('CONFIG_BSD', host_os in bsd_oses)
+config_host_data.set('CONFIG_FREEBSD', host_os == 'freebsd')
config_host_data.set('CONFIG_CAPSTONE', capstone.found())
config_host_data.set('CONFIG_COCOA', cocoa.found())
config_host_data.set('CONFIG_DARWIN', host_os == 'darwin')
@@ -2423,6 +2437,8 @@
config_host_data.set('CONFIG_DEBUG_REMAP', get_option('debug_remap'))
config_host_data.set('CONFIG_QOM_CAST_DEBUG', get_option('qom_cast_debug'))
config_host_data.set('CONFIG_REPLICATION', get_option('replication').allowed())
+config_host_data.set('CONFIG_FSFREEZE', qga_fsfreeze)
+config_host_data.set('CONFIG_FSTRIM', qga_fstrim)
# has_header
config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h'))
@@ -4073,6 +4089,13 @@
dependencies: [authz, crypto, io, qom, qemuutil,
libcap_ng, mpathpersist],
install: true)
+
+ if cpu in ['x86', 'x86_64']
+ executable('qemu-vmsr-helper', files('tools/i386/qemu-vmsr-helper.c'),
+ dependencies: [authz, crypto, io, qom, qemuutil,
+ libcap_ng, mpathpersist],
+ install: true)
+ endif
endif
if have_ivshmem
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index daa3842..03457ea 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -62,6 +62,7 @@
VIRTIO_F_RING_PACKED,
VIRTIO_F_RING_RESET,
VIRTIO_F_VERSION_1,
+ VIRTIO_F_IN_ORDER,
VIRTIO_F_NOTIFICATION_DATA,
VIRTIO_NET_F_CSUM,
VIRTIO_NET_F_CTRL_GUEST_OFFLOADS,
diff --git a/qga/commands-bsd.c b/qga/commands-bsd.c
index 17bddda..9ce48af 100644
--- a/qga/commands-bsd.c
+++ b/qga/commands-bsd.c
@@ -149,30 +149,6 @@
}
return ret;
}
-
-GuestFilesystemInfoList *qmp_guest_get_fsinfo(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
-
-GuestDiskInfoList *qmp_guest_get_disks(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
-
-GuestDiskStatsInfoList *qmp_guest_get_diskstats(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
-
-GuestCpuStatsList *qmp_guest_get_cpustats(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
#endif /* CONFIG_FSFREEZE */
#ifdef HAVE_GETIFADDRS
diff --git a/qga/commands-common.h b/qga/commands-common.h
index 8c1c56a..263e7c0 100644
--- a/qga/commands-common.h
+++ b/qga/commands-common.h
@@ -15,19 +15,10 @@
#if defined(__linux__)
#include <linux/fs.h>
-#ifdef FIFREEZE
-#define CONFIG_FSFREEZE
-#endif
-#ifdef FITRIM
-#define CONFIG_FSTRIM
-#endif
#endif /* __linux__ */
#ifdef __FreeBSD__
#include <ufs/ffs/fs.h>
-#ifdef UFSSUSPEND
-#define CONFIG_FSFREEZE
-#endif
#endif /* __FreeBSD__ */
#if defined(CONFIG_FSFREEZE) || defined(CONFIG_FSTRIM)
diff --git a/qga/commands-linux.c b/qga/commands-linux.c
index 214e408..51d5e3d 100644
--- a/qga/commands-linux.c
+++ b/qga/commands-linux.c
@@ -13,10 +13,26 @@
#include "qemu/osdep.h"
#include "qapi/error.h"
+#include "qga-qapi-commands.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
#include "commands-common.h"
#include "cutils.h"
#include <mntent.h>
#include <sys/ioctl.h>
+#include <mntent.h>
+#include <linux/nvme_ioctl.h>
+#include "block/nvme.h"
+
+#ifdef CONFIG_LIBUDEV
+#include <libudev.h>
+#endif
+
+#ifdef HAVE_GETIFADDRS
+#include <net/if.h>
+#endif
+
+#include <sys/statvfs.h>
#if defined(CONFIG_FSFREEZE) || defined(CONFIG_FSTRIM)
static int dev_major_minor(const char *devpath,
@@ -284,3 +300,1925 @@
return i;
}
#endif /* CONFIG_FSFREEZE */
+
+#if defined(CONFIG_FSFREEZE)
+
+static char *get_pci_driver(char const *syspath, int pathlen, Error **errp)
+{
+ char *path;
+ char *dpath;
+ char *driver = NULL;
+ char buf[PATH_MAX];
+ ssize_t len;
+
+ path = g_strndup(syspath, pathlen);
+ dpath = g_strdup_printf("%s/driver", path);
+ len = readlink(dpath, buf, sizeof(buf) - 1);
+ if (len != -1) {
+ buf[len] = 0;
+ driver = g_path_get_basename(buf);
+ }
+ g_free(dpath);
+ g_free(path);
+ return driver;
+}
+
+static int compare_uint(const void *_a, const void *_b)
+{
+ unsigned int a = *(unsigned int *)_a;
+ unsigned int b = *(unsigned int *)_b;
+
+ return a < b ? -1 : a > b ? 1 : 0;
+}
+
+/* Walk the specified sysfs and build a sorted list of host or ata numbers */
+static int build_hosts(char const *syspath, char const *host, bool ata,
+ unsigned int *hosts, int hosts_max, Error **errp)
+{
+ char *path;
+ DIR *dir;
+ struct dirent *entry;
+ int i = 0;
+
+ path = g_strndup(syspath, host - syspath);
+ dir = opendir(path);
+ if (!dir) {
+ error_setg_errno(errp, errno, "opendir(\"%s\")", path);
+ g_free(path);
+ return -1;
+ }
+
+ while (i < hosts_max) {
+ entry = readdir(dir);
+ if (!entry) {
+ break;
+ }
+ if (ata && sscanf(entry->d_name, "ata%d", hosts + i) == 1) {
+ ++i;
+ } else if (!ata && sscanf(entry->d_name, "host%d", hosts + i) == 1) {
+ ++i;
+ }
+ }
+
+ qsort(hosts, i, sizeof(hosts[0]), compare_uint);
+
+ g_free(path);
+ closedir(dir);
+ return i;
+}
+
+/*
+ * Store disk device info for devices on the PCI bus.
+ * Returns true if information has been stored, or false for failure.
+ */
+static bool build_guest_fsinfo_for_pci_dev(char const *syspath,
+ GuestDiskAddress *disk,
+ Error **errp)
+{
+ unsigned int pci[4], host, hosts[8], tgt[3];
+ int i, nhosts = 0, pcilen;
+ GuestPCIAddress *pciaddr = disk->pci_controller;
+ bool has_ata = false, has_host = false, has_tgt = false;
+ char *p, *q, *driver = NULL;
+ bool ret = false;
+
+ p = strstr(syspath, "/devices/pci");
+ if (!p || sscanf(p + 12, "%*x:%*x/%x:%x:%x.%x%n",
+ pci, pci + 1, pci + 2, pci + 3, &pcilen) < 4) {
+ g_debug("only pci device is supported: sysfs path '%s'", syspath);
+ return false;
+ }
+
+ p += 12 + pcilen;
+ while (true) {
+ driver = get_pci_driver(syspath, p - syspath, errp);
+ if (driver && (g_str_equal(driver, "ata_piix") ||
+ g_str_equal(driver, "sym53c8xx") ||
+ g_str_equal(driver, "virtio-pci") ||
+ g_str_equal(driver, "ahci") ||
+ g_str_equal(driver, "nvme") ||
+ g_str_equal(driver, "xhci_hcd") ||
+ g_str_equal(driver, "ehci-pci"))) {
+ break;
+ }
+
+ g_free(driver);
+ if (sscanf(p, "/%x:%x:%x.%x%n",
+ pci, pci + 1, pci + 2, pci + 3, &pcilen) == 4) {
+ p += pcilen;
+ continue;
+ }
+
+ g_debug("unsupported driver or sysfs path '%s'", syspath);
+ return false;
+ }
+
+ p = strstr(syspath, "/target");
+ if (p && sscanf(p + 7, "%*u:%*u:%*u/%*u:%u:%u:%u",
+ tgt, tgt + 1, tgt + 2) == 3) {
+ has_tgt = true;
+ }
+
+ p = strstr(syspath, "/ata");
+ if (p) {
+ q = p + 4;
+ has_ata = true;
+ } else {
+ p = strstr(syspath, "/host");
+ q = p + 5;
+ }
+ if (p && sscanf(q, "%u", &host) == 1) {
+ has_host = true;
+ nhosts = build_hosts(syspath, p, has_ata, hosts,
+ ARRAY_SIZE(hosts), errp);
+ if (nhosts < 0) {
+ goto cleanup;
+ }
+ }
+
+ pciaddr->domain = pci[0];
+ pciaddr->bus = pci[1];
+ pciaddr->slot = pci[2];
+ pciaddr->function = pci[3];
+
+ if (strcmp(driver, "ata_piix") == 0) {
+ /* a host per ide bus, target*:0:<unit>:0 */
+ if (!has_host || !has_tgt) {
+ g_debug("invalid sysfs path '%s' (driver '%s')", syspath, driver);
+ goto cleanup;
+ }
+ for (i = 0; i < nhosts; i++) {
+ if (host == hosts[i]) {
+ disk->bus_type = GUEST_DISK_BUS_TYPE_IDE;
+ disk->bus = i;
+ disk->unit = tgt[1];
+ break;
+ }
+ }
+ if (i >= nhosts) {
+ g_debug("no host for '%s' (driver '%s')", syspath, driver);
+ goto cleanup;
+ }
+ } else if (strcmp(driver, "sym53c8xx") == 0) {
+ /* scsi(LSI Logic): target*:0:<unit>:0 */
+ if (!has_tgt) {
+ g_debug("invalid sysfs path '%s' (driver '%s')", syspath, driver);
+ goto cleanup;
+ }
+ disk->bus_type = GUEST_DISK_BUS_TYPE_SCSI;
+ disk->unit = tgt[1];
+ } else if (strcmp(driver, "virtio-pci") == 0) {
+ if (has_tgt) {
+ /* virtio-scsi: target*:0:0:<unit> */
+ disk->bus_type = GUEST_DISK_BUS_TYPE_SCSI;
+ disk->unit = tgt[2];
+ } else {
+ /* virtio-blk: 1 disk per 1 device */
+ disk->bus_type = GUEST_DISK_BUS_TYPE_VIRTIO;
+ }
+ } else if (strcmp(driver, "ahci") == 0) {
+ /* ahci: 1 host per 1 unit */
+ if (!has_host || !has_tgt) {
+ g_debug("invalid sysfs path '%s' (driver '%s')", syspath, driver);
+ goto cleanup;
+ }
+ for (i = 0; i < nhosts; i++) {
+ if (host == hosts[i]) {
+ disk->unit = i;
+ disk->bus_type = GUEST_DISK_BUS_TYPE_SATA;
+ break;
+ }
+ }
+ if (i >= nhosts) {
+ g_debug("no host for '%s' (driver '%s')", syspath, driver);
+ goto cleanup;
+ }
+ } else if (strcmp(driver, "nvme") == 0) {
+ disk->bus_type = GUEST_DISK_BUS_TYPE_NVME;
+ } else if (strcmp(driver, "ehci-pci") == 0 || strcmp(driver, "xhci_hcd") == 0) {
+ disk->bus_type = GUEST_DISK_BUS_TYPE_USB;
+ } else {
+ g_debug("unknown driver '%s' (sysfs path '%s')", driver, syspath);
+ goto cleanup;
+ }
+
+ ret = true;
+
+cleanup:
+ g_free(driver);
+ return ret;
+}
+
+/*
+ * Store disk device info for non-PCI virtio devices (for example s390x
+ * channel I/O devices). Returns true if information has been stored, or
+ * false for failure.
+ */
+static bool build_guest_fsinfo_for_nonpci_virtio(char const *syspath,
+ GuestDiskAddress *disk,
+ Error **errp)
+{
+ unsigned int tgt[3];
+ char *p;
+
+ if (!strstr(syspath, "/virtio") || !strstr(syspath, "/block")) {
+ g_debug("Unsupported virtio device '%s'", syspath);
+ return false;
+ }
+
+ p = strstr(syspath, "/target");
+ if (p && sscanf(p + 7, "%*u:%*u:%*u/%*u:%u:%u:%u",
+ &tgt[0], &tgt[1], &tgt[2]) == 3) {
+ /* virtio-scsi: target*:0:<target>:<unit> */
+ disk->bus_type = GUEST_DISK_BUS_TYPE_SCSI;
+ disk->bus = tgt[0];
+ disk->target = tgt[1];
+ disk->unit = tgt[2];
+ } else {
+ /* virtio-blk: 1 disk per 1 device */
+ disk->bus_type = GUEST_DISK_BUS_TYPE_VIRTIO;
+ }
+
+ return true;
+}
+
+/*
+ * Store disk device info for CCW devices (s390x channel I/O devices).
+ * Returns true if information has been stored, or false for failure.
+ */
+static bool build_guest_fsinfo_for_ccw_dev(char const *syspath,
+ GuestDiskAddress *disk,
+ Error **errp)
+{
+ unsigned int cssid, ssid, subchno, devno;
+ char *p;
+
+ p = strstr(syspath, "/devices/css");
+ if (!p || sscanf(p + 12, "%*x/%x.%x.%x/%*x.%*x.%x/",
+ &cssid, &ssid, &subchno, &devno) < 4) {
+ g_debug("could not parse ccw device sysfs path: %s", syspath);
+ return false;
+ }
+
+ disk->ccw_address = g_new0(GuestCCWAddress, 1);
+ disk->ccw_address->cssid = cssid;
+ disk->ccw_address->ssid = ssid;
+ disk->ccw_address->subchno = subchno;
+ disk->ccw_address->devno = devno;
+
+ if (strstr(p, "/virtio")) {
+ build_guest_fsinfo_for_nonpci_virtio(syspath, disk, errp);
+ }
+
+ return true;
+}
+
+/* Store disk device info specified by @sysfs into @fs */
+static void build_guest_fsinfo_for_real_device(char const *syspath,
+ GuestFilesystemInfo *fs,
+ Error **errp)
+{
+ GuestDiskAddress *disk;
+ GuestPCIAddress *pciaddr;
+ bool has_hwinf;
+#ifdef CONFIG_LIBUDEV
+ struct udev *udev = NULL;
+ struct udev_device *udevice = NULL;
+#endif
+
+ pciaddr = g_new0(GuestPCIAddress, 1);
+ pciaddr->domain = -1; /* -1 means field is invalid */
+ pciaddr->bus = -1;
+ pciaddr->slot = -1;
+ pciaddr->function = -1;
+
+ disk = g_new0(GuestDiskAddress, 1);
+ disk->pci_controller = pciaddr;
+ disk->bus_type = GUEST_DISK_BUS_TYPE_UNKNOWN;
+
+#ifdef CONFIG_LIBUDEV
+ udev = udev_new();
+ udevice = udev_device_new_from_syspath(udev, syspath);
+ if (udev == NULL || udevice == NULL) {
+ g_debug("failed to query udev");
+ } else {
+ const char *devnode, *serial;
+ devnode = udev_device_get_devnode(udevice);
+ if (devnode != NULL) {
+ disk->dev = g_strdup(devnode);
+ }
+ serial = udev_device_get_property_value(udevice, "ID_SERIAL");
+ if (serial != NULL && *serial != 0) {
+ disk->serial = g_strdup(serial);
+ }
+ }
+
+ udev_unref(udev);
+ udev_device_unref(udevice);
+#endif
+
+ if (strstr(syspath, "/devices/pci")) {
+ has_hwinf = build_guest_fsinfo_for_pci_dev(syspath, disk, errp);
+ } else if (strstr(syspath, "/devices/css")) {
+ has_hwinf = build_guest_fsinfo_for_ccw_dev(syspath, disk, errp);
+ } else if (strstr(syspath, "/virtio")) {
+ has_hwinf = build_guest_fsinfo_for_nonpci_virtio(syspath, disk, errp);
+ } else {
+ g_debug("Unsupported device type for '%s'", syspath);
+ has_hwinf = false;
+ }
+
+ if (has_hwinf || disk->dev || disk->serial) {
+ QAPI_LIST_PREPEND(fs->disk, disk);
+ } else {
+ qapi_free_GuestDiskAddress(disk);
+ }
+}
+
+static void build_guest_fsinfo_for_device(char const *devpath,
+ GuestFilesystemInfo *fs,
+ Error **errp);
+
+/* Store a list of slave devices of virtual volume specified by @syspath into
+ * @fs */
+static void build_guest_fsinfo_for_virtual_device(char const *syspath,
+ GuestFilesystemInfo *fs,
+ Error **errp)
+{
+ Error *err = NULL;
+ DIR *dir;
+ char *dirpath;
+ struct dirent *entry;
+
+ dirpath = g_strdup_printf("%s/slaves", syspath);
+ dir = opendir(dirpath);
+ if (!dir) {
+ if (errno != ENOENT) {
+ error_setg_errno(errp, errno, "opendir(\"%s\")", dirpath);
+ }
+ g_free(dirpath);
+ return;
+ }
+
+ for (;;) {
+ errno = 0;
+ entry = readdir(dir);
+ if (entry == NULL) {
+ if (errno) {
+ error_setg_errno(errp, errno, "readdir(\"%s\")", dirpath);
+ }
+ break;
+ }
+
+ if (entry->d_type == DT_LNK) {
+ char *path;
+
+ g_debug(" slave device '%s'", entry->d_name);
+ path = g_strdup_printf("%s/slaves/%s", syspath, entry->d_name);
+ build_guest_fsinfo_for_device(path, fs, &err);
+ g_free(path);
+
+ if (err) {
+ error_propagate(errp, err);
+ break;
+ }
+ }
+ }
+
+ g_free(dirpath);
+ closedir(dir);
+}
+
+static bool is_disk_virtual(const char *devpath, Error **errp)
+{
+ g_autofree char *syspath = realpath(devpath, NULL);
+
+ if (!syspath) {
+ error_setg_errno(errp, errno, "realpath(\"%s\")", devpath);
+ return false;
+ }
+ return strstr(syspath, "/devices/virtual/block/") != NULL;
+}
+
+/* Dispatch to functions for virtual/real device */
+static void build_guest_fsinfo_for_device(char const *devpath,
+ GuestFilesystemInfo *fs,
+ Error **errp)
+{
+ ERRP_GUARD();
+ g_autofree char *syspath = NULL;
+ bool is_virtual = false;
+
+ syspath = realpath(devpath, NULL);
+ if (!syspath) {
+ if (errno != ENOENT) {
+ error_setg_errno(errp, errno, "realpath(\"%s\")", devpath);
+ return;
+ }
+
+ /* ENOENT: This devpath may not exist because of container config */
+ if (!fs->name) {
+ fs->name = g_path_get_basename(devpath);
+ }
+ return;
+ }
+
+ if (!fs->name) {
+ fs->name = g_path_get_basename(syspath);
+ }
+
+ g_debug(" parse sysfs path '%s'", syspath);
+ is_virtual = is_disk_virtual(syspath, errp);
+ if (*errp != NULL) {
+ return;
+ }
+ if (is_virtual) {
+ build_guest_fsinfo_for_virtual_device(syspath, fs, errp);
+ } else {
+ build_guest_fsinfo_for_real_device(syspath, fs, errp);
+ }
+}
+
+#ifdef CONFIG_LIBUDEV
+
+/*
+ * Wrapper around build_guest_fsinfo_for_device() for getting just
+ * the disk address.
+ */
+static GuestDiskAddress *get_disk_address(const char *syspath, Error **errp)
+{
+ g_autoptr(GuestFilesystemInfo) fs = NULL;
+
+ fs = g_new0(GuestFilesystemInfo, 1);
+ build_guest_fsinfo_for_device(syspath, fs, errp);
+ if (fs->disk != NULL) {
+ return g_steal_pointer(&fs->disk->value);
+ }
+ return NULL;
+}
+
+static char *get_alias_for_syspath(const char *syspath)
+{
+ struct udev *udev = NULL;
+ struct udev_device *udevice = NULL;
+ char *ret = NULL;
+
+ udev = udev_new();
+ if (udev == NULL) {
+ g_debug("failed to query udev");
+ goto out;
+ }
+ udevice = udev_device_new_from_syspath(udev, syspath);
+ if (udevice == NULL) {
+ g_debug("failed to query udev for path: %s", syspath);
+ goto out;
+ } else {
+ const char *alias = udev_device_get_property_value(
+ udevice, "DM_NAME");
+ /*
+ * NULL means there was an error and empty string means there is no
+ * alias. In case of no alias we return NULL instead of empty string.
+ */
+ if (alias == NULL) {
+ g_debug("failed to query udev for device alias for: %s",
+ syspath);
+ } else if (*alias != 0) {
+ ret = g_strdup(alias);
+ }
+ }
+
+out:
+ udev_unref(udev);
+ udev_device_unref(udevice);
+ return ret;
+}
+
+static char *get_device_for_syspath(const char *syspath)
+{
+ struct udev *udev = NULL;
+ struct udev_device *udevice = NULL;
+ char *ret = NULL;
+
+ udev = udev_new();
+ if (udev == NULL) {
+ g_debug("failed to query udev");
+ goto out;
+ }
+ udevice = udev_device_new_from_syspath(udev, syspath);
+ if (udevice == NULL) {
+ g_debug("failed to query udev for path: %s", syspath);
+ goto out;
+ } else {
+ ret = g_strdup(udev_device_get_devnode(udevice));
+ }
+
+out:
+ udev_unref(udev);
+ udev_device_unref(udevice);
+ return ret;
+}
+
+static void get_disk_deps(const char *disk_dir, GuestDiskInfo *disk)
+{
+ g_autofree char *deps_dir = NULL;
+ const gchar *dep;
+ GDir *dp_deps = NULL;
+
+ /* List dependent disks */
+ deps_dir = g_strdup_printf("%s/slaves", disk_dir);
+ g_debug(" listing entries in: %s", deps_dir);
+ dp_deps = g_dir_open(deps_dir, 0, NULL);
+ if (dp_deps == NULL) {
+ g_debug("failed to list entries in %s", deps_dir);
+ return;
+ }
+ disk->has_dependencies = true;
+ while ((dep = g_dir_read_name(dp_deps)) != NULL) {
+ g_autofree char *dep_dir = NULL;
+ char *dev_name;
+
+ /* Add dependent disks */
+ dep_dir = g_strdup_printf("%s/%s", deps_dir, dep);
+ dev_name = get_device_for_syspath(dep_dir);
+ if (dev_name != NULL) {
+ g_debug(" adding dependent device: %s", dev_name);
+ QAPI_LIST_PREPEND(disk->dependencies, dev_name);
+ }
+ }
+ g_dir_close(dp_deps);
+}
+
+/*
+ * Detect partitions subdirectory, name is "<disk_name><number>" or
+ * "<disk_name>p<number>"
+ *
+ * @disk_name -- last component of /sys path (e.g. sda)
+ * @disk_dir -- sys path of the disk (e.g. /sys/block/sda)
+ * @disk_dev -- device node of the disk (e.g. /dev/sda)
+ */
+static GuestDiskInfoList *get_disk_partitions(
+ GuestDiskInfoList *list,
+ const char *disk_name, const char *disk_dir,
+ const char *disk_dev)
+{
+ GuestDiskInfoList *ret = list;
+ struct dirent *de_disk;
+ DIR *dp_disk = NULL;
+ size_t len = strlen(disk_name);
+
+ dp_disk = opendir(disk_dir);
+ while ((de_disk = readdir(dp_disk)) != NULL) {
+ g_autofree char *partition_dir = NULL;
+ char *dev_name;
+ GuestDiskInfo *partition;
+
+ if (!(de_disk->d_type & DT_DIR)) {
+ continue;
+ }
+
+ if (!(strncmp(disk_name, de_disk->d_name, len) == 0 &&
+ ((*(de_disk->d_name + len) == 'p' &&
+ isdigit(*(de_disk->d_name + len + 1))) ||
+ isdigit(*(de_disk->d_name + len))))) {
+ continue;
+ }
+
+ partition_dir = g_strdup_printf("%s/%s",
+ disk_dir, de_disk->d_name);
+ dev_name = get_device_for_syspath(partition_dir);
+ if (dev_name == NULL) {
+ g_debug("Failed to get device name for syspath: %s",
+ disk_dir);
+ continue;
+ }
+ partition = g_new0(GuestDiskInfo, 1);
+ partition->name = dev_name;
+ partition->partition = true;
+ partition->has_dependencies = true;
+ /* Add parent disk as dependent for easier tracking of hierarchy */
+ QAPI_LIST_PREPEND(partition->dependencies, g_strdup(disk_dev));
+
+ QAPI_LIST_PREPEND(ret, partition);
+ }
+ closedir(dp_disk);
+
+ return ret;
+}
+
+static void get_nvme_smart(GuestDiskInfo *disk)
+{
+ int fd;
+ GuestNVMeSmart *smart;
+ NvmeSmartLog log = {0};
+ struct nvme_admin_cmd cmd = {
+ .opcode = NVME_ADM_CMD_GET_LOG_PAGE,
+ .nsid = NVME_NSID_BROADCAST,
+ .addr = (uintptr_t)&log,
+ .data_len = sizeof(log),
+ .cdw10 = NVME_LOG_SMART_INFO | (1 << 15) /* RAE bit */
+ | (((sizeof(log) >> 2) - 1) << 16)
+ };
+
+ fd = qga_open_cloexec(disk->name, O_RDONLY, 0);
+ if (fd == -1) {
+ g_debug("Failed to open device: %s: %s", disk->name, g_strerror(errno));
+ return;
+ }
+
+ if (ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd)) {
+ g_debug("Failed to get smart: %s: %s", disk->name, g_strerror(errno));
+ close(fd);
+ return;
+ }
+
+ disk->smart = g_new0(GuestDiskSmart, 1);
+ disk->smart->type = GUEST_DISK_BUS_TYPE_NVME;
+
+ smart = &disk->smart->u.nvme;
+ smart->critical_warning = log.critical_warning;
+ smart->temperature = lduw_le_p(&log.temperature); /* unaligned field */
+ smart->available_spare = log.available_spare;
+ smart->available_spare_threshold = log.available_spare_threshold;
+ smart->percentage_used = log.percentage_used;
+ smart->data_units_read_lo = le64_to_cpu(log.data_units_read[0]);
+ smart->data_units_read_hi = le64_to_cpu(log.data_units_read[1]);
+ smart->data_units_written_lo = le64_to_cpu(log.data_units_written[0]);
+ smart->data_units_written_hi = le64_to_cpu(log.data_units_written[1]);
+ smart->host_read_commands_lo = le64_to_cpu(log.host_read_commands[0]);
+ smart->host_read_commands_hi = le64_to_cpu(log.host_read_commands[1]);
+ smart->host_write_commands_lo = le64_to_cpu(log.host_write_commands[0]);
+ smart->host_write_commands_hi = le64_to_cpu(log.host_write_commands[1]);
+ smart->controller_busy_time_lo = le64_to_cpu(log.controller_busy_time[0]);
+ smart->controller_busy_time_hi = le64_to_cpu(log.controller_busy_time[1]);
+ smart->power_cycles_lo = le64_to_cpu(log.power_cycles[0]);
+ smart->power_cycles_hi = le64_to_cpu(log.power_cycles[1]);
+ smart->power_on_hours_lo = le64_to_cpu(log.power_on_hours[0]);
+ smart->power_on_hours_hi = le64_to_cpu(log.power_on_hours[1]);
+ smart->unsafe_shutdowns_lo = le64_to_cpu(log.unsafe_shutdowns[0]);
+ smart->unsafe_shutdowns_hi = le64_to_cpu(log.unsafe_shutdowns[1]);
+ smart->media_errors_lo = le64_to_cpu(log.media_errors[0]);
+ smart->media_errors_hi = le64_to_cpu(log.media_errors[1]);
+ smart->number_of_error_log_entries_lo =
+ le64_to_cpu(log.number_of_error_log_entries[0]);
+ smart->number_of_error_log_entries_hi =
+ le64_to_cpu(log.number_of_error_log_entries[1]);
+
+ close(fd);
+}
+
+static void get_disk_smart(GuestDiskInfo *disk)
+{
+ if (disk->address
+ && (disk->address->bus_type == GUEST_DISK_BUS_TYPE_NVME)) {
+ get_nvme_smart(disk);
+ }
+}
+
+GuestDiskInfoList *qmp_guest_get_disks(Error **errp)
+{
+ GuestDiskInfoList *ret = NULL;
+ GuestDiskInfo *disk;
+ DIR *dp = NULL;
+ struct dirent *de = NULL;
+
+ g_debug("listing /sys/block directory");
+ dp = opendir("/sys/block");
+ if (dp == NULL) {
+ error_setg_errno(errp, errno, "Can't open directory \"/sys/block\"");
+ return NULL;
+ }
+ while ((de = readdir(dp)) != NULL) {
+ g_autofree char *disk_dir = NULL, *line = NULL,
+ *size_path = NULL;
+ char *dev_name;
+ Error *local_err = NULL;
+ if (de->d_type != DT_LNK) {
+ g_debug(" skipping entry: %s", de->d_name);
+ continue;
+ }
+
+ /* Check size and skip zero-sized disks */
+ g_debug(" checking disk size");
+ size_path = g_strdup_printf("/sys/block/%s/size", de->d_name);
+ if (!g_file_get_contents(size_path, &line, NULL, NULL)) {
+ g_debug(" failed to read disk size");
+ continue;
+ }
+ if (g_strcmp0(line, "0\n") == 0) {
+ g_debug(" skipping zero-sized disk");
+ continue;
+ }
+
+ g_debug(" adding %s", de->d_name);
+ disk_dir = g_strdup_printf("/sys/block/%s", de->d_name);
+ dev_name = get_device_for_syspath(disk_dir);
+ if (dev_name == NULL) {
+ g_debug("Failed to get device name for syspath: %s",
+ disk_dir);
+ continue;
+ }
+ disk = g_new0(GuestDiskInfo, 1);
+ disk->name = dev_name;
+ disk->partition = false;
+ disk->alias = get_alias_for_syspath(disk_dir);
+ QAPI_LIST_PREPEND(ret, disk);
+
+ /* Get address for non-virtual devices */
+ bool is_virtual = is_disk_virtual(disk_dir, &local_err);
+ if (local_err != NULL) {
+ g_debug(" failed to check disk path, ignoring error: %s",
+ error_get_pretty(local_err));
+ error_free(local_err);
+ local_err = NULL;
+ /* Don't try to get the address */
+ is_virtual = true;
+ }
+ if (!is_virtual) {
+ disk->address = get_disk_address(disk_dir, &local_err);
+ if (local_err != NULL) {
+ g_debug(" failed to get device info, ignoring error: %s",
+ error_get_pretty(local_err));
+ error_free(local_err);
+ local_err = NULL;
+ }
+ }
+
+ get_disk_deps(disk_dir, disk);
+ get_disk_smart(disk);
+ ret = get_disk_partitions(ret, de->d_name, disk_dir, dev_name);
+ }
+
+ closedir(dp);
+
+ return ret;
+}
+
+#endif
+
+/* Return a list of the disk device(s)' info which @mount lies on */
+static GuestFilesystemInfo *build_guest_fsinfo(struct FsMount *mount,
+ Error **errp)
+{
+ GuestFilesystemInfo *fs = g_malloc0(sizeof(*fs));
+ struct statvfs buf;
+ unsigned long used, nonroot_total, fr_size;
+ char *devpath = g_strdup_printf("/sys/dev/block/%u:%u",
+ mount->devmajor, mount->devminor);
+
+ fs->mountpoint = g_strdup(mount->dirname);
+ fs->type = g_strdup(mount->devtype);
+ build_guest_fsinfo_for_device(devpath, fs, errp);
+
+ if (statvfs(fs->mountpoint, &buf) == 0) {
+ fr_size = buf.f_frsize;
+ used = buf.f_blocks - buf.f_bfree;
+ nonroot_total = used + buf.f_bavail;
+ fs->used_bytes = used * fr_size;
+ fs->total_bytes = nonroot_total * fr_size;
+ fs->total_bytes_privileged = buf.f_blocks * fr_size;
+
+ fs->has_total_bytes = true;
+ fs->has_total_bytes_privileged = true;
+ fs->has_used_bytes = true;
+ }
+
+ g_free(devpath);
+
+ return fs;
+}
+
+GuestFilesystemInfoList *qmp_guest_get_fsinfo(Error **errp)
+{
+ FsMountList mounts;
+ struct FsMount *mount;
+ GuestFilesystemInfoList *ret = NULL;
+ Error *local_err = NULL;
+
+ QTAILQ_INIT(&mounts);
+ if (!build_fs_mount_list(&mounts, &local_err)) {
+ error_propagate(errp, local_err);
+ return NULL;
+ }
+
+ QTAILQ_FOREACH(mount, &mounts, next) {
+ g_debug("Building guest fsinfo for '%s'", mount->dirname);
+
+ QAPI_LIST_PREPEND(ret, build_guest_fsinfo(mount, &local_err));
+ if (local_err) {
+ error_propagate(errp, local_err);
+ qapi_free_GuestFilesystemInfoList(ret);
+ ret = NULL;
+ break;
+ }
+ }
+
+ free_fs_mount_list(&mounts);
+ return ret;
+}
+#endif /* CONFIG_FSFREEZE */
+
+#if defined(CONFIG_FSTRIM)
+/*
+ * Walk list of mounted file systems in the guest, and trim them.
+ */
+GuestFilesystemTrimResponse *
+qmp_guest_fstrim(bool has_minimum, int64_t minimum, Error **errp)
+{
+ GuestFilesystemTrimResponse *response;
+ GuestFilesystemTrimResult *result;
+ int ret = 0;
+ FsMountList mounts;
+ struct FsMount *mount;
+ int fd;
+ struct fstrim_range r;
+
+ slog("guest-fstrim called");
+
+ QTAILQ_INIT(&mounts);
+ if (!build_fs_mount_list(&mounts, errp)) {
+ return NULL;
+ }
+
+ response = g_malloc0(sizeof(*response));
+
+ QTAILQ_FOREACH(mount, &mounts, next) {
+ result = g_malloc0(sizeof(*result));
+ result->path = g_strdup(mount->dirname);
+
+ QAPI_LIST_PREPEND(response->paths, result);
+
+ fd = qga_open_cloexec(mount->dirname, O_RDONLY, 0);
+ if (fd == -1) {
+ result->error = g_strdup_printf("failed to open: %s",
+ strerror(errno));
+ continue;
+ }
+
+ /* We try to cull filesystems we know won't work in advance, but other
+ * filesystems may not implement fstrim for less obvious reasons.
+ * These will report EOPNOTSUPP; while in some other cases ENOTTY
+ * will be reported (e.g. CD-ROMs).
+ * Any other error means an unexpected error.
+ */
+ r.start = 0;
+ r.len = -1;
+ r.minlen = has_minimum ? minimum : 0;
+ ret = ioctl(fd, FITRIM, &r);
+ if (ret == -1) {
+ if (errno == ENOTTY || errno == EOPNOTSUPP) {
+ result->error = g_strdup("trim not supported");
+ } else {
+ result->error = g_strdup_printf("failed to trim: %s",
+ strerror(errno));
+ }
+ close(fd);
+ continue;
+ }
+
+ result->has_minimum = true;
+ result->minimum = r.minlen;
+ result->has_trimmed = true;
+ result->trimmed = r.len;
+ close(fd);
+ }
+
+ free_fs_mount_list(&mounts);
+ return response;
+}
+#endif /* CONFIG_FSTRIM */
+
+#define LINUX_SYS_STATE_FILE "/sys/power/state"
+#define SUSPEND_SUPPORTED 0
+#define SUSPEND_NOT_SUPPORTED 1
+
+typedef enum {
+ SUSPEND_MODE_DISK = 0,
+ SUSPEND_MODE_RAM = 1,
+ SUSPEND_MODE_HYBRID = 2,
+} SuspendMode;
+
+/*
+ * Executes a command in a child process using g_spawn_sync,
+ * returning an int >= 0 representing the exit status of the
+ * process.
+ *
+ * If the program wasn't found in path, returns -1.
+ *
+ * If a problem happened when creating the child process,
+ * returns -1 and errp is set.
+ */
+static int run_process_child(const char *command[], Error **errp)
+{
+ int exit_status, spawn_flag;
+ GError *g_err = NULL;
+ bool success;
+
+ spawn_flag = G_SPAWN_SEARCH_PATH | G_SPAWN_STDOUT_TO_DEV_NULL |
+ G_SPAWN_STDERR_TO_DEV_NULL;
+
+ success = g_spawn_sync(NULL, (char **)command, NULL, spawn_flag,
+ NULL, NULL, NULL, NULL,
+ &exit_status, &g_err);
+
+ if (success) {
+ return WEXITSTATUS(exit_status);
+ }
+
+ if (g_err && (g_err->code != G_SPAWN_ERROR_NOENT)) {
+ error_setg(errp, "failed to create child process, error '%s'",
+ g_err->message);
+ }
+
+ g_error_free(g_err);
+ return -1;
+}
+
+static bool systemd_supports_mode(SuspendMode mode, Error **errp)
+{
+ const char *systemctl_args[3] = {"systemd-hibernate", "systemd-suspend",
+ "systemd-hybrid-sleep"};
+ const char *cmd[4] = {"systemctl", "status", systemctl_args[mode], NULL};
+ int status;
+
+ status = run_process_child(cmd, errp);
+
+ /*
+ * systemctl status uses LSB return codes so we can expect
+ * status > 0 and be ok. To assert if the guest has support
+ * for the selected suspend mode, status should be < 4. 4 is
+ * the code for unknown service status, the return value when
+ * the service does not exist. A common value is status = 3
+ * (program is not running).
+ */
+ if (status > 0 && status < 4) {
+ return true;
+ }
+
+ return false;
+}
+
+static void systemd_suspend(SuspendMode mode, Error **errp)
+{
+ Error *local_err = NULL;
+ const char *systemctl_args[3] = {"hibernate", "suspend", "hybrid-sleep"};
+ const char *cmd[3] = {"systemctl", systemctl_args[mode], NULL};
+ int status;
+
+ status = run_process_child(cmd, &local_err);
+
+ if (status == 0) {
+ return;
+ }
+
+ if ((status == -1) && !local_err) {
+ error_setg(errp, "the helper program 'systemctl %s' was not found",
+ systemctl_args[mode]);
+ return;
+ }
+
+ if (local_err) {
+ error_propagate(errp, local_err);
+ } else {
+ error_setg(errp, "the helper program 'systemctl %s' returned an "
+ "unexpected exit status code (%d)",
+ systemctl_args[mode], status);
+ }
+}
+
+static bool pmutils_supports_mode(SuspendMode mode, Error **errp)
+{
+ Error *local_err = NULL;
+ const char *pmutils_args[3] = {"--hibernate", "--suspend",
+ "--suspend-hybrid"};
+ const char *cmd[3] = {"pm-is-supported", pmutils_args[mode], NULL};
+ int status;
+
+ status = run_process_child(cmd, &local_err);
+
+ if (status == SUSPEND_SUPPORTED) {
+ return true;
+ }
+
+ if ((status == -1) && !local_err) {
+ return false;
+ }
+
+ if (local_err) {
+ error_propagate(errp, local_err);
+ } else {
+ error_setg(errp,
+ "the helper program '%s' returned an unexpected exit"
+ " status code (%d)", "pm-is-supported", status);
+ }
+
+ return false;
+}
+
+static void pmutils_suspend(SuspendMode mode, Error **errp)
+{
+ Error *local_err = NULL;
+ const char *pmutils_binaries[3] = {"pm-hibernate", "pm-suspend",
+ "pm-suspend-hybrid"};
+ const char *cmd[2] = {pmutils_binaries[mode], NULL};
+ int status;
+
+ status = run_process_child(cmd, &local_err);
+
+ if (status == 0) {
+ return;
+ }
+
+ if ((status == -1) && !local_err) {
+ error_setg(errp, "the helper program '%s' was not found",
+ pmutils_binaries[mode]);
+ return;
+ }
+
+ if (local_err) {
+ error_propagate(errp, local_err);
+ } else {
+ error_setg(errp,
+ "the helper program '%s' returned an unexpected exit"
+ " status code (%d)", pmutils_binaries[mode], status);
+ }
+}
+
+static bool linux_sys_state_supports_mode(SuspendMode mode, Error **errp)
+{
+ const char *sysfile_strs[3] = {"disk", "mem", NULL};
+ const char *sysfile_str = sysfile_strs[mode];
+ char buf[32]; /* hopefully big enough */
+ int fd;
+ ssize_t ret;
+
+ if (!sysfile_str) {
+ error_setg(errp, "unknown guest suspend mode");
+ return false;
+ }
+
+ fd = open(LINUX_SYS_STATE_FILE, O_RDONLY);
+ if (fd < 0) {
+ return false;
+ }
+
+ ret = read(fd, buf, sizeof(buf) - 1);
+ close(fd);
+ if (ret <= 0) {
+ return false;
+ }
+ buf[ret] = '\0';
+
+ if (strstr(buf, sysfile_str)) {
+ return true;
+ }
+ return false;
+}
+
+static void linux_sys_state_suspend(SuspendMode mode, Error **errp)
+{
+ g_autoptr(GError) local_gerr = NULL;
+ const char *sysfile_strs[3] = {"disk", "mem", NULL};
+ const char *sysfile_str = sysfile_strs[mode];
+
+ if (!sysfile_str) {
+ error_setg(errp, "unknown guest suspend mode");
+ return;
+ }
+
+ if (!g_file_set_contents(LINUX_SYS_STATE_FILE, sysfile_str,
+ -1, &local_gerr)) {
+ error_setg(errp, "suspend: cannot write to '%s': %s",
+ LINUX_SYS_STATE_FILE, local_gerr->message);
+ return;
+ }
+}
+
+static void guest_suspend(SuspendMode mode, Error **errp)
+{
+ Error *local_err = NULL;
+ bool mode_supported = false;
+
+ if (systemd_supports_mode(mode, &local_err)) {
+ mode_supported = true;
+ systemd_suspend(mode, &local_err);
+
+ if (!local_err) {
+ return;
+ }
+ }
+
+ error_free(local_err);
+ local_err = NULL;
+
+ if (pmutils_supports_mode(mode, &local_err)) {
+ mode_supported = true;
+ pmutils_suspend(mode, &local_err);
+
+ if (!local_err) {
+ return;
+ }
+ }
+
+ error_free(local_err);
+ local_err = NULL;
+
+ if (linux_sys_state_supports_mode(mode, &local_err)) {
+ mode_supported = true;
+ linux_sys_state_suspend(mode, &local_err);
+ }
+
+ if (!mode_supported) {
+ error_free(local_err);
+ error_setg(errp,
+ "the requested suspend mode is not supported by the guest");
+ } else {
+ error_propagate(errp, local_err);
+ }
+}
+
+void qmp_guest_suspend_disk(Error **errp)
+{
+ guest_suspend(SUSPEND_MODE_DISK, errp);
+}
+
+void qmp_guest_suspend_ram(Error **errp)
+{
+ guest_suspend(SUSPEND_MODE_RAM, errp);
+}
+
+void qmp_guest_suspend_hybrid(Error **errp)
+{
+ guest_suspend(SUSPEND_MODE_HYBRID, errp);
+}
+
+/* Transfer online/offline status between @vcpu and the guest system.
+ *
+ * On input either @errp or *@errp must be NULL.
+ *
+ * In system-to-@vcpu direction, the following @vcpu fields are accessed:
+ * - R: vcpu->logical_id
+ * - W: vcpu->online
+ * - W: vcpu->can_offline
+ *
+ * In @vcpu-to-system direction, the following @vcpu fields are accessed:
+ * - R: vcpu->logical_id
+ * - R: vcpu->online
+ *
+ * Written members remain unmodified on error.
+ */
+static void transfer_vcpu(GuestLogicalProcessor *vcpu, bool sys2vcpu,
+ char *dirpath, Error **errp)
+{
+ int fd;
+ int res;
+ int dirfd;
+ static const char fn[] = "online";
+
+ dirfd = open(dirpath, O_RDONLY | O_DIRECTORY);
+ if (dirfd == -1) {
+ error_setg_errno(errp, errno, "open(\"%s\")", dirpath);
+ return;
+ }
+
+ fd = openat(dirfd, fn, sys2vcpu ? O_RDONLY : O_RDWR);
+ if (fd == -1) {
+ if (errno != ENOENT) {
+ error_setg_errno(errp, errno, "open(\"%s/%s\")", dirpath, fn);
+ } else if (sys2vcpu) {
+ vcpu->online = true;
+ vcpu->can_offline = false;
+ } else if (!vcpu->online) {
+ error_setg(errp, "logical processor #%" PRId64 " can't be "
+ "offlined", vcpu->logical_id);
+ } /* otherwise pretend successful re-onlining */
+ } else {
+ unsigned char status;
+
+ res = pread(fd, &status, 1, 0);
+ if (res == -1) {
+ error_setg_errno(errp, errno, "pread(\"%s/%s\")", dirpath, fn);
+ } else if (res == 0) {
+ error_setg(errp, "pread(\"%s/%s\"): unexpected EOF", dirpath,
+ fn);
+ } else if (sys2vcpu) {
+ vcpu->online = (status != '0');
+ vcpu->can_offline = true;
+ } else if (vcpu->online != (status != '0')) {
+ status = '0' + vcpu->online;
+ if (pwrite(fd, &status, 1, 0) == -1) {
+ error_setg_errno(errp, errno, "pwrite(\"%s/%s\")", dirpath,
+ fn);
+ }
+ } /* otherwise pretend successful re-(on|off)-lining */
+
+ res = close(fd);
+ g_assert(res == 0);
+ }
+
+ res = close(dirfd);
+ g_assert(res == 0);
+}
+
+GuestLogicalProcessorList *qmp_guest_get_vcpus(Error **errp)
+{
+ GuestLogicalProcessorList *head, **tail;
+ const char *cpu_dir = "/sys/devices/system/cpu";
+ const gchar *line;
+ g_autoptr(GDir) cpu_gdir = NULL;
+ Error *local_err = NULL;
+
+ head = NULL;
+ tail = &head;
+ cpu_gdir = g_dir_open(cpu_dir, 0, NULL);
+
+ if (cpu_gdir == NULL) {
+ error_setg_errno(errp, errno, "failed to list entries: %s", cpu_dir);
+ return NULL;
+ }
+
+ while (local_err == NULL && (line = g_dir_read_name(cpu_gdir)) != NULL) {
+ GuestLogicalProcessor *vcpu;
+ int64_t id;
+ if (sscanf(line, "cpu%" PRId64, &id)) {
+ g_autofree char *path = g_strdup_printf("/sys/devices/system/cpu/"
+ "cpu%" PRId64 "/", id);
+ vcpu = g_malloc0(sizeof *vcpu);
+ vcpu->logical_id = id;
+ vcpu->has_can_offline = true; /* lolspeak ftw */
+ transfer_vcpu(vcpu, true, path, &local_err);
+ QAPI_LIST_APPEND(tail, vcpu);
+ }
+ }
+
+ if (local_err == NULL) {
+ /* there's no guest with zero VCPUs */
+ g_assert(head != NULL);
+ return head;
+ }
+
+ qapi_free_GuestLogicalProcessorList(head);
+ error_propagate(errp, local_err);
+ return NULL;
+}
+
+int64_t qmp_guest_set_vcpus(GuestLogicalProcessorList *vcpus, Error **errp)
+{
+ int64_t processed;
+ Error *local_err = NULL;
+
+ processed = 0;
+ while (vcpus != NULL) {
+ char *path = g_strdup_printf("/sys/devices/system/cpu/cpu%" PRId64 "/",
+ vcpus->value->logical_id);
+
+ transfer_vcpu(vcpus->value, false, path, &local_err);
+ g_free(path);
+ if (local_err != NULL) {
+ break;
+ }
+ ++processed;
+ vcpus = vcpus->next;
+ }
+
+ if (local_err != NULL) {
+ if (processed == 0) {
+ error_propagate(errp, local_err);
+ } else {
+ error_free(local_err);
+ }
+ }
+
+ return processed;
+}
+
+
+static void ga_read_sysfs_file(int dirfd, const char *pathname, char *buf,
+ int size, Error **errp)
+{
+ int fd;
+ int res;
+
+ errno = 0;
+ fd = openat(dirfd, pathname, O_RDONLY);
+ if (fd == -1) {
+ error_setg_errno(errp, errno, "open sysfs file \"%s\"", pathname);
+ return;
+ }
+
+ res = pread(fd, buf, size, 0);
+ if (res == -1) {
+ error_setg_errno(errp, errno, "pread sysfs file \"%s\"", pathname);
+ } else if (res == 0) {
+ error_setg(errp, "pread sysfs file \"%s\": unexpected EOF", pathname);
+ }
+ close(fd);
+}
+
+static void ga_write_sysfs_file(int dirfd, const char *pathname,
+ const char *buf, int size, Error **errp)
+{
+ int fd;
+
+ errno = 0;
+ fd = openat(dirfd, pathname, O_WRONLY);
+ if (fd == -1) {
+ error_setg_errno(errp, errno, "open sysfs file \"%s\"", pathname);
+ return;
+ }
+
+ if (pwrite(fd, buf, size, 0) == -1) {
+ error_setg_errno(errp, errno, "pwrite sysfs file \"%s\"", pathname);
+ }
+
+ close(fd);
+}
+
+/* Transfer online/offline status between @mem_blk and the guest system.
+ *
+ * On input either @errp or *@errp must be NULL.
+ *
+ * In system-to-@mem_blk direction, the following @mem_blk fields are accessed:
+ * - R: mem_blk->phys_index
+ * - W: mem_blk->online
+ * - W: mem_blk->can_offline
+ *
+ * In @mem_blk-to-system direction, the following @mem_blk fields are accessed:
+ * - R: mem_blk->phys_index
+ * - R: mem_blk->online
+ *- R: mem_blk->can_offline
+ * Written members remain unmodified on error.
+ */
+static void transfer_memory_block(GuestMemoryBlock *mem_blk, bool sys2memblk,
+ GuestMemoryBlockResponse *result,
+ Error **errp)
+{
+ char *dirpath;
+ int dirfd;
+ char *status;
+ Error *local_err = NULL;
+
+ if (!sys2memblk) {
+ DIR *dp;
+
+ if (!result) {
+ error_setg(errp, "Internal error, 'result' should not be NULL");
+ return;
+ }
+ errno = 0;
+ dp = opendir("/sys/devices/system/memory/");
+ /* if there is no 'memory' directory in sysfs,
+ * we think this VM does not support online/offline memory block,
+ * any other solution?
+ */
+ if (!dp) {
+ if (errno == ENOENT) {
+ result->response =
+ GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_NOT_SUPPORTED;
+ }
+ goto out1;
+ }
+ closedir(dp);
+ }
+
+ dirpath = g_strdup_printf("/sys/devices/system/memory/memory%" PRId64 "/",
+ mem_blk->phys_index);
+ dirfd = open(dirpath, O_RDONLY | O_DIRECTORY);
+ if (dirfd == -1) {
+ if (sys2memblk) {
+ error_setg_errno(errp, errno, "open(\"%s\")", dirpath);
+ } else {
+ if (errno == ENOENT) {
+ result->response = GUEST_MEMORY_BLOCK_RESPONSE_TYPE_NOT_FOUND;
+ } else {
+ result->response =
+ GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_FAILED;
+ }
+ }
+ g_free(dirpath);
+ goto out1;
+ }
+ g_free(dirpath);
+
+ status = g_malloc0(10);
+ ga_read_sysfs_file(dirfd, "state", status, 10, &local_err);
+ if (local_err) {
+ /* treat with sysfs file that not exist in old kernel */
+ if (errno == ENOENT) {
+ error_free(local_err);
+ if (sys2memblk) {
+ mem_blk->online = true;
+ mem_blk->can_offline = false;
+ } else if (!mem_blk->online) {
+ result->response =
+ GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_NOT_SUPPORTED;
+ }
+ } else {
+ if (sys2memblk) {
+ error_propagate(errp, local_err);
+ } else {
+ error_free(local_err);
+ result->response =
+ GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_FAILED;
+ }
+ }
+ goto out2;
+ }
+
+ if (sys2memblk) {
+ char removable = '0';
+
+ mem_blk->online = (strncmp(status, "online", 6) == 0);
+
+ ga_read_sysfs_file(dirfd, "removable", &removable, 1, &local_err);
+ if (local_err) {
+ /* if no 'removable' file, it doesn't support offline mem blk */
+ if (errno == ENOENT) {
+ error_free(local_err);
+ mem_blk->can_offline = false;
+ } else {
+ error_propagate(errp, local_err);
+ }
+ } else {
+ mem_blk->can_offline = (removable != '0');
+ }
+ } else {
+ if (mem_blk->online != (strncmp(status, "online", 6) == 0)) {
+ const char *new_state = mem_blk->online ? "online" : "offline";
+
+ ga_write_sysfs_file(dirfd, "state", new_state, strlen(new_state),
+ &local_err);
+ if (local_err) {
+ error_free(local_err);
+ result->response =
+ GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_FAILED;
+ goto out2;
+ }
+
+ result->response = GUEST_MEMORY_BLOCK_RESPONSE_TYPE_SUCCESS;
+ result->has_error_code = false;
+ } /* otherwise pretend successful re-(on|off)-lining */
+ }
+ g_free(status);
+ close(dirfd);
+ return;
+
+out2:
+ g_free(status);
+ close(dirfd);
+out1:
+ if (!sys2memblk) {
+ result->has_error_code = true;
+ result->error_code = errno;
+ }
+}
+
+GuestMemoryBlockList *qmp_guest_get_memory_blocks(Error **errp)
+{
+ GuestMemoryBlockList *head, **tail;
+ Error *local_err = NULL;
+ struct dirent *de;
+ DIR *dp;
+
+ head = NULL;
+ tail = &head;
+
+ dp = opendir("/sys/devices/system/memory/");
+ if (!dp) {
+ /* it's ok if this happens to be a system that doesn't expose
+ * memory blocks via sysfs, but otherwise we should report
+ * an error
+ */
+ if (errno != ENOENT) {
+ error_setg_errno(errp, errno, "Can't open directory"
+ "\"/sys/devices/system/memory/\"");
+ }
+ return NULL;
+ }
+
+ /* Note: the phys_index of memory block may be discontinuous,
+ * this is because a memblk is the unit of the Sparse Memory design, which
+ * allows discontinuous memory ranges (ex. NUMA), so here we should
+ * traverse the memory block directory.
+ */
+ while ((de = readdir(dp)) != NULL) {
+ GuestMemoryBlock *mem_blk;
+
+ if ((strncmp(de->d_name, "memory", 6) != 0) ||
+ !(de->d_type & DT_DIR)) {
+ continue;
+ }
+
+ mem_blk = g_malloc0(sizeof *mem_blk);
+ /* The d_name is "memoryXXX", phys_index is block id, same as XXX */
+ mem_blk->phys_index = strtoul(&de->d_name[6], NULL, 10);
+ mem_blk->has_can_offline = true; /* lolspeak ftw */
+ transfer_memory_block(mem_blk, true, NULL, &local_err);
+ if (local_err) {
+ break;
+ }
+
+ QAPI_LIST_APPEND(tail, mem_blk);
+ }
+
+ closedir(dp);
+ if (local_err == NULL) {
+ /* there's no guest with zero memory blocks */
+ if (head == NULL) {
+ error_setg(errp, "guest reported zero memory blocks!");
+ }
+ return head;
+ }
+
+ qapi_free_GuestMemoryBlockList(head);
+ error_propagate(errp, local_err);
+ return NULL;
+}
+
+GuestMemoryBlockResponseList *
+qmp_guest_set_memory_blocks(GuestMemoryBlockList *mem_blks, Error **errp)
+{
+ GuestMemoryBlockResponseList *head, **tail;
+ Error *local_err = NULL;
+
+ head = NULL;
+ tail = &head;
+
+ while (mem_blks != NULL) {
+ GuestMemoryBlockResponse *result;
+ GuestMemoryBlock *current_mem_blk = mem_blks->value;
+
+ result = g_malloc0(sizeof(*result));
+ result->phys_index = current_mem_blk->phys_index;
+ transfer_memory_block(current_mem_blk, false, result, &local_err);
+ if (local_err) { /* should never happen */
+ goto err;
+ }
+
+ QAPI_LIST_APPEND(tail, result);
+ mem_blks = mem_blks->next;
+ }
+
+ return head;
+err:
+ qapi_free_GuestMemoryBlockResponseList(head);
+ error_propagate(errp, local_err);
+ return NULL;
+}
+
+GuestMemoryBlockInfo *qmp_guest_get_memory_block_info(Error **errp)
+{
+ Error *local_err = NULL;
+ char *dirpath;
+ int dirfd;
+ char *buf;
+ GuestMemoryBlockInfo *info;
+
+ dirpath = g_strdup_printf("/sys/devices/system/memory/");
+ dirfd = open(dirpath, O_RDONLY | O_DIRECTORY);
+ if (dirfd == -1) {
+ error_setg_errno(errp, errno, "open(\"%s\")", dirpath);
+ g_free(dirpath);
+ return NULL;
+ }
+ g_free(dirpath);
+
+ buf = g_malloc0(20);
+ ga_read_sysfs_file(dirfd, "block_size_bytes", buf, 20, &local_err);
+ close(dirfd);
+ if (local_err) {
+ g_free(buf);
+ error_propagate(errp, local_err);
+ return NULL;
+ }
+
+ info = g_new0(GuestMemoryBlockInfo, 1);
+ info->size = strtol(buf, NULL, 16); /* the unit is bytes */
+
+ g_free(buf);
+
+ return info;
+}
+
+#define MAX_NAME_LEN 128
+static GuestDiskStatsInfoList *guest_get_diskstats(Error **errp)
+{
+ GuestDiskStatsInfoList *head = NULL, **tail = &head;
+ const char *diskstats = "/proc/diskstats";
+ FILE *fp;
+ size_t n;
+ char *line = NULL;
+
+ fp = fopen(diskstats, "r");
+ if (fp == NULL) {
+ error_setg_errno(errp, errno, "open(\"%s\")", diskstats);
+ return NULL;
+ }
+
+ while (getline(&line, &n, fp) != -1) {
+ g_autofree GuestDiskStatsInfo *diskstatinfo = NULL;
+ g_autofree GuestDiskStats *diskstat = NULL;
+ char dev_name[MAX_NAME_LEN];
+ unsigned int ios_pgr, tot_ticks, rq_ticks, wr_ticks, dc_ticks, fl_ticks;
+ unsigned long rd_ios, rd_merges_or_rd_sec, rd_ticks_or_wr_sec, wr_ios;
+ unsigned long wr_merges, rd_sec_or_wr_ios, wr_sec;
+ unsigned long dc_ios, dc_merges, dc_sec, fl_ios;
+ unsigned int major, minor;
+ int i;
+
+ i = sscanf(line, "%u %u %s %lu %lu %lu"
+ "%lu %lu %lu %lu %u %u %u %u"
+ "%lu %lu %lu %u %lu %u",
+ &major, &minor, dev_name,
+ &rd_ios, &rd_merges_or_rd_sec, &rd_sec_or_wr_ios,
+ &rd_ticks_or_wr_sec, &wr_ios, &wr_merges, &wr_sec,
+ &wr_ticks, &ios_pgr, &tot_ticks, &rq_ticks,
+ &dc_ios, &dc_merges, &dc_sec, &dc_ticks,
+ &fl_ios, &fl_ticks);
+
+ if (i < 7) {
+ continue;
+ }
+
+ diskstatinfo = g_new0(GuestDiskStatsInfo, 1);
+ diskstatinfo->name = g_strdup(dev_name);
+ diskstatinfo->major = major;
+ diskstatinfo->minor = minor;
+
+ diskstat = g_new0(GuestDiskStats, 1);
+ if (i == 7) {
+ diskstat->has_read_ios = true;
+ diskstat->read_ios = rd_ios;
+ diskstat->has_read_sectors = true;
+ diskstat->read_sectors = rd_merges_or_rd_sec;
+ diskstat->has_write_ios = true;
+ diskstat->write_ios = rd_sec_or_wr_ios;
+ diskstat->has_write_sectors = true;
+ diskstat->write_sectors = rd_ticks_or_wr_sec;
+ }
+ if (i >= 14) {
+ diskstat->has_read_ios = true;
+ diskstat->read_ios = rd_ios;
+ diskstat->has_read_sectors = true;
+ diskstat->read_sectors = rd_sec_or_wr_ios;
+ diskstat->has_read_merges = true;
+ diskstat->read_merges = rd_merges_or_rd_sec;
+ diskstat->has_read_ticks = true;
+ diskstat->read_ticks = rd_ticks_or_wr_sec;
+ diskstat->has_write_ios = true;
+ diskstat->write_ios = wr_ios;
+ diskstat->has_write_sectors = true;
+ diskstat->write_sectors = wr_sec;
+ diskstat->has_write_merges = true;
+ diskstat->write_merges = wr_merges;
+ diskstat->has_write_ticks = true;
+ diskstat->write_ticks = wr_ticks;
+ diskstat->has_ios_pgr = true;
+ diskstat->ios_pgr = ios_pgr;
+ diskstat->has_total_ticks = true;
+ diskstat->total_ticks = tot_ticks;
+ diskstat->has_weight_ticks = true;
+ diskstat->weight_ticks = rq_ticks;
+ }
+ if (i >= 18) {
+ diskstat->has_discard_ios = true;
+ diskstat->discard_ios = dc_ios;
+ diskstat->has_discard_merges = true;
+ diskstat->discard_merges = dc_merges;
+ diskstat->has_discard_sectors = true;
+ diskstat->discard_sectors = dc_sec;
+ diskstat->has_discard_ticks = true;
+ diskstat->discard_ticks = dc_ticks;
+ }
+ if (i >= 20) {
+ diskstat->has_flush_ios = true;
+ diskstat->flush_ios = fl_ios;
+ diskstat->has_flush_ticks = true;
+ diskstat->flush_ticks = fl_ticks;
+ }
+
+ diskstatinfo->stats = g_steal_pointer(&diskstat);
+ QAPI_LIST_APPEND(tail, diskstatinfo);
+ diskstatinfo = NULL;
+ }
+ free(line);
+ fclose(fp);
+ return head;
+}
+
+GuestDiskStatsInfoList *qmp_guest_get_diskstats(Error **errp)
+{
+ return guest_get_diskstats(errp);
+}
+
+GuestCpuStatsList *qmp_guest_get_cpustats(Error **errp)
+{
+ GuestCpuStatsList *head = NULL, **tail = &head;
+ const char *cpustats = "/proc/stat";
+ int clk_tck = sysconf(_SC_CLK_TCK);
+ FILE *fp;
+ size_t n;
+ char *line = NULL;
+
+ fp = fopen(cpustats, "r");
+ if (fp == NULL) {
+ error_setg_errno(errp, errno, "open(\"%s\")", cpustats);
+ return NULL;
+ }
+
+ while (getline(&line, &n, fp) != -1) {
+ GuestCpuStats *cpustat = NULL;
+ GuestLinuxCpuStats *linuxcpustat;
+ int i;
+ unsigned long user, system, idle, iowait, irq, softirq, steal, guest;
+ unsigned long nice, guest_nice;
+ char name[64];
+
+ i = sscanf(line, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
+ name, &user, &nice, &system, &idle, &iowait, &irq, &softirq,
+ &steal, &guest, &guest_nice);
+
+ /* drop "cpu 1 2 3 ...", get "cpuX 1 2 3 ..." only */
+ if ((i == EOF) || strncmp(name, "cpu", 3) || (name[3] == '\0')) {
+ continue;
+ }
+
+ if (i < 5) {
+ slog("Parsing cpu stat from %s failed, see \"man proc\"", cpustats);
+ break;
+ }
+
+ cpustat = g_new0(GuestCpuStats, 1);
+ cpustat->type = GUEST_CPU_STATS_TYPE_LINUX;
+
+ linuxcpustat = &cpustat->u.q_linux;
+ linuxcpustat->cpu = atoi(&name[3]);
+ linuxcpustat->user = user * 1000 / clk_tck;
+ linuxcpustat->nice = nice * 1000 / clk_tck;
+ linuxcpustat->system = system * 1000 / clk_tck;
+ linuxcpustat->idle = idle * 1000 / clk_tck;
+
+ if (i > 5) {
+ linuxcpustat->has_iowait = true;
+ linuxcpustat->iowait = iowait * 1000 / clk_tck;
+ }
+
+ if (i > 6) {
+ linuxcpustat->has_irq = true;
+ linuxcpustat->irq = irq * 1000 / clk_tck;
+ linuxcpustat->has_softirq = true;
+ linuxcpustat->softirq = softirq * 1000 / clk_tck;
+ }
+
+ if (i > 8) {
+ linuxcpustat->has_steal = true;
+ linuxcpustat->steal = steal * 1000 / clk_tck;
+ }
+
+ if (i > 9) {
+ linuxcpustat->has_guest = true;
+ linuxcpustat->guest = guest * 1000 / clk_tck;
+ }
+
+ if (i > 10) {
+ linuxcpustat->has_guest = true;
+ linuxcpustat->guest = guest * 1000 / clk_tck;
+ linuxcpustat->has_guestnice = true;
+ linuxcpustat->guestnice = guest_nice * 1000 / clk_tck;
+ }
+
+ QAPI_LIST_APPEND(tail, cpustat);
+ }
+
+ free(line);
+ fclose(fp);
+ return head;
+}
+
+static char *hexToIPAddress(const void *hexValue, int is_ipv6)
+{
+ if (is_ipv6) {
+ char addr[INET6_ADDRSTRLEN];
+ struct in6_addr in6;
+ const char *hexStr = (const char *)hexValue;
+ int i;
+
+ for (i = 0; i < 16; i++) {
+ sscanf(&hexStr[i * 2], "%02hhx", &in6.s6_addr[i]);
+ }
+ inet_ntop(AF_INET6, &in6, addr, INET6_ADDRSTRLEN);
+
+ return g_strdup(addr);
+ } else {
+ unsigned int hexInt = *(unsigned int *)hexValue;
+ unsigned int byte1 = (hexInt >> 24) & 0xFF;
+ unsigned int byte2 = (hexInt >> 16) & 0xFF;
+ unsigned int byte3 = (hexInt >> 8) & 0xFF;
+ unsigned int byte4 = hexInt & 0xFF;
+
+ return g_strdup_printf("%u.%u.%u.%u", byte4, byte3, byte2, byte1);
+ }
+}
+
+GuestNetworkRouteList *qmp_guest_network_get_route(Error **errp)
+{
+ GuestNetworkRouteList *head = NULL, **tail = &head;
+ const char *routeFiles[] = {"/proc/net/route", "/proc/net/ipv6_route"};
+ FILE *fp;
+ size_t n;
+ char *line = NULL;
+ int firstLine;
+ int is_ipv6;
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ firstLine = 1;
+ is_ipv6 = (i == 1);
+ fp = fopen(routeFiles[i], "r");
+ if (fp == NULL) {
+ error_setg_errno(errp, errno, "open(\"%s\")", routeFiles[i]);
+ free(line);
+ continue;
+ }
+
+ while (getline(&line, &n, fp) != -1) {
+ if (firstLine && !is_ipv6) {
+ firstLine = 0;
+ continue;
+ }
+ GuestNetworkRoute *route = NULL;
+ GuestNetworkRoute *networkroute;
+ char Iface[IFNAMSIZ];
+ if (is_ipv6) {
+ char Destination[33], Source[33], NextHop[33];
+ int DesPrefixlen, SrcPrefixlen, Metric, RefCnt, Use, Flags;
+
+ /* Parse the line and extract the values */
+ if (sscanf(line, "%32s %x %32s %x %32s %x %x %x %x %s",
+ Destination, &DesPrefixlen, Source,
+ &SrcPrefixlen, NextHop, &Metric, &RefCnt,
+ &Use, &Flags, Iface) != 10) {
+ continue;
+ }
+
+ route = g_new0(GuestNetworkRoute, 1);
+ networkroute = route;
+ networkroute->iface = g_strdup(Iface);
+ networkroute->destination = hexToIPAddress(Destination, 1);
+ networkroute->metric = Metric;
+ networkroute->source = hexToIPAddress(Source, 1);
+ networkroute->desprefixlen = g_strdup_printf(
+ "%d", DesPrefixlen
+ );
+ networkroute->srcprefixlen = g_strdup_printf(
+ "%d", SrcPrefixlen
+ );
+ networkroute->nexthop = hexToIPAddress(NextHop, 1);
+ networkroute->has_flags = true;
+ networkroute->flags = Flags;
+ networkroute->has_refcnt = true;
+ networkroute->refcnt = RefCnt;
+ networkroute->has_use = true;
+ networkroute->use = Use;
+ networkroute->version = 6;
+ } else {
+ unsigned int Destination, Gateway, Mask, Flags;
+ int RefCnt, Use, Metric, MTU, Window, IRTT;
+
+ /* Parse the line and extract the values */
+ if (sscanf(line, "%s %X %X %x %d %d %d %X %d %d %d",
+ Iface, &Destination, &Gateway, &Flags, &RefCnt,
+ &Use, &Metric, &Mask, &MTU, &Window, &IRTT) != 11) {
+ continue;
+ }
+
+ route = g_new0(GuestNetworkRoute, 1);
+ networkroute = route;
+ networkroute->iface = g_strdup(Iface);
+ networkroute->destination = hexToIPAddress(&Destination, 0);
+ networkroute->gateway = hexToIPAddress(&Gateway, 0);
+ networkroute->mask = hexToIPAddress(&Mask, 0);
+ networkroute->metric = Metric;
+ networkroute->has_flags = true;
+ networkroute->flags = Flags;
+ networkroute->has_refcnt = true;
+ networkroute->refcnt = RefCnt;
+ networkroute->has_use = true;
+ networkroute->use = Use;
+ networkroute->has_mtu = true;
+ networkroute->mtu = MTU;
+ networkroute->has_window = true;
+ networkroute->window = Window;
+ networkroute->has_irtt = true;
+ networkroute->irtt = IRTT;
+ networkroute->version = 4;
+ }
+
+ QAPI_LIST_APPEND(tail, route);
+ }
+
+ free(line);
+ fclose(fp);
+ }
+
+ return head;
+}
diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 7f05996..c2bd0b4 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -24,23 +24,12 @@
#include "qemu/base64.h"
#include "qemu/cutils.h"
#include "commands-common.h"
-#include "block/nvme.h"
#include "cutils.h"
#ifdef HAVE_UTMPX
#include <utmpx.h>
#endif
-#if defined(__linux__)
-#include <mntent.h>
-#include <sys/statvfs.h>
-#include <linux/nvme_ioctl.h>
-
-#ifdef CONFIG_LIBUDEV
-#include <libudev.h>
-#endif
-#endif
-
#ifdef HAVE_GETIFADDRS
#include <arpa/inet.h>
#include <sys/socket.h>
@@ -59,7 +48,7 @@
#endif
#endif
-static void ga_wait_child(pid_t pid, int *status, Error **errp)
+static bool ga_wait_child(pid_t pid, int *status, Error **errp)
{
pid_t rpid;
@@ -70,10 +59,11 @@
if (rpid == -1) {
error_setg_errno(errp, errno, "failed to wait for child (pid: %d)",
pid);
- return;
+ return false;
}
g_assert(rpid == pid);
+ return true;
}
static ssize_t ga_pipe_read_str(int fd[2], char **str)
@@ -178,8 +168,7 @@
goto out;
}
- ga_wait_child(pid, &status, errp);
- if (*errp) {
+ if (!ga_wait_child(pid, &status, errp)) {
goto out;
}
@@ -842,1308 +831,6 @@
}
#endif
-/* linux-specific implementations. avoid this if at all possible. */
-#if defined(__linux__)
-#if defined(CONFIG_FSFREEZE)
-
-static char *get_pci_driver(char const *syspath, int pathlen, Error **errp)
-{
- char *path;
- char *dpath;
- char *driver = NULL;
- char buf[PATH_MAX];
- ssize_t len;
-
- path = g_strndup(syspath, pathlen);
- dpath = g_strdup_printf("%s/driver", path);
- len = readlink(dpath, buf, sizeof(buf) - 1);
- if (len != -1) {
- buf[len] = 0;
- driver = g_path_get_basename(buf);
- }
- g_free(dpath);
- g_free(path);
- return driver;
-}
-
-static int compare_uint(const void *_a, const void *_b)
-{
- unsigned int a = *(unsigned int *)_a;
- unsigned int b = *(unsigned int *)_b;
-
- return a < b ? -1 : a > b ? 1 : 0;
-}
-
-/* Walk the specified sysfs and build a sorted list of host or ata numbers */
-static int build_hosts(char const *syspath, char const *host, bool ata,
- unsigned int *hosts, int hosts_max, Error **errp)
-{
- char *path;
- DIR *dir;
- struct dirent *entry;
- int i = 0;
-
- path = g_strndup(syspath, host - syspath);
- dir = opendir(path);
- if (!dir) {
- error_setg_errno(errp, errno, "opendir(\"%s\")", path);
- g_free(path);
- return -1;
- }
-
- while (i < hosts_max) {
- entry = readdir(dir);
- if (!entry) {
- break;
- }
- if (ata && sscanf(entry->d_name, "ata%d", hosts + i) == 1) {
- ++i;
- } else if (!ata && sscanf(entry->d_name, "host%d", hosts + i) == 1) {
- ++i;
- }
- }
-
- qsort(hosts, i, sizeof(hosts[0]), compare_uint);
-
- g_free(path);
- closedir(dir);
- return i;
-}
-
-/*
- * Store disk device info for devices on the PCI bus.
- * Returns true if information has been stored, or false for failure.
- */
-static bool build_guest_fsinfo_for_pci_dev(char const *syspath,
- GuestDiskAddress *disk,
- Error **errp)
-{
- unsigned int pci[4], host, hosts[8], tgt[3];
- int i, nhosts = 0, pcilen;
- GuestPCIAddress *pciaddr = disk->pci_controller;
- bool has_ata = false, has_host = false, has_tgt = false;
- char *p, *q, *driver = NULL;
- bool ret = false;
-
- p = strstr(syspath, "/devices/pci");
- if (!p || sscanf(p + 12, "%*x:%*x/%x:%x:%x.%x%n",
- pci, pci + 1, pci + 2, pci + 3, &pcilen) < 4) {
- g_debug("only pci device is supported: sysfs path '%s'", syspath);
- return false;
- }
-
- p += 12 + pcilen;
- while (true) {
- driver = get_pci_driver(syspath, p - syspath, errp);
- if (driver && (g_str_equal(driver, "ata_piix") ||
- g_str_equal(driver, "sym53c8xx") ||
- g_str_equal(driver, "virtio-pci") ||
- g_str_equal(driver, "ahci") ||
- g_str_equal(driver, "nvme") ||
- g_str_equal(driver, "xhci_hcd") ||
- g_str_equal(driver, "ehci-pci"))) {
- break;
- }
-
- g_free(driver);
- if (sscanf(p, "/%x:%x:%x.%x%n",
- pci, pci + 1, pci + 2, pci + 3, &pcilen) == 4) {
- p += pcilen;
- continue;
- }
-
- g_debug("unsupported driver or sysfs path '%s'", syspath);
- return false;
- }
-
- p = strstr(syspath, "/target");
- if (p && sscanf(p + 7, "%*u:%*u:%*u/%*u:%u:%u:%u",
- tgt, tgt + 1, tgt + 2) == 3) {
- has_tgt = true;
- }
-
- p = strstr(syspath, "/ata");
- if (p) {
- q = p + 4;
- has_ata = true;
- } else {
- p = strstr(syspath, "/host");
- q = p + 5;
- }
- if (p && sscanf(q, "%u", &host) == 1) {
- has_host = true;
- nhosts = build_hosts(syspath, p, has_ata, hosts,
- ARRAY_SIZE(hosts), errp);
- if (nhosts < 0) {
- goto cleanup;
- }
- }
-
- pciaddr->domain = pci[0];
- pciaddr->bus = pci[1];
- pciaddr->slot = pci[2];
- pciaddr->function = pci[3];
-
- if (strcmp(driver, "ata_piix") == 0) {
- /* a host per ide bus, target*:0:<unit>:0 */
- if (!has_host || !has_tgt) {
- g_debug("invalid sysfs path '%s' (driver '%s')", syspath, driver);
- goto cleanup;
- }
- for (i = 0; i < nhosts; i++) {
- if (host == hosts[i]) {
- disk->bus_type = GUEST_DISK_BUS_TYPE_IDE;
- disk->bus = i;
- disk->unit = tgt[1];
- break;
- }
- }
- if (i >= nhosts) {
- g_debug("no host for '%s' (driver '%s')", syspath, driver);
- goto cleanup;
- }
- } else if (strcmp(driver, "sym53c8xx") == 0) {
- /* scsi(LSI Logic): target*:0:<unit>:0 */
- if (!has_tgt) {
- g_debug("invalid sysfs path '%s' (driver '%s')", syspath, driver);
- goto cleanup;
- }
- disk->bus_type = GUEST_DISK_BUS_TYPE_SCSI;
- disk->unit = tgt[1];
- } else if (strcmp(driver, "virtio-pci") == 0) {
- if (has_tgt) {
- /* virtio-scsi: target*:0:0:<unit> */
- disk->bus_type = GUEST_DISK_BUS_TYPE_SCSI;
- disk->unit = tgt[2];
- } else {
- /* virtio-blk: 1 disk per 1 device */
- disk->bus_type = GUEST_DISK_BUS_TYPE_VIRTIO;
- }
- } else if (strcmp(driver, "ahci") == 0) {
- /* ahci: 1 host per 1 unit */
- if (!has_host || !has_tgt) {
- g_debug("invalid sysfs path '%s' (driver '%s')", syspath, driver);
- goto cleanup;
- }
- for (i = 0; i < nhosts; i++) {
- if (host == hosts[i]) {
- disk->unit = i;
- disk->bus_type = GUEST_DISK_BUS_TYPE_SATA;
- break;
- }
- }
- if (i >= nhosts) {
- g_debug("no host for '%s' (driver '%s')", syspath, driver);
- goto cleanup;
- }
- } else if (strcmp(driver, "nvme") == 0) {
- disk->bus_type = GUEST_DISK_BUS_TYPE_NVME;
- } else if (strcmp(driver, "ehci-pci") == 0 || strcmp(driver, "xhci_hcd") == 0) {
- disk->bus_type = GUEST_DISK_BUS_TYPE_USB;
- } else {
- g_debug("unknown driver '%s' (sysfs path '%s')", driver, syspath);
- goto cleanup;
- }
-
- ret = true;
-
-cleanup:
- g_free(driver);
- return ret;
-}
-
-/*
- * Store disk device info for non-PCI virtio devices (for example s390x
- * channel I/O devices). Returns true if information has been stored, or
- * false for failure.
- */
-static bool build_guest_fsinfo_for_nonpci_virtio(char const *syspath,
- GuestDiskAddress *disk,
- Error **errp)
-{
- unsigned int tgt[3];
- char *p;
-
- if (!strstr(syspath, "/virtio") || !strstr(syspath, "/block")) {
- g_debug("Unsupported virtio device '%s'", syspath);
- return false;
- }
-
- p = strstr(syspath, "/target");
- if (p && sscanf(p + 7, "%*u:%*u:%*u/%*u:%u:%u:%u",
- &tgt[0], &tgt[1], &tgt[2]) == 3) {
- /* virtio-scsi: target*:0:<target>:<unit> */
- disk->bus_type = GUEST_DISK_BUS_TYPE_SCSI;
- disk->bus = tgt[0];
- disk->target = tgt[1];
- disk->unit = tgt[2];
- } else {
- /* virtio-blk: 1 disk per 1 device */
- disk->bus_type = GUEST_DISK_BUS_TYPE_VIRTIO;
- }
-
- return true;
-}
-
-/*
- * Store disk device info for CCW devices (s390x channel I/O devices).
- * Returns true if information has been stored, or false for failure.
- */
-static bool build_guest_fsinfo_for_ccw_dev(char const *syspath,
- GuestDiskAddress *disk,
- Error **errp)
-{
- unsigned int cssid, ssid, subchno, devno;
- char *p;
-
- p = strstr(syspath, "/devices/css");
- if (!p || sscanf(p + 12, "%*x/%x.%x.%x/%*x.%*x.%x/",
- &cssid, &ssid, &subchno, &devno) < 4) {
- g_debug("could not parse ccw device sysfs path: %s", syspath);
- return false;
- }
-
- disk->ccw_address = g_new0(GuestCCWAddress, 1);
- disk->ccw_address->cssid = cssid;
- disk->ccw_address->ssid = ssid;
- disk->ccw_address->subchno = subchno;
- disk->ccw_address->devno = devno;
-
- if (strstr(p, "/virtio")) {
- build_guest_fsinfo_for_nonpci_virtio(syspath, disk, errp);
- }
-
- return true;
-}
-
-/* Store disk device info specified by @sysfs into @fs */
-static void build_guest_fsinfo_for_real_device(char const *syspath,
- GuestFilesystemInfo *fs,
- Error **errp)
-{
- GuestDiskAddress *disk;
- GuestPCIAddress *pciaddr;
- bool has_hwinf;
-#ifdef CONFIG_LIBUDEV
- struct udev *udev = NULL;
- struct udev_device *udevice = NULL;
-#endif
-
- pciaddr = g_new0(GuestPCIAddress, 1);
- pciaddr->domain = -1; /* -1 means field is invalid */
- pciaddr->bus = -1;
- pciaddr->slot = -1;
- pciaddr->function = -1;
-
- disk = g_new0(GuestDiskAddress, 1);
- disk->pci_controller = pciaddr;
- disk->bus_type = GUEST_DISK_BUS_TYPE_UNKNOWN;
-
-#ifdef CONFIG_LIBUDEV
- udev = udev_new();
- udevice = udev_device_new_from_syspath(udev, syspath);
- if (udev == NULL || udevice == NULL) {
- g_debug("failed to query udev");
- } else {
- const char *devnode, *serial;
- devnode = udev_device_get_devnode(udevice);
- if (devnode != NULL) {
- disk->dev = g_strdup(devnode);
- }
- serial = udev_device_get_property_value(udevice, "ID_SERIAL");
- if (serial != NULL && *serial != 0) {
- disk->serial = g_strdup(serial);
- }
- }
-
- udev_unref(udev);
- udev_device_unref(udevice);
-#endif
-
- if (strstr(syspath, "/devices/pci")) {
- has_hwinf = build_guest_fsinfo_for_pci_dev(syspath, disk, errp);
- } else if (strstr(syspath, "/devices/css")) {
- has_hwinf = build_guest_fsinfo_for_ccw_dev(syspath, disk, errp);
- } else if (strstr(syspath, "/virtio")) {
- has_hwinf = build_guest_fsinfo_for_nonpci_virtio(syspath, disk, errp);
- } else {
- g_debug("Unsupported device type for '%s'", syspath);
- has_hwinf = false;
- }
-
- if (has_hwinf || disk->dev || disk->serial) {
- QAPI_LIST_PREPEND(fs->disk, disk);
- } else {
- qapi_free_GuestDiskAddress(disk);
- }
-}
-
-static void build_guest_fsinfo_for_device(char const *devpath,
- GuestFilesystemInfo *fs,
- Error **errp);
-
-/* Store a list of slave devices of virtual volume specified by @syspath into
- * @fs */
-static void build_guest_fsinfo_for_virtual_device(char const *syspath,
- GuestFilesystemInfo *fs,
- Error **errp)
-{
- Error *err = NULL;
- DIR *dir;
- char *dirpath;
- struct dirent *entry;
-
- dirpath = g_strdup_printf("%s/slaves", syspath);
- dir = opendir(dirpath);
- if (!dir) {
- if (errno != ENOENT) {
- error_setg_errno(errp, errno, "opendir(\"%s\")", dirpath);
- }
- g_free(dirpath);
- return;
- }
-
- for (;;) {
- errno = 0;
- entry = readdir(dir);
- if (entry == NULL) {
- if (errno) {
- error_setg_errno(errp, errno, "readdir(\"%s\")", dirpath);
- }
- break;
- }
-
- if (entry->d_type == DT_LNK) {
- char *path;
-
- g_debug(" slave device '%s'", entry->d_name);
- path = g_strdup_printf("%s/slaves/%s", syspath, entry->d_name);
- build_guest_fsinfo_for_device(path, fs, &err);
- g_free(path);
-
- if (err) {
- error_propagate(errp, err);
- break;
- }
- }
- }
-
- g_free(dirpath);
- closedir(dir);
-}
-
-static bool is_disk_virtual(const char *devpath, Error **errp)
-{
- g_autofree char *syspath = realpath(devpath, NULL);
-
- if (!syspath) {
- error_setg_errno(errp, errno, "realpath(\"%s\")", devpath);
- return false;
- }
- return strstr(syspath, "/devices/virtual/block/") != NULL;
-}
-
-/* Dispatch to functions for virtual/real device */
-static void build_guest_fsinfo_for_device(char const *devpath,
- GuestFilesystemInfo *fs,
- Error **errp)
-{
- ERRP_GUARD();
- g_autofree char *syspath = NULL;
- bool is_virtual = false;
-
- syspath = realpath(devpath, NULL);
- if (!syspath) {
- if (errno != ENOENT) {
- error_setg_errno(errp, errno, "realpath(\"%s\")", devpath);
- return;
- }
-
- /* ENOENT: This devpath may not exist because of container config */
- if (!fs->name) {
- fs->name = g_path_get_basename(devpath);
- }
- return;
- }
-
- if (!fs->name) {
- fs->name = g_path_get_basename(syspath);
- }
-
- g_debug(" parse sysfs path '%s'", syspath);
- is_virtual = is_disk_virtual(syspath, errp);
- if (*errp != NULL) {
- return;
- }
- if (is_virtual) {
- build_guest_fsinfo_for_virtual_device(syspath, fs, errp);
- } else {
- build_guest_fsinfo_for_real_device(syspath, fs, errp);
- }
-}
-
-#ifdef CONFIG_LIBUDEV
-
-/*
- * Wrapper around build_guest_fsinfo_for_device() for getting just
- * the disk address.
- */
-static GuestDiskAddress *get_disk_address(const char *syspath, Error **errp)
-{
- g_autoptr(GuestFilesystemInfo) fs = NULL;
-
- fs = g_new0(GuestFilesystemInfo, 1);
- build_guest_fsinfo_for_device(syspath, fs, errp);
- if (fs->disk != NULL) {
- return g_steal_pointer(&fs->disk->value);
- }
- return NULL;
-}
-
-static char *get_alias_for_syspath(const char *syspath)
-{
- struct udev *udev = NULL;
- struct udev_device *udevice = NULL;
- char *ret = NULL;
-
- udev = udev_new();
- if (udev == NULL) {
- g_debug("failed to query udev");
- goto out;
- }
- udevice = udev_device_new_from_syspath(udev, syspath);
- if (udevice == NULL) {
- g_debug("failed to query udev for path: %s", syspath);
- goto out;
- } else {
- const char *alias = udev_device_get_property_value(
- udevice, "DM_NAME");
- /*
- * NULL means there was an error and empty string means there is no
- * alias. In case of no alias we return NULL instead of empty string.
- */
- if (alias == NULL) {
- g_debug("failed to query udev for device alias for: %s",
- syspath);
- } else if (*alias != 0) {
- ret = g_strdup(alias);
- }
- }
-
-out:
- udev_unref(udev);
- udev_device_unref(udevice);
- return ret;
-}
-
-static char *get_device_for_syspath(const char *syspath)
-{
- struct udev *udev = NULL;
- struct udev_device *udevice = NULL;
- char *ret = NULL;
-
- udev = udev_new();
- if (udev == NULL) {
- g_debug("failed to query udev");
- goto out;
- }
- udevice = udev_device_new_from_syspath(udev, syspath);
- if (udevice == NULL) {
- g_debug("failed to query udev for path: %s", syspath);
- goto out;
- } else {
- ret = g_strdup(udev_device_get_devnode(udevice));
- }
-
-out:
- udev_unref(udev);
- udev_device_unref(udevice);
- return ret;
-}
-
-static void get_disk_deps(const char *disk_dir, GuestDiskInfo *disk)
-{
- g_autofree char *deps_dir = NULL;
- const gchar *dep;
- GDir *dp_deps = NULL;
-
- /* List dependent disks */
- deps_dir = g_strdup_printf("%s/slaves", disk_dir);
- g_debug(" listing entries in: %s", deps_dir);
- dp_deps = g_dir_open(deps_dir, 0, NULL);
- if (dp_deps == NULL) {
- g_debug("failed to list entries in %s", deps_dir);
- return;
- }
- disk->has_dependencies = true;
- while ((dep = g_dir_read_name(dp_deps)) != NULL) {
- g_autofree char *dep_dir = NULL;
- char *dev_name;
-
- /* Add dependent disks */
- dep_dir = g_strdup_printf("%s/%s", deps_dir, dep);
- dev_name = get_device_for_syspath(dep_dir);
- if (dev_name != NULL) {
- g_debug(" adding dependent device: %s", dev_name);
- QAPI_LIST_PREPEND(disk->dependencies, dev_name);
- }
- }
- g_dir_close(dp_deps);
-}
-
-/*
- * Detect partitions subdirectory, name is "<disk_name><number>" or
- * "<disk_name>p<number>"
- *
- * @disk_name -- last component of /sys path (e.g. sda)
- * @disk_dir -- sys path of the disk (e.g. /sys/block/sda)
- * @disk_dev -- device node of the disk (e.g. /dev/sda)
- */
-static GuestDiskInfoList *get_disk_partitions(
- GuestDiskInfoList *list,
- const char *disk_name, const char *disk_dir,
- const char *disk_dev)
-{
- GuestDiskInfoList *ret = list;
- struct dirent *de_disk;
- DIR *dp_disk = NULL;
- size_t len = strlen(disk_name);
-
- dp_disk = opendir(disk_dir);
- while ((de_disk = readdir(dp_disk)) != NULL) {
- g_autofree char *partition_dir = NULL;
- char *dev_name;
- GuestDiskInfo *partition;
-
- if (!(de_disk->d_type & DT_DIR)) {
- continue;
- }
-
- if (!(strncmp(disk_name, de_disk->d_name, len) == 0 &&
- ((*(de_disk->d_name + len) == 'p' &&
- isdigit(*(de_disk->d_name + len + 1))) ||
- isdigit(*(de_disk->d_name + len))))) {
- continue;
- }
-
- partition_dir = g_strdup_printf("%s/%s",
- disk_dir, de_disk->d_name);
- dev_name = get_device_for_syspath(partition_dir);
- if (dev_name == NULL) {
- g_debug("Failed to get device name for syspath: %s",
- disk_dir);
- continue;
- }
- partition = g_new0(GuestDiskInfo, 1);
- partition->name = dev_name;
- partition->partition = true;
- partition->has_dependencies = true;
- /* Add parent disk as dependent for easier tracking of hierarchy */
- QAPI_LIST_PREPEND(partition->dependencies, g_strdup(disk_dev));
-
- QAPI_LIST_PREPEND(ret, partition);
- }
- closedir(dp_disk);
-
- return ret;
-}
-
-static void get_nvme_smart(GuestDiskInfo *disk)
-{
- int fd;
- GuestNVMeSmart *smart;
- NvmeSmartLog log = {0};
- struct nvme_admin_cmd cmd = {
- .opcode = NVME_ADM_CMD_GET_LOG_PAGE,
- .nsid = NVME_NSID_BROADCAST,
- .addr = (uintptr_t)&log,
- .data_len = sizeof(log),
- .cdw10 = NVME_LOG_SMART_INFO | (1 << 15) /* RAE bit */
- | (((sizeof(log) >> 2) - 1) << 16)
- };
-
- fd = qga_open_cloexec(disk->name, O_RDONLY, 0);
- if (fd == -1) {
- g_debug("Failed to open device: %s: %s", disk->name, g_strerror(errno));
- return;
- }
-
- if (ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd)) {
- g_debug("Failed to get smart: %s: %s", disk->name, g_strerror(errno));
- close(fd);
- return;
- }
-
- disk->smart = g_new0(GuestDiskSmart, 1);
- disk->smart->type = GUEST_DISK_BUS_TYPE_NVME;
-
- smart = &disk->smart->u.nvme;
- smart->critical_warning = log.critical_warning;
- smart->temperature = lduw_le_p(&log.temperature); /* unaligned field */
- smart->available_spare = log.available_spare;
- smart->available_spare_threshold = log.available_spare_threshold;
- smart->percentage_used = log.percentage_used;
- smart->data_units_read_lo = le64_to_cpu(log.data_units_read[0]);
- smart->data_units_read_hi = le64_to_cpu(log.data_units_read[1]);
- smart->data_units_written_lo = le64_to_cpu(log.data_units_written[0]);
- smart->data_units_written_hi = le64_to_cpu(log.data_units_written[1]);
- smart->host_read_commands_lo = le64_to_cpu(log.host_read_commands[0]);
- smart->host_read_commands_hi = le64_to_cpu(log.host_read_commands[1]);
- smart->host_write_commands_lo = le64_to_cpu(log.host_write_commands[0]);
- smart->host_write_commands_hi = le64_to_cpu(log.host_write_commands[1]);
- smart->controller_busy_time_lo = le64_to_cpu(log.controller_busy_time[0]);
- smart->controller_busy_time_hi = le64_to_cpu(log.controller_busy_time[1]);
- smart->power_cycles_lo = le64_to_cpu(log.power_cycles[0]);
- smart->power_cycles_hi = le64_to_cpu(log.power_cycles[1]);
- smart->power_on_hours_lo = le64_to_cpu(log.power_on_hours[0]);
- smart->power_on_hours_hi = le64_to_cpu(log.power_on_hours[1]);
- smart->unsafe_shutdowns_lo = le64_to_cpu(log.unsafe_shutdowns[0]);
- smart->unsafe_shutdowns_hi = le64_to_cpu(log.unsafe_shutdowns[1]);
- smart->media_errors_lo = le64_to_cpu(log.media_errors[0]);
- smart->media_errors_hi = le64_to_cpu(log.media_errors[1]);
- smart->number_of_error_log_entries_lo =
- le64_to_cpu(log.number_of_error_log_entries[0]);
- smart->number_of_error_log_entries_hi =
- le64_to_cpu(log.number_of_error_log_entries[1]);
-
- close(fd);
-}
-
-static void get_disk_smart(GuestDiskInfo *disk)
-{
- if (disk->address
- && (disk->address->bus_type == GUEST_DISK_BUS_TYPE_NVME)) {
- get_nvme_smart(disk);
- }
-}
-
-GuestDiskInfoList *qmp_guest_get_disks(Error **errp)
-{
- GuestDiskInfoList *ret = NULL;
- GuestDiskInfo *disk;
- DIR *dp = NULL;
- struct dirent *de = NULL;
-
- g_debug("listing /sys/block directory");
- dp = opendir("/sys/block");
- if (dp == NULL) {
- error_setg_errno(errp, errno, "Can't open directory \"/sys/block\"");
- return NULL;
- }
- while ((de = readdir(dp)) != NULL) {
- g_autofree char *disk_dir = NULL, *line = NULL,
- *size_path = NULL;
- char *dev_name;
- Error *local_err = NULL;
- if (de->d_type != DT_LNK) {
- g_debug(" skipping entry: %s", de->d_name);
- continue;
- }
-
- /* Check size and skip zero-sized disks */
- g_debug(" checking disk size");
- size_path = g_strdup_printf("/sys/block/%s/size", de->d_name);
- if (!g_file_get_contents(size_path, &line, NULL, NULL)) {
- g_debug(" failed to read disk size");
- continue;
- }
- if (g_strcmp0(line, "0\n") == 0) {
- g_debug(" skipping zero-sized disk");
- continue;
- }
-
- g_debug(" adding %s", de->d_name);
- disk_dir = g_strdup_printf("/sys/block/%s", de->d_name);
- dev_name = get_device_for_syspath(disk_dir);
- if (dev_name == NULL) {
- g_debug("Failed to get device name for syspath: %s",
- disk_dir);
- continue;
- }
- disk = g_new0(GuestDiskInfo, 1);
- disk->name = dev_name;
- disk->partition = false;
- disk->alias = get_alias_for_syspath(disk_dir);
- QAPI_LIST_PREPEND(ret, disk);
-
- /* Get address for non-virtual devices */
- bool is_virtual = is_disk_virtual(disk_dir, &local_err);
- if (local_err != NULL) {
- g_debug(" failed to check disk path, ignoring error: %s",
- error_get_pretty(local_err));
- error_free(local_err);
- local_err = NULL;
- /* Don't try to get the address */
- is_virtual = true;
- }
- if (!is_virtual) {
- disk->address = get_disk_address(disk_dir, &local_err);
- if (local_err != NULL) {
- g_debug(" failed to get device info, ignoring error: %s",
- error_get_pretty(local_err));
- error_free(local_err);
- local_err = NULL;
- }
- }
-
- get_disk_deps(disk_dir, disk);
- get_disk_smart(disk);
- ret = get_disk_partitions(ret, de->d_name, disk_dir, dev_name);
- }
-
- closedir(dp);
-
- return ret;
-}
-
-#else
-
-GuestDiskInfoList *qmp_guest_get_disks(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
-
-#endif
-
-/* Return a list of the disk device(s)' info which @mount lies on */
-static GuestFilesystemInfo *build_guest_fsinfo(struct FsMount *mount,
- Error **errp)
-{
- GuestFilesystemInfo *fs = g_malloc0(sizeof(*fs));
- struct statvfs buf;
- unsigned long used, nonroot_total, fr_size;
- char *devpath = g_strdup_printf("/sys/dev/block/%u:%u",
- mount->devmajor, mount->devminor);
-
- fs->mountpoint = g_strdup(mount->dirname);
- fs->type = g_strdup(mount->devtype);
- build_guest_fsinfo_for_device(devpath, fs, errp);
-
- if (statvfs(fs->mountpoint, &buf) == 0) {
- fr_size = buf.f_frsize;
- used = buf.f_blocks - buf.f_bfree;
- nonroot_total = used + buf.f_bavail;
- fs->used_bytes = used * fr_size;
- fs->total_bytes = nonroot_total * fr_size;
- fs->total_bytes_privileged = buf.f_blocks * fr_size;
-
- fs->has_total_bytes = true;
- fs->has_total_bytes_privileged = true;
- fs->has_used_bytes = true;
- }
-
- g_free(devpath);
-
- return fs;
-}
-
-GuestFilesystemInfoList *qmp_guest_get_fsinfo(Error **errp)
-{
- FsMountList mounts;
- struct FsMount *mount;
- GuestFilesystemInfoList *ret = NULL;
- Error *local_err = NULL;
-
- QTAILQ_INIT(&mounts);
- if (!build_fs_mount_list(&mounts, &local_err)) {
- error_propagate(errp, local_err);
- return NULL;
- }
-
- QTAILQ_FOREACH(mount, &mounts, next) {
- g_debug("Building guest fsinfo for '%s'", mount->dirname);
-
- QAPI_LIST_PREPEND(ret, build_guest_fsinfo(mount, &local_err));
- if (local_err) {
- error_propagate(errp, local_err);
- qapi_free_GuestFilesystemInfoList(ret);
- ret = NULL;
- break;
- }
- }
-
- free_fs_mount_list(&mounts);
- return ret;
-}
-#endif /* CONFIG_FSFREEZE */
-
-#if defined(CONFIG_FSTRIM)
-/*
- * Walk list of mounted file systems in the guest, and trim them.
- */
-GuestFilesystemTrimResponse *
-qmp_guest_fstrim(bool has_minimum, int64_t minimum, Error **errp)
-{
- GuestFilesystemTrimResponse *response;
- GuestFilesystemTrimResult *result;
- int ret = 0;
- FsMountList mounts;
- struct FsMount *mount;
- int fd;
- struct fstrim_range r;
-
- slog("guest-fstrim called");
-
- QTAILQ_INIT(&mounts);
- if (!build_fs_mount_list(&mounts, errp)) {
- return NULL;
- }
-
- response = g_malloc0(sizeof(*response));
-
- QTAILQ_FOREACH(mount, &mounts, next) {
- result = g_malloc0(sizeof(*result));
- result->path = g_strdup(mount->dirname);
-
- QAPI_LIST_PREPEND(response->paths, result);
-
- fd = qga_open_cloexec(mount->dirname, O_RDONLY, 0);
- if (fd == -1) {
- result->error = g_strdup_printf("failed to open: %s",
- strerror(errno));
- continue;
- }
-
- /* We try to cull filesystems we know won't work in advance, but other
- * filesystems may not implement fstrim for less obvious reasons.
- * These will report EOPNOTSUPP; while in some other cases ENOTTY
- * will be reported (e.g. CD-ROMs).
- * Any other error means an unexpected error.
- */
- r.start = 0;
- r.len = -1;
- r.minlen = has_minimum ? minimum : 0;
- ret = ioctl(fd, FITRIM, &r);
- if (ret == -1) {
- if (errno == ENOTTY || errno == EOPNOTSUPP) {
- result->error = g_strdup("trim not supported");
- } else {
- result->error = g_strdup_printf("failed to trim: %s",
- strerror(errno));
- }
- close(fd);
- continue;
- }
-
- result->has_minimum = true;
- result->minimum = r.minlen;
- result->has_trimmed = true;
- result->trimmed = r.len;
- close(fd);
- }
-
- free_fs_mount_list(&mounts);
- return response;
-}
-#endif /* CONFIG_FSTRIM */
-
-
-#define LINUX_SYS_STATE_FILE "/sys/power/state"
-#define SUSPEND_SUPPORTED 0
-#define SUSPEND_NOT_SUPPORTED 1
-
-typedef enum {
- SUSPEND_MODE_DISK = 0,
- SUSPEND_MODE_RAM = 1,
- SUSPEND_MODE_HYBRID = 2,
-} SuspendMode;
-
-/*
- * Executes a command in a child process using g_spawn_sync,
- * returning an int >= 0 representing the exit status of the
- * process.
- *
- * If the program wasn't found in path, returns -1.
- *
- * If a problem happened when creating the child process,
- * returns -1 and errp is set.
- */
-static int run_process_child(const char *command[], Error **errp)
-{
- int exit_status, spawn_flag;
- GError *g_err = NULL;
- bool success;
-
- spawn_flag = G_SPAWN_SEARCH_PATH | G_SPAWN_STDOUT_TO_DEV_NULL |
- G_SPAWN_STDERR_TO_DEV_NULL;
-
- success = g_spawn_sync(NULL, (char **)command, NULL, spawn_flag,
- NULL, NULL, NULL, NULL,
- &exit_status, &g_err);
-
- if (success) {
- return WEXITSTATUS(exit_status);
- }
-
- if (g_err && (g_err->code != G_SPAWN_ERROR_NOENT)) {
- error_setg(errp, "failed to create child process, error '%s'",
- g_err->message);
- }
-
- g_error_free(g_err);
- return -1;
-}
-
-static bool systemd_supports_mode(SuspendMode mode, Error **errp)
-{
- const char *systemctl_args[3] = {"systemd-hibernate", "systemd-suspend",
- "systemd-hybrid-sleep"};
- const char *cmd[4] = {"systemctl", "status", systemctl_args[mode], NULL};
- int status;
-
- status = run_process_child(cmd, errp);
-
- /*
- * systemctl status uses LSB return codes so we can expect
- * status > 0 and be ok. To assert if the guest has support
- * for the selected suspend mode, status should be < 4. 4 is
- * the code for unknown service status, the return value when
- * the service does not exist. A common value is status = 3
- * (program is not running).
- */
- if (status > 0 && status < 4) {
- return true;
- }
-
- return false;
-}
-
-static void systemd_suspend(SuspendMode mode, Error **errp)
-{
- Error *local_err = NULL;
- const char *systemctl_args[3] = {"hibernate", "suspend", "hybrid-sleep"};
- const char *cmd[3] = {"systemctl", systemctl_args[mode], NULL};
- int status;
-
- status = run_process_child(cmd, &local_err);
-
- if (status == 0) {
- return;
- }
-
- if ((status == -1) && !local_err) {
- error_setg(errp, "the helper program 'systemctl %s' was not found",
- systemctl_args[mode]);
- return;
- }
-
- if (local_err) {
- error_propagate(errp, local_err);
- } else {
- error_setg(errp, "the helper program 'systemctl %s' returned an "
- "unexpected exit status code (%d)",
- systemctl_args[mode], status);
- }
-}
-
-static bool pmutils_supports_mode(SuspendMode mode, Error **errp)
-{
- Error *local_err = NULL;
- const char *pmutils_args[3] = {"--hibernate", "--suspend",
- "--suspend-hybrid"};
- const char *cmd[3] = {"pm-is-supported", pmutils_args[mode], NULL};
- int status;
-
- status = run_process_child(cmd, &local_err);
-
- if (status == SUSPEND_SUPPORTED) {
- return true;
- }
-
- if ((status == -1) && !local_err) {
- return false;
- }
-
- if (local_err) {
- error_propagate(errp, local_err);
- } else {
- error_setg(errp,
- "the helper program '%s' returned an unexpected exit"
- " status code (%d)", "pm-is-supported", status);
- }
-
- return false;
-}
-
-static void pmutils_suspend(SuspendMode mode, Error **errp)
-{
- Error *local_err = NULL;
- const char *pmutils_binaries[3] = {"pm-hibernate", "pm-suspend",
- "pm-suspend-hybrid"};
- const char *cmd[2] = {pmutils_binaries[mode], NULL};
- int status;
-
- status = run_process_child(cmd, &local_err);
-
- if (status == 0) {
- return;
- }
-
- if ((status == -1) && !local_err) {
- error_setg(errp, "the helper program '%s' was not found",
- pmutils_binaries[mode]);
- return;
- }
-
- if (local_err) {
- error_propagate(errp, local_err);
- } else {
- error_setg(errp,
- "the helper program '%s' returned an unexpected exit"
- " status code (%d)", pmutils_binaries[mode], status);
- }
-}
-
-static bool linux_sys_state_supports_mode(SuspendMode mode, Error **errp)
-{
- const char *sysfile_strs[3] = {"disk", "mem", NULL};
- const char *sysfile_str = sysfile_strs[mode];
- char buf[32]; /* hopefully big enough */
- int fd;
- ssize_t ret;
-
- if (!sysfile_str) {
- error_setg(errp, "unknown guest suspend mode");
- return false;
- }
-
- fd = open(LINUX_SYS_STATE_FILE, O_RDONLY);
- if (fd < 0) {
- return false;
- }
-
- ret = read(fd, buf, sizeof(buf) - 1);
- close(fd);
- if (ret <= 0) {
- return false;
- }
- buf[ret] = '\0';
-
- if (strstr(buf, sysfile_str)) {
- return true;
- }
- return false;
-}
-
-static void linux_sys_state_suspend(SuspendMode mode, Error **errp)
-{
- g_autoptr(GError) local_gerr = NULL;
- const char *sysfile_strs[3] = {"disk", "mem", NULL};
- const char *sysfile_str = sysfile_strs[mode];
-
- if (!sysfile_str) {
- error_setg(errp, "unknown guest suspend mode");
- return;
- }
-
- if (!g_file_set_contents(LINUX_SYS_STATE_FILE, sysfile_str,
- -1, &local_gerr)) {
- error_setg(errp, "suspend: cannot write to '%s': %s",
- LINUX_SYS_STATE_FILE, local_gerr->message);
- return;
- }
-}
-
-static void guest_suspend(SuspendMode mode, Error **errp)
-{
- Error *local_err = NULL;
- bool mode_supported = false;
-
- if (systemd_supports_mode(mode, &local_err)) {
- mode_supported = true;
- systemd_suspend(mode, &local_err);
-
- if (!local_err) {
- return;
- }
- }
-
- error_free(local_err);
- local_err = NULL;
-
- if (pmutils_supports_mode(mode, &local_err)) {
- mode_supported = true;
- pmutils_suspend(mode, &local_err);
-
- if (!local_err) {
- return;
- }
- }
-
- error_free(local_err);
- local_err = NULL;
-
- if (linux_sys_state_supports_mode(mode, &local_err)) {
- mode_supported = true;
- linux_sys_state_suspend(mode, &local_err);
- }
-
- if (!mode_supported) {
- error_free(local_err);
- error_setg(errp,
- "the requested suspend mode is not supported by the guest");
- } else {
- error_propagate(errp, local_err);
- }
-}
-
-void qmp_guest_suspend_disk(Error **errp)
-{
- guest_suspend(SUSPEND_MODE_DISK, errp);
-}
-
-void qmp_guest_suspend_ram(Error **errp)
-{
- guest_suspend(SUSPEND_MODE_RAM, errp);
-}
-
-void qmp_guest_suspend_hybrid(Error **errp)
-{
- guest_suspend(SUSPEND_MODE_HYBRID, errp);
-}
-
-/* Transfer online/offline status between @vcpu and the guest system.
- *
- * On input either @errp or *@errp must be NULL.
- *
- * In system-to-@vcpu direction, the following @vcpu fields are accessed:
- * - R: vcpu->logical_id
- * - W: vcpu->online
- * - W: vcpu->can_offline
- *
- * In @vcpu-to-system direction, the following @vcpu fields are accessed:
- * - R: vcpu->logical_id
- * - R: vcpu->online
- *
- * Written members remain unmodified on error.
- */
-static void transfer_vcpu(GuestLogicalProcessor *vcpu, bool sys2vcpu,
- char *dirpath, Error **errp)
-{
- int fd;
- int res;
- int dirfd;
- static const char fn[] = "online";
-
- dirfd = open(dirpath, O_RDONLY | O_DIRECTORY);
- if (dirfd == -1) {
- error_setg_errno(errp, errno, "open(\"%s\")", dirpath);
- return;
- }
-
- fd = openat(dirfd, fn, sys2vcpu ? O_RDONLY : O_RDWR);
- if (fd == -1) {
- if (errno != ENOENT) {
- error_setg_errno(errp, errno, "open(\"%s/%s\")", dirpath, fn);
- } else if (sys2vcpu) {
- vcpu->online = true;
- vcpu->can_offline = false;
- } else if (!vcpu->online) {
- error_setg(errp, "logical processor #%" PRId64 " can't be "
- "offlined", vcpu->logical_id);
- } /* otherwise pretend successful re-onlining */
- } else {
- unsigned char status;
-
- res = pread(fd, &status, 1, 0);
- if (res == -1) {
- error_setg_errno(errp, errno, "pread(\"%s/%s\")", dirpath, fn);
- } else if (res == 0) {
- error_setg(errp, "pread(\"%s/%s\"): unexpected EOF", dirpath,
- fn);
- } else if (sys2vcpu) {
- vcpu->online = (status != '0');
- vcpu->can_offline = true;
- } else if (vcpu->online != (status != '0')) {
- status = '0' + vcpu->online;
- if (pwrite(fd, &status, 1, 0) == -1) {
- error_setg_errno(errp, errno, "pwrite(\"%s/%s\")", dirpath,
- fn);
- }
- } /* otherwise pretend successful re-(on|off)-lining */
-
- res = close(fd);
- g_assert(res == 0);
- }
-
- res = close(dirfd);
- g_assert(res == 0);
-}
-
-GuestLogicalProcessorList *qmp_guest_get_vcpus(Error **errp)
-{
- GuestLogicalProcessorList *head, **tail;
- const char *cpu_dir = "/sys/devices/system/cpu";
- const gchar *line;
- g_autoptr(GDir) cpu_gdir = NULL;
- Error *local_err = NULL;
-
- head = NULL;
- tail = &head;
- cpu_gdir = g_dir_open(cpu_dir, 0, NULL);
-
- if (cpu_gdir == NULL) {
- error_setg_errno(errp, errno, "failed to list entries: %s", cpu_dir);
- return NULL;
- }
-
- while (local_err == NULL && (line = g_dir_read_name(cpu_gdir)) != NULL) {
- GuestLogicalProcessor *vcpu;
- int64_t id;
- if (sscanf(line, "cpu%" PRId64, &id)) {
- g_autofree char *path = g_strdup_printf("/sys/devices/system/cpu/"
- "cpu%" PRId64 "/", id);
- vcpu = g_malloc0(sizeof *vcpu);
- vcpu->logical_id = id;
- vcpu->has_can_offline = true; /* lolspeak ftw */
- transfer_vcpu(vcpu, true, path, &local_err);
- QAPI_LIST_APPEND(tail, vcpu);
- }
- }
-
- if (local_err == NULL) {
- /* there's no guest with zero VCPUs */
- g_assert(head != NULL);
- return head;
- }
-
- qapi_free_GuestLogicalProcessorList(head);
- error_propagate(errp, local_err);
- return NULL;
-}
-
-int64_t qmp_guest_set_vcpus(GuestLogicalProcessorList *vcpus, Error **errp)
-{
- int64_t processed;
- Error *local_err = NULL;
-
- processed = 0;
- while (vcpus != NULL) {
- char *path = g_strdup_printf("/sys/devices/system/cpu/cpu%" PRId64 "/",
- vcpus->value->logical_id);
-
- transfer_vcpu(vcpus->value, false, path, &local_err);
- g_free(path);
- if (local_err != NULL) {
- break;
- }
- ++processed;
- vcpus = vcpus->next;
- }
-
- if (local_err != NULL) {
- if (processed == 0) {
- error_propagate(errp, local_err);
- } else {
- error_free(local_err);
- }
- }
-
- return processed;
-}
-#endif /* __linux__ */
-
#if defined(__linux__) || defined(__FreeBSD__)
void qmp_guest_set_user_password(const char *username,
const char *password,
@@ -2190,574 +877,8 @@
return;
}
}
-#else /* __linux__ || __FreeBSD__ */
-void qmp_guest_set_user_password(const char *username,
- const char *password,
- bool crypted,
- Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
-}
#endif /* __linux__ || __FreeBSD__ */
-#ifdef __linux__
-static void ga_read_sysfs_file(int dirfd, const char *pathname, char *buf,
- int size, Error **errp)
-{
- int fd;
- int res;
-
- errno = 0;
- fd = openat(dirfd, pathname, O_RDONLY);
- if (fd == -1) {
- error_setg_errno(errp, errno, "open sysfs file \"%s\"", pathname);
- return;
- }
-
- res = pread(fd, buf, size, 0);
- if (res == -1) {
- error_setg_errno(errp, errno, "pread sysfs file \"%s\"", pathname);
- } else if (res == 0) {
- error_setg(errp, "pread sysfs file \"%s\": unexpected EOF", pathname);
- }
- close(fd);
-}
-
-static void ga_write_sysfs_file(int dirfd, const char *pathname,
- const char *buf, int size, Error **errp)
-{
- int fd;
-
- errno = 0;
- fd = openat(dirfd, pathname, O_WRONLY);
- if (fd == -1) {
- error_setg_errno(errp, errno, "open sysfs file \"%s\"", pathname);
- return;
- }
-
- if (pwrite(fd, buf, size, 0) == -1) {
- error_setg_errno(errp, errno, "pwrite sysfs file \"%s\"", pathname);
- }
-
- close(fd);
-}
-
-/* Transfer online/offline status between @mem_blk and the guest system.
- *
- * On input either @errp or *@errp must be NULL.
- *
- * In system-to-@mem_blk direction, the following @mem_blk fields are accessed:
- * - R: mem_blk->phys_index
- * - W: mem_blk->online
- * - W: mem_blk->can_offline
- *
- * In @mem_blk-to-system direction, the following @mem_blk fields are accessed:
- * - R: mem_blk->phys_index
- * - R: mem_blk->online
- *- R: mem_blk->can_offline
- * Written members remain unmodified on error.
- */
-static void transfer_memory_block(GuestMemoryBlock *mem_blk, bool sys2memblk,
- GuestMemoryBlockResponse *result,
- Error **errp)
-{
- char *dirpath;
- int dirfd;
- char *status;
- Error *local_err = NULL;
-
- if (!sys2memblk) {
- DIR *dp;
-
- if (!result) {
- error_setg(errp, "Internal error, 'result' should not be NULL");
- return;
- }
- errno = 0;
- dp = opendir("/sys/devices/system/memory/");
- /* if there is no 'memory' directory in sysfs,
- * we think this VM does not support online/offline memory block,
- * any other solution?
- */
- if (!dp) {
- if (errno == ENOENT) {
- result->response =
- GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_NOT_SUPPORTED;
- }
- goto out1;
- }
- closedir(dp);
- }
-
- dirpath = g_strdup_printf("/sys/devices/system/memory/memory%" PRId64 "/",
- mem_blk->phys_index);
- dirfd = open(dirpath, O_RDONLY | O_DIRECTORY);
- if (dirfd == -1) {
- if (sys2memblk) {
- error_setg_errno(errp, errno, "open(\"%s\")", dirpath);
- } else {
- if (errno == ENOENT) {
- result->response = GUEST_MEMORY_BLOCK_RESPONSE_TYPE_NOT_FOUND;
- } else {
- result->response =
- GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_FAILED;
- }
- }
- g_free(dirpath);
- goto out1;
- }
- g_free(dirpath);
-
- status = g_malloc0(10);
- ga_read_sysfs_file(dirfd, "state", status, 10, &local_err);
- if (local_err) {
- /* treat with sysfs file that not exist in old kernel */
- if (errno == ENOENT) {
- error_free(local_err);
- if (sys2memblk) {
- mem_blk->online = true;
- mem_blk->can_offline = false;
- } else if (!mem_blk->online) {
- result->response =
- GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_NOT_SUPPORTED;
- }
- } else {
- if (sys2memblk) {
- error_propagate(errp, local_err);
- } else {
- error_free(local_err);
- result->response =
- GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_FAILED;
- }
- }
- goto out2;
- }
-
- if (sys2memblk) {
- char removable = '0';
-
- mem_blk->online = (strncmp(status, "online", 6) == 0);
-
- ga_read_sysfs_file(dirfd, "removable", &removable, 1, &local_err);
- if (local_err) {
- /* if no 'removable' file, it doesn't support offline mem blk */
- if (errno == ENOENT) {
- error_free(local_err);
- mem_blk->can_offline = false;
- } else {
- error_propagate(errp, local_err);
- }
- } else {
- mem_blk->can_offline = (removable != '0');
- }
- } else {
- if (mem_blk->online != (strncmp(status, "online", 6) == 0)) {
- const char *new_state = mem_blk->online ? "online" : "offline";
-
- ga_write_sysfs_file(dirfd, "state", new_state, strlen(new_state),
- &local_err);
- if (local_err) {
- error_free(local_err);
- result->response =
- GUEST_MEMORY_BLOCK_RESPONSE_TYPE_OPERATION_FAILED;
- goto out2;
- }
-
- result->response = GUEST_MEMORY_BLOCK_RESPONSE_TYPE_SUCCESS;
- result->has_error_code = false;
- } /* otherwise pretend successful re-(on|off)-lining */
- }
- g_free(status);
- close(dirfd);
- return;
-
-out2:
- g_free(status);
- close(dirfd);
-out1:
- if (!sys2memblk) {
- result->has_error_code = true;
- result->error_code = errno;
- }
-}
-
-GuestMemoryBlockList *qmp_guest_get_memory_blocks(Error **errp)
-{
- GuestMemoryBlockList *head, **tail;
- Error *local_err = NULL;
- struct dirent *de;
- DIR *dp;
-
- head = NULL;
- tail = &head;
-
- dp = opendir("/sys/devices/system/memory/");
- if (!dp) {
- /* it's ok if this happens to be a system that doesn't expose
- * memory blocks via sysfs, but otherwise we should report
- * an error
- */
- if (errno != ENOENT) {
- error_setg_errno(errp, errno, "Can't open directory"
- "\"/sys/devices/system/memory/\"");
- }
- return NULL;
- }
-
- /* Note: the phys_index of memory block may be discontinuous,
- * this is because a memblk is the unit of the Sparse Memory design, which
- * allows discontinuous memory ranges (ex. NUMA), so here we should
- * traverse the memory block directory.
- */
- while ((de = readdir(dp)) != NULL) {
- GuestMemoryBlock *mem_blk;
-
- if ((strncmp(de->d_name, "memory", 6) != 0) ||
- !(de->d_type & DT_DIR)) {
- continue;
- }
-
- mem_blk = g_malloc0(sizeof *mem_blk);
- /* The d_name is "memoryXXX", phys_index is block id, same as XXX */
- mem_blk->phys_index = strtoul(&de->d_name[6], NULL, 10);
- mem_blk->has_can_offline = true; /* lolspeak ftw */
- transfer_memory_block(mem_blk, true, NULL, &local_err);
- if (local_err) {
- break;
- }
-
- QAPI_LIST_APPEND(tail, mem_blk);
- }
-
- closedir(dp);
- if (local_err == NULL) {
- /* there's no guest with zero memory blocks */
- if (head == NULL) {
- error_setg(errp, "guest reported zero memory blocks!");
- }
- return head;
- }
-
- qapi_free_GuestMemoryBlockList(head);
- error_propagate(errp, local_err);
- return NULL;
-}
-
-GuestMemoryBlockResponseList *
-qmp_guest_set_memory_blocks(GuestMemoryBlockList *mem_blks, Error **errp)
-{
- GuestMemoryBlockResponseList *head, **tail;
- Error *local_err = NULL;
-
- head = NULL;
- tail = &head;
-
- while (mem_blks != NULL) {
- GuestMemoryBlockResponse *result;
- GuestMemoryBlock *current_mem_blk = mem_blks->value;
-
- result = g_malloc0(sizeof(*result));
- result->phys_index = current_mem_blk->phys_index;
- transfer_memory_block(current_mem_blk, false, result, &local_err);
- if (local_err) { /* should never happen */
- goto err;
- }
-
- QAPI_LIST_APPEND(tail, result);
- mem_blks = mem_blks->next;
- }
-
- return head;
-err:
- qapi_free_GuestMemoryBlockResponseList(head);
- error_propagate(errp, local_err);
- return NULL;
-}
-
-GuestMemoryBlockInfo *qmp_guest_get_memory_block_info(Error **errp)
-{
- Error *local_err = NULL;
- char *dirpath;
- int dirfd;
- char *buf;
- GuestMemoryBlockInfo *info;
-
- dirpath = g_strdup_printf("/sys/devices/system/memory/");
- dirfd = open(dirpath, O_RDONLY | O_DIRECTORY);
- if (dirfd == -1) {
- error_setg_errno(errp, errno, "open(\"%s\")", dirpath);
- g_free(dirpath);
- return NULL;
- }
- g_free(dirpath);
-
- buf = g_malloc0(20);
- ga_read_sysfs_file(dirfd, "block_size_bytes", buf, 20, &local_err);
- close(dirfd);
- if (local_err) {
- g_free(buf);
- error_propagate(errp, local_err);
- return NULL;
- }
-
- info = g_new0(GuestMemoryBlockInfo, 1);
- info->size = strtol(buf, NULL, 16); /* the unit is bytes */
-
- g_free(buf);
-
- return info;
-}
-
-#define MAX_NAME_LEN 128
-static GuestDiskStatsInfoList *guest_get_diskstats(Error **errp)
-{
-#ifdef CONFIG_LINUX
- GuestDiskStatsInfoList *head = NULL, **tail = &head;
- const char *diskstats = "/proc/diskstats";
- FILE *fp;
- size_t n;
- char *line = NULL;
-
- fp = fopen(diskstats, "r");
- if (fp == NULL) {
- error_setg_errno(errp, errno, "open(\"%s\")", diskstats);
- return NULL;
- }
-
- while (getline(&line, &n, fp) != -1) {
- g_autofree GuestDiskStatsInfo *diskstatinfo = NULL;
- g_autofree GuestDiskStats *diskstat = NULL;
- char dev_name[MAX_NAME_LEN];
- unsigned int ios_pgr, tot_ticks, rq_ticks, wr_ticks, dc_ticks, fl_ticks;
- unsigned long rd_ios, rd_merges_or_rd_sec, rd_ticks_or_wr_sec, wr_ios;
- unsigned long wr_merges, rd_sec_or_wr_ios, wr_sec;
- unsigned long dc_ios, dc_merges, dc_sec, fl_ios;
- unsigned int major, minor;
- int i;
-
- i = sscanf(line, "%u %u %s %lu %lu %lu"
- "%lu %lu %lu %lu %u %u %u %u"
- "%lu %lu %lu %u %lu %u",
- &major, &minor, dev_name,
- &rd_ios, &rd_merges_or_rd_sec, &rd_sec_or_wr_ios,
- &rd_ticks_or_wr_sec, &wr_ios, &wr_merges, &wr_sec,
- &wr_ticks, &ios_pgr, &tot_ticks, &rq_ticks,
- &dc_ios, &dc_merges, &dc_sec, &dc_ticks,
- &fl_ios, &fl_ticks);
-
- if (i < 7) {
- continue;
- }
-
- diskstatinfo = g_new0(GuestDiskStatsInfo, 1);
- diskstatinfo->name = g_strdup(dev_name);
- diskstatinfo->major = major;
- diskstatinfo->minor = minor;
-
- diskstat = g_new0(GuestDiskStats, 1);
- if (i == 7) {
- diskstat->has_read_ios = true;
- diskstat->read_ios = rd_ios;
- diskstat->has_read_sectors = true;
- diskstat->read_sectors = rd_merges_or_rd_sec;
- diskstat->has_write_ios = true;
- diskstat->write_ios = rd_sec_or_wr_ios;
- diskstat->has_write_sectors = true;
- diskstat->write_sectors = rd_ticks_or_wr_sec;
- }
- if (i >= 14) {
- diskstat->has_read_ios = true;
- diskstat->read_ios = rd_ios;
- diskstat->has_read_sectors = true;
- diskstat->read_sectors = rd_sec_or_wr_ios;
- diskstat->has_read_merges = true;
- diskstat->read_merges = rd_merges_or_rd_sec;
- diskstat->has_read_ticks = true;
- diskstat->read_ticks = rd_ticks_or_wr_sec;
- diskstat->has_write_ios = true;
- diskstat->write_ios = wr_ios;
- diskstat->has_write_sectors = true;
- diskstat->write_sectors = wr_sec;
- diskstat->has_write_merges = true;
- diskstat->write_merges = wr_merges;
- diskstat->has_write_ticks = true;
- diskstat->write_ticks = wr_ticks;
- diskstat->has_ios_pgr = true;
- diskstat->ios_pgr = ios_pgr;
- diskstat->has_total_ticks = true;
- diskstat->total_ticks = tot_ticks;
- diskstat->has_weight_ticks = true;
- diskstat->weight_ticks = rq_ticks;
- }
- if (i >= 18) {
- diskstat->has_discard_ios = true;
- diskstat->discard_ios = dc_ios;
- diskstat->has_discard_merges = true;
- diskstat->discard_merges = dc_merges;
- diskstat->has_discard_sectors = true;
- diskstat->discard_sectors = dc_sec;
- diskstat->has_discard_ticks = true;
- diskstat->discard_ticks = dc_ticks;
- }
- if (i >= 20) {
- diskstat->has_flush_ios = true;
- diskstat->flush_ios = fl_ios;
- diskstat->has_flush_ticks = true;
- diskstat->flush_ticks = fl_ticks;
- }
-
- diskstatinfo->stats = g_steal_pointer(&diskstat);
- QAPI_LIST_APPEND(tail, diskstatinfo);
- diskstatinfo = NULL;
- }
- free(line);
- fclose(fp);
- return head;
-#else
- g_debug("disk stats reporting available only for Linux");
- return NULL;
-#endif
-}
-
-GuestDiskStatsInfoList *qmp_guest_get_diskstats(Error **errp)
-{
- return guest_get_diskstats(errp);
-}
-
-GuestCpuStatsList *qmp_guest_get_cpustats(Error **errp)
-{
- GuestCpuStatsList *head = NULL, **tail = &head;
- const char *cpustats = "/proc/stat";
- int clk_tck = sysconf(_SC_CLK_TCK);
- FILE *fp;
- size_t n;
- char *line = NULL;
-
- fp = fopen(cpustats, "r");
- if (fp == NULL) {
- error_setg_errno(errp, errno, "open(\"%s\")", cpustats);
- return NULL;
- }
-
- while (getline(&line, &n, fp) != -1) {
- GuestCpuStats *cpustat = NULL;
- GuestLinuxCpuStats *linuxcpustat;
- int i;
- unsigned long user, system, idle, iowait, irq, softirq, steal, guest;
- unsigned long nice, guest_nice;
- char name[64];
-
- i = sscanf(line, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
- name, &user, &nice, &system, &idle, &iowait, &irq, &softirq,
- &steal, &guest, &guest_nice);
-
- /* drop "cpu 1 2 3 ...", get "cpuX 1 2 3 ..." only */
- if ((i == EOF) || strncmp(name, "cpu", 3) || (name[3] == '\0')) {
- continue;
- }
-
- if (i < 5) {
- slog("Parsing cpu stat from %s failed, see \"man proc\"", cpustats);
- break;
- }
-
- cpustat = g_new0(GuestCpuStats, 1);
- cpustat->type = GUEST_CPU_STATS_TYPE_LINUX;
-
- linuxcpustat = &cpustat->u.q_linux;
- linuxcpustat->cpu = atoi(&name[3]);
- linuxcpustat->user = user * 1000 / clk_tck;
- linuxcpustat->nice = nice * 1000 / clk_tck;
- linuxcpustat->system = system * 1000 / clk_tck;
- linuxcpustat->idle = idle * 1000 / clk_tck;
-
- if (i > 5) {
- linuxcpustat->has_iowait = true;
- linuxcpustat->iowait = iowait * 1000 / clk_tck;
- }
-
- if (i > 6) {
- linuxcpustat->has_irq = true;
- linuxcpustat->irq = irq * 1000 / clk_tck;
- linuxcpustat->has_softirq = true;
- linuxcpustat->softirq = softirq * 1000 / clk_tck;
- }
-
- if (i > 8) {
- linuxcpustat->has_steal = true;
- linuxcpustat->steal = steal * 1000 / clk_tck;
- }
-
- if (i > 9) {
- linuxcpustat->has_guest = true;
- linuxcpustat->guest = guest * 1000 / clk_tck;
- }
-
- if (i > 10) {
- linuxcpustat->has_guest = true;
- linuxcpustat->guest = guest * 1000 / clk_tck;
- linuxcpustat->has_guestnice = true;
- linuxcpustat->guestnice = guest_nice * 1000 / clk_tck;
- }
-
- QAPI_LIST_APPEND(tail, cpustat);
- }
-
- free(line);
- fclose(fp);
- return head;
-}
-
-#else /* defined(__linux__) */
-
-void qmp_guest_suspend_disk(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
-}
-
-void qmp_guest_suspend_ram(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
-}
-
-void qmp_guest_suspend_hybrid(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
-}
-
-GuestLogicalProcessorList *qmp_guest_get_vcpus(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
-
-int64_t qmp_guest_set_vcpus(GuestLogicalProcessorList *vcpus, Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return -1;
-}
-
-GuestMemoryBlockList *qmp_guest_get_memory_blocks(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
-
-GuestMemoryBlockResponseList *
-qmp_guest_set_memory_blocks(GuestMemoryBlockList *mem_blks, Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
-
-GuestMemoryBlockInfo *qmp_guest_get_memory_block_info(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
-
-#endif
-
#ifdef HAVE_GETIFADDRS
static GuestNetworkInterface *
guest_find_interface(GuestNetworkInterfaceList *head,
@@ -3013,131 +1134,8 @@
return NULL;
}
-#else
-
-GuestNetworkInterfaceList *qmp_guest_network_get_interfaces(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
-
#endif /* HAVE_GETIFADDRS */
-#if !defined(CONFIG_FSFREEZE)
-
-GuestFilesystemInfoList *qmp_guest_get_fsinfo(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
-
-GuestFsfreezeStatus qmp_guest_fsfreeze_status(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
-
- return 0;
-}
-
-int64_t qmp_guest_fsfreeze_freeze(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
-
- return 0;
-}
-
-int64_t qmp_guest_fsfreeze_freeze_list(bool has_mountpoints,
- strList *mountpoints,
- Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
-
- return 0;
-}
-
-int64_t qmp_guest_fsfreeze_thaw(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
-
- return 0;
-}
-
-GuestDiskInfoList *qmp_guest_get_disks(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
-
-GuestDiskStatsInfoList *qmp_guest_get_diskstats(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
-
-GuestCpuStatsList *qmp_guest_get_cpustats(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
-
-#endif /* CONFIG_FSFREEZE */
-
-#if !defined(CONFIG_FSTRIM)
-GuestFilesystemTrimResponse *
-qmp_guest_fstrim(bool has_minimum, int64_t minimum, Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
-#endif
-
-/* add unsupported commands to the list of blocked RPCs */
-GList *ga_command_init_blockedrpcs(GList *blockedrpcs)
-{
-#if !defined(__linux__)
- {
- const char *list[] = {
- "guest-suspend-disk", "guest-suspend-ram",
- "guest-suspend-hybrid", "guest-get-vcpus", "guest-set-vcpus",
- "guest-get-memory-blocks", "guest-set-memory-blocks",
- "guest-get-memory-block-size", "guest-get-memory-block-info",
- NULL};
- char **p = (char **)list;
-
- while (*p) {
- blockedrpcs = g_list_append(blockedrpcs, g_strdup(*p++));
- }
- }
-#endif
-
-#if !defined(HAVE_GETIFADDRS)
- blockedrpcs = g_list_append(blockedrpcs,
- g_strdup("guest-network-get-interfaces"));
-#endif
-
-#if !defined(CONFIG_FSFREEZE)
- {
- const char *list[] = {
- "guest-get-fsinfo", "guest-fsfreeze-status",
- "guest-fsfreeze-freeze", "guest-fsfreeze-freeze-list",
- "guest-fsfreeze-thaw", "guest-get-fsinfo",
- "guest-get-disks", NULL};
- char **p = (char **)list;
-
- while (*p) {
- blockedrpcs = g_list_append(blockedrpcs, g_strdup(*p++));
- }
- }
-#endif
-
-#if !defined(CONFIG_FSTRIM)
- blockedrpcs = g_list_append(blockedrpcs, g_strdup("guest-fstrim"));
-#endif
-
- blockedrpcs = g_list_append(blockedrpcs, g_strdup("guest-get-devices"));
-
- return blockedrpcs;
-}
-
/* register init/cleanup routines for stateful command groups */
void ga_command_state_init(GAState *s, GACommandState *cs)
{
@@ -3200,15 +1198,7 @@
return head;
}
-#else
-
-GuestUserList *qmp_guest_get_users(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
-
-#endif
+#endif /* HAVE_UTMPX */
/* Replace escaped special characters with their real values. The replacement
* is done in place -- returned value is in the original string.
@@ -3345,13 +1335,6 @@
return info;
}
-GuestDeviceInfoList *qmp_guest_get_devices(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
-
- return NULL;
-}
-
#ifndef HOST_NAME_MAX
# ifdef _POSIX_HOST_NAME_MAX
# define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
diff --git a/qga/commands-win32.c b/qga/commands-win32.c
index 0d1b836..61b36da 100644
--- a/qga/commands-win32.c
+++ b/qga/commands-win32.c
@@ -1203,7 +1203,7 @@
GuestFsfreezeStatus qmp_guest_fsfreeze_status(Error **errp)
{
if (!vss_initialized()) {
- error_setg(errp, QERR_UNSUPPORTED);
+ error_setg(errp, "fsfreeze not possible as VSS failed to initialize");
return 0;
}
@@ -1231,7 +1231,7 @@
Error *local_err = NULL;
if (!vss_initialized()) {
- error_setg(errp, QERR_UNSUPPORTED);
+ error_setg(errp, "fsfreeze not possible as VSS failed to initialize");
return 0;
}
@@ -1266,7 +1266,7 @@
int i;
if (!vss_initialized()) {
- error_setg(errp, QERR_UNSUPPORTED);
+ error_setg(errp, "fsfreeze not possible as VSS failed to initialize");
return 0;
}
@@ -1494,11 +1494,6 @@
}
}
-void qmp_guest_suspend_hybrid(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
-}
-
static IP_ADAPTER_ADDRESSES *guest_get_adapters_addresses(Error **errp)
{
IP_ADAPTER_ADDRESSES *adptr_addrs = NULL;
@@ -1862,12 +1857,6 @@
return NULL;
}
-int64_t qmp_guest_set_vcpus(GuestLogicalProcessorList *vcpus, Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return -1;
-}
-
static gchar *
get_net_error_message(gint error)
{
@@ -1969,55 +1958,6 @@
g_free(rawpasswddata);
}
-GuestMemoryBlockList *qmp_guest_get_memory_blocks(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
-
-GuestMemoryBlockResponseList *
-qmp_guest_set_memory_blocks(GuestMemoryBlockList *mem_blks, Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
-
-GuestMemoryBlockInfo *qmp_guest_get_memory_block_info(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
-
-/* add unsupported commands to the list of blocked RPCs */
-GList *ga_command_init_blockedrpcs(GList *blockedrpcs)
-{
- const char *list_unsupported[] = {
- "guest-suspend-hybrid",
- "guest-set-vcpus",
- "guest-get-memory-blocks", "guest-set-memory-blocks",
- "guest-get-memory-block-size", "guest-get-memory-block-info",
- NULL};
- char **p = (char **)list_unsupported;
-
- while (*p) {
- blockedrpcs = g_list_append(blockedrpcs, g_strdup(*p++));
- }
-
- if (!vss_init(true)) {
- g_debug("vss_init failed, vss commands are going to be disabled");
- const char *list[] = {
- "guest-get-fsinfo", "guest-fsfreeze-status",
- "guest-fsfreeze-freeze", "guest-fsfreeze-thaw", NULL};
- p = (char **)list;
-
- while (*p) {
- blockedrpcs = g_list_append(blockedrpcs, g_strdup(*p++));
- }
- }
-
- return blockedrpcs;
-}
-
/* register init/cleanup routines for stateful command groups */
void ga_command_state_init(GAState *s, GACommandState *cs)
{
@@ -2505,15 +2445,3 @@
return g_utf16_to_utf8(tmp, size, NULL, NULL, NULL);
}
-
-GuestDiskStatsInfoList *qmp_guest_get_diskstats(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
-
-GuestCpuStatsList *qmp_guest_get_cpustats(Error **errp)
-{
- error_setg(errp, QERR_UNSUPPORTED);
- return NULL;
-}
diff --git a/qga/main.c b/qga/main.c
index f4d5f15..b8f7b1e 100644
--- a/qga/main.c
+++ b/qga/main.c
@@ -70,6 +70,28 @@
typedef struct GAConfig GAConfig;
+struct GAConfig {
+ char *channel_path;
+ char *method;
+ char *log_filepath;
+ char *pid_filepath;
+#ifdef CONFIG_FSFREEZE
+ char *fsfreeze_hook;
+#endif
+ char *state_dir;
+#ifdef _WIN32
+ const char *service;
+#endif
+ gchar *bliststr; /* blockedrpcs may point to this string */
+ gchar *aliststr; /* allowedrpcs may point to this string */
+ GList *blockedrpcs;
+ GList *allowedrpcs;
+ int daemonize;
+ GLogLevelFlags log_level;
+ int dumpconf;
+ bool retry_path;
+};
+
struct GAState {
JSONMessageParser parser;
GMainLoop *main_loop;
@@ -226,12 +248,16 @@
#ifdef CONFIG_FSFREEZE
g_autofree char *fsfreeze_hook = get_relocated_path(QGA_FSFREEZE_HOOK_DEFAULT);
#endif
+ g_autofree char *conf_path = get_relocated_path(QGA_CONF_DEFAULT);
printf(
"Usage: %s [-m <method> -p <path>] [<options>]\n"
"QEMU Guest Agent " QEMU_FULL_VERSION "\n"
QEMU_COPYRIGHT "\n"
"\n"
+" -c, --config=PATH configuration file path (default is\n"
+" %s/qemu-ga.conf\n"
+" unless overriden by the QGA_CONF environment variable)\n"
" -m, --method transport method: one of unix-listen, virtio-serial,\n"
" isa-serial, or vsock-listen (virtio-serial is the default)\n"
" -p, --path device/socket path (the default for virtio-serial is:\n"
@@ -272,8 +298,8 @@
" plug/unplug, etc.)\n"
" -h, --help display this help and exit\n"
"\n"
-QEMU_HELP_BOTTOM "\n"
- , cmd, QGA_VIRTIO_PATH_DEFAULT, QGA_SERIAL_PATH_DEFAULT,
+QEMU_HELP_BOTTOM "\n",
+ cmd, conf_path, QGA_VIRTIO_PATH_DEFAULT, QGA_SERIAL_PATH_DEFAULT,
dfl_pathnames.pidfile,
#ifdef CONFIG_FSFREEZE
fsfreeze_hook,
@@ -397,58 +423,77 @@
return strcmp(str1, str2);
}
-/* disable commands that aren't safe for fsfreeze */
-static void ga_disable_not_allowed_freeze(const QmpCommand *cmd, void *opaque)
+static bool ga_command_is_allowed(const QmpCommand *cmd, GAState *state)
{
- bool allowed = false;
int i = 0;
+ GAConfig *config = state->config;
const char *name = qmp_command_name(cmd);
+ /* Fallback policy is allow everything */
+ bool allowed = true;
- while (ga_freeze_allowlist[i] != NULL) {
- if (strcmp(name, ga_freeze_allowlist[i]) == 0) {
+ if (config->allowedrpcs) {
+ /*
+ * If an allow-list is given, this changes the fallback
+ * policy to deny everything
+ */
+ allowed = false;
+
+ if (g_list_find_custom(config->allowedrpcs, name, ga_strcmp) != NULL) {
allowed = true;
}
- i++;
}
- if (!allowed) {
- g_debug("disabling command: %s", name);
- qmp_disable_command(&ga_commands, name, "the agent is in frozen state");
+
+ /*
+ * If both allowedrpcs and blockedrpcs are set, the blocked
+ * list will take priority
+ */
+ if (config->blockedrpcs) {
+ if (g_list_find_custom(config->blockedrpcs, name, ga_strcmp) != NULL) {
+ allowed = false;
+ }
}
+
+ /*
+ * If frozen, this filtering must take priority over
+ * absolutely everything
+ */
+ if (state->frozen) {
+ allowed = false;
+
+ while (ga_freeze_allowlist[i] != NULL) {
+ if (strcmp(name, ga_freeze_allowlist[i]) == 0) {
+ allowed = true;
+ }
+ i++;
+ }
+ }
+
+ return allowed;
}
-/* [re-]enable all commands, except those explicitly blocked by user */
-static void ga_enable_non_blocked(const QmpCommand *cmd, void *opaque)
+static void ga_apply_command_filters_iter(const QmpCommand *cmd, void *opaque)
{
- GAState *s = opaque;
- GList *blockedrpcs = s->blockedrpcs;
- GList *allowedrpcs = s->allowedrpcs;
+ GAState *state = opaque;
+ bool want = ga_command_is_allowed(cmd, state);
+ bool have = qmp_command_is_enabled(cmd);
const char *name = qmp_command_name(cmd);
- if (g_list_find_custom(blockedrpcs, name, ga_strcmp) == NULL) {
- if (qmp_command_is_enabled(cmd)) {
- return;
- }
+ if (want == have) {
+ return;
+ }
- if (allowedrpcs &&
- g_list_find_custom(allowedrpcs, name, ga_strcmp) == NULL) {
- return;
- }
-
+ if (have) {
+ g_debug("disabling command: %s", name);
+ qmp_disable_command(&ga_commands, name, "the command is not allowed");
+ } else {
g_debug("enabling command: %s", name);
qmp_enable_command(&ga_commands, name);
}
}
-/* disable commands that aren't allowed */
-static void ga_disable_not_allowed(const QmpCommand *cmd, void *opaque)
+static void ga_apply_command_filters(GAState *state)
{
- GList *allowedrpcs = opaque;
- const char *name = qmp_command_name(cmd);
-
- if (g_list_find_custom(allowedrpcs, name, ga_strcmp) == NULL) {
- g_debug("disabling command: %s", name);
- qmp_disable_command(&ga_commands, name, "the command is not allowed");
- }
+ qmp_for_each_command(&ga_commands, ga_apply_command_filters_iter, state);
}
static bool ga_create_file(const char *path)
@@ -483,15 +528,14 @@
if (ga_is_frozen(s)) {
return;
}
- /* disable all forbidden (for frozen state) commands */
- qmp_for_each_command(&ga_commands, ga_disable_not_allowed_freeze, NULL);
g_warning("disabling logging due to filesystem freeze");
- ga_disable_logging(s);
s->frozen = true;
if (!ga_create_file(s->state_filepath_isfrozen)) {
g_warning("unable to create %s, fsfreeze may not function properly",
s->state_filepath_isfrozen);
}
+ ga_apply_command_filters(s);
+ ga_disable_logging(s);
}
void ga_unset_frozen(GAState *s)
@@ -523,12 +567,12 @@
}
/* enable all disabled, non-blocked and allowed commands */
- qmp_for_each_command(&ga_commands, ga_enable_non_blocked, s);
s->frozen = false;
if (!ga_delete_file(s->state_filepath_isfrozen)) {
g_warning("unable to delete %s, fsfreeze may not function properly",
s->state_filepath_isfrozen);
}
+ ga_apply_command_filters(s);
}
#ifdef CONFIG_FSFREEZE
@@ -996,38 +1040,14 @@
return list;
}
-struct GAConfig {
- char *channel_path;
- char *method;
- char *log_filepath;
- char *pid_filepath;
-#ifdef CONFIG_FSFREEZE
- char *fsfreeze_hook;
-#endif
- char *state_dir;
-#ifdef _WIN32
- const char *service;
-#endif
- gchar *bliststr; /* blockedrpcs may point to this string */
- gchar *aliststr; /* allowedrpcs may point to this string */
- GList *blockedrpcs;
- GList *allowedrpcs;
- int daemonize;
- GLogLevelFlags log_level;
- int dumpconf;
- bool retry_path;
-};
-
-static void config_load(GAConfig *config)
+static void config_load(GAConfig *config, const char *confpath, bool required)
{
GError *gerr = NULL;
GKeyFile *keyfile;
- g_autofree char *conf = g_strdup(g_getenv("QGA_CONF")) ?: get_relocated_path(QGA_CONF_DEFAULT);
- const gchar *blockrpcs_key = "block-rpcs";
/* read system config */
keyfile = g_key_file_new();
- if (!g_key_file_load_from_file(keyfile, conf, 0, &gerr)) {
+ if (!g_key_file_load_from_file(keyfile, confpath, 0, &gerr)) {
goto end;
}
if (g_key_file_has_key(keyfile, "general", "daemon", NULL)) {
@@ -1071,9 +1091,9 @@
g_key_file_get_boolean(keyfile, "general", "retry-path", &gerr);
}
- if (g_key_file_has_key(keyfile, "general", blockrpcs_key, NULL)) {
+ if (g_key_file_has_key(keyfile, "general", "block-rpcs", NULL)) {
config->bliststr =
- g_key_file_get_string(keyfile, "general", blockrpcs_key, &gerr);
+ g_key_file_get_string(keyfile, "general", "block-rpcs", &gerr);
config->blockedrpcs = g_list_concat(config->blockedrpcs,
split_list(config->bliststr, ","));
}
@@ -1084,19 +1104,12 @@
split_list(config->aliststr, ","));
}
- if (g_key_file_has_key(keyfile, "general", blockrpcs_key, NULL) &&
- g_key_file_has_key(keyfile, "general", "allow-rpcs", NULL)) {
- g_critical("wrong config, using 'block-rpcs' and 'allow-rpcs' keys at"
- " the same time is not allowed");
- exit(EXIT_FAILURE);
- }
-
end:
g_key_file_free(keyfile);
- if (gerr &&
- !(gerr->domain == G_FILE_ERROR && gerr->code == G_FILE_ERROR_NOENT)) {
+ if (gerr && (required ||
+ !(gerr->domain == G_FILE_ERROR && gerr->code == G_FILE_ERROR_NOENT))) {
g_critical("error loading configuration from path: %s, %s",
- conf, gerr->message);
+ confpath, gerr->message);
exit(EXIT_FAILURE);
}
g_clear_error(&gerr);
@@ -1168,12 +1181,12 @@
static void config_parse(GAConfig *config, int argc, char **argv)
{
- const char *sopt = "hVvdm:p:l:f:F::b:a:s:t:Dr";
+ const char *sopt = "hVvdc:m:p:l:f:F::b:a:s:t:Dr";
int opt_ind = 0, ch;
- bool block_rpcs = false, allow_rpcs = false;
const struct option lopt[] = {
{ "help", 0, NULL, 'h' },
{ "version", 0, NULL, 'V' },
+ { "config", 1, NULL, 'c' },
{ "dump-conf", 0, NULL, 'D' },
{ "logfile", 1, NULL, 'l' },
{ "pidfile", 1, NULL, 'f' },
@@ -1193,6 +1206,26 @@
{ "retry-path", 0, NULL, 'r' },
{ NULL, 0, NULL, 0 }
};
+ g_autofree char *confpath = g_strdup(g_getenv("QGA_CONF")) ?:
+ get_relocated_path(QGA_CONF_DEFAULT);
+ bool confrequired = false;
+
+ while ((ch = getopt_long(argc, argv, sopt, lopt, NULL)) != -1) {
+ switch (ch) {
+ case 'c':
+ g_free(confpath);
+ confpath = g_strdup(optarg);
+ confrequired = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ config_load(config, confpath, confrequired);
+
+ /* Reset for second pass */
+ optind = 1;
while ((ch = getopt_long(argc, argv, sopt, lopt, &opt_ind)) != -1) {
switch (ch) {
@@ -1245,7 +1278,6 @@
}
config->blockedrpcs = g_list_concat(config->blockedrpcs,
split_list(optarg, ","));
- block_rpcs = true;
break;
}
case 'a': {
@@ -1255,7 +1287,6 @@
}
config->allowedrpcs = g_list_concat(config->allowedrpcs,
split_list(optarg, ","));
- allow_rpcs = true;
break;
}
#ifdef _WIN32
@@ -1296,12 +1327,6 @@
exit(EXIT_FAILURE);
}
}
-
- if (block_rpcs && allow_rpcs) {
- g_critical("wrong commandline, using --block-rpcs and --allow-rpcs at the"
- " same time is not allowed");
- exit(EXIT_FAILURE);
- }
}
static void config_free(GAConfig *config)
@@ -1395,6 +1420,10 @@
" '%s': %s", config->state_dir, strerror(errno));
return NULL;
}
+
+ if (!vss_init(true)) {
+ g_debug("vss_init failed, vss commands will not function");
+ }
#endif
if (ga_is_frozen(s)) {
@@ -1408,7 +1437,6 @@
s->deferred_options.log_filepath = config->log_filepath;
}
ga_disable_logging(s);
- qmp_for_each_command(&ga_commands, ga_disable_not_allowed_freeze, NULL);
} else {
if (config->daemonize) {
become_daemon(config->pid_filepath);
@@ -1432,25 +1460,6 @@
return NULL;
}
- if (config->allowedrpcs) {
- qmp_for_each_command(&ga_commands, ga_disable_not_allowed, config->allowedrpcs);
- s->allowedrpcs = config->allowedrpcs;
- }
-
- /*
- * Some commands can be blocked due to system limitation.
- * Initialize blockedrpcs list even if allowedrpcs specified.
- */
- config->blockedrpcs = ga_command_init_blockedrpcs(config->blockedrpcs);
- if (config->blockedrpcs) {
- GList *l = config->blockedrpcs;
- s->blockedrpcs = config->blockedrpcs;
- do {
- g_debug("disabling command: %s", (char *)l->data);
- qmp_disable_command(&ga_commands, l->data, NULL);
- l = g_list_next(l);
- } while (l);
- }
s->command_state = ga_command_state_new();
ga_command_state_init(s, s->command_state);
ga_command_state_init_all(s->command_state);
@@ -1476,6 +1485,8 @@
}
#endif
+ ga_apply_command_filters(s);
+
ga_state = s;
return s;
}
@@ -1579,7 +1590,6 @@
qga_qmp_init_marshal(&ga_commands);
init_dfl_pathnames();
- config_load(config);
config_parse(config, argc, argv);
if (config->pid_filepath == NULL) {
diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
index 1273d85..495706c 100644
--- a/qga/qapi-schema.json
+++ b/qga/qapi-schema.json
@@ -412,7 +412,8 @@
# Since: 0.15.0
##
{ 'enum': 'GuestFsfreezeStatus',
- 'data': [ 'thawed', 'frozen' ] }
+ 'data': [ 'thawed', 'frozen' ],
+ 'if': { 'any': ['CONFIG_WIN32', 'CONFIG_FSFREEZE'] } }
##
# @guest-fsfreeze-status:
@@ -429,7 +430,8 @@
# Since: 0.15.0
##
{ 'command': 'guest-fsfreeze-status',
- 'returns': 'GuestFsfreezeStatus' }
+ 'returns': 'GuestFsfreezeStatus',
+ 'if': { 'any': ['CONFIG_WIN32', 'CONFIG_FSFREEZE'] } }
##
# @guest-fsfreeze-freeze:
@@ -451,7 +453,8 @@
# Since: 0.15.0
##
{ 'command': 'guest-fsfreeze-freeze',
- 'returns': 'int' }
+ 'returns': 'int',
+ 'if': { 'any': ['CONFIG_WIN32', 'CONFIG_FSFREEZE'] } }
##
# @guest-fsfreeze-freeze-list:
@@ -471,7 +474,8 @@
##
{ 'command': 'guest-fsfreeze-freeze-list',
'data': { '*mountpoints': ['str'] },
- 'returns': 'int' }
+ 'returns': 'int',
+ 'if': { 'any': ['CONFIG_WIN32', 'CONFIG_FSFREEZE'] } }
##
# @guest-fsfreeze-thaw:
@@ -488,7 +492,8 @@
# Since: 0.15.0
##
{ 'command': 'guest-fsfreeze-thaw',
- 'returns': 'int' }
+ 'returns': 'int',
+ 'if': { 'any': ['CONFIG_WIN32', 'CONFIG_FSFREEZE'] } }
##
# @GuestFilesystemTrimResult:
@@ -505,7 +510,8 @@
##
{ 'struct': 'GuestFilesystemTrimResult',
'data': {'path': 'str',
- '*trimmed': 'int', '*minimum': 'int', '*error': 'str'} }
+ '*trimmed': 'int', '*minimum': 'int', '*error': 'str'},
+ 'if': { 'any': ['CONFIG_WIN32', 'CONFIG_FSTRIM'] } }
##
# @GuestFilesystemTrimResponse:
@@ -515,7 +521,8 @@
# Since: 2.4
##
{ 'struct': 'GuestFilesystemTrimResponse',
- 'data': {'paths': ['GuestFilesystemTrimResult']} }
+ 'data': {'paths': ['GuestFilesystemTrimResult']},
+ 'if': { 'any': ['CONFIG_WIN32', 'CONFIG_FSTRIM'] } }
##
# @guest-fstrim:
@@ -537,7 +544,8 @@
##
{ 'command': 'guest-fstrim',
'data': { '*minimum': 'int' },
- 'returns': 'GuestFilesystemTrimResponse' }
+ 'returns': 'GuestFilesystemTrimResponse',
+ 'if': { 'any': ['CONFIG_WIN32', 'CONFIG_FSTRIM'] } }
##
# @guest-suspend-disk:
@@ -566,7 +574,8 @@
#
# Since: 1.1
##
-{ 'command': 'guest-suspend-disk', 'success-response': false }
+{ 'command': 'guest-suspend-disk', 'success-response': false,
+ 'if': { 'any': ['CONFIG_LINUX', 'CONFIG_WIN32'] } }
##
# @guest-suspend-ram:
@@ -602,7 +611,8 @@
#
# Since: 1.1
##
-{ 'command': 'guest-suspend-ram', 'success-response': false }
+{ 'command': 'guest-suspend-ram', 'success-response': false,
+ 'if': { 'any': ['CONFIG_LINUX', 'CONFIG_WIN32'] } }
##
# @guest-suspend-hybrid:
@@ -637,7 +647,8 @@
#
# Since: 1.1
##
-{ 'command': 'guest-suspend-hybrid', 'success-response': false }
+{ 'command': 'guest-suspend-hybrid', 'success-response': false,
+ 'if': 'CONFIG_LINUX' }
##
# @GuestIpAddressType:
@@ -651,7 +662,8 @@
# Since: 1.1
##
{ 'enum': 'GuestIpAddressType',
- 'data': [ 'ipv4', 'ipv6' ] }
+ 'data': [ 'ipv4', 'ipv6' ],
+ 'if': { 'any': ['CONFIG_WIN32', 'HAVE_GETIFADDRS'] } }
##
# @GuestIpAddress:
@@ -667,7 +679,8 @@
{ 'struct': 'GuestIpAddress',
'data': {'ip-address': 'str',
'ip-address-type': 'GuestIpAddressType',
- 'prefix': 'int'} }
+ 'prefix': 'int'},
+ 'if': { 'any': ['CONFIG_WIN32', 'HAVE_GETIFADDRS'] } }
##
# @GuestNetworkInterfaceStat:
@@ -699,7 +712,8 @@
'tx-packets': 'uint64',
'tx-errs': 'uint64',
'tx-dropped': 'uint64'
- } }
+ },
+ 'if': { 'any': ['CONFIG_WIN32', 'HAVE_GETIFADDRS'] } }
##
# @GuestNetworkInterface:
@@ -719,7 +733,8 @@
'data': {'name': 'str',
'*hardware-address': 'str',
'*ip-addresses': ['GuestIpAddress'],
- '*statistics': 'GuestNetworkInterfaceStat' } }
+ '*statistics': 'GuestNetworkInterfaceStat' },
+ 'if': { 'any': ['CONFIG_WIN32', 'HAVE_GETIFADDRS'] } }
##
# @guest-network-get-interfaces:
@@ -731,7 +746,8 @@
# Since: 1.1
##
{ 'command': 'guest-network-get-interfaces',
- 'returns': ['GuestNetworkInterface'] }
+ 'returns': ['GuestNetworkInterface'],
+ 'if': { 'any': ['CONFIG_WIN32', 'HAVE_GETIFADDRS'] } }
##
# @GuestLogicalProcessor:
@@ -750,7 +766,8 @@
{ 'struct': 'GuestLogicalProcessor',
'data': {'logical-id': 'int',
'online': 'bool',
- '*can-offline': 'bool'} }
+ '*can-offline': 'bool'},
+ 'if': { 'any': ['CONFIG_LINUX', 'CONFIG_WIN32'] } }
##
# @guest-get-vcpus:
@@ -765,7 +782,8 @@
# Since: 1.5
##
{ 'command': 'guest-get-vcpus',
- 'returns': ['GuestLogicalProcessor'] }
+ 'returns': ['GuestLogicalProcessor'],
+ 'if': { 'any': ['CONFIG_LINUX', 'CONFIG_WIN32'] } }
##
# @guest-set-vcpus:
@@ -807,7 +825,8 @@
##
{ 'command': 'guest-set-vcpus',
'data': {'vcpus': ['GuestLogicalProcessor'] },
- 'returns': 'int' }
+ 'returns': 'int',
+ 'if': 'CONFIG_LINUX' }
##
# @GuestDiskBusType:
@@ -859,7 +878,8 @@
{ 'enum': 'GuestDiskBusType',
'data': [ 'ide', 'fdc', 'scsi', 'virtio', 'xen', 'usb', 'uml', 'sata',
'sd', 'unknown', 'ieee1394', 'ssa', 'fibre', 'raid', 'iscsi',
- 'sas', 'mmc', 'virtual', 'file-backed-virtual', 'nvme' ] }
+ 'sas', 'mmc', 'virtual', 'file-backed-virtual', 'nvme' ],
+ 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LINUX' ] } }
##
@@ -877,7 +897,8 @@
##
{ 'struct': 'GuestPCIAddress',
'data': {'domain': 'int', 'bus': 'int',
- 'slot': 'int', 'function': 'int'} }
+ 'slot': 'int', 'function': 'int'},
+ 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LINUX' ] } }
##
# @GuestCCWAddress:
@@ -896,7 +917,8 @@
'data': {'cssid': 'int',
'ssid': 'int',
'subchno': 'int',
- 'devno': 'int'} }
+ 'devno': 'int'},
+ 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LINUX' ] } }
##
# @GuestDiskAddress:
@@ -925,7 +947,8 @@
'bus-type': 'GuestDiskBusType',
'bus': 'int', 'target': 'int', 'unit': 'int',
'*serial': 'str', '*dev': 'str',
- '*ccw-address': 'GuestCCWAddress'} }
+ '*ccw-address': 'GuestCCWAddress'},
+ 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LINUX' ] } }
##
# @GuestNVMeSmart:
@@ -962,7 +985,8 @@
'media-errors-lo': 'uint64',
'media-errors-hi': 'uint64',
'number-of-error-log-entries-lo': 'uint64',
- 'number-of-error-log-entries-hi': 'uint64' } }
+ 'number-of-error-log-entries-hi': 'uint64' },
+ 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LIBUDEV' ] } }
##
# @GuestDiskSmart:
@@ -976,7 +1000,8 @@
{ 'union': 'GuestDiskSmart',
'base': { 'type': 'GuestDiskBusType' },
'discriminator': 'type',
- 'data': { 'nvme': 'GuestNVMeSmart' } }
+ 'data': { 'nvme': 'GuestNVMeSmart' },
+ 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LIBUDEV' ] } }
##
# @GuestDiskInfo:
@@ -1001,7 +1026,8 @@
{ 'struct': 'GuestDiskInfo',
'data': {'name': 'str', 'partition': 'bool', '*dependencies': ['str'],
'*address': 'GuestDiskAddress', '*alias': 'str',
- '*smart': 'GuestDiskSmart'} }
+ '*smart': 'GuestDiskSmart'},
+ 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LIBUDEV' ] } }
##
# @guest-get-disks:
@@ -1014,7 +1040,8 @@
# Since: 5.2
##
{ 'command': 'guest-get-disks',
- 'returns': ['GuestDiskInfo'] }
+ 'returns': ['GuestDiskInfo'],
+ 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LIBUDEV' ] } }
##
# @GuestFilesystemInfo:
@@ -1040,7 +1067,8 @@
{ 'struct': 'GuestFilesystemInfo',
'data': {'name': 'str', 'mountpoint': 'str', 'type': 'str',
'*used-bytes': 'uint64', '*total-bytes': 'uint64',
- '*total-bytes-privileged': 'uint64', 'disk': ['GuestDiskAddress']} }
+ '*total-bytes-privileged': 'uint64', 'disk': ['GuestDiskAddress']},
+ 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LINUX' ] } }
##
# @guest-get-fsinfo:
@@ -1053,7 +1081,8 @@
# Since: 2.2
##
{ 'command': 'guest-get-fsinfo',
- 'returns': ['GuestFilesystemInfo'] }
+ 'returns': ['GuestFilesystemInfo'],
+ 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LINUX' ] } }
##
# @guest-set-user-password:
@@ -1080,7 +1109,8 @@
# Since: 2.3
##
{ 'command': 'guest-set-user-password',
- 'data': { 'username': 'str', 'password': 'str', 'crypted': 'bool' } }
+ 'data': { 'username': 'str', 'password': 'str', 'crypted': 'bool' },
+ 'if': { 'any': [ 'CONFIG_WIN32', 'CONFIG_LINUX', 'CONFIG_FREEBSD'] } }
##
# @GuestMemoryBlock:
@@ -1100,7 +1130,8 @@
{ 'struct': 'GuestMemoryBlock',
'data': {'phys-index': 'uint64',
'online': 'bool',
- '*can-offline': 'bool'} }
+ '*can-offline': 'bool'},
+ 'if': 'CONFIG_LINUX' }
##
# @guest-get-memory-blocks:
@@ -1116,7 +1147,8 @@
# Since: 2.3
##
{ 'command': 'guest-get-memory-blocks',
- 'returns': ['GuestMemoryBlock'] }
+ 'returns': ['GuestMemoryBlock'],
+ 'if': 'CONFIG_LINUX' }
##
# @GuestMemoryBlockResponseType:
@@ -1139,7 +1171,8 @@
##
{ 'enum': 'GuestMemoryBlockResponseType',
'data': ['success', 'not-found', 'operation-not-supported',
- 'operation-failed'] }
+ 'operation-failed'],
+ 'if': 'CONFIG_LINUX' }
##
# @GuestMemoryBlockResponse:
@@ -1157,7 +1190,8 @@
{ 'struct': 'GuestMemoryBlockResponse',
'data': { 'phys-index': 'uint64',
'response': 'GuestMemoryBlockResponseType',
- '*error-code': 'int' }}
+ '*error-code': 'int' },
+ 'if': 'CONFIG_LINUX'}
##
# @guest-set-memory-blocks:
@@ -1188,7 +1222,8 @@
##
{ 'command': 'guest-set-memory-blocks',
'data': {'mem-blks': ['GuestMemoryBlock'] },
- 'returns': ['GuestMemoryBlockResponse'] }
+ 'returns': ['GuestMemoryBlockResponse'],
+ 'if': 'CONFIG_LINUX' }
##
# @GuestMemoryBlockInfo:
@@ -1200,7 +1235,8 @@
# Since: 2.3
##
{ 'struct': 'GuestMemoryBlockInfo',
- 'data': {'size': 'uint64'} }
+ 'data': {'size': 'uint64'},
+ 'if': 'CONFIG_LINUX' }
##
# @guest-get-memory-block-info:
@@ -1212,7 +1248,8 @@
# Since: 2.3
##
{ 'command': 'guest-get-memory-block-info',
- 'returns': 'GuestMemoryBlockInfo' }
+ 'returns': 'GuestMemoryBlockInfo',
+ 'if': 'CONFIG_LINUX' }
##
# @GuestExecStatus:
@@ -1378,7 +1415,8 @@
# Since: 2.10
##
{ 'struct': 'GuestUser',
- 'data': { 'user': 'str', 'login-time': 'number', '*domain': 'str' } }
+ 'data': { 'user': 'str', 'login-time': 'number', '*domain': 'str' },
+ 'if': { 'any': ['CONFIG_WIN32', 'HAVE_UTMPX' ] } }
##
# @guest-get-users:
@@ -1390,7 +1428,8 @@
# Since: 2.10
##
{ 'command': 'guest-get-users',
- 'returns': ['GuestUser'] }
+ 'returns': ['GuestUser'],
+ 'if': { 'any': ['CONFIG_WIN32', 'HAVE_UTMPX' ] } }
##
# @GuestTimezone:
@@ -1499,7 +1538,8 @@
# @pci: PCI device
##
{ 'enum': 'GuestDeviceType',
- 'data': [ 'pci' ] }
+ 'data': [ 'pci' ],
+ 'if': 'CONFIG_WIN32' }
##
# @GuestDeviceIdPCI:
@@ -1511,7 +1551,8 @@
# Since: 5.2
##
{ 'struct': 'GuestDeviceIdPCI',
- 'data': { 'vendor-id': 'uint16', 'device-id': 'uint16' } }
+ 'data': { 'vendor-id': 'uint16', 'device-id': 'uint16' },
+ 'if': 'CONFIG_WIN32' }
##
# @GuestDeviceId:
@@ -1525,7 +1566,8 @@
{ 'union': 'GuestDeviceId',
'base': { 'type': 'GuestDeviceType' },
'discriminator': 'type',
- 'data': { 'pci': 'GuestDeviceIdPCI' } }
+ 'data': { 'pci': 'GuestDeviceIdPCI' },
+ 'if': 'CONFIG_WIN32' }
##
# @GuestDeviceInfo:
@@ -1546,7 +1588,8 @@
'*driver-date': 'int',
'*driver-version': 'str',
'*id': 'GuestDeviceId'
- } }
+ },
+ 'if': 'CONFIG_WIN32' }
##
# @guest-get-devices:
@@ -1558,7 +1601,8 @@
# Since: 5.2
##
{ 'command': 'guest-get-devices',
- 'returns': ['GuestDeviceInfo'] }
+ 'returns': ['GuestDeviceInfo'],
+ 'if': 'CONFIG_WIN32' }
##
# @GuestAuthorizedKeys:
@@ -1685,7 +1729,8 @@
'*ios-pgr': 'uint64',
'*total-ticks': 'uint64',
'*weight-ticks': 'uint64'
- } }
+ },
+ 'if': 'CONFIG_LINUX' }
##
# @GuestDiskStatsInfo:
@@ -1702,7 +1747,8 @@
'data': {'name': 'str',
'major': 'uint64',
'minor': 'uint64',
- 'stats': 'GuestDiskStats' } }
+ 'stats': 'GuestDiskStats' },
+ 'if': 'CONFIG_LINUX' }
##
# @guest-get-diskstats:
@@ -1714,7 +1760,8 @@
# Since: 7.1
##
{ 'command': 'guest-get-diskstats',
- 'returns': ['GuestDiskStatsInfo']
+ 'returns': ['GuestDiskStatsInfo'],
+ 'if': 'CONFIG_LINUX'
}
##
@@ -1727,7 +1774,8 @@
# Since: 7.1
##
{ 'enum': 'GuestCpuStatsType',
- 'data': [ 'linux' ] }
+ 'data': [ 'linux' ],
+ 'if': 'CONFIG_LINUX' }
##
@@ -1772,7 +1820,8 @@
'*steal': 'uint64',
'*guest': 'uint64',
'*guestnice': 'uint64'
- } }
+ },
+ 'if': 'CONFIG_LINUX' }
##
# @GuestCpuStats:
@@ -1786,7 +1835,8 @@
{ 'union': 'GuestCpuStats',
'base': { 'type': 'GuestCpuStatsType' },
'discriminator': 'type',
- 'data': { 'linux': 'GuestLinuxCpuStats' } }
+ 'data': { 'linux': 'GuestLinuxCpuStats' },
+ 'if': 'CONFIG_LINUX' }
##
# @guest-get-cpustats:
@@ -1798,5 +1848,79 @@
# Since: 7.1
##
{ 'command': 'guest-get-cpustats',
- 'returns': ['GuestCpuStats']
+ 'returns': ['GuestCpuStats'],
+ 'if': 'CONFIG_LINUX'
+}
+
+##
+# @GuestNetworkRoute:
+#
+# Route information, currently, only linux supported.
+#
+# @iface: The destination network or host's egress network interface in the routing table
+#
+# @destination: The IP address of the target network or host, The final destination of the packet
+#
+# @metric: Route metric
+#
+# @gateway: The IP address of the next hop router
+#
+# @mask: Subnet Mask (IPv4 only)
+#
+# @irtt: Initial round-trip delay (not for windows, IPv4 only)
+#
+# @flags: Route flags (not for windows)
+#
+# @refcnt: The route's reference count (not for windows)
+#
+# @use: Route usage count (not for windows)
+#
+# @window: TCP window size, used for flow control (not for windows, IPv4 only)
+#
+# @mtu: Data link layer maximum packet size (not for windows)
+#
+# @desprefixlen: Destination prefix length (for IPv6)
+#
+# @source: Source IP address (for IPv6)
+#
+# @srcprefixlen: Source prefix length (for IPv6)
+#
+# @nexthop: Next hop IP address (for IPv6)
+#
+# @version: IP version (4 or 6)
+#
+# Since: 9.1
+
+##
+{ 'struct': 'GuestNetworkRoute',
+ 'data': {'iface': 'str',
+ 'destination': 'str',
+ 'metric': 'int',
+ '*gateway': 'str',
+ '*mask': 'str',
+ '*irtt': 'int',
+ '*flags': 'uint64',
+ '*refcnt': 'int',
+ '*use': 'int',
+ '*window': 'int',
+ '*mtu': 'int',
+ '*desprefixlen': 'str',
+ '*source': 'str',
+ '*srcprefixlen': 'str',
+ '*nexthop': 'str',
+ 'version': 'int'
+ },
+ 'if': 'CONFIG_LINUX' }
+
+##
+# @guest-network-get-route:
+#
+# Retrieve information about route of network.
+# Returns: List of route info of guest.
+#
+# Since: 9.1
+##
+{ 'command': 'guest-network-get-route',
+ 'returns': ['GuestNetworkRoute'],
+ 'if': 'CONFIG_LINUX'
}
diff --git a/system/physmem.c b/system/physmem.c
index 9a3b3a7..0e19186 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -763,6 +763,7 @@
if (!cpu->cpu_ases) {
cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
+ cpu->cpu_ases_count = cpu->num_ases;
}
newas = &cpu->cpu_ases[asidx];
@@ -776,6 +777,34 @@
}
}
+void cpu_address_space_destroy(CPUState *cpu, int asidx)
+{
+ CPUAddressSpace *cpuas;
+
+ assert(cpu->cpu_ases);
+ assert(asidx >= 0 && asidx < cpu->num_ases);
+ /* KVM cannot currently support multiple address spaces. */
+ assert(asidx == 0 || !kvm_enabled());
+
+ cpuas = &cpu->cpu_ases[asidx];
+ if (tcg_enabled()) {
+ memory_listener_unregister(&cpuas->tcg_as_listener);
+ }
+
+ address_space_destroy(cpuas->as);
+ g_free_rcu(cpuas->as, rcu);
+
+ if (asidx == 0) {
+ /* reset the convenience alias for address space 0 */
+ cpu->as = NULL;
+ }
+
+ if (--cpu->cpu_ases_count == 0) {
+ g_free(cpu->cpu_ases);
+ cpu->cpu_ases = NULL;
+ }
+}
+
AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
{
/* Return the AddressSpace corresponding to the specified index */
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 1e121ac..c6cc035 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -414,6 +414,10 @@
#define MSR_IA32_TSX_CTRL 0x122
#define MSR_IA32_TSCDEADLINE 0x6e0
#define MSR_IA32_PKRS 0x6e1
+#define MSR_RAPL_POWER_UNIT 0x00000606
+#define MSR_PKG_POWER_LIMIT 0x00000610
+#define MSR_PKG_ENERGY_STATUS 0x00000611
+#define MSR_PKG_POWER_INFO 0x00000614
#define MSR_ARCH_LBR_CTL 0x000014ce
#define MSR_ARCH_LBR_DEPTH 0x000014cf
#define MSR_ARCH_LBR_FROM_0 0x00001500
@@ -1880,6 +1884,10 @@
uintptr_t retaddr;
+ /* RAPL MSR */
+ uint64_t msr_rapl_power_unit;
+ uint64_t msr_pkg_energy_status;
+
/* Fields up to this point are cleared by a CPU reset */
struct {} end_reset_fields;
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index becca2e..b4aab9a 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -16,9 +16,12 @@
#include "qapi/qapi-events-run-state.h"
#include "qapi/error.h"
#include "qapi/visitor.h"
+#include <math.h>
#include <sys/ioctl.h>
#include <sys/utsname.h>
#include <sys/syscall.h>
+#include <sys/resource.h>
+#include <sys/time.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -27,6 +30,7 @@
#include "cpu.h"
#include "host-cpu.h"
+#include "vmsr_energy.h"
#include "sysemu/sysemu.h"
#include "sysemu/hw_accel.h"
#include "sysemu/kvm_int.h"
@@ -2559,7 +2563,8 @@
return ret;
}
-static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, uint32_t msr,
+static bool kvm_rdmsr_core_thread_count(X86CPU *cpu,
+ uint32_t msr,
uint64_t *val)
{
CPUState *cs = CPU(cpu);
@@ -2570,6 +2575,53 @@
return true;
}
+static bool kvm_rdmsr_rapl_power_unit(X86CPU *cpu,
+ uint32_t msr,
+ uint64_t *val)
+{
+
+ CPUState *cs = CPU(cpu);
+
+ *val = cs->kvm_state->msr_energy.msr_unit;
+
+ return true;
+}
+
+static bool kvm_rdmsr_pkg_power_limit(X86CPU *cpu,
+ uint32_t msr,
+ uint64_t *val)
+{
+
+ CPUState *cs = CPU(cpu);
+
+ *val = cs->kvm_state->msr_energy.msr_limit;
+
+ return true;
+}
+
+static bool kvm_rdmsr_pkg_power_info(X86CPU *cpu,
+ uint32_t msr,
+ uint64_t *val)
+{
+
+ CPUState *cs = CPU(cpu);
+
+ *val = cs->kvm_state->msr_energy.msr_info;
+
+ return true;
+}
+
+static bool kvm_rdmsr_pkg_energy_status(X86CPU *cpu,
+ uint32_t msr,
+ uint64_t *val)
+{
+
+ CPUState *cs = CPU(cpu);
+ *val = cs->kvm_state->msr_energy.msr_value[cs->cpu_index];
+
+ return true;
+}
+
static Notifier smram_machine_done;
static KVMMemoryListener smram_listener;
static AddressSpace smram_address_space;
@@ -2604,6 +2656,340 @@
&smram_address_space, 1, "kvm-smram");
}
+static void *kvm_msr_energy_thread(void *data)
+{
+ KVMState *s = data;
+ struct KVMMsrEnergy *vmsr = &s->msr_energy;
+
+ g_autofree vmsr_package_energy_stat *pkg_stat = NULL;
+ g_autofree vmsr_thread_stat *thd_stat = NULL;
+ g_autofree CPUState *cpu = NULL;
+ g_autofree unsigned int *vpkgs_energy_stat = NULL;
+ unsigned int num_threads = 0;
+
+ X86CPUTopoIDs topo_ids;
+
+ rcu_register_thread();
+
+ /* Allocate memory for each package energy status */
+ pkg_stat = g_new0(vmsr_package_energy_stat, vmsr->host_topo.maxpkgs);
+
+ /* Allocate memory for thread stats */
+ thd_stat = g_new0(vmsr_thread_stat, 1);
+
+ /* Allocate memory for holding virtual package energy counter */
+ vpkgs_energy_stat = g_new0(unsigned int, vmsr->guest_vsockets);
+
+ /* Populate the max tick of each packages */
+ for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) {
+ /*
+ * Max numbers of ticks per package
+ * Time in second * Number of ticks/second * Number of cores/package
+ * ex: 100 ticks/second/CPU, 12 CPUs per Package gives 1200 ticks max
+ */
+ vmsr->host_topo.maxticks[i] = (MSR_ENERGY_THREAD_SLEEP_US / 1000000)
+ * sysconf(_SC_CLK_TCK)
+ * vmsr->host_topo.pkg_cpu_count[i];
+ }
+
+ while (true) {
+ /* Get all qemu threads id */
+ g_autofree pid_t *thread_ids =
+ thread_ids = vmsr_get_thread_ids(vmsr->pid, &num_threads);
+
+ if (thread_ids == NULL) {
+ goto clean;
+ }
+
+ thd_stat = g_renew(vmsr_thread_stat, thd_stat, num_threads);
+ /* Unlike g_new0, g_renew0 function doesn't exist yet... */
+ memset(thd_stat, 0, num_threads * sizeof(vmsr_thread_stat));
+
+ /* Populate all the thread stats */
+ for (int i = 0; i < num_threads; i++) {
+ thd_stat[i].utime = g_new0(unsigned long long, 2);
+ thd_stat[i].stime = g_new0(unsigned long long, 2);
+ thd_stat[i].thread_id = thread_ids[i];
+ vmsr_read_thread_stat(vmsr->pid,
+ thd_stat[i].thread_id,
+ thd_stat[i].utime,
+ thd_stat[i].stime,
+ &thd_stat[i].cpu_id);
+ thd_stat[i].pkg_id =
+ vmsr_get_physical_package_id(thd_stat[i].cpu_id);
+ }
+
+ /* Retrieve all packages power plane energy counter */
+ for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) {
+ for (int j = 0; j < num_threads; j++) {
+ /*
+ * Use the first thread we found that ran on the CPU
+ * of the package to read the packages energy counter
+ */
+ if (thd_stat[j].pkg_id == i) {
+ pkg_stat[i].e_start =
+ vmsr_read_msr(MSR_PKG_ENERGY_STATUS,
+ thd_stat[j].cpu_id,
+ thd_stat[j].thread_id,
+ s->msr_energy.sioc);
+ break;
+ }
+ }
+ }
+
+ /* Sleep a short period while the other threads are working */
+ usleep(MSR_ENERGY_THREAD_SLEEP_US);
+
+ /*
+ * Retrieve all packages power plane energy counter
+ * Calculate the delta of all packages
+ */
+ for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) {
+ for (int j = 0; j < num_threads; j++) {
+ /*
+ * Use the first thread we found that ran on the CPU
+ * of the package to read the packages energy counter
+ */
+ if (thd_stat[j].pkg_id == i) {
+ pkg_stat[i].e_end =
+ vmsr_read_msr(MSR_PKG_ENERGY_STATUS,
+ thd_stat[j].cpu_id,
+ thd_stat[j].thread_id,
+ s->msr_energy.sioc);
+ /*
+ * Prevent the case we have migrate the VM
+ * during the sleep period or any other cases
+ * were energy counter might be lower after
+ * the sleep period.
+ */
+ if (pkg_stat[i].e_end > pkg_stat[i].e_start) {
+ pkg_stat[i].e_delta =
+ pkg_stat[i].e_end - pkg_stat[i].e_start;
+ } else {
+ pkg_stat[i].e_delta = 0;
+ }
+ break;
+ }
+ }
+ }
+
+ /* Delta of ticks spend by each thread between the sample */
+ for (int i = 0; i < num_threads; i++) {
+ vmsr_read_thread_stat(vmsr->pid,
+ thd_stat[i].thread_id,
+ thd_stat[i].utime,
+ thd_stat[i].stime,
+ &thd_stat[i].cpu_id);
+
+ if (vmsr->pid < 0) {
+ /*
+ * We don't count the dead thread
+ * i.e threads that existed before the sleep
+ * and not anymore
+ */
+ thd_stat[i].delta_ticks = 0;
+ } else {
+ vmsr_delta_ticks(thd_stat, i);
+ }
+ }
+
+ /*
+ * Identify the vcpu threads
+ * Calculate the number of vcpu per package
+ */
+ CPU_FOREACH(cpu) {
+ for (int i = 0; i < num_threads; i++) {
+ if (cpu->thread_id == thd_stat[i].thread_id) {
+ thd_stat[i].is_vcpu = true;
+ thd_stat[i].vcpu_id = cpu->cpu_index;
+ pkg_stat[thd_stat[i].pkg_id].nb_vcpu++;
+ thd_stat[i].acpi_id = kvm_arch_vcpu_id(cpu);
+ break;
+ }
+ }
+ }
+
+ /* Retrieve the virtual package number of each vCPU */
+ for (int i = 0; i < vmsr->guest_cpu_list->len; i++) {
+ for (int j = 0; j < num_threads; j++) {
+ if ((thd_stat[j].acpi_id ==
+ vmsr->guest_cpu_list->cpus[i].arch_id)
+ && (thd_stat[j].is_vcpu == true)) {
+ x86_topo_ids_from_apicid(thd_stat[j].acpi_id,
+ &vmsr->guest_topo_info, &topo_ids);
+ thd_stat[j].vpkg_id = topo_ids.pkg_id;
+ }
+ }
+ }
+
+ /* Calculate the total energy of all non-vCPU thread */
+ for (int i = 0; i < num_threads; i++) {
+ if ((thd_stat[i].is_vcpu != true) &&
+ (thd_stat[i].delta_ticks > 0)) {
+ double temp;
+ temp = vmsr_get_ratio(pkg_stat[thd_stat[i].pkg_id].e_delta,
+ thd_stat[i].delta_ticks,
+ vmsr->host_topo.maxticks[thd_stat[i].pkg_id]);
+ pkg_stat[thd_stat[i].pkg_id].e_ratio
+ += (uint64_t)lround(temp);
+ }
+ }
+
+ /* Calculate the ratio per non-vCPU thread of each package */
+ for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) {
+ if (pkg_stat[i].nb_vcpu > 0) {
+ pkg_stat[i].e_ratio = pkg_stat[i].e_ratio / pkg_stat[i].nb_vcpu;
+ }
+ }
+
+ /*
+ * Calculate the energy for each Package:
+ * Energy Package = sum of each vCPU energy that belongs to the package
+ */
+ for (int i = 0; i < num_threads; i++) {
+ if ((thd_stat[i].is_vcpu == true) && \
+ (thd_stat[i].delta_ticks > 0)) {
+ double temp;
+ temp = vmsr_get_ratio(pkg_stat[thd_stat[i].pkg_id].e_delta,
+ thd_stat[i].delta_ticks,
+ vmsr->host_topo.maxticks[thd_stat[i].pkg_id]);
+ vpkgs_energy_stat[thd_stat[i].vpkg_id] +=
+ (uint64_t)lround(temp);
+ vpkgs_energy_stat[thd_stat[i].vpkg_id] +=
+ pkg_stat[thd_stat[i].pkg_id].e_ratio;
+ }
+ }
+
+ /*
+ * Finally populate the vmsr register of each vCPU with the total
+ * package value to emulate the real hardware where each CPU return the
+ * value of the package it belongs.
+ */
+ for (int i = 0; i < num_threads; i++) {
+ if ((thd_stat[i].is_vcpu == true) && \
+ (thd_stat[i].delta_ticks > 0)) {
+ vmsr->msr_value[thd_stat[i].vcpu_id] = \
+ vpkgs_energy_stat[thd_stat[i].vpkg_id];
+ }
+ }
+
+ /* Freeing memory before zeroing the pointer */
+ for (int i = 0; i < num_threads; i++) {
+ g_free(thd_stat[i].utime);
+ g_free(thd_stat[i].stime);
+ }
+ }
+
+clean:
+ rcu_unregister_thread();
+ return NULL;
+}
+
+static int kvm_msr_energy_thread_init(KVMState *s, MachineState *ms)
+{
+ MachineClass *mc = MACHINE_GET_CLASS(ms);
+ struct KVMMsrEnergy *r = &s->msr_energy;
+ int ret = 0;
+
+ /*
+ * Sanity check
+ * 1. Host cpu must be Intel cpu
+ * 2. RAPL must be enabled on the Host
+ */
+ if (is_host_cpu_intel()) {
+ error_report("The RAPL feature can only be enabled on hosts\
+ with Intel CPU models");
+ ret = 1;
+ goto out;
+ }
+
+ if (!is_rapl_enabled()) {
+ ret = 1;
+ goto out;
+ }
+
+ /* Retrieve the virtual topology */
+ vmsr_init_topo_info(&r->guest_topo_info, ms);
+
+ /* Retrieve the number of vcpu */
+ r->guest_vcpus = ms->smp.cpus;
+
+ /* Retrieve the number of virtual sockets */
+ r->guest_vsockets = ms->smp.sockets;
+
+ /* Allocate register memory (MSR_PKG_STATUS) for each vcpu */
+ r->msr_value = g_new0(uint64_t, r->guest_vcpus);
+
+ /* Retrieve the CPUArchIDlist */
+ r->guest_cpu_list = mc->possible_cpu_arch_ids(ms);
+
+ /* Max number of cpus on the Host */
+ r->host_topo.maxcpus = vmsr_get_maxcpus();
+ if (r->host_topo.maxcpus == 0) {
+ error_report("host max cpus = 0");
+ ret = 1;
+ goto out;
+ }
+
+ /* Max number of packages on the host */
+ r->host_topo.maxpkgs = vmsr_get_max_physical_package(r->host_topo.maxcpus);
+ if (r->host_topo.maxpkgs == 0) {
+ error_report("host max pkgs = 0");
+ ret = 1;
+ goto out;
+ }
+
+ /* Allocate memory for each package on the host */
+ r->host_topo.pkg_cpu_count = g_new0(unsigned int, r->host_topo.maxpkgs);
+ r->host_topo.maxticks = g_new0(unsigned int, r->host_topo.maxpkgs);
+
+ vmsr_count_cpus_per_package(r->host_topo.pkg_cpu_count,
+ r->host_topo.maxpkgs);
+ for (int i = 0; i < r->host_topo.maxpkgs; i++) {
+ if (r->host_topo.pkg_cpu_count[i] == 0) {
+ error_report("cpu per packages = 0 on package_%d", i);
+ ret = 1;
+ goto out;
+ }
+ }
+
+ /* Get QEMU PID*/
+ r->pid = getpid();
+
+ /* Compute the socket path if necessary */
+ if (s->msr_energy.socket_path == NULL) {
+ s->msr_energy.socket_path = vmsr_compute_default_paths();
+ }
+
+ /* Open socket with vmsr helper */
+ s->msr_energy.sioc = vmsr_open_socket(s->msr_energy.socket_path);
+
+ if (s->msr_energy.sioc == NULL) {
+ error_report("vmsr socket opening failed");
+ ret = 1;
+ goto out;
+ }
+
+ /* Those MSR values should not change */
+ r->msr_unit = vmsr_read_msr(MSR_RAPL_POWER_UNIT, 0, r->pid,
+ s->msr_energy.sioc);
+ r->msr_limit = vmsr_read_msr(MSR_PKG_POWER_LIMIT, 0, r->pid,
+ s->msr_energy.sioc);
+ r->msr_info = vmsr_read_msr(MSR_PKG_POWER_INFO, 0, r->pid,
+ s->msr_energy.sioc);
+ if (r->msr_unit == 0 || r->msr_limit == 0 || r->msr_info == 0) {
+ error_report("can't read any virtual msr");
+ ret = 1;
+ goto out;
+ }
+
+ qemu_thread_create(&r->msr_thr, "kvm-msr",
+ kvm_msr_energy_thread,
+ s, QEMU_THREAD_JOINABLE);
+out:
+ return ret;
+}
+
int kvm_arch_get_default_type(MachineState *ms)
{
return 0;
@@ -2804,6 +3190,49 @@
strerror(-ret));
exit(1);
}
+
+ if (s->msr_energy.enable == true) {
+ r = kvm_filter_msr(s, MSR_RAPL_POWER_UNIT,
+ kvm_rdmsr_rapl_power_unit, NULL);
+ if (!r) {
+ error_report("Could not install MSR_RAPL_POWER_UNIT \
+ handler: %s",
+ strerror(-ret));
+ exit(1);
+ }
+
+ r = kvm_filter_msr(s, MSR_PKG_POWER_LIMIT,
+ kvm_rdmsr_pkg_power_limit, NULL);
+ if (!r) {
+ error_report("Could not install MSR_PKG_POWER_LIMIT \
+ handler: %s",
+ strerror(-ret));
+ exit(1);
+ }
+
+ r = kvm_filter_msr(s, MSR_PKG_POWER_INFO,
+ kvm_rdmsr_pkg_power_info, NULL);
+ if (!r) {
+ error_report("Could not install MSR_PKG_POWER_INFO \
+ handler: %s",
+ strerror(-ret));
+ exit(1);
+ }
+ r = kvm_filter_msr(s, MSR_PKG_ENERGY_STATUS,
+ kvm_rdmsr_pkg_energy_status, NULL);
+ if (!r) {
+ error_report("Could not install MSR_PKG_ENERGY_STATUS \
+ handler: %s",
+ strerror(-ret));
+ exit(1);
+ }
+ r = kvm_msr_energy_thread_init(s, ms);
+ if (r) {
+ error_report("kvm : error RAPL feature requirement not meet");
+ exit(1);
+ }
+
+ }
}
return 0;
diff --git a/target/i386/kvm/meson.build b/target/i386/kvm/meson.build
index e785098..3996caf 100644
--- a/target/i386/kvm/meson.build
+++ b/target/i386/kvm/meson.build
@@ -3,6 +3,7 @@
i386_kvm_ss.add(files(
'kvm.c',
'kvm-cpu.c',
+ 'vmsr_energy.c',
))
i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen-emu.c'))
diff --git a/target/i386/kvm/vmsr_energy.c b/target/i386/kvm/vmsr_energy.c
new file mode 100644
index 0000000..a1d78f2
--- /dev/null
+++ b/target/i386/kvm/vmsr_energy.c
@@ -0,0 +1,345 @@
+/*
+ * QEMU KVM support -- x86 virtual RAPL msr
+ *
+ * Copyright 2024 Red Hat, Inc. 2024
+ *
+ * Author:
+ * Anthony Harivel <aharivel@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "vmsr_energy.h"
+#include "io/channel.h"
+#include "io/channel-socket.h"
+#include "hw/boards.h"
+#include "cpu.h"
+#include "host-cpu.h"
+
+char *vmsr_compute_default_paths(void)
+{
+ g_autofree char *state = qemu_get_local_state_dir();
+
+ return g_build_filename(state, "run", "qemu-vmsr-helper.sock", NULL);
+}
+
+bool is_host_cpu_intel(void)
+{
+ int family, model, stepping;
+ char vendor[CPUID_VENDOR_SZ + 1];
+
+ host_cpu_vendor_fms(vendor, &family, &model, &stepping);
+
+ return strcmp(vendor, CPUID_VENDOR_INTEL);
+}
+
+int is_rapl_enabled(void)
+{
+ const char *path = "/sys/class/powercap/intel-rapl/enabled";
+ FILE *file = fopen(path, "r");
+ int value = 0;
+
+ if (file != NULL) {
+ if (fscanf(file, "%d", &value) != 1) {
+ error_report("INTEL RAPL not enabled");
+ }
+ fclose(file);
+ } else {
+ error_report("Error opening %s", path);
+ }
+
+ return value;
+}
+
+QIOChannelSocket *vmsr_open_socket(const char *path)
+{
+ g_autofree char *socket_path = NULL;
+
+ socket_path = g_strdup(path);
+
+ SocketAddress saddr = {
+ .type = SOCKET_ADDRESS_TYPE_UNIX,
+ .u.q_unix.path = socket_path
+ };
+
+ QIOChannelSocket *sioc = qio_channel_socket_new();
+ Error *local_err = NULL;
+
+ qio_channel_set_name(QIO_CHANNEL(sioc), "vmsr-helper");
+ qio_channel_socket_connect_sync(sioc,
+ &saddr,
+ &local_err);
+ if (local_err) {
+ /* Close socket. */
+ qio_channel_close(QIO_CHANNEL(sioc), NULL);
+ object_unref(OBJECT(sioc));
+ sioc = NULL;
+ goto out;
+ }
+
+ qio_channel_set_delay(QIO_CHANNEL(sioc), false);
+out:
+ return sioc;
+}
+
+uint64_t vmsr_read_msr(uint32_t reg, uint32_t cpu_id, uint32_t tid,
+ QIOChannelSocket *sioc)
+{
+ uint64_t data = 0;
+ int r = 0;
+ Error *local_err = NULL;
+ uint32_t buffer[3];
+ /*
+ * Send the required arguments:
+ * 1. RAPL MSR register to read
+ * 2. On which CPU ID
+ * 3. From which vCPU (Thread ID)
+ */
+ buffer[0] = reg;
+ buffer[1] = cpu_id;
+ buffer[2] = tid;
+
+ r = qio_channel_write_all(QIO_CHANNEL(sioc),
+ (char *)buffer, sizeof(buffer),
+ &local_err);
+ if (r < 0) {
+ goto out_close;
+ }
+
+ r = qio_channel_read(QIO_CHANNEL(sioc),
+ (char *)&data, sizeof(data),
+ &local_err);
+ if (r < 0) {
+ data = 0;
+ goto out_close;
+ }
+
+out_close:
+ return data;
+}
+
+/* Retrieve the max number of physical package */
+unsigned int vmsr_get_max_physical_package(unsigned int max_cpus)
+{
+ const char *dir = "/sys/devices/system/cpu/";
+ const char *topo_path = "topology/physical_package_id";
+ g_autofree int *uniquePackages = g_new0(int, max_cpus);
+ unsigned int packageCount = 0;
+ FILE *file = NULL;
+
+ for (int i = 0; i < max_cpus; i++) {
+ g_autofree char *filePath = NULL;
+ g_autofree char *cpuid = g_strdup_printf("cpu%d", i);
+
+ filePath = g_build_filename(dir, cpuid, topo_path, NULL);
+
+ file = fopen(filePath, "r");
+
+ if (file == NULL) {
+ error_report("Error opening physical_package_id file");
+ return 0;
+ }
+
+ char packageId[10];
+ if (fgets(packageId, sizeof(packageId), file) == NULL) {
+ packageCount = 0;
+ }
+
+ fclose(file);
+
+ int currentPackageId = atoi(packageId);
+
+ bool isUnique = true;
+ for (int j = 0; j < packageCount; j++) {
+ if (uniquePackages[j] == currentPackageId) {
+ isUnique = false;
+ break;
+ }
+ }
+
+ if (isUnique) {
+ uniquePackages[packageCount] = currentPackageId;
+ packageCount++;
+
+ if (packageCount >= max_cpus) {
+ break;
+ }
+ }
+ }
+
+ return (packageCount == 0) ? 1 : packageCount;
+}
+
+/* Retrieve the max number of physical cpu on the host */
+unsigned int vmsr_get_maxcpus(void)
+{
+ GDir *dir;
+ const gchar *entry_name;
+ unsigned int cpu_count = 0;
+ const char *path = "/sys/devices/system/cpu/";
+
+ dir = g_dir_open(path, 0, NULL);
+ if (dir == NULL) {
+ error_report("Unable to open cpu directory");
+ return -1;
+ }
+
+ while ((entry_name = g_dir_read_name(dir)) != NULL) {
+ if (g_ascii_strncasecmp(entry_name, "cpu", 3) == 0 &&
+ isdigit(entry_name[3])) {
+ cpu_count++;
+ }
+ }
+
+ g_dir_close(dir);
+
+ return cpu_count;
+}
+
+/* Count the number of physical cpu on each packages */
+unsigned int vmsr_count_cpus_per_package(unsigned int *package_count,
+ unsigned int max_pkgs)
+{
+ g_autofree char *file_contents = NULL;
+ g_autofree char *path = NULL;
+ g_autofree char *path_name = NULL;
+ gsize length;
+
+ /* Iterate over cpus and count cpus in each package */
+ for (int cpu_id = 0; ; cpu_id++) {
+ path_name = g_strdup_printf("/sys/devices/system/cpu/cpu%d/"
+ "topology/physical_package_id", cpu_id);
+
+ path = g_build_filename(path_name, NULL);
+
+ if (!g_file_get_contents(path, &file_contents, &length, NULL)) {
+ break; /* No more cpus */
+ }
+
+ /* Get the physical package ID for this CPU */
+ int package_id = atoi(file_contents);
+
+ /* Check if the package ID is within the known number of packages */
+ if (package_id >= 0 && package_id < max_pkgs) {
+ /* If yes, count the cpu for this package*/
+ package_count[package_id]++;
+ }
+ }
+
+ return 0;
+}
+
+/* Get the physical package id from a given cpu id */
+int vmsr_get_physical_package_id(int cpu_id)
+{
+ g_autofree char *file_contents = NULL;
+ g_autofree char *file_path = NULL;
+ int package_id = -1;
+ gsize length;
+
+ file_path = g_strdup_printf("/sys/devices/system/cpu/cpu%d"
+ "/topology/physical_package_id", cpu_id);
+
+ if (!g_file_get_contents(file_path, &file_contents, &length, NULL)) {
+ goto out;
+ }
+
+ package_id = atoi(file_contents);
+
+out:
+ return package_id;
+}
+
+/* Read the scheduled time for a given thread of a give pid */
+void vmsr_read_thread_stat(pid_t pid,
+ unsigned int thread_id,
+ unsigned long long *utime,
+ unsigned long long *stime,
+ unsigned int *cpu_id)
+{
+ g_autofree char *path = NULL;
+ g_autofree char *path_name = NULL;
+
+ path_name = g_strdup_printf("/proc/%u/task/%d/stat", pid, thread_id);
+
+ path = g_build_filename(path_name, NULL);
+
+ FILE *file = fopen(path, "r");
+ if (file == NULL) {
+ pid = -1;
+ return;
+ }
+
+ if (fscanf(file, "%*d (%*[^)]) %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u"
+ " %llu %llu %*d %*d %*d %*d %*d %*d %*u %*u %*d %*u %*u"
+ " %*u %*u %*u %*u %*u %*u %*u %*u %*u %*d %*u %*u %u",
+ utime, stime, cpu_id) != 3)
+ {
+ pid = -1;
+ return;
+ }
+
+ fclose(file);
+ return;
+}
+
+/* Read QEMU stat task folder to retrieve all QEMU threads ID */
+pid_t *vmsr_get_thread_ids(pid_t pid, unsigned int *num_threads)
+{
+ g_autofree char *task_path = g_strdup_printf("%d/task", pid);
+ g_autofree char *path = g_build_filename("/proc", task_path, NULL);
+
+ DIR *dir = opendir(path);
+ if (dir == NULL) {
+ error_report("Error opening /proc/qemu/task");
+ return NULL;
+ }
+
+ pid_t *thread_ids = NULL;
+ unsigned int thread_count = 0;
+
+ g_autofree struct dirent *ent = NULL;
+ while ((ent = readdir(dir)) != NULL) {
+ if (ent->d_name[0] == '.') {
+ continue;
+ }
+ pid_t tid = atoi(ent->d_name);
+ if (pid != tid) {
+ thread_ids = g_renew(pid_t, thread_ids, (thread_count + 1));
+ thread_ids[thread_count] = tid;
+ thread_count++;
+ }
+ }
+
+ closedir(dir);
+
+ *num_threads = thread_count;
+ return thread_ids;
+}
+
+void vmsr_delta_ticks(vmsr_thread_stat *thd_stat, int i)
+{
+ thd_stat[i].delta_ticks = (thd_stat[i].utime[1] + thd_stat[i].stime[1])
+ - (thd_stat[i].utime[0] + thd_stat[i].stime[0]);
+}
+
+double vmsr_get_ratio(uint64_t e_delta,
+ unsigned long long delta_ticks,
+ unsigned int maxticks)
+{
+ return (e_delta / 100.0) * ((100.0 / maxticks) * delta_ticks);
+}
+
+void vmsr_init_topo_info(X86CPUTopoInfo *topo_info,
+ const MachineState *ms)
+{
+ topo_info->dies_per_pkg = ms->smp.dies;
+ topo_info->modules_per_die = ms->smp.modules;
+ topo_info->cores_per_module = ms->smp.cores;
+ topo_info->threads_per_core = ms->smp.threads;
+}
+
diff --git a/target/i386/kvm/vmsr_energy.h b/target/i386/kvm/vmsr_energy.h
new file mode 100644
index 0000000..16cc1f4
--- /dev/null
+++ b/target/i386/kvm/vmsr_energy.h
@@ -0,0 +1,99 @@
+/*
+ * QEMU KVM support -- x86 virtual energy-related MSR.
+ *
+ * Copyright 2024 Red Hat, Inc. 2024
+ *
+ * Author:
+ * Anthony Harivel <aharivel@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef VMSR_ENERGY_H
+#define VMSR_ENERGY_H
+
+#include <stdint.h>
+#include "qemu/osdep.h"
+#include "io/channel-socket.h"
+#include "hw/i386/topology.h"
+
+/*
+ * Define the interval time in micro seconds between 2 samples of
+ * energy related MSRs
+ */
+#define MSR_ENERGY_THREAD_SLEEP_US 1000000.0
+
+/*
+ * Thread statistic
+ * @ thread_id: TID (thread ID)
+ * @ is_vcpu: true if TID is vCPU thread
+ * @ cpu_id: CPU number last executed on
+ * @ pkg_id: package number of the CPU
+ * @ vcpu_id: vCPU ID
+ * @ vpkg: virtual package number
+ * @ acpi_id: APIC id of the vCPU
+ * @ utime: amount of clock ticks the thread
+ * has been scheduled in User mode
+ * @ stime: amount of clock ticks the thread
+ * has been scheduled in System mode
+ * @ delta_ticks: delta of utime+stime between
+ * the two samples (before/after sleep)
+ */
+struct vmsr_thread_stat {
+ unsigned int thread_id;
+ bool is_vcpu;
+ unsigned int cpu_id;
+ unsigned int pkg_id;
+ unsigned int vpkg_id;
+ unsigned int vcpu_id;
+ unsigned long acpi_id;
+ unsigned long long *utime;
+ unsigned long long *stime;
+ unsigned long long delta_ticks;
+};
+
+/*
+ * Package statistic
+ * @ e_start: package energy counter before the sleep
+ * @ e_end: package energy counter after the sleep
+ * @ e_delta: delta of package energy counter
+ * @ e_ratio: store the energy ratio of non-vCPU thread
+ * @ nb_vcpu: number of vCPU running on this package
+ */
+struct vmsr_package_energy_stat {
+ uint64_t e_start;
+ uint64_t e_end;
+ uint64_t e_delta;
+ uint64_t e_ratio;
+ unsigned int nb_vcpu;
+};
+
+typedef struct vmsr_thread_stat vmsr_thread_stat;
+typedef struct vmsr_package_energy_stat vmsr_package_energy_stat;
+
+char *vmsr_compute_default_paths(void);
+void vmsr_read_thread_stat(pid_t pid,
+ unsigned int thread_id,
+ unsigned long long *utime,
+ unsigned long long *stime,
+ unsigned int *cpu_id);
+
+QIOChannelSocket *vmsr_open_socket(const char *path);
+uint64_t vmsr_read_msr(uint32_t reg, uint32_t cpu_id,
+ uint32_t tid, QIOChannelSocket *sioc);
+void vmsr_delta_ticks(vmsr_thread_stat *thd_stat, int i);
+unsigned int vmsr_get_maxcpus(void);
+unsigned int vmsr_get_max_physical_package(unsigned int max_cpus);
+unsigned int vmsr_count_cpus_per_package(unsigned int *package_count,
+ unsigned int max_pkgs);
+int vmsr_get_physical_package_id(int cpu_id);
+pid_t *vmsr_get_thread_ids(pid_t pid, unsigned int *num_threads);
+double vmsr_get_ratio(uint64_t e_delta,
+ unsigned long long delta_ticks,
+ unsigned int maxticks);
+void vmsr_init_topo_info(X86CPUTopoInfo *topo_info, const MachineState *ms);
+bool is_host_cpu_intel(void);
+int is_rapl_enabled(void);
+#endif /* VMSR_ENERGY_H */
diff --git a/tests/data/acpi/aarch64/virt/DSDT b/tests/data/acpi/aarch64/virt/DSDT
index c475039..36d3e5d 100644
--- a/tests/data/acpi/aarch64/virt/DSDT
+++ b/tests/data/acpi/aarch64/virt/DSDT
Binary files differ
diff --git a/tests/data/acpi/aarch64/virt/DSDT.acpihmatvirt b/tests/data/acpi/aarch64/virt/DSDT.acpihmatvirt
index aee6ba0..e6154d0 100644
--- a/tests/data/acpi/aarch64/virt/DSDT.acpihmatvirt
+++ b/tests/data/acpi/aarch64/virt/DSDT.acpihmatvirt
Binary files differ
diff --git a/tests/data/acpi/aarch64/virt/DSDT.memhp b/tests/data/acpi/aarch64/virt/DSDT.memhp
index bae36cd..33f011d 100644
--- a/tests/data/acpi/aarch64/virt/DSDT.memhp
+++ b/tests/data/acpi/aarch64/virt/DSDT.memhp
Binary files differ
diff --git a/tests/data/acpi/aarch64/virt/DSDT.pxb b/tests/data/acpi/aarch64/virt/DSDT.pxb
index fbd78f4..c0fdc6e 100644
--- a/tests/data/acpi/aarch64/virt/DSDT.pxb
+++ b/tests/data/acpi/aarch64/virt/DSDT.pxb
Binary files differ
diff --git a/tests/data/acpi/aarch64/virt/DSDT.topology b/tests/data/acpi/aarch64/virt/DSDT.topology
index 501314c..029d03e 100644
--- a/tests/data/acpi/aarch64/virt/DSDT.topology
+++ b/tests/data/acpi/aarch64/virt/DSDT.topology
Binary files differ
diff --git a/tests/data/acpi/riscv64/virt/APIC b/tests/data/acpi/riscv64/virt/APIC
new file mode 100644
index 0000000..66a25df
--- /dev/null
+++ b/tests/data/acpi/riscv64/virt/APIC
Binary files differ
diff --git a/tests/data/acpi/riscv64/virt/DSDT b/tests/data/acpi/riscv64/virt/DSDT
new file mode 100644
index 0000000..6a33f56
--- /dev/null
+++ b/tests/data/acpi/riscv64/virt/DSDT
Binary files differ
diff --git a/tests/data/acpi/riscv64/virt/FACP b/tests/data/acpi/riscv64/virt/FACP
new file mode 100644
index 0000000..a5276b6
--- /dev/null
+++ b/tests/data/acpi/riscv64/virt/FACP
Binary files differ
diff --git a/tests/data/acpi/riscv64/virt/MCFG b/tests/data/acpi/riscv64/virt/MCFG
new file mode 100644
index 0000000..37eb923
--- /dev/null
+++ b/tests/data/acpi/riscv64/virt/MCFG
Binary files differ
diff --git a/tests/data/acpi/riscv64/virt/RHCT b/tests/data/acpi/riscv64/virt/RHCT
new file mode 100644
index 0000000..4f23173
--- /dev/null
+++ b/tests/data/acpi/riscv64/virt/RHCT
Binary files differ
diff --git a/tests/data/acpi/riscv64/virt/SPCR b/tests/data/acpi/riscv64/virt/SPCR
new file mode 100644
index 0000000..4da9daf
--- /dev/null
+++ b/tests/data/acpi/riscv64/virt/SPCR
Binary files differ
diff --git a/tests/data/acpi/x86/microvm/DSDT.pcie b/tests/data/acpi/x86/microvm/DSDT.pcie
index 765f14e..8eacd21 100644
--- a/tests/data/acpi/x86/microvm/DSDT.pcie
+++ b/tests/data/acpi/x86/microvm/DSDT.pcie
Binary files differ
diff --git a/tests/qtest/bios-tables-test.c b/tests/qtest/bios-tables-test.c
index f4c4704..36e5c0a 100644
--- a/tests/qtest/bios-tables-test.c
+++ b/tests/qtest/bios-tables-test.c
@@ -267,15 +267,6 @@
data->arch, data->machine,
sdt->aml, ext);
- /*
- * To keep test cases not failing before the DATA files are moved to
- * ${arch}/${machine} folder, add this check as well.
- */
- if (!g_file_test(aml_file, G_FILE_TEST_EXISTS)) {
- aml_file = g_strdup_printf("%s/%s/%.4s%s", data_dir,
- data->machine, sdt->aml, ext);
- }
-
if (!g_file_test(aml_file, G_FILE_TEST_EXISTS) &&
sdt->aml_len == exp_sdt->aml_len &&
!memcmp(sdt->aml, exp_sdt->aml, sdt->aml_len)) {
@@ -412,11 +403,6 @@
try_again:
aml_file = g_strdup_printf("%s/%s/%s/%.4s%s", data_dir, data->arch,
data->machine, sdt->aml, ext);
- if (!g_file_test(aml_file, G_FILE_TEST_EXISTS)) {
- aml_file = g_strdup_printf("%s/%s/%.4s%s", data_dir, data->machine,
- sdt->aml, ext);
- }
-
if (verbosity_level >= 2) {
fprintf(stderr, "Looking for expected file '%s'\n", aml_file);
}
@@ -1977,6 +1963,28 @@
}
#endif /* CONFIG_POSIX */
+static void test_acpi_riscv64_virt_tcg(void)
+{
+ test_data data = {
+ .machine = "virt",
+ .arch = "riscv64",
+ .tcg_only = true,
+ .uefi_fl1 = "pc-bios/edk2-riscv-code.fd",
+ .uefi_fl2 = "pc-bios/edk2-riscv-vars.fd",
+ .cd = "tests/data/uefi-boot-images/bios-tables-test.riscv64.iso.qcow2",
+ .ram_start = 0x80000000ULL,
+ .scan_len = 128ULL * 1024 * 1024,
+ };
+
+ /*
+ * RHCT will have ISA string encoded. To reduce the effort
+ * of updating expected AML file for any new default ISA extension,
+ * use the profile rva22s64.
+ */
+ test_acpi_one("-cpu rva22s64 ", &data);
+ free_test_data(&data);
+}
+
static void test_acpi_aarch64_virt_tcg(void)
{
test_data data = {
@@ -2455,6 +2463,10 @@
qtest_add_func("acpi/virt/viot", test_acpi_aarch64_virt_viot);
}
}
+ } else if (strcmp(arch, "riscv64") == 0) {
+ if (has_tcg && qtest_has_device("virtio-blk-pci")) {
+ qtest_add_func("acpi/virt", test_acpi_riscv64_virt_tcg);
+ }
}
ret = g_test_run();
boot_sector_cleanup(disk);
diff --git a/tools/i386/qemu-vmsr-helper.c b/tools/i386/qemu-vmsr-helper.c
new file mode 100644
index 0000000..ebf562c
--- /dev/null
+++ b/tools/i386/qemu-vmsr-helper.c
@@ -0,0 +1,530 @@
+/*
+ * Privileged RAPL MSR helper commands for QEMU
+ *
+ * Copyright (C) 2024 Red Hat, Inc. <aharivel@redhat.com>
+ *
+ * Author: Anthony Harivel <aharivel@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; under version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include <getopt.h>
+#include <stdbool.h>
+#include <sys/ioctl.h>
+#ifdef CONFIG_LIBCAP_NG
+#include <cap-ng.h>
+#endif
+#include <pwd.h>
+#include <grp.h>
+
+#include "qemu/help-texts.h"
+#include "qapi/error.h"
+#include "qemu/cutils.h"
+#include "qemu/main-loop.h"
+#include "qemu/module.h"
+#include "qemu/error-report.h"
+#include "qemu/config-file.h"
+#include "qemu-version.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/log.h"
+#include "qemu/systemd.h"
+#include "io/channel.h"
+#include "io/channel-socket.h"
+#include "trace/control.h"
+#include "qemu-version.h"
+#include "rapl-msr-index.h"
+
+#define MSR_PATH_TEMPLATE "/dev/cpu/%u/msr"
+
+static char *socket_path;
+static char *pidfile;
+static enum { RUNNING, TERMINATE, TERMINATING } state;
+static QIOChannelSocket *server_ioc;
+static int server_watch;
+static int num_active_sockets = 1;
+
+#ifdef CONFIG_LIBCAP_NG
+static int uid = -1;
+static int gid = -1;
+#endif
+
+static void compute_default_paths(void)
+{
+ g_autofree char *state = qemu_get_local_state_dir();
+
+ socket_path = g_build_filename(state, "run", "qemu-vmsr-helper.sock", NULL);
+ pidfile = g_build_filename(state, "run", "qemu-vmsr-helper.pid", NULL);
+}
+
+static int is_intel_processor(void)
+{
+ int result;
+ int ebx, ecx, edx;
+
+ /* Execute CPUID instruction with eax=0 (basic identification) */
+ asm volatile (
+ "cpuid"
+ : "=b" (ebx), "=c" (ecx), "=d" (edx)
+ : "a" (0)
+ );
+
+ /*
+ * Check if processor is "GenuineIntel"
+ * 0x756e6547 = "Genu"
+ * 0x49656e69 = "ineI"
+ * 0x6c65746e = "ntel"
+ */
+ result = (ebx == 0x756e6547) && (edx == 0x49656e69) && (ecx == 0x6c65746e);
+
+ return result;
+}
+
+static int is_rapl_enabled(void)
+{
+ const char *path = "/sys/class/powercap/intel-rapl/enabled";
+ FILE *file = fopen(path, "r");
+ int value = 0;
+
+ if (file != NULL) {
+ if (fscanf(file, "%d", &value) != 1) {
+ error_report("INTEL RAPL not enabled");
+ }
+ fclose(file);
+ } else {
+ error_report("Error opening %s", path);
+ }
+
+ return value;
+}
+
+/*
+ * Check if the TID that request the MSR read
+ * belongs to the peer. It be should a TID of a vCPU.
+ */
+static bool is_tid_present(pid_t pid, pid_t tid)
+{
+ g_autofree char *tidPath = g_strdup_printf("/proc/%d/task/%d", pid, tid);
+
+ /* Check if the TID directory exists within the PID directory */
+ if (access(tidPath, F_OK) == 0) {
+ return true;
+ }
+
+ error_report("Failed to open /proc at %s", tidPath);
+ return false;
+}
+
+/*
+ * Only the RAPL MSR in target/i386/cpu.h are allowed
+ */
+static bool is_msr_allowed(uint32_t reg)
+{
+ switch (reg) {
+ case MSR_RAPL_POWER_UNIT:
+ case MSR_PKG_POWER_LIMIT:
+ case MSR_PKG_ENERGY_STATUS:
+ case MSR_PKG_POWER_INFO:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static uint64_t vmsr_read_msr(uint32_t msr_register, unsigned int cpu_id)
+{
+ int fd;
+ uint64_t result = 0;
+
+ g_autofree char *path = g_strdup_printf(MSR_PATH_TEMPLATE, cpu_id);
+
+ fd = open(path, O_RDONLY);
+ if (fd < 0) {
+ error_report("Failed to open MSR file at %s", path);
+ return result;
+ }
+
+ if (pread(fd, &result, sizeof(result), msr_register) != sizeof(result)) {
+ error_report("Failed to read MSR");
+ result = 0;
+ }
+
+ close(fd);
+ return result;
+}
+
+static void usage(const char *name)
+{
+ (printf) (
+"Usage: %s [OPTIONS] FILE\n"
+"Virtual RAPL MSR helper program for QEMU\n"
+"\n"
+" -h, --help display this help and exit\n"
+" -V, --version output version information and exit\n"
+"\n"
+" -d, --daemon run in the background\n"
+" -f, --pidfile=PATH PID file when running as a daemon\n"
+" (default '%s')\n"
+" -k, --socket=PATH path to the unix socket\n"
+" (default '%s')\n"
+" -T, --trace [[enable=]<pattern>][,events=<file>][,file=<file>]\n"
+" specify tracing options\n"
+#ifdef CONFIG_LIBCAP_NG
+" -u, --user=USER user to drop privileges to\n"
+" -g, --group=GROUP group to drop privileges to\n"
+#endif
+"\n"
+QEMU_HELP_BOTTOM "\n"
+ , name, pidfile, socket_path);
+}
+
+static void version(const char *name)
+{
+ printf(
+"%s " QEMU_FULL_VERSION "\n"
+"Written by Anthony Harivel.\n"
+"\n"
+QEMU_COPYRIGHT "\n"
+"This is free software; see the source for copying conditions. There is NO\n"
+"warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n"
+ , name);
+}
+
+typedef struct VMSRHelperClient {
+ QIOChannelSocket *ioc;
+ Coroutine *co;
+} VMSRHelperClient;
+
+static void coroutine_fn vh_co_entry(void *opaque)
+{
+ VMSRHelperClient *client = opaque;
+ Error *local_err = NULL;
+ unsigned int peer_pid;
+ uint32_t request[3];
+ uint64_t vmsr;
+ int r;
+
+ qio_channel_set_blocking(QIO_CHANNEL(client->ioc),
+ false, NULL);
+
+ qio_channel_set_follow_coroutine_ctx(QIO_CHANNEL(client->ioc), true);
+
+ /*
+ * Check peer credentials
+ */
+ r = qio_channel_get_peerpid(QIO_CHANNEL(client->ioc),
+ &peer_pid,
+ &local_err);
+ if (r < 0) {
+ error_report_err(local_err);
+ goto out;
+ }
+
+ while (r < 0) {
+ /*
+ * Read the requested MSR
+ * Only RAPL MSR in rapl-msr-index.h is allowed
+ */
+ r = qio_channel_read_all(QIO_CHANNEL(client->ioc),
+ (char *) &request, sizeof(request), &local_err);
+ if (r < 0) {
+ error_report_err(local_err);
+ break;
+ }
+
+ if (!is_msr_allowed(request[0])) {
+ error_report("Requested unallowed msr: %d", request[0]);
+ break;
+ }
+
+ vmsr = vmsr_read_msr(request[0], request[1]);
+
+ if (!is_tid_present(peer_pid, request[2])) {
+ error_report("Requested TID not in peer PID: %d %d",
+ peer_pid, request[2]);
+ vmsr = 0;
+ }
+
+ r = qio_channel_write_all(QIO_CHANNEL(client->ioc),
+ (char *) &vmsr,
+ sizeof(vmsr),
+ &local_err);
+ if (r < 0) {
+ error_report_err(local_err);
+ break;
+ }
+ }
+out:
+ object_unref(OBJECT(client->ioc));
+ g_free(client);
+}
+
+static gboolean accept_client(QIOChannel *ioc,
+ GIOCondition cond,
+ gpointer opaque)
+{
+ QIOChannelSocket *cioc;
+ VMSRHelperClient *vmsrh;
+
+ cioc = qio_channel_socket_accept(QIO_CHANNEL_SOCKET(ioc),
+ NULL);
+ if (!cioc) {
+ return TRUE;
+ }
+
+ vmsrh = g_new(VMSRHelperClient, 1);
+ vmsrh->ioc = cioc;
+ vmsrh->co = qemu_coroutine_create(vh_co_entry, vmsrh);
+ qemu_coroutine_enter(vmsrh->co);
+
+ return TRUE;
+}
+
+static void termsig_handler(int signum)
+{
+ qatomic_cmpxchg(&state, RUNNING, TERMINATE);
+ qemu_notify_event();
+}
+
+static void close_server_socket(void)
+{
+ assert(server_ioc);
+
+ g_source_remove(server_watch);
+ server_watch = -1;
+ object_unref(OBJECT(server_ioc));
+ num_active_sockets--;
+}
+
+#ifdef CONFIG_LIBCAP_NG
+static int drop_privileges(void)
+{
+ /* clear all capabilities */
+ capng_clear(CAPNG_SELECT_BOTH);
+
+ if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE | CAPNG_PERMITTED,
+ CAP_SYS_RAWIO) < 0) {
+ return -1;
+ }
+
+ return 0;
+}
+#endif
+
+int main(int argc, char **argv)
+{
+ const char *sopt = "hVk:f:dT:u:g:vq";
+ struct option lopt[] = {
+ { "help", no_argument, NULL, 'h' },
+ { "version", no_argument, NULL, 'V' },
+ { "socket", required_argument, NULL, 'k' },
+ { "pidfile", required_argument, NULL, 'f' },
+ { "daemon", no_argument, NULL, 'd' },
+ { "trace", required_argument, NULL, 'T' },
+ { "verbose", no_argument, NULL, 'v' },
+ { NULL, 0, NULL, 0 }
+ };
+ int opt_ind = 0;
+ int ch;
+ Error *local_err = NULL;
+ bool daemonize = false;
+ bool pidfile_specified = false;
+ bool socket_path_specified = false;
+ unsigned socket_activation;
+
+ struct sigaction sa_sigterm;
+ memset(&sa_sigterm, 0, sizeof(sa_sigterm));
+ sa_sigterm.sa_handler = termsig_handler;
+ sigaction(SIGTERM, &sa_sigterm, NULL);
+ sigaction(SIGINT, &sa_sigterm, NULL);
+ sigaction(SIGHUP, &sa_sigterm, NULL);
+
+ signal(SIGPIPE, SIG_IGN);
+
+ error_init(argv[0]);
+ module_call_init(MODULE_INIT_TRACE);
+ module_call_init(MODULE_INIT_QOM);
+ qemu_add_opts(&qemu_trace_opts);
+ qemu_init_exec_dir(argv[0]);
+
+ compute_default_paths();
+
+ /*
+ * Sanity check
+ * 1. cpu must be Intel cpu
+ * 2. RAPL must be enabled
+ */
+ if (!is_intel_processor()) {
+ error_report("error: CPU is not INTEL cpu");
+ exit(EXIT_FAILURE);
+ }
+
+ if (!is_rapl_enabled()) {
+ error_report("error: RAPL driver not enable");
+ exit(EXIT_FAILURE);
+ }
+
+ while ((ch = getopt_long(argc, argv, sopt, lopt, &opt_ind)) != -1) {
+ switch (ch) {
+ case 'k':
+ g_free(socket_path);
+ socket_path = g_strdup(optarg);
+ socket_path_specified = true;
+ if (socket_path[0] != '/') {
+ error_report("socket path must be absolute");
+ exit(EXIT_FAILURE);
+ }
+ break;
+ case 'f':
+ g_free(pidfile);
+ pidfile = g_strdup(optarg);
+ pidfile_specified = true;
+ break;
+#ifdef CONFIG_LIBCAP_NG
+ case 'u': {
+ unsigned long res;
+ struct passwd *userinfo = getpwnam(optarg);
+ if (userinfo) {
+ uid = userinfo->pw_uid;
+ } else if (qemu_strtoul(optarg, NULL, 10, &res) == 0 &&
+ (uid_t)res == res) {
+ uid = res;
+ } else {
+ error_report("invalid user '%s'", optarg);
+ exit(EXIT_FAILURE);
+ }
+ break;
+ }
+ case 'g': {
+ unsigned long res;
+ struct group *groupinfo = getgrnam(optarg);
+ if (groupinfo) {
+ gid = groupinfo->gr_gid;
+ } else if (qemu_strtoul(optarg, NULL, 10, &res) == 0 &&
+ (gid_t)res == res) {
+ gid = res;
+ } else {
+ error_report("invalid group '%s'", optarg);
+ exit(EXIT_FAILURE);
+ }
+ break;
+ }
+#else
+ case 'u':
+ case 'g':
+ error_report("-%c not supported by this %s", ch, argv[0]);
+ exit(1);
+#endif
+ case 'd':
+ daemonize = true;
+ break;
+ case 'T':
+ trace_opt_parse(optarg);
+ break;
+ case 'V':
+ version(argv[0]);
+ exit(EXIT_SUCCESS);
+ break;
+ case 'h':
+ usage(argv[0]);
+ exit(EXIT_SUCCESS);
+ break;
+ case '?':
+ error_report("Try `%s --help' for more information.", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ if (!trace_init_backends()) {
+ exit(EXIT_FAILURE);
+ }
+ trace_init_file();
+ qemu_set_log(LOG_TRACE, &error_fatal);
+
+ socket_activation = check_socket_activation();
+ if (socket_activation == 0) {
+ SocketAddress saddr;
+ saddr = (SocketAddress){
+ .type = SOCKET_ADDRESS_TYPE_UNIX,
+ .u.q_unix.path = socket_path,
+ };
+ server_ioc = qio_channel_socket_new();
+ if (qio_channel_socket_listen_sync(server_ioc, &saddr,
+ 1, &local_err) < 0) {
+ object_unref(OBJECT(server_ioc));
+ error_report_err(local_err);
+ return 1;
+ }
+ } else {
+ /* Using socket activation - check user didn't use -p etc. */
+ if (socket_path_specified) {
+ error_report("Unix socket can't be set when"
+ "using socket activation");
+ exit(EXIT_FAILURE);
+ }
+
+ /* Can only listen on a single socket. */
+ if (socket_activation > 1) {
+ error_report("%s does not support socket activation"
+ "with LISTEN_FDS > 1",
+ argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ server_ioc = qio_channel_socket_new_fd(FIRST_SOCKET_ACTIVATION_FD,
+ &local_err);
+ if (server_ioc == NULL) {
+ error_reportf_err(local_err,
+ "Failed to use socket activation: ");
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ qemu_init_main_loop(&error_fatal);
+
+ server_watch = qio_channel_add_watch(QIO_CHANNEL(server_ioc),
+ G_IO_IN,
+ accept_client,
+ NULL, NULL);
+
+ if (daemonize) {
+ if (daemon(0, 0) < 0) {
+ error_report("Failed to daemonize: %s", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ if (daemonize || pidfile_specified) {
+ qemu_write_pidfile(pidfile, &error_fatal);
+ }
+
+#ifdef CONFIG_LIBCAP_NG
+ if (drop_privileges() < 0) {
+ error_report("Failed to drop privileges: %s", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+#endif
+
+ info_report("Listening on %s", socket_path);
+
+ state = RUNNING;
+ do {
+ main_loop_wait(false);
+ if (state == TERMINATE) {
+ state = TERMINATING;
+ close_server_socket();
+ }
+ } while (num_active_sockets > 0);
+
+ exit(EXIT_SUCCESS);
+}
diff --git a/tools/i386/rapl-msr-index.h b/tools/i386/rapl-msr-index.h
new file mode 100644
index 0000000..9a71186
--- /dev/null
+++ b/tools/i386/rapl-msr-index.h
@@ -0,0 +1,28 @@
+/*
+ * Allowed list of MSR for Privileged RAPL MSR helper commands for QEMU
+ *
+ * Copyright (C) 2023 Red Hat, Inc. <aharivel@redhat.com>
+ *
+ * Author: Anthony Harivel <aharivel@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; under version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Should stay in sync with the RAPL MSR
+ * in target/i386/cpu.h
+ */
+#define MSR_RAPL_POWER_UNIT 0x00000606
+#define MSR_PKG_POWER_LIMIT 0x00000610
+#define MSR_PKG_ENERGY_STATUS 0x00000611
+#define MSR_PKG_POWER_INFO 0x00000614