target/i386/kvm/xen-emu.c - qemu - Git at Google

 /*
  * Xen HVM emulation support in KVM
  *
  * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
  * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  *
  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  * See the COPYING file in the top-level directory.
  *
  */

 #include "qemu/osdep.h"
 #include "qemu/log.h"
 #include "qemu/main-loop.h"
 #include "hw/xen/xen.h"
 #include "sysemu/kvm_int.h"
 #include "sysemu/kvm_xen.h"
 #include "kvm/kvm_i386.h"
 #include "exec/address-spaces.h"
 #include "xen-emu.h"
 #include "trace.h"
 #include "sysemu/runstate.h"

 #include "hw/i386/kvm/xen_overlay.h"

 #include "hw/xen/interface/version.h"
 #include "hw/xen/interface/sched.h"
 #include "hw/xen/interface/memory.h"
 #include "hw/xen/interface/hvm/hvm_op.h"
 #include "hw/xen/interface/vcpu.h"

 #include "xen-compat.h"

 #ifdef TARGET_X86_64
 #define hypercall_compat32(longmode) (!(longmode))
 #else
 #define hypercall_compat32(longmode) (false)
 #endif

 static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz,
                       bool is_write)
 {
     uint8_t *buf = (uint8_t *)_buf;
     int ret;

     while (sz) {
         struct kvm_translation tr = {
             .linear_address = gva,
         };

         size_t len = TARGET_PAGE_SIZE - (tr.linear_address & ~TARGET_PAGE_MASK);
         if (len > sz) {
             len = sz;
         }

         ret = kvm_vcpu_ioctl(cs, KVM_TRANSLATE, &tr);
         if (ret || !tr.valid || (is_write && !tr.writeable)) {
             return -EFAULT;
         }

         cpu_physical_memory_rw(tr.physical_address, buf, len, is_write);

         buf += len;
         sz -= len;
         gva += len;
     }

     return 0;
 }

 static inline int kvm_copy_from_gva(CPUState *cs, uint64_t gva, void *buf,
                                     size_t sz)
 {
     return kvm_gva_rw(cs, gva, buf, sz, false);
 }

 static inline int kvm_copy_to_gva(CPUState *cs, uint64_t gva, void *buf,
                                   size_t sz)
 {
     return kvm_gva_rw(cs, gva, buf, sz, true);
 }

 int kvm_xen_init(KVMState *s, uint32_t hypercall_msr)
 {
     const int required_caps = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
         KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO;
     struct kvm_xen_hvm_config cfg = {
         .msr = hypercall_msr,
         .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
     };
     int xen_caps, ret;

     xen_caps = kvm_check_extension(s, KVM_CAP_XEN_HVM);
     if (required_caps & ~xen_caps) {
         error_report("kvm: Xen HVM guest support not present or insufficient");
         return -ENOSYS;
     }

     if (xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND) {
         struct kvm_xen_hvm_attr ha = {
             .type = KVM_XEN_ATTR_TYPE_XEN_VERSION,
             .u.xen_version = s->xen_version,
         };
         (void)kvm_vm_ioctl(s, KVM_XEN_HVM_SET_ATTR, &ha);

         cfg.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
     }

     ret = kvm_vm_ioctl(s, KVM_XEN_HVM_CONFIG, &cfg);
     if (ret < 0) {
         error_report("kvm: Failed to enable Xen HVM support: %s",
                      strerror(-ret));
         return ret;
     }

     s->xen_caps = xen_caps;
     return 0;
 }

 int kvm_xen_init_vcpu(CPUState *cs)
 {
     X86CPU *cpu = X86_CPU(cs);
     CPUX86State *env = &cpu->env;
     int err;

     /*
      * The kernel needs to know the Xen/ACPI vCPU ID because that's
      * what the guest uses in hypercalls such as timers. It doesn't
      * match the APIC ID which is generally used for talking to the
      * kernel about vCPUs. And if vCPU threads race with creating
      * their KVM vCPUs out of order, it doesn't necessarily match
      * with the kernel's internal vCPU indices either.
      */
     if (kvm_xen_has_cap(EVTCHN_SEND)) {
         struct kvm_xen_vcpu_attr va = {
             .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID,
             .u.vcpu_id = cs->cpu_index,
         };
         err = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
         if (err) {
             error_report("kvm: Failed to set Xen vCPU ID attribute: %s",
                          strerror(-err));
             return err;
         }
     }

     env->xen_vcpu_info_gpa = INVALID_GPA;
     env->xen_vcpu_info_default_gpa = INVALID_GPA;

     return 0;
 }

 uint32_t kvm_xen_get_caps(void)
 {
     return kvm_state->xen_caps;
 }

 static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit *exit, X86CPU *cpu,
                                      int cmd, uint64_t arg)
 {
     int err = 0;

     switch (cmd) {
     case XENVER_get_features: {
         struct xen_feature_info fi;

         /* No need for 32/64 compat handling */
         qemu_build_assert(sizeof(fi) == 8);

         err = kvm_copy_from_gva(CPU(cpu), arg, &fi, sizeof(fi));
         if (err) {
             break;
         }

         fi.submap = 0;
         if (fi.submap_idx == 0) {
             fi.submap |= 1 << XENFEAT_writable_page_tables |
                          1 << XENFEAT_writable_descriptor_tables |
                          1 << XENFEAT_auto_translated_physmap |
                          1 << XENFEAT_supervisor_mode_kernel;
         }

         err = kvm_copy_to_gva(CPU(cpu), arg, &fi, sizeof(fi));
         break;
     }

     default:
         return false;
     }

     exit->u.hcall.result = err;
     return true;
 }

 static int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t type, uint64_t gpa)
 {
     struct kvm_xen_vcpu_attr xhsi;

     xhsi.type = type;
     xhsi.u.gpa = gpa;

     trace_kvm_xen_set_vcpu_attr(cs->cpu_index, type, gpa);

     return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xhsi);
 }

 static void do_set_vcpu_info_default_gpa(CPUState *cs, run_on_cpu_data data)
 {
     X86CPU *cpu = X86_CPU(cs);
     CPUX86State *env = &cpu->env;

     env->xen_vcpu_info_default_gpa = data.host_ulong;

     /* Changing the default does nothing if a vcpu_info was explicitly set. */
     if (env->xen_vcpu_info_gpa == INVALID_GPA) {
         kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO,
                               env->xen_vcpu_info_default_gpa);
     }
 }

 static void do_set_vcpu_info_gpa(CPUState *cs, run_on_cpu_data data)
 {
     X86CPU *cpu = X86_CPU(cs);
     CPUX86State *env = &cpu->env;

     env->xen_vcpu_info_gpa = data.host_ulong;

     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO,
                           env->xen_vcpu_info_gpa);
 }

 static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data)
 {
     X86CPU *cpu = X86_CPU(cs);
     CPUX86State *env = &cpu->env;

     env->xen_vcpu_info_gpa = INVALID_GPA;
     env->xen_vcpu_info_default_gpa = INVALID_GPA;

     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, INVALID_GPA);
 }

 static int xen_set_shared_info(uint64_t gfn)
 {
     uint64_t gpa = gfn << TARGET_PAGE_BITS;
     int i, err;

     QEMU_IOTHREAD_LOCK_GUARD();

     /*
      * The xen_overlay device tells KVM about it too, since it had to
      * do that on migration load anyway (unless we're going to jump
      * through lots of hoops to maintain the fiction that this isn't
      * KVM-specific.
      */
     err = xen_overlay_map_shinfo_page(gpa);
     if (err) {
             return err;
     }

     trace_kvm_xen_set_shared_info(gfn);

     for (i = 0; i < XEN_LEGACY_MAX_VCPUS; i++) {
         CPUState *cpu = qemu_get_cpu(i);
         if (cpu) {
             async_run_on_cpu(cpu, do_set_vcpu_info_default_gpa,
                              RUN_ON_CPU_HOST_ULONG(gpa));
         }
         gpa += sizeof(vcpu_info_t);
     }

     return err;
 }

 static int add_to_physmap_one(uint32_t space, uint64_t idx, uint64_t gfn)
 {
     switch (space) {
     case XENMAPSPACE_shared_info:
         if (idx > 0) {
             return -EINVAL;
         }
         return xen_set_shared_info(gfn);

     case XENMAPSPACE_grant_table:
     case XENMAPSPACE_gmfn:
     case XENMAPSPACE_gmfn_range:
         return -ENOTSUP;

     case XENMAPSPACE_gmfn_foreign:
     case XENMAPSPACE_dev_mmio:
         return -EPERM;

     default:
         return -EINVAL;
     }
 }

 static int do_add_to_physmap(struct kvm_xen_exit *exit, X86CPU *cpu,
                              uint64_t arg)
 {
     struct xen_add_to_physmap xatp;
     CPUState *cs = CPU(cpu);

     if (hypercall_compat32(exit->u.hcall.longmode)) {
         struct compat_xen_add_to_physmap xatp32;

         qemu_build_assert(sizeof(struct compat_xen_add_to_physmap) == 16);
         if (kvm_copy_from_gva(cs, arg, &xatp32, sizeof(xatp32))) {
             return -EFAULT;
         }
         xatp.domid = xatp32.domid;
         xatp.size = xatp32.size;
         xatp.space = xatp32.space;
         xatp.idx = xatp32.idx;
         xatp.gpfn = xatp32.gpfn;
     } else {
         if (kvm_copy_from_gva(cs, arg, &xatp, sizeof(xatp))) {
             return -EFAULT;
         }
     }

     if (xatp.domid != DOMID_SELF && xatp.domid != xen_domid) {
         return -ESRCH;
     }

     return add_to_physmap_one(xatp.space, xatp.idx, xatp.gpfn);
 }

 static int do_add_to_physmap_batch(struct kvm_xen_exit *exit, X86CPU *cpu,
                                    uint64_t arg)
 {
     struct xen_add_to_physmap_batch xatpb;
     unsigned long idxs_gva, gpfns_gva, errs_gva;
     CPUState *cs = CPU(cpu);
     size_t op_sz;

     if (hypercall_compat32(exit->u.hcall.longmode)) {
         struct compat_xen_add_to_physmap_batch xatpb32;

         qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch) == 20);
         if (kvm_copy_from_gva(cs, arg, &xatpb32, sizeof(xatpb32))) {
             return -EFAULT;
         }
         xatpb.domid = xatpb32.domid;
         xatpb.space = xatpb32.space;
         xatpb.size = xatpb32.size;

         idxs_gva = xatpb32.idxs.c;
         gpfns_gva = xatpb32.gpfns.c;
         errs_gva = xatpb32.errs.c;
         op_sz = sizeof(uint32_t);
     } else {
         if (kvm_copy_from_gva(cs, arg, &xatpb, sizeof(xatpb))) {
             return -EFAULT;
         }
         op_sz = sizeof(unsigned long);
         idxs_gva = (unsigned long)xatpb.idxs.p;
         gpfns_gva = (unsigned long)xatpb.gpfns.p;
         errs_gva = (unsigned long)xatpb.errs.p;
     }

     if (xatpb.domid != DOMID_SELF && xatpb.domid != xen_domid) {
         return -ESRCH;
     }

     /* Explicitly invalid for the batch op. Not that we implement it anyway. */
     if (xatpb.space == XENMAPSPACE_gmfn_range) {
         return -EINVAL;
     }

     while (xatpb.size--) {
         unsigned long idx = 0;
         unsigned long gpfn = 0;
         int err;

         /* For 32-bit compat this only copies the low 32 bits of each */
         if (kvm_copy_from_gva(cs, idxs_gva, &idx, op_sz) ||
             kvm_copy_from_gva(cs, gpfns_gva, &gpfn, op_sz)) {
             return -EFAULT;
         }
         idxs_gva += op_sz;
         gpfns_gva += op_sz;

         err = add_to_physmap_one(xatpb.space, idx, gpfn);

         if (kvm_copy_to_gva(cs, errs_gva, &err, sizeof(err))) {
             return -EFAULT;
         }
         errs_gva += sizeof(err);
     }
     return 0;
 }

 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit, X86CPU *cpu,
                                    int cmd, uint64_t arg)
 {
     int err;

     switch (cmd) {
     case XENMEM_add_to_physmap:
         err = do_add_to_physmap(exit, cpu, arg);
         break;

     case XENMEM_add_to_physmap_batch:
         err = do_add_to_physmap_batch(exit, cpu, arg);
         break;

     default:
         return false;
     }

     exit->u.hcall.result = err;
     return true;
 }

 static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu,
                                  int cmd, uint64_t arg)
 {
     switch (cmd) {
     case HVMOP_pagetable_dying:
         exit->u.hcall.result = -ENOSYS;
         return true;

     default:
         return false;
     }
 }

 static int vcpuop_register_vcpu_info(CPUState *cs, CPUState *target,
                                      uint64_t arg)
 {
     struct vcpu_register_vcpu_info rvi;
     uint64_t gpa;

     /* No need for 32/64 compat handling */
     qemu_build_assert(sizeof(rvi) == 16);
     qemu_build_assert(sizeof(struct vcpu_info) == 64);

     if (!target) {
         return -ENOENT;
     }

     if (kvm_copy_from_gva(cs, arg, &rvi, sizeof(rvi))) {
         return -EFAULT;
     }

     if (rvi.offset > TARGET_PAGE_SIZE - sizeof(struct vcpu_info)) {
         return -EINVAL;
     }

     gpa = ((rvi.mfn << TARGET_PAGE_BITS) + rvi.offset);
     async_run_on_cpu(target, do_set_vcpu_info_gpa, RUN_ON_CPU_HOST_ULONG(gpa));
     return 0;
 }

 static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit *exit, X86CPU *cpu,
                                   int cmd, int vcpu_id, uint64_t arg)
 {
     CPUState *dest = qemu_get_cpu(vcpu_id);
     CPUState *cs = CPU(cpu);
     int err;

     switch (cmd) {
     case VCPUOP_register_vcpu_info:
         err = vcpuop_register_vcpu_info(cs, dest, arg);
         break;

     default:
         return false;
     }

     exit->u.hcall.result = err;
     return true;
 }

 int kvm_xen_soft_reset(void)
 {
     CPUState *cpu;
     int err;

     assert(qemu_mutex_iothread_locked());

     trace_kvm_xen_soft_reset();

     CPU_FOREACH(cpu) {
         async_run_on_cpu(cpu, do_vcpu_soft_reset, RUN_ON_CPU_NULL);
     }

     err = xen_overlay_map_shinfo_page(INVALID_GFN);
     if (err) {
         return err;
     }

     return 0;
 }

 static int schedop_shutdown(CPUState *cs, uint64_t arg)
 {
     struct sched_shutdown shutdown;
     int ret = 0;

     /* No need for 32/64 compat handling */
     qemu_build_assert(sizeof(shutdown) == 4);

     if (kvm_copy_from_gva(cs, arg, &shutdown, sizeof(shutdown))) {
         return -EFAULT;
     }

     switch (shutdown.reason) {
     case SHUTDOWN_crash:
         cpu_dump_state(cs, stderr, CPU_DUMP_CODE);
         qemu_system_guest_panicked(NULL);
         break;

     case SHUTDOWN_reboot:
         qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
         break;

     case SHUTDOWN_poweroff:
         qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
         break;

     case SHUTDOWN_soft_reset:
         qemu_mutex_lock_iothread();
         ret = kvm_xen_soft_reset();
         qemu_mutex_unlock_iothread();
         break;

     default:
         ret = -EINVAL;
         break;
     }

     return ret;
 }

 static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit *exit, X86CPU *cpu,
                                    int cmd, uint64_t arg)
 {
     CPUState *cs = CPU(cpu);
     int err = -ENOSYS;

     switch (cmd) {
     case SCHEDOP_shutdown:
         err = schedop_shutdown(cs, arg);
         break;

     case SCHEDOP_poll:
         /*
          * Linux will panic if this doesn't work. Just yield; it's not
          * worth overthinking it because with event channel handling
          * in KVM, the kernel will intercept this and it will never
          * reach QEMU anyway. The semantics of the hypercall explicltly
          * permit spurious wakeups.
          */
     case SCHEDOP_yield:
         sched_yield();
         err = 0;
         break;

     default:
         return false;
     }

     exit->u.hcall.result = err;
     return true;
 }

 static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
 {
     uint16_t code = exit->u.hcall.input;

     if (exit->u.hcall.cpl > 0) {
         exit->u.hcall.result = -EPERM;
         return true;
     }

     switch (code) {
     case __HYPERVISOR_sched_op:
         return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0],
                                       exit->u.hcall.params[1]);
     case __HYPERVISOR_vcpu_op:
         return kvm_xen_hcall_vcpu_op(exit, cpu,
                                      exit->u.hcall.params[0],
                                      exit->u.hcall.params[1],
                                      exit->u.hcall.params[2]);
     case __HYPERVISOR_hvm_op:
         return kvm_xen_hcall_hvm_op(exit, cpu, exit->u.hcall.params[0],
                                     exit->u.hcall.params[1]);
     case __HYPERVISOR_memory_op:
         return kvm_xen_hcall_memory_op(exit, cpu, exit->u.hcall.params[0],
                                        exit->u.hcall.params[1]);
     case __HYPERVISOR_xen_version:
         return kvm_xen_hcall_xen_version(exit, cpu, exit->u.hcall.params[0],
                                          exit->u.hcall.params[1]);
     default:
         return false;
     }
 }

 int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
 {
     if (exit->type != KVM_EXIT_XEN_HCALL) {
         return -1;
     }

     /*
      * The kernel latches the guest 32/64 mode when the MSR is used to fill
      * the hypercall page. So if we see a hypercall in a mode that doesn't
      * match our own idea of the guest mode, fetch the kernel's idea of the
      * "long mode" to remain in sync.
      */
     if (exit->u.hcall.longmode != xen_is_long_mode()) {
         xen_sync_long_mode();
     }

     if (!do_kvm_xen_handle_exit(cpu, exit)) {
         /*
          * Some hypercalls will be deliberately "implemented" by returning
          * -ENOSYS. This case is for hypercalls which are unexpected.
          */
         exit->u.hcall.result = -ENOSYS;
         qemu_log_mask(LOG_UNIMP, "Unimplemented Xen hypercall %"
                       PRId64 " (0x%" PRIx64 " 0x%" PRIx64 " 0x%" PRIx64 ")\n",
                       (uint64_t)exit->u.hcall.input,
                       (uint64_t)exit->u.hcall.params[0],
                       (uint64_t)exit->u.hcall.params[1],
                       (uint64_t)exit->u.hcall.params[2]);
     }

     trace_kvm_xen_hypercall(CPU(cpu)->cpu_index, exit->u.hcall.cpl,
                             exit->u.hcall.input, exit->u.hcall.params[0],
                             exit->u.hcall.params[1], exit->u.hcall.params[2],
                             exit->u.hcall.result);
     return 0;
 }

 int kvm_put_xen_state(CPUState *cs)
 {
     X86CPU *cpu = X86_CPU(cs);
     CPUX86State *env = &cpu->env;
     uint64_t gpa;
     int ret;

     gpa = env->xen_vcpu_info_gpa;
     if (gpa == INVALID_GPA) {
         gpa = env->xen_vcpu_info_default_gpa;
     }

     if (gpa != INVALID_GPA) {
         ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, gpa);
         if (ret < 0) {
             return ret;
         }
     }

     return 0;
 }

 int kvm_get_xen_state(CPUState *cs)
 {
     X86CPU *cpu = X86_CPU(cs);
     CPUX86State *env = &cpu->env;
     uint64_t gpa;

     /*
      * The kernel does not mark vcpu_info as dirty when it delivers interrupts
      * to it. It's up to userspace to *assume* that any page shared thus is
      * always considered dirty. The shared_info page is different since it's
      * an overlay and migrated separately anyway.
      */
     gpa = env->xen_vcpu_info_gpa;
     if (gpa == INVALID_GPA) {
         gpa = env->xen_vcpu_info_default_gpa;
     }
     if (gpa != INVALID_GPA) {
         MemoryRegionSection mrs = memory_region_find(get_system_memory(),
                                                      gpa,
                                                      sizeof(struct vcpu_info));
         if (mrs.mr &&
             !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
             memory_region_set_dirty(mrs.mr, mrs.offset_within_region,
                                     sizeof(struct vcpu_info));
         }
     }

     return 0;
 }
	/*
	* Xen HVM emulation support in KVM
	*
	* Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
	* Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
	*
	* This work is licensed under the terms of the GNU GPL, version 2 or later.
	* See the COPYING file in the top-level directory.
	*
	*/

	#include "qemu/osdep.h"
	#include "qemu/log.h"
	#include "qemu/main-loop.h"
	#include "hw/xen/xen.h"
	#include "sysemu/kvm_int.h"
	#include "sysemu/kvm_xen.h"
	#include "kvm/kvm_i386.h"
	#include "exec/address-spaces.h"
	#include "xen-emu.h"
	#include "trace.h"
	#include "sysemu/runstate.h"

	#include "hw/i386/kvm/xen_overlay.h"

	#include "hw/xen/interface/version.h"
	#include "hw/xen/interface/sched.h"
	#include "hw/xen/interface/memory.h"
	#include "hw/xen/interface/hvm/hvm_op.h"
	#include "hw/xen/interface/vcpu.h"

	#include "xen-compat.h"

	#ifdef TARGET_X86_64
	#define hypercall_compat32(longmode) (!(longmode))
	#else
	#define hypercall_compat32(longmode) (false)
	#endif

	static int kvm_gva_rw(CPUState cs, uint64_t gva, void _buf, size_t sz,
	bool is_write)
	{
	uint8_t buf = (uint8_t )_buf;
	int ret;

	while (sz) {
	struct kvm_translation tr = {
	.linear_address = gva,
	};

	size_t len = TARGET_PAGE_SIZE - (tr.linear_address & ~TARGET_PAGE_MASK);
	if (len > sz) {
	len = sz;
	}

	ret = kvm_vcpu_ioctl(cs, KVM_TRANSLATE, &tr);
	if (ret \|\| !tr.valid \|\| (is_write && !tr.writeable)) {
	return -EFAULT;
	}

	cpu_physical_memory_rw(tr.physical_address, buf, len, is_write);

	buf += len;
	sz -= len;
	gva += len;
	}

	return 0;
	}

	static inline int kvm_copy_from_gva(CPUState cs, uint64_t gva, void buf,
	size_t sz)
	{
	return kvm_gva_rw(cs, gva, buf, sz, false);
	}

	static inline int kvm_copy_to_gva(CPUState cs, uint64_t gva, void buf,
	size_t sz)
	{
	return kvm_gva_rw(cs, gva, buf, sz, true);
	}

	int kvm_xen_init(KVMState *s, uint32_t hypercall_msr)
	{
	const int required_caps = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR \|
	KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL \| KVM_XEN_HVM_CONFIG_SHARED_INFO;
	struct kvm_xen_hvm_config cfg = {
	.msr = hypercall_msr,
	.flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
	};
	int xen_caps, ret;

	xen_caps = kvm_check_extension(s, KVM_CAP_XEN_HVM);
	if (required_caps & ~xen_caps) {
	error_report("kvm: Xen HVM guest support not present or insufficient");
	return -ENOSYS;
	}

	if (xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND) {
	struct kvm_xen_hvm_attr ha = {
	.type = KVM_XEN_ATTR_TYPE_XEN_VERSION,
	.u.xen_version = s->xen_version,
	};
	(void)kvm_vm_ioctl(s, KVM_XEN_HVM_SET_ATTR, &ha);

	cfg.flags \|= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
	}

	ret = kvm_vm_ioctl(s, KVM_XEN_HVM_CONFIG, &cfg);
	if (ret < 0) {
	error_report("kvm: Failed to enable Xen HVM support: %s",
	strerror(-ret));
	return ret;
	}

	s->xen_caps = xen_caps;
	return 0;
	}

	int kvm_xen_init_vcpu(CPUState *cs)
	{
	X86CPU *cpu = X86_CPU(cs);
	CPUX86State *env = &cpu->env;
	int err;

	/*
	* The kernel needs to know the Xen/ACPI vCPU ID because that's
	* what the guest uses in hypercalls such as timers. It doesn't
	* match the APIC ID which is generally used for talking to the
	* kernel about vCPUs. And if vCPU threads race with creating
	* their KVM vCPUs out of order, it doesn't necessarily match
	* with the kernel's internal vCPU indices either.
	*/
	if (kvm_xen_has_cap(EVTCHN_SEND)) {
	struct kvm_xen_vcpu_attr va = {
	.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID,
	.u.vcpu_id = cs->cpu_index,
	};
	err = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
	if (err) {
	error_report("kvm: Failed to set Xen vCPU ID attribute: %s",
	strerror(-err));
	return err;
	}
	}

	env->xen_vcpu_info_gpa = INVALID_GPA;
	env->xen_vcpu_info_default_gpa = INVALID_GPA;

	return 0;
	}

	uint32_t kvm_xen_get_caps(void)
	{
	return kvm_state->xen_caps;
	}

	static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit exit, X86CPU cpu,
	int cmd, uint64_t arg)
	{
	int err = 0;

	switch (cmd) {
	case XENVER_get_features: {
	struct xen_feature_info fi;

	/* No need for 32/64 compat handling */
	qemu_build_assert(sizeof(fi) == 8);

	err = kvm_copy_from_gva(CPU(cpu), arg, &fi, sizeof(fi));
	if (err) {
	break;
	}

	fi.submap = 0;
	if (fi.submap_idx == 0) {
	fi.submap \|= 1 << XENFEAT_writable_page_tables \|
	1 << XENFEAT_writable_descriptor_tables \|
	1 << XENFEAT_auto_translated_physmap \|
	1 << XENFEAT_supervisor_mode_kernel;
	}

	err = kvm_copy_to_gva(CPU(cpu), arg, &fi, sizeof(fi));
	break;
	}

	default:
	return false;
	}

	exit->u.hcall.result = err;
	return true;
	}

	static int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t type, uint64_t gpa)
	{
	struct kvm_xen_vcpu_attr xhsi;

	xhsi.type = type;
	xhsi.u.gpa = gpa;

	trace_kvm_xen_set_vcpu_attr(cs->cpu_index, type, gpa);

	return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xhsi);
	}

	static void do_set_vcpu_info_default_gpa(CPUState *cs, run_on_cpu_data data)
	{
	X86CPU *cpu = X86_CPU(cs);
	CPUX86State *env = &cpu->env;

	env->xen_vcpu_info_default_gpa = data.host_ulong;

	/* Changing the default does nothing if a vcpu_info was explicitly set. */
	if (env->xen_vcpu_info_gpa == INVALID_GPA) {
	kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO,
	env->xen_vcpu_info_default_gpa);
	}
	}

	static void do_set_vcpu_info_gpa(CPUState *cs, run_on_cpu_data data)
	{
	X86CPU *cpu = X86_CPU(cs);
	CPUX86State *env = &cpu->env;

	env->xen_vcpu_info_gpa = data.host_ulong;

	kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO,
	env->xen_vcpu_info_gpa);
	}

	static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data)
	{
	X86CPU *cpu = X86_CPU(cs);
	CPUX86State *env = &cpu->env;

	env->xen_vcpu_info_gpa = INVALID_GPA;
	env->xen_vcpu_info_default_gpa = INVALID_GPA;

	kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, INVALID_GPA);
	}

	static int xen_set_shared_info(uint64_t gfn)
	{
	uint64_t gpa = gfn << TARGET_PAGE_BITS;
	int i, err;

	QEMU_IOTHREAD_LOCK_GUARD();

	/*
	* The xen_overlay device tells KVM about it too, since it had to
	* do that on migration load anyway (unless we're going to jump
	* through lots of hoops to maintain the fiction that this isn't
	* KVM-specific.
	*/
	err = xen_overlay_map_shinfo_page(gpa);
	if (err) {
	return err;
	}

	trace_kvm_xen_set_shared_info(gfn);

	for (i = 0; i < XEN_LEGACY_MAX_VCPUS; i++) {
	CPUState *cpu = qemu_get_cpu(i);
	if (cpu) {
	async_run_on_cpu(cpu, do_set_vcpu_info_default_gpa,
	RUN_ON_CPU_HOST_ULONG(gpa));
	}
	gpa += sizeof(vcpu_info_t);
	}

	return err;
	}

	static int add_to_physmap_one(uint32_t space, uint64_t idx, uint64_t gfn)
	{
	switch (space) {
	case XENMAPSPACE_shared_info:
	if (idx > 0) {
	return -EINVAL;
	}
	return xen_set_shared_info(gfn);

	case XENMAPSPACE_grant_table:
	case XENMAPSPACE_gmfn:
	case XENMAPSPACE_gmfn_range:
	return -ENOTSUP;

	case XENMAPSPACE_gmfn_foreign:
	case XENMAPSPACE_dev_mmio:
	return -EPERM;

	default:
	return -EINVAL;
	}
	}

	static int do_add_to_physmap(struct kvm_xen_exit exit, X86CPU cpu,
	uint64_t arg)
	{
	struct xen_add_to_physmap xatp;
	CPUState *cs = CPU(cpu);

	if (hypercall_compat32(exit->u.hcall.longmode)) {
	struct compat_xen_add_to_physmap xatp32;

	qemu_build_assert(sizeof(struct compat_xen_add_to_physmap) == 16);
	if (kvm_copy_from_gva(cs, arg, &xatp32, sizeof(xatp32))) {
	return -EFAULT;
	}
	xatp.domid = xatp32.domid;
	xatp.size = xatp32.size;
	xatp.space = xatp32.space;
	xatp.idx = xatp32.idx;
	xatp.gpfn = xatp32.gpfn;
	} else {
	if (kvm_copy_from_gva(cs, arg, &xatp, sizeof(xatp))) {
	return -EFAULT;
	}
	}

	if (xatp.domid != DOMID_SELF && xatp.domid != xen_domid) {
	return -ESRCH;
	}

	return add_to_physmap_one(xatp.space, xatp.idx, xatp.gpfn);
	}

	static int do_add_to_physmap_batch(struct kvm_xen_exit exit, X86CPU cpu,
	uint64_t arg)
	{
	struct xen_add_to_physmap_batch xatpb;
	unsigned long idxs_gva, gpfns_gva, errs_gva;
	CPUState *cs = CPU(cpu);
	size_t op_sz;

	if (hypercall_compat32(exit->u.hcall.longmode)) {
	struct compat_xen_add_to_physmap_batch xatpb32;

	qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch) == 20);
	if (kvm_copy_from_gva(cs, arg, &xatpb32, sizeof(xatpb32))) {
	return -EFAULT;
	}
	xatpb.domid = xatpb32.domid;
	xatpb.space = xatpb32.space;
	xatpb.size = xatpb32.size;

	idxs_gva = xatpb32.idxs.c;
	gpfns_gva = xatpb32.gpfns.c;
	errs_gva = xatpb32.errs.c;
	op_sz = sizeof(uint32_t);
	} else {
	if (kvm_copy_from_gva(cs, arg, &xatpb, sizeof(xatpb))) {
	return -EFAULT;
	}
	op_sz = sizeof(unsigned long);
	idxs_gva = (unsigned long)xatpb.idxs.p;
	gpfns_gva = (unsigned long)xatpb.gpfns.p;
	errs_gva = (unsigned long)xatpb.errs.p;
	}

	if (xatpb.domid != DOMID_SELF && xatpb.domid != xen_domid) {
	return -ESRCH;
	}

	/* Explicitly invalid for the batch op. Not that we implement it anyway. */
	if (xatpb.space == XENMAPSPACE_gmfn_range) {
	return -EINVAL;
	}

	while (xatpb.size--) {
	unsigned long idx = 0;
	unsigned long gpfn = 0;
	int err;

	/* For 32-bit compat this only copies the low 32 bits of each */
	if (kvm_copy_from_gva(cs, idxs_gva, &idx, op_sz) \|\|
	kvm_copy_from_gva(cs, gpfns_gva, &gpfn, op_sz)) {
	return -EFAULT;
	}
	idxs_gva += op_sz;
	gpfns_gva += op_sz;

	err = add_to_physmap_one(xatpb.space, idx, gpfn);

	if (kvm_copy_to_gva(cs, errs_gva, &err, sizeof(err))) {
	return -EFAULT;
	}
	errs_gva += sizeof(err);
	}
	return 0;
	}

	static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit exit, X86CPU cpu,
	int cmd, uint64_t arg)
	{
	int err;

	switch (cmd) {
	case XENMEM_add_to_physmap:
	err = do_add_to_physmap(exit, cpu, arg);
	break;

	case XENMEM_add_to_physmap_batch:
	err = do_add_to_physmap_batch(exit, cpu, arg);
	break;

	default:
	return false;
	}

	exit->u.hcall.result = err;
	return true;
	}

	static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit exit, X86CPU cpu,
	int cmd, uint64_t arg)
	{
	switch (cmd) {
	case HVMOP_pagetable_dying:
	exit->u.hcall.result = -ENOSYS;
	return true;

	default:
	return false;
	}
	}

	static int vcpuop_register_vcpu_info(CPUState cs, CPUState target,
	uint64_t arg)
	{
	struct vcpu_register_vcpu_info rvi;
	uint64_t gpa;

	/* No need for 32/64 compat handling */
	qemu_build_assert(sizeof(rvi) == 16);
	qemu_build_assert(sizeof(struct vcpu_info) == 64);

	if (!target) {
	return -ENOENT;
	}

	if (kvm_copy_from_gva(cs, arg, &rvi, sizeof(rvi))) {
	return -EFAULT;
	}

	if (rvi.offset > TARGET_PAGE_SIZE - sizeof(struct vcpu_info)) {
	return -EINVAL;
	}

	gpa = ((rvi.mfn << TARGET_PAGE_BITS) + rvi.offset);
	async_run_on_cpu(target, do_set_vcpu_info_gpa, RUN_ON_CPU_HOST_ULONG(gpa));
	return 0;
	}

	static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit exit, X86CPU cpu,
	int cmd, int vcpu_id, uint64_t arg)
	{
	CPUState *dest = qemu_get_cpu(vcpu_id);
	CPUState *cs = CPU(cpu);
	int err;

	switch (cmd) {
	case VCPUOP_register_vcpu_info:
	err = vcpuop_register_vcpu_info(cs, dest, arg);
	break;

	default:
	return false;
	}

	exit->u.hcall.result = err;
	return true;
	}

	int kvm_xen_soft_reset(void)
	{
	CPUState *cpu;
	int err;

	assert(qemu_mutex_iothread_locked());

	trace_kvm_xen_soft_reset();

	CPU_FOREACH(cpu) {
	async_run_on_cpu(cpu, do_vcpu_soft_reset, RUN_ON_CPU_NULL);
	}

	err = xen_overlay_map_shinfo_page(INVALID_GFN);
	if (err) {
	return err;
	}

	return 0;
	}

	static int schedop_shutdown(CPUState *cs, uint64_t arg)
	{
	struct sched_shutdown shutdown;
	int ret = 0;

	/* No need for 32/64 compat handling */
	qemu_build_assert(sizeof(shutdown) == 4);

	if (kvm_copy_from_gva(cs, arg, &shutdown, sizeof(shutdown))) {
	return -EFAULT;
	}

	switch (shutdown.reason) {
	case SHUTDOWN_crash:
	cpu_dump_state(cs, stderr, CPU_DUMP_CODE);
	qemu_system_guest_panicked(NULL);
	break;

	case SHUTDOWN_reboot:
	qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
	break;

	case SHUTDOWN_poweroff:
	qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
	break;

	case SHUTDOWN_soft_reset:
	qemu_mutex_lock_iothread();
	ret = kvm_xen_soft_reset();
	qemu_mutex_unlock_iothread();
	break;

	default:
	ret = -EINVAL;
	break;
	}

	return ret;
	}

	static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit exit, X86CPU cpu,
	int cmd, uint64_t arg)
	{
	CPUState *cs = CPU(cpu);
	int err = -ENOSYS;

	switch (cmd) {
	case SCHEDOP_shutdown:
	err = schedop_shutdown(cs, arg);
	break;

	case SCHEDOP_poll:
	/*
	* Linux will panic if this doesn't work. Just yield; it's not
	* worth overthinking it because with event channel handling
	* in KVM, the kernel will intercept this and it will never
	* reach QEMU anyway. The semantics of the hypercall explicltly
	* permit spurious wakeups.
	*/
	case SCHEDOP_yield:
	sched_yield();
	err = 0;
	break;

	default:
	return false;
	}

	exit->u.hcall.result = err;
	return true;
	}

	static bool do_kvm_xen_handle_exit(X86CPU cpu, struct kvm_xen_exit exit)
	{
	uint16_t code = exit->u.hcall.input;

	if (exit->u.hcall.cpl > 0) {
	exit->u.hcall.result = -EPERM;
	return true;
	}

	switch (code) {
	case __HYPERVISOR_sched_op:
	return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0],
	exit->u.hcall.params[1]);
	case __HYPERVISOR_vcpu_op:
	return kvm_xen_hcall_vcpu_op(exit, cpu,
	exit->u.hcall.params[0],
	exit->u.hcall.params[1],
	exit->u.hcall.params[2]);
	case __HYPERVISOR_hvm_op:
	return kvm_xen_hcall_hvm_op(exit, cpu, exit->u.hcall.params[0],
	exit->u.hcall.params[1]);
	case __HYPERVISOR_memory_op:
	return kvm_xen_hcall_memory_op(exit, cpu, exit->u.hcall.params[0],
	exit->u.hcall.params[1]);
	case __HYPERVISOR_xen_version:
	return kvm_xen_hcall_xen_version(exit, cpu, exit->u.hcall.params[0],
	exit->u.hcall.params[1]);
	default:
	return false;
	}
	}

	int kvm_xen_handle_exit(X86CPU cpu, struct kvm_xen_exit exit)
	{
	if (exit->type != KVM_EXIT_XEN_HCALL) {
	return -1;
	}

	/*
	* The kernel latches the guest 32/64 mode when the MSR is used to fill
	* the hypercall page. So if we see a hypercall in a mode that doesn't
	* match our own idea of the guest mode, fetch the kernel's idea of the
	* "long mode" to remain in sync.
	*/
	if (exit->u.hcall.longmode != xen_is_long_mode()) {
	xen_sync_long_mode();
	}

	if (!do_kvm_xen_handle_exit(cpu, exit)) {
	/*
	* Some hypercalls will be deliberately "implemented" by returning
	* -ENOSYS. This case is for hypercalls which are unexpected.
	*/
	exit->u.hcall.result = -ENOSYS;
	qemu_log_mask(LOG_UNIMP, "Unimplemented Xen hypercall %"
	PRId64 " (0x%" PRIx64 " 0x%" PRIx64 " 0x%" PRIx64 ")\n",
	(uint64_t)exit->u.hcall.input,
	(uint64_t)exit->u.hcall.params[0],
	(uint64_t)exit->u.hcall.params[1],
	(uint64_t)exit->u.hcall.params[2]);
	}

	trace_kvm_xen_hypercall(CPU(cpu)->cpu_index, exit->u.hcall.cpl,
	exit->u.hcall.input, exit->u.hcall.params[0],
	exit->u.hcall.params[1], exit->u.hcall.params[2],
	exit->u.hcall.result);
	return 0;
	}

	int kvm_put_xen_state(CPUState *cs)
	{
	X86CPU *cpu = X86_CPU(cs);
	CPUX86State *env = &cpu->env;
	uint64_t gpa;
	int ret;

	gpa = env->xen_vcpu_info_gpa;
	if (gpa == INVALID_GPA) {
	gpa = env->xen_vcpu_info_default_gpa;
	}

	if (gpa != INVALID_GPA) {
	ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, gpa);
	if (ret < 0) {
	return ret;
	}
	}

	return 0;
	}

	int kvm_get_xen_state(CPUState *cs)
	{
	X86CPU *cpu = X86_CPU(cs);
	CPUX86State *env = &cpu->env;
	uint64_t gpa;

	/*
	* The kernel does not mark vcpu_info as dirty when it delivers interrupts
	* to it. It's up to userspace to assume that any page shared thus is
	* always considered dirty. The shared_info page is different since it's
	* an overlay and migrated separately anyway.
	*/
	gpa = env->xen_vcpu_info_gpa;
	if (gpa == INVALID_GPA) {
	gpa = env->xen_vcpu_info_default_gpa;
	}
	if (gpa != INVALID_GPA) {
	MemoryRegionSection mrs = memory_region_find(get_system_memory(),
	gpa,
	sizeof(struct vcpu_info));
	if (mrs.mr &&
	!int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
	memory_region_set_dirty(mrs.mr, mrs.offset_within_region,
	sizeof(struct vcpu_info));
	}
	}

	return 0;
	}