target/i386/whpx/whpx-all.c - qemu - Git at Google

 /*
  * QEMU Windows Hypervisor Platform accelerator (WHPX)
  *
  * Copyright Microsoft Corp. 2017
  *
  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  * See the COPYING file in the top-level directory.
  *
  */

 #include "qemu/osdep.h"
 #include "cpu.h"
 #include "exec/address-spaces.h"
 #include "exec/ioport.h"
 #include "gdbstub/helpers.h"
 #include "qemu/accel.h"
 #include "sysemu/whpx.h"
 #include "sysemu/cpus.h"
 #include "sysemu/runstate.h"
 #include "qemu/main-loop.h"
 #include "hw/boards.h"
 #include "hw/intc/ioapic.h"
 #include "hw/i386/apic_internal.h"
 #include "qemu/error-report.h"
 #include "qapi/error.h"
 #include "qapi/qapi-types-common.h"
 #include "qapi/qapi-visit-common.h"
 #include "migration/blocker.h"
 #include <winerror.h>

 #include "whpx-internal.h"
 #include "whpx-accel-ops.h"

 #include <winhvplatform.h>
 #include <winhvemulation.h>

 #define HYPERV_APIC_BUS_FREQUENCY      (200000000ULL)

 static const WHV_REGISTER_NAME whpx_register_names[] = {

     /* X64 General purpose registers */
     WHvX64RegisterRax,
     WHvX64RegisterRcx,
     WHvX64RegisterRdx,
     WHvX64RegisterRbx,
     WHvX64RegisterRsp,
     WHvX64RegisterRbp,
     WHvX64RegisterRsi,
     WHvX64RegisterRdi,
     WHvX64RegisterR8,
     WHvX64RegisterR9,
     WHvX64RegisterR10,
     WHvX64RegisterR11,
     WHvX64RegisterR12,
     WHvX64RegisterR13,
     WHvX64RegisterR14,
     WHvX64RegisterR15,
     WHvX64RegisterRip,
     WHvX64RegisterRflags,

     /* X64 Segment registers */
     WHvX64RegisterEs,
     WHvX64RegisterCs,
     WHvX64RegisterSs,
     WHvX64RegisterDs,
     WHvX64RegisterFs,
     WHvX64RegisterGs,
     WHvX64RegisterLdtr,
     WHvX64RegisterTr,

     /* X64 Table registers */
     WHvX64RegisterIdtr,
     WHvX64RegisterGdtr,

     /* X64 Control Registers */
     WHvX64RegisterCr0,
     WHvX64RegisterCr2,
     WHvX64RegisterCr3,
     WHvX64RegisterCr4,
     WHvX64RegisterCr8,

     /* X64 Debug Registers */
     /*
      * WHvX64RegisterDr0,
      * WHvX64RegisterDr1,
      * WHvX64RegisterDr2,
      * WHvX64RegisterDr3,
      * WHvX64RegisterDr6,
      * WHvX64RegisterDr7,
      */

     /* X64 Floating Point and Vector Registers */
     WHvX64RegisterXmm0,
     WHvX64RegisterXmm1,
     WHvX64RegisterXmm2,
     WHvX64RegisterXmm3,
     WHvX64RegisterXmm4,
     WHvX64RegisterXmm5,
     WHvX64RegisterXmm6,
     WHvX64RegisterXmm7,
     WHvX64RegisterXmm8,
     WHvX64RegisterXmm9,
     WHvX64RegisterXmm10,
     WHvX64RegisterXmm11,
     WHvX64RegisterXmm12,
     WHvX64RegisterXmm13,
     WHvX64RegisterXmm14,
     WHvX64RegisterXmm15,
     WHvX64RegisterFpMmx0,
     WHvX64RegisterFpMmx1,
     WHvX64RegisterFpMmx2,
     WHvX64RegisterFpMmx3,
     WHvX64RegisterFpMmx4,
     WHvX64RegisterFpMmx5,
     WHvX64RegisterFpMmx6,
     WHvX64RegisterFpMmx7,
     WHvX64RegisterFpControlStatus,
     WHvX64RegisterXmmControlStatus,

     /* X64 MSRs */
     WHvX64RegisterEfer,
 #ifdef TARGET_X86_64
     WHvX64RegisterKernelGsBase,
 #endif
     WHvX64RegisterApicBase,
     /* WHvX64RegisterPat, */
     WHvX64RegisterSysenterCs,
     WHvX64RegisterSysenterEip,
     WHvX64RegisterSysenterEsp,
     WHvX64RegisterStar,
 #ifdef TARGET_X86_64
     WHvX64RegisterLstar,
     WHvX64RegisterCstar,
     WHvX64RegisterSfmask,
 #endif

     /* Interrupt / Event Registers */
     /*
      * WHvRegisterPendingInterruption,
      * WHvRegisterInterruptState,
      * WHvRegisterPendingEvent0,
      * WHvRegisterPendingEvent1
      * WHvX64RegisterDeliverabilityNotifications,
      */
 };

 struct whpx_register_set {
     WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
 };

 /*
  * The current implementation of instruction stepping sets the TF flag
  * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
  * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
  *
  * This approach has a few limitations:
  *     1. Stepping over a PUSHF/SAHF instruction will save the TF flag
  *        along with the other flags, possibly restoring it later. It would
  *        result in another INT1 when the flags are restored, triggering
  *        a stop in gdb that could be cleared by doing another step.
  *
  *        Stepping over a POPF/LAHF instruction will let it overwrite the
  *        TF flags, ending the stepping mode.
  *
  *     2. Stepping over an instruction raising an exception (e.g. INT, DIV,
  *        or anything that could result in a page fault) will save the flags
  *        to the stack, clear the TF flag, and let the guest execute the
  *        handler. Normally, the guest will restore the original flags,
  *        that will continue single-stepping.
  *
  *     3. Debuggers running on the guest may wish to set TF to do instruction
  *        stepping. INT1 events generated by it would be intercepted by us,
  *        as long as the gdb is connected to QEMU.
  *
  * In practice this means that:
  *     1. Stepping through flags-modifying instructions may cause gdb to
  *        continue or stop in unexpected places. This will be fully recoverable
  *        and will not crash the target.
  *
  *     2. Stepping over an instruction that triggers an exception will step
  *        over the exception handler, not into it.
  *
  *     3. Debugging the guest via gdb, while running debugger on the guest
  *        at the same time may lead to unexpected effects. Removing all
  *        breakpoints set via QEMU will prevent any further interference
  *        with the guest-level debuggers.
  *
  * The limitations can be addressed as shown below:
  *     1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
  *        stepping through them. The exact semantics of the instructions is
  *        defined in the "Combined Volume Set of Intel 64 and IA-32
  *        Architectures Software Developer's Manuals", however it involves a
  *        fair amount of corner cases due to compatibility with real mode,
  *        virtual 8086 mode, and differences between 64-bit and 32-bit modes.
  *
  *     2. We could step into the guest's exception handlers using the following
  *        sequence:
  *          a. Temporarily enable catching of all exception types via
  *             whpx_set_exception_exit_bitmap().
  *          b. Once an exception is intercepted, read the IDT/GDT and locate
  *             the original handler.
  *          c. Patch the original handler, injecting an INT3 at the beginning.
  *          d. Update the exception exit bitmap to only catch the
  *             WHvX64ExceptionTypeBreakpointTrap exception.
  *          e. Let the affected CPU run in the exclusive mode.
  *          f. Restore the original handler and the exception exit bitmap.
  *        Note that handling all corner cases related to IDT/GDT is harder
  *        than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
  *        rough idea.
  *
  *     3. In order to properly support guest-level debugging in parallel with
  *        the QEMU-level debugging, we would need to be able to pass some INT1
  *        events to the guest. This could be done via the following methods:
  *          a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
  *             it seems to only work for interrupts and not software
  *             exceptions.
  *          b. Locating and patching the original handler by parsing IDT/GDT.
  *             This involves relatively complex logic outlined in the previous
  *             paragraph.
  *          c. Emulating the exception invocation (i.e. manually updating RIP,
  *             RFLAGS, and pushing the old values to stack). This is even more
  *             complicated than the previous option, since it involves checking
  *             CPL, gate attributes, and doing various adjustments depending
  *             on the current CPU mode, whether the CPL is changing, etc.
  */
 typedef enum WhpxStepMode {
     WHPX_STEP_NONE = 0,
     /* Halt other VCPUs */
     WHPX_STEP_EXCLUSIVE,
 } WhpxStepMode;

 struct AccelCPUState {
     WHV_EMULATOR_HANDLE emulator;
     bool window_registered;
     bool interruptable;
     bool ready_for_pic_interrupt;
     uint64_t tpr;
     uint64_t apic_base;
     bool interruption_pending;
     bool dirty;

     /* Must be the last field as it may have a tail */
     WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
 };

 static bool whpx_allowed;
 static bool whp_dispatch_initialized;
 static HMODULE hWinHvPlatform, hWinHvEmulation;
 static uint32_t max_vcpu_index;
 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;

 struct whpx_state whpx_global;
 struct WHPDispatch whp_dispatch;

 static bool whpx_has_xsave(void)
 {
     return whpx_xsave_cap.XsaveSupport;
 }

 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
                                              int r86)
 {
     WHV_X64_SEGMENT_REGISTER hs;
     unsigned flags = qs->flags;

     hs.Base = qs->base;
     hs.Limit = qs->limit;
     hs.Selector = qs->selector;

     if (v86) {
         hs.Attributes = 0;
         hs.SegmentType = 3;
         hs.Present = 1;
         hs.DescriptorPrivilegeLevel = 3;
         hs.NonSystemSegment = 1;

     } else {
         hs.Attributes = (flags >> DESC_TYPE_SHIFT);

         if (r86) {
             /* hs.Base &= 0xfffff; */
         }
     }

     return hs;
 }

 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
 {
     SegmentCache qs;

     qs.base = hs->Base;
     qs.limit = hs->Limit;
     qs.selector = hs->Selector;

     qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;

     return qs;
 }

 /* X64 Extended Control Registers */
 static void whpx_set_xcrs(CPUState *cpu)
 {
     HRESULT hr;
     struct whpx_state *whpx = &whpx_global;
     WHV_REGISTER_VALUE xcr0;
     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;

     if (!whpx_has_xsave()) {
         return;
     }

     /* Only xcr0 is supported by the hypervisor currently */
     xcr0.Reg64 = cpu_env(cpu)->xcr0;
     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
     if (FAILED(hr)) {
         error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
     }
 }

 static int whpx_set_tsc(CPUState *cpu)
 {
     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
     WHV_REGISTER_VALUE tsc_val;
     HRESULT hr;
     struct whpx_state *whpx = &whpx_global;

     /*
      * Suspend the partition prior to setting the TSC to reduce the variance
      * in TSC across vCPUs. When the first vCPU runs post suspend, the
      * partition is automatically resumed.
      */
     if (whp_dispatch.WHvSuspendPartitionTime) {

         /*
          * Unable to suspend partition while setting TSC is not a fatal
          * error. It just increases the likelihood of TSC variance between
          * vCPUs and some guest OS are able to handle that just fine.
          */
         hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
         if (FAILED(hr)) {
             warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
         }
     }

     tsc_val.Reg64 = cpu_env(cpu)->tsc;
     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
     if (FAILED(hr)) {
         error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
         return -1;
     }

     return 0;
 }

 /*
  * The CR8 register in the CPU is mapped to the TPR register of the APIC,
  * however, they use a slightly different encoding. Specifically:
  *
  *     APIC.TPR[bits 7:4] = CR8[bits 3:0]
  *
  * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
  * and IA-32 Architectures Software Developer's Manual.
  *
  * The functions below translate the value of CR8 to TPR and vice versa.
  */

 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
 {
     return tpr >> 4;
 }

 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
 {
     return cr8 << 4;
 }

 static void whpx_set_registers(CPUState *cpu, int level)
 {
     struct whpx_state *whpx = &whpx_global;
     AccelCPUState *vcpu = cpu->accel;
     X86CPU *x86_cpu = X86_CPU(cpu);
     CPUX86State *env = &x86_cpu->env;
     struct whpx_register_set vcxt;
     HRESULT hr;
     int idx;
     int idx_next;
     int i;
     int v86, r86;

     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));

     /*
      * Following MSRs have side effects on the guest or are too heavy for
      * runtime. Limit them to full state update.
      */
     if (level >= WHPX_SET_RESET_STATE) {
         whpx_set_tsc(cpu);
     }

     memset(&vcxt, 0, sizeof(struct whpx_register_set));

     v86 = (env->eflags & VM_MASK);
     r86 = !(env->cr[0] & CR0_PE_MASK);

     vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
     vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);

     idx = 0;

     /* Indexes for first 16 registers match between HV and QEMU definitions */
     idx_next = 16;
     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
         vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
     }
     idx = idx_next;

     /* Same goes for RIP and RFLAGS */
     assert(whpx_register_names[idx] == WHvX64RegisterRip);
     vcxt.values[idx++].Reg64 = env->eip;

     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
     vcxt.values[idx++].Reg64 = env->eflags;

     /* Translate 6+4 segment registers. HV and QEMU order matches  */
     assert(idx == WHvX64RegisterEs);
     for (i = 0; i < 6; i += 1, idx += 1) {
         vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
     }

     assert(idx == WHvX64RegisterLdtr);
     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);

     assert(idx == WHvX64RegisterTr);
     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);

     assert(idx == WHvX64RegisterIdtr);
     vcxt.values[idx].Table.Base = env->idt.base;
     vcxt.values[idx].Table.Limit = env->idt.limit;
     idx += 1;

     assert(idx == WHvX64RegisterGdtr);
     vcxt.values[idx].Table.Base = env->gdt.base;
     vcxt.values[idx].Table.Limit = env->gdt.limit;
     idx += 1;

     /* CR0, 2, 3, 4, 8 */
     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
     vcxt.values[idx++].Reg64 = env->cr[0];
     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
     vcxt.values[idx++].Reg64 = env->cr[2];
     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
     vcxt.values[idx++].Reg64 = env->cr[3];
     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
     vcxt.values[idx++].Reg64 = env->cr[4];
     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
     vcxt.values[idx++].Reg64 = vcpu->tpr;

     /* 8 Debug Registers - Skipped */

     /*
      * Extended control registers needs to be handled separately depending
      * on whether xsave is supported/enabled or not.
      */
     whpx_set_xcrs(cpu);

     /* 16 XMM registers */
     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
     idx_next = idx + 16;
     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
         vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
         vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
     }
     idx = idx_next;

     /* 8 FP registers */
     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
     for (i = 0; i < 8; i += 1, idx += 1) {
         vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
         /* vcxt.values[idx].Fp.AsUINT128.High64 =
                env->fpregs[i].mmx.MMX_Q(1);
         */
     }

     /* FP control status register */
     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
     vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
     vcxt.values[idx].FpControlStatus.FpStatus =
         (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
     vcxt.values[idx].FpControlStatus.FpTag = 0;
     for (i = 0; i < 8; ++i) {
         vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
     }
     vcxt.values[idx].FpControlStatus.Reserved = 0;
     vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
     vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
     idx += 1;

     /* XMM control status register */
     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
     vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
     vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
     vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
     idx += 1;

     /* MSRs */
     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
     vcxt.values[idx++].Reg64 = env->efer;
 #ifdef TARGET_X86_64
     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
     vcxt.values[idx++].Reg64 = env->kernelgsbase;
 #endif

     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
     vcxt.values[idx++].Reg64 = vcpu->apic_base;

     /* WHvX64RegisterPat - Skipped */

     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
     vcxt.values[idx++].Reg64 = env->sysenter_cs;
     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
     vcxt.values[idx++].Reg64 = env->sysenter_eip;
     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
     vcxt.values[idx++].Reg64 = env->sysenter_esp;
     assert(whpx_register_names[idx] == WHvX64RegisterStar);
     vcxt.values[idx++].Reg64 = env->star;
 #ifdef TARGET_X86_64
     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
     vcxt.values[idx++].Reg64 = env->lstar;
     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
     vcxt.values[idx++].Reg64 = env->cstar;
     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
     vcxt.values[idx++].Reg64 = env->fmask;
 #endif

     /* Interrupt / Event Registers - Skipped */

     assert(idx == RTL_NUMBER_OF(whpx_register_names));

     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
         whpx->partition, cpu->cpu_index,
         whpx_register_names,
         RTL_NUMBER_OF(whpx_register_names),
         &vcxt.values[0]);

     if (FAILED(hr)) {
         error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
                      hr);
     }

     return;
 }

 static int whpx_get_tsc(CPUState *cpu)
 {
     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
     WHV_REGISTER_VALUE tsc_val;
     HRESULT hr;
     struct whpx_state *whpx = &whpx_global;

     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
     if (FAILED(hr)) {
         error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
         return -1;
     }

     cpu_env(cpu)->tsc = tsc_val.Reg64;
     return 0;
 }

 /* X64 Extended Control Registers */
 static void whpx_get_xcrs(CPUState *cpu)
 {
     HRESULT hr;
     struct whpx_state *whpx = &whpx_global;
     WHV_REGISTER_VALUE xcr0;
     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;

     if (!whpx_has_xsave()) {
         return;
     }

     /* Only xcr0 is supported by the hypervisor currently */
     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
     if (FAILED(hr)) {
         error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
         return;
     }

     cpu_env(cpu)->xcr0 = xcr0.Reg64;
 }

 static void whpx_get_registers(CPUState *cpu)
 {
     struct whpx_state *whpx = &whpx_global;
     AccelCPUState *vcpu = cpu->accel;
     X86CPU *x86_cpu = X86_CPU(cpu);
     CPUX86State *env = &x86_cpu->env;
     struct whpx_register_set vcxt;
     uint64_t tpr, apic_base;
     HRESULT hr;
     int idx;
     int idx_next;
     int i;

     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));

     if (!env->tsc_valid) {
         whpx_get_tsc(cpu);
         env->tsc_valid = !runstate_is_running();
     }

     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
         whpx->partition, cpu->cpu_index,
         whpx_register_names,
         RTL_NUMBER_OF(whpx_register_names),
         &vcxt.values[0]);
     if (FAILED(hr)) {
         error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
                      hr);
     }

     if (whpx_apic_in_platform()) {
         /*
          * Fetch the TPR value from the emulated APIC. It may get overwritten
          * below with the value from CR8 returned by
          * WHvGetVirtualProcessorRegisters().
          */
         whpx_apic_get(x86_cpu->apic_state);
         vcpu->tpr = whpx_apic_tpr_to_cr8(
             cpu_get_apic_tpr(x86_cpu->apic_state));
     }

     idx = 0;

     /* Indexes for first 16 registers match between HV and QEMU definitions */
     idx_next = 16;
     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
         env->regs[idx] = vcxt.values[idx].Reg64;
     }
     idx = idx_next;

     /* Same goes for RIP and RFLAGS */
     assert(whpx_register_names[idx] == WHvX64RegisterRip);
     env->eip = vcxt.values[idx++].Reg64;
     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
     env->eflags = vcxt.values[idx++].Reg64;

     /* Translate 6+4 segment registers. HV and QEMU order matches  */
     assert(idx == WHvX64RegisterEs);
     for (i = 0; i < 6; i += 1, idx += 1) {
         env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
     }

     assert(idx == WHvX64RegisterLdtr);
     env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
     assert(idx == WHvX64RegisterTr);
     env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
     assert(idx == WHvX64RegisterIdtr);
     env->idt.base = vcxt.values[idx].Table.Base;
     env->idt.limit = vcxt.values[idx].Table.Limit;
     idx += 1;
     assert(idx == WHvX64RegisterGdtr);
     env->gdt.base = vcxt.values[idx].Table.Base;
     env->gdt.limit = vcxt.values[idx].Table.Limit;
     idx += 1;

     /* CR0, 2, 3, 4, 8 */
     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
     env->cr[0] = vcxt.values[idx++].Reg64;
     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
     env->cr[2] = vcxt.values[idx++].Reg64;
     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
     env->cr[3] = vcxt.values[idx++].Reg64;
     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
     env->cr[4] = vcxt.values[idx++].Reg64;
     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
     tpr = vcxt.values[idx++].Reg64;
     if (tpr != vcpu->tpr) {
         vcpu->tpr = tpr;
         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
     }

     /* 8 Debug Registers - Skipped */

     /*
      * Extended control registers needs to be handled separately depending
      * on whether xsave is supported/enabled or not.
      */
     whpx_get_xcrs(cpu);

     /* 16 XMM registers */
     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
     idx_next = idx + 16;
     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
         env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
         env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
     }
     idx = idx_next;

     /* 8 FP registers */
     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
     for (i = 0; i < 8; i += 1, idx += 1) {
         env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
         /* env->fpregs[i].mmx.MMX_Q(1) =
                vcxt.values[idx].Fp.AsUINT128.High64;
         */
     }

     /* FP control status register */
     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
     env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
     env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
     env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
     for (i = 0; i < 8; ++i) {
         env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
     }
     env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
     env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
     idx += 1;

     /* XMM control status register */
     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
     env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
     idx += 1;

     /* MSRs */
     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
     env->efer = vcxt.values[idx++].Reg64;
 #ifdef TARGET_X86_64
     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
     env->kernelgsbase = vcxt.values[idx++].Reg64;
 #endif

     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
     apic_base = vcxt.values[idx++].Reg64;
     if (apic_base != vcpu->apic_base) {
         vcpu->apic_base = apic_base;
         cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
     }

     /* WHvX64RegisterPat - Skipped */

     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
     env->sysenter_cs = vcxt.values[idx++].Reg64;
     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
     env->sysenter_eip = vcxt.values[idx++].Reg64;
     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
     env->sysenter_esp = vcxt.values[idx++].Reg64;
     assert(whpx_register_names[idx] == WHvX64RegisterStar);
     env->star = vcxt.values[idx++].Reg64;
 #ifdef TARGET_X86_64
     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
     env->lstar = vcxt.values[idx++].Reg64;
     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
     env->cstar = vcxt.values[idx++].Reg64;
     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
     env->fmask = vcxt.values[idx++].Reg64;
 #endif

     /* Interrupt / Event Registers - Skipped */

     assert(idx == RTL_NUMBER_OF(whpx_register_names));

     if (whpx_apic_in_platform()) {
         whpx_apic_get(x86_cpu->apic_state);
     }

     x86_update_hflags(env);

     return;
 }

 static HRESULT CALLBACK whpx_emu_ioport_callback(
     void *ctx,
     WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
 {
     MemTxAttrs attrs = { 0 };
     address_space_rw(&address_space_io, IoAccess->Port, attrs,
                      &IoAccess->Data, IoAccess->AccessSize,
                      IoAccess->Direction);
     return S_OK;
 }

 static HRESULT CALLBACK whpx_emu_mmio_callback(
     void *ctx,
     WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
 {
     cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
                            ma->Direction);
     return S_OK;
 }

 static HRESULT CALLBACK whpx_emu_getreg_callback(
     void *ctx,
     const WHV_REGISTER_NAME *RegisterNames,
     UINT32 RegisterCount,
     WHV_REGISTER_VALUE *RegisterValues)
 {
     HRESULT hr;
     struct whpx_state *whpx = &whpx_global;
     CPUState *cpu = (CPUState *)ctx;

     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
         whpx->partition, cpu->cpu_index,
         RegisterNames, RegisterCount,
         RegisterValues);
     if (FAILED(hr)) {
         error_report("WHPX: Failed to get virtual processor registers,"
                      " hr=%08lx", hr);
     }

     return hr;
 }

 static HRESULT CALLBACK whpx_emu_setreg_callback(
     void *ctx,
     const WHV_REGISTER_NAME *RegisterNames,
     UINT32 RegisterCount,
     const WHV_REGISTER_VALUE *RegisterValues)
 {
     HRESULT hr;
     struct whpx_state *whpx = &whpx_global;
     CPUState *cpu = (CPUState *)ctx;

     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
         whpx->partition, cpu->cpu_index,
         RegisterNames, RegisterCount,
         RegisterValues);
     if (FAILED(hr)) {
         error_report("WHPX: Failed to set virtual processor registers,"
                      " hr=%08lx", hr);
     }

     /*
      * The emulator just successfully wrote the register state. We clear the
      * dirty state so we avoid the double write on resume of the VP.
      */
     cpu->accel->dirty = false;

     return hr;
 }

 static HRESULT CALLBACK whpx_emu_translate_callback(
     void *ctx,
     WHV_GUEST_VIRTUAL_ADDRESS Gva,
     WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
     WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
     WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
 {
     HRESULT hr;
     struct whpx_state *whpx = &whpx_global;
     CPUState *cpu = (CPUState *)ctx;
     WHV_TRANSLATE_GVA_RESULT res;

     hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
                                       Gva, TranslateFlags, &res, Gpa);
     if (FAILED(hr)) {
         error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
     } else {
         *TranslationResult = res.ResultCode;
     }

     return hr;
 }

 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
     .Size = sizeof(WHV_EMULATOR_CALLBACKS),
     .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
     .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
     .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
     .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
     .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
 };

 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
 {
     HRESULT hr;
     AccelCPUState *vcpu = cpu->accel;
     WHV_EMULATOR_STATUS emu_status;

     hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
         vcpu->emulator, cpu,
         &vcpu->exit_ctx.VpContext, ctx,
         &emu_status);
     if (FAILED(hr)) {
         error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
         return -1;
     }

     if (!emu_status.EmulationSuccessful) {
         error_report("WHPX: Failed to emulate MMIO access with"
                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
         return -1;
     }

     return 0;
 }

 static int whpx_handle_portio(CPUState *cpu,
                               WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
 {
     HRESULT hr;
     AccelCPUState *vcpu = cpu->accel;
     WHV_EMULATOR_STATUS emu_status;

     hr = whp_dispatch.WHvEmulatorTryIoEmulation(
         vcpu->emulator, cpu,
         &vcpu->exit_ctx.VpContext, ctx,
         &emu_status);
     if (FAILED(hr)) {
         error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
         return -1;
     }

     if (!emu_status.EmulationSuccessful) {
         error_report("WHPX: Failed to emulate PortIO access with"
                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
         return -1;
     }

     return 0;
 }

 /*
  * Controls whether we should intercept various exceptions on the guest,
  * namely breakpoint/single-step events.
  *
  * The 'exceptions' argument accepts a bitmask, e.g:
  * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
  */
 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
 {
     struct whpx_state *whpx = &whpx_global;
     WHV_PARTITION_PROPERTY prop = { 0, };
     HRESULT hr;

     if (exceptions == whpx->exception_exit_bitmap) {
         return S_OK;
     }

     prop.ExceptionExitBitmap = exceptions;

     hr = whp_dispatch.WHvSetPartitionProperty(
         whpx->partition,
         WHvPartitionPropertyCodeExceptionExitBitmap,
         &prop,
         sizeof(WHV_PARTITION_PROPERTY));

     if (SUCCEEDED(hr)) {
         whpx->exception_exit_bitmap = exceptions;
     }

     return hr;
 }


 /*
  * This function is called before/after stepping over a single instruction.
  * It will update the CPU registers to arm/disarm the instruction stepping
  * accordingly.
  */
 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
     bool set,
     uint64_t *exit_context_rflags)
 {
     WHV_REGISTER_NAME reg_name;
     WHV_REGISTER_VALUE reg_value;
     HRESULT hr;
     struct whpx_state *whpx = &whpx_global;

     /*
      * If we are trying to step over a single instruction, we need to set the
      * TF bit in rflags. Otherwise, clear it.
      */
     reg_name = WHvX64RegisterRflags;
     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
         whpx->partition,
         cpu->cpu_index,
         &reg_name,
         1,
         &reg_value);

     if (FAILED(hr)) {
         error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
         return hr;
     }

     if (exit_context_rflags) {
         assert(*exit_context_rflags == reg_value.Reg64);
     }

     if (set) {
         /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
         reg_value.Reg64 |= TF_MASK;
     } else {
         reg_value.Reg64 &= ~TF_MASK;
     }

     if (exit_context_rflags) {
         *exit_context_rflags = reg_value.Reg64;
     }

     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
         whpx->partition,
         cpu->cpu_index,
         &reg_name,
         1,
         &reg_value);

     if (FAILED(hr)) {
         error_report("WHPX: Failed to set rflags,"
             " hr=%08lx",
             hr);
         return hr;
     }

     reg_name = WHvRegisterInterruptState;
     reg_value.Reg64 = 0;

     /* Suspend delivery of hardware interrupts during single-stepping. */
     reg_value.InterruptState.InterruptShadow = set != 0;

     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
     whpx->partition,
         cpu->cpu_index,
         &reg_name,
         1,
         &reg_value);

     if (FAILED(hr)) {
         error_report("WHPX: Failed to set InterruptState,"
             " hr=%08lx",
             hr);
         return hr;
     }

     if (!set) {
         /*
          * We have just finished stepping over a single instruction,
          * and intercepted the INT1 generated by it.
          * We need to now hide the INT1 from the guest,
          * as it would not be expecting it.
          */

         reg_name = WHvX64RegisterPendingDebugException;
         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
         whpx->partition,
             cpu->cpu_index,
             &reg_name,
             1,
             &reg_value);

         if (FAILED(hr)) {
             error_report("WHPX: Failed to get pending debug exceptions,"
                          "hr=%08lx", hr);
             return hr;
         }

         if (reg_value.PendingDebugException.SingleStep) {
             reg_value.PendingDebugException.SingleStep = 0;

             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
                 whpx->partition,
                 cpu->cpu_index,
                 &reg_name,
                 1,
                 &reg_value);

             if (FAILED(hr)) {
                 error_report("WHPX: Failed to clear pending debug exceptions,"
                              "hr=%08lx", hr);
              return hr;
             }
         }

     }

     return S_OK;
 }

 /* Tries to find a breakpoint at the specified address. */
 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
 {
     struct whpx_state *whpx = &whpx_global;
     int i;

     if (whpx->breakpoints.breakpoints) {
         for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
             if (address == whpx->breakpoints.breakpoints->data[i].address) {
                 return &whpx->breakpoints.breakpoints->data[i];
             }
         }
     }

     return NULL;
 }

 /*
  * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
  * debugging user-mode applications. Since the WHPX API does not offer
  * an easy way to pass the intercepted exception back to the guest, we
  * resort to using INT1 instead, and let the guest always handle INT3.
  */
 static const uint8_t whpx_breakpoint_instruction = 0xF1;

 /*
  * The WHPX QEMU backend implements breakpoints by writing the INT1
  * instruction into memory (ignoring the DRx registers). This raises a few
  * issues that need to be carefully handled:
  *
  * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
  *    at the same location, and later remove them in arbitrary order.
  *    This should not cause memory corruption, and should only remove the
  *    physical breakpoint instruction when the last QEMU breakpoint is gone.
  *
  * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
  *    physical location. Hence, physically adding/removing a breakpoint can
  *    theoretically fail at any time. We need to keep track of it.
  *
  * The function below rebuilds a list of low-level breakpoints (one per
  * address, tracking the original instruction and any errors) from the list of
  * high-level breakpoints (set via cpu_breakpoint_insert()).
  *
  * In order to optimize performance, this function stores the list of
  * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
  * low-level ones, so that it won't be re-invoked until these breakpoints
  * change.
  *
  * Note that this function decides which breakpoints should be inserted into,
  * memory, but doesn't actually do it. The memory accessing is done in
  * whpx_apply_breakpoints().
  */
 static void whpx_translate_cpu_breakpoints(
     struct whpx_breakpoints *breakpoints,
     CPUState *cpu,
     int cpu_breakpoint_count)
 {
     CPUBreakpoint *bp;
     int cpu_bp_index = 0;

     breakpoints->original_addresses =
         g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);

     breakpoints->original_address_count = cpu_breakpoint_count;

     int max_breakpoints = cpu_breakpoint_count +
         (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);

     struct whpx_breakpoint_collection *new_breakpoints =
         g_malloc0(sizeof(struct whpx_breakpoint_collection)
                   + max_breakpoints * sizeof(struct whpx_breakpoint));

     new_breakpoints->allocated = max_breakpoints;
     new_breakpoints->used = 0;

     /*
      * 1. Preserve all old breakpoints that could not be automatically
      * cleared when the CPU got stopped.
      */
     if (breakpoints->breakpoints) {
         int i;
         for (i = 0; i < breakpoints->breakpoints->used; i++) {
             if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
                 new_breakpoints->data[new_breakpoints->used++] =
                     breakpoints->breakpoints->data[i];
             }
         }
     }

     /* 2. Map all CPU breakpoints to WHPX breakpoints */
     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
         int i;
         bool found = false;

         /* This will be used to detect changed CPU breakpoints later. */
         breakpoints->original_addresses[cpu_bp_index++] = bp->pc;

         for (i = 0; i < new_breakpoints->used; i++) {
             /*
              * WARNING: This loop has O(N^2) complexity, where N is the
              * number of breakpoints. It should not be a bottleneck in
              * real-world scenarios, since it only needs to run once after
              * the breakpoints have been modified.
              * If this ever becomes a concern, it can be optimized by storing
              * high-level breakpoint objects in a tree or hash map.
              */

             if (new_breakpoints->data[i].address == bp->pc) {
                 /* There was already a breakpoint at this address. */
                 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
                     new_breakpoints->data[i].state = WHPX_BP_SET;
                 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
                     new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
                 }

                 found = true;
                 break;
             }
         }

         if (!found && new_breakpoints->used < new_breakpoints->allocated) {
             /* No WHPX breakpoint at this address. Create one. */
             new_breakpoints->data[new_breakpoints->used].address = bp->pc;
             new_breakpoints->data[new_breakpoints->used].state =
                 WHPX_BP_SET_PENDING;
             new_breakpoints->used++;
         }
     }

     /*
      * Free the previous breakpoint list. This can be optimized by keeping
      * it as shadow buffer for the next computation instead of freeing
      * it immediately.
      */
     g_free(breakpoints->breakpoints);

     breakpoints->breakpoints = new_breakpoints;
 }

 /*
  * Physically inserts/removes the breakpoints by reading and writing the
  * physical memory, keeping a track of the failed attempts.
  *
  * Passing resuming=true  will try to set all previously unset breakpoints.
  * Passing resuming=false will remove all inserted ones.
  */
 static void whpx_apply_breakpoints(
     struct whpx_breakpoint_collection *breakpoints,
     CPUState *cpu,
     bool resuming)
 {
     int i, rc;
     if (!breakpoints) {
         return;
     }

     for (i = 0; i < breakpoints->used; i++) {
         /* Decide what to do right now based on the last known state. */
         WhpxBreakpointState state = breakpoints->data[i].state;
         switch (state) {
         case WHPX_BP_CLEARED:
             if (resuming) {
                 state = WHPX_BP_SET_PENDING;
             }
             break;
         case WHPX_BP_SET_PENDING:
             if (!resuming) {
                 state = WHPX_BP_CLEARED;
             }
             break;
         case WHPX_BP_SET:
             if (!resuming) {
                 state = WHPX_BP_CLEAR_PENDING;
             }
             break;
         case WHPX_BP_CLEAR_PENDING:
             if (resuming) {
                 state = WHPX_BP_SET;
             }
             break;
         }

         if (state == WHPX_BP_SET_PENDING) {
             /* Remember the original instruction. */
             rc = cpu_memory_rw_debug(cpu,
                 breakpoints->data[i].address,
                 &breakpoints->data[i].original_instruction,
                 1,
                 false);

             if (!rc) {
                 /* Write the breakpoint instruction. */
                 rc = cpu_memory_rw_debug(cpu,
                     breakpoints->data[i].address,
                     (void *)&whpx_breakpoint_instruction,
                     1,
                     true);
             }

             if (!rc) {
                 state = WHPX_BP_SET;
             }

         }

         if (state == WHPX_BP_CLEAR_PENDING) {
             /* Restore the original instruction. */
             rc = cpu_memory_rw_debug(cpu,
                 breakpoints->data[i].address,
                 &breakpoints->data[i].original_instruction,
                 1,
                 true);

             if (!rc) {
                 state = WHPX_BP_CLEARED;
             }
         }

         breakpoints->data[i].state = state;
     }
 }

 /*
  * This function is called when the a VCPU is about to start and no other
  * VCPUs have been started so far. Since the VCPU start order could be
  * arbitrary, it doesn't have to be VCPU#0.
  *
  * It is used to commit the breakpoints into memory, and configure WHPX
  * to intercept debug exceptions.
  *
  * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
  * more VCPUs are already running, so this is the best place to do it.
  */
 static int whpx_first_vcpu_starting(CPUState *cpu)
 {
     struct whpx_state *whpx = &whpx_global;
     HRESULT hr;

     g_assert(bql_locked());

     if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
             (whpx->breakpoints.breakpoints &&
              whpx->breakpoints.breakpoints->used)) {
         CPUBreakpoint *bp;
         int i = 0;
         bool update_pending = false;

         QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
             if (i >= whpx->breakpoints.original_address_count ||
                 bp->pc != whpx->breakpoints.original_addresses[i]) {
                 update_pending = true;
             }

             i++;
         }

         if (i != whpx->breakpoints.original_address_count) {
             update_pending = true;
         }

         if (update_pending) {
             /*
              * The CPU breakpoints have changed since the last call to
              * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
              * now be recomputed.
              */
             whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
         }

         /* Actually insert the breakpoints into the memory. */
         whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
     }

     uint64_t exception_mask;
     if (whpx->step_pending ||
         (whpx->breakpoints.breakpoints &&
          whpx->breakpoints.breakpoints->used)) {
         /*
          * We are either attempting to single-step one or more CPUs, or
          * have one or more breakpoints enabled. Both require intercepting
          * the WHvX64ExceptionTypeBreakpointTrap exception.
          */

         exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
     } else {
         /* Let the guest handle all exceptions. */
         exception_mask = 0;
     }

     hr = whpx_set_exception_exit_bitmap(exception_mask);
     if (!SUCCEEDED(hr)) {
         error_report("WHPX: Failed to update exception exit mask,"
                      "hr=%08lx.", hr);
         return 1;
     }

     return 0;
 }

 /*
  * This function is called when the last VCPU has finished running.
  * It is used to remove any previously set breakpoints from memory.
  */
 static int whpx_last_vcpu_stopping(CPUState *cpu)
 {
     whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
     return 0;
 }

 /* Returns the address of the next instruction that is about to be executed. */
 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
 {
     if (cpu->accel->dirty) {
         /* The CPU registers have been modified by other parts of QEMU. */
         return cpu_env(cpu)->eip;
     } else if (exit_context_valid) {
         /*
          * The CPU registers have not been modified by neither other parts
          * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
          * This is the most common case.
          */
         AccelCPUState *vcpu = cpu->accel;
         return vcpu->exit_ctx.VpContext.Rip;
     } else {
         /*
          * The CPU registers have been modified by a call to
          * WHvSetVirtualProcessorRegisters() and must be re-queried from
          * the target.
          */
         WHV_REGISTER_VALUE reg_value;
         WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
         HRESULT hr;
         struct whpx_state *whpx = &whpx_global;

         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
             whpx->partition,
             cpu->cpu_index,
             &reg_name,
             1,
             &reg_value);

         if (FAILED(hr)) {
             error_report("WHPX: Failed to get PC, hr=%08lx", hr);
             return 0;
         }

         return reg_value.Reg64;
     }
 }

 static int whpx_handle_halt(CPUState *cpu)
 {
     int ret = 0;

     bql_lock();
     if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
           (cpu_env(cpu)->eflags & IF_MASK)) &&
         !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
         cpu->exception_index = EXCP_HLT;
         cpu->halted = true;
         ret = 1;
     }
     bql_unlock();

     return ret;
 }

 static void whpx_vcpu_pre_run(CPUState *cpu)
 {
     HRESULT hr;
     struct whpx_state *whpx = &whpx_global;
     AccelCPUState *vcpu = cpu->accel;
     X86CPU *x86_cpu = X86_CPU(cpu);
     CPUX86State *env = &x86_cpu->env;
     int irq;
     uint8_t tpr;
     WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
     UINT32 reg_count = 0;
     WHV_REGISTER_VALUE reg_values[3];
     WHV_REGISTER_NAME reg_names[3];

     memset(&new_int, 0, sizeof(new_int));
     memset(reg_values, 0, sizeof(reg_values));

     bql_lock();

     /* Inject NMI */
     if (!vcpu->interruption_pending &&
         cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
             vcpu->interruptable = false;
             new_int.InterruptionType = WHvX64PendingNmi;
             new_int.InterruptionPending = 1;
             new_int.InterruptionVector = 2;
         }
         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
         }
     }

     /*
      * Force the VCPU out of its inner loop to process any INIT requests or
      * commit pending TPR access.
      */
     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
             !(env->hflags & HF_SMM_MASK)) {
             cpu->exit_request = 1;
         }
         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
             cpu->exit_request = 1;
         }
     }

     /* Get pending hard interruption or replay one that was overwritten */
     if (!whpx_apic_in_platform()) {
         if (!vcpu->interruption_pending &&
             vcpu->interruptable && (env->eflags & IF_MASK)) {
             assert(!new_int.InterruptionPending);
             if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
                 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
                 irq = cpu_get_pic_interrupt(env);
                 if (irq >= 0) {
                     new_int.InterruptionType = WHvX64PendingInterrupt;
                     new_int.InterruptionPending = 1;
                     new_int.InterruptionVector = irq;
                 }
             }
         }

         /* Setup interrupt state if new one was prepared */
         if (new_int.InterruptionPending) {
             reg_values[reg_count].PendingInterruption = new_int;
             reg_names[reg_count] = WHvRegisterPendingInterruption;
             reg_count += 1;
         }
     } else if (vcpu->ready_for_pic_interrupt &&
                (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
         cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
         irq = cpu_get_pic_interrupt(env);
         if (irq >= 0) {
             reg_names[reg_count] = WHvRegisterPendingEvent;
             reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
             {
                 .EventPending = 1,
                 .EventType = WHvX64PendingEventExtInt,
                 .Vector = irq,
             };
             reg_count += 1;
         }
      }

     /* Sync the TPR to the CR8 if was modified during the intercept */
     tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
     if (tpr != vcpu->tpr) {
         vcpu->tpr = tpr;
         reg_values[reg_count].Reg64 = tpr;
         cpu->exit_request = 1;
         reg_names[reg_count] = WHvX64RegisterCr8;
         reg_count += 1;
     }

     /* Update the state of the interrupt delivery notification */
     if (!vcpu->window_registered &&
         cpu->interrupt_request & CPU_INTERRUPT_HARD) {
         reg_values[reg_count].DeliverabilityNotifications =
             (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
                 .InterruptNotification = 1
             };
         vcpu->window_registered = 1;
         reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
         reg_count += 1;
     }

     bql_unlock();
     vcpu->ready_for_pic_interrupt = false;

     if (reg_count) {
         hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
             whpx->partition, cpu->cpu_index,
             reg_names, reg_count, reg_values);
         if (FAILED(hr)) {
             error_report("WHPX: Failed to set interrupt state registers,"
                          " hr=%08lx", hr);
         }
     }

     return;
 }

 static void whpx_vcpu_post_run(CPUState *cpu)
 {
     AccelCPUState *vcpu = cpu->accel;
     X86CPU *x86_cpu = X86_CPU(cpu);
     CPUX86State *env = &x86_cpu->env;

     env->eflags = vcpu->exit_ctx.VpContext.Rflags;

     uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
     if (vcpu->tpr != tpr) {
         vcpu->tpr = tpr;
         bql_lock();
         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
         bql_unlock();
     }

     vcpu->interruption_pending =
         vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;

     vcpu->interruptable =
         !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;

     return;
 }

 static void whpx_vcpu_process_async_events(CPUState *cpu)
 {
     X86CPU *x86_cpu = X86_CPU(cpu);
     CPUX86State *env = &x86_cpu->env;
     AccelCPUState *vcpu = cpu->accel;

     if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
         !(env->hflags & HF_SMM_MASK)) {
         whpx_cpu_synchronize_state(cpu);
         do_cpu_init(x86_cpu);
         vcpu->interruptable = true;
     }

     if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
         cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
         apic_poll_irq(x86_cpu->apic_state);
     }

     if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
          (env->eflags & IF_MASK)) ||
         (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
         cpu->halted = false;
     }

     if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
         whpx_cpu_synchronize_state(cpu);
         do_cpu_sipi(x86_cpu);
     }

     if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
         cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
         whpx_cpu_synchronize_state(cpu);
         apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
                                       env->tpr_access_type);
     }

     return;
 }

 static int whpx_vcpu_run(CPUState *cpu)
 {
     HRESULT hr;
     struct whpx_state *whpx = &whpx_global;
     AccelCPUState *vcpu = cpu->accel;
     struct whpx_breakpoint *stepped_over_bp = NULL;
     WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
     int ret;

     g_assert(bql_locked());

     if (whpx->running_cpus++ == 0) {
         /* Insert breakpoints into memory, update exception exit bitmap. */
         ret = whpx_first_vcpu_starting(cpu);
         if (ret != 0) {
             return ret;
         }
     }

     if (whpx->breakpoints.breakpoints &&
         whpx->breakpoints.breakpoints->used > 0)
     {
         uint64_t pc = whpx_vcpu_get_pc(cpu, true);
         stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
         if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
             stepped_over_bp = NULL;
         }

         if (stepped_over_bp) {
             /*
              * We are trying to run the instruction overwritten by an active
              * breakpoint. We will temporarily disable the breakpoint, suspend
              * other CPUs, and step over the instruction.
              */
             exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
         }
     }

     if (exclusive_step_mode == WHPX_STEP_NONE) {
         whpx_vcpu_process_async_events(cpu);
         if (cpu->halted && !whpx_apic_in_platform()) {
             cpu->exception_index = EXCP_HLT;
             qatomic_set(&cpu->exit_request, false);
             return 0;
         }
     }

     bql_unlock();

     if (exclusive_step_mode != WHPX_STEP_NONE) {
         start_exclusive();
         g_assert(cpu == current_cpu);
         g_assert(!cpu->running);
         cpu->running = true;

         hr = whpx_set_exception_exit_bitmap(
             1UL << WHvX64ExceptionTypeDebugTrapOrFault);
         if (!SUCCEEDED(hr)) {
             error_report("WHPX: Failed to update exception exit mask, "
                          "hr=%08lx.", hr);
             return 1;
         }

         if (stepped_over_bp) {
             /* Temporarily disable the triggered breakpoint. */
             cpu_memory_rw_debug(cpu,
                 stepped_over_bp->address,
                 &stepped_over_bp->original_instruction,
                 1,
                 true);
         }
     } else {
         cpu_exec_start(cpu);
     }

     do {
         if (cpu->accel->dirty) {
             whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
             cpu->accel->dirty = false;
         }

         if (exclusive_step_mode == WHPX_STEP_NONE) {
             whpx_vcpu_pre_run(cpu);

             if (qatomic_read(&cpu->exit_request)) {
                 whpx_vcpu_kick(cpu);
             }
         }

         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
             whpx_vcpu_configure_single_stepping(cpu, true, NULL);
         }

         hr = whp_dispatch.WHvRunVirtualProcessor(
             whpx->partition, cpu->cpu_index,
             &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));

         if (FAILED(hr)) {
             error_report("WHPX: Failed to exec a virtual processor,"
                          " hr=%08lx", hr);
             ret = -1;
             break;
         }

         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
             whpx_vcpu_configure_single_stepping(cpu,
                 false,
                 &vcpu->exit_ctx.VpContext.Rflags);
         }

         whpx_vcpu_post_run(cpu);

         switch (vcpu->exit_ctx.ExitReason) {
         case WHvRunVpExitReasonMemoryAccess:
             ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
             break;

         case WHvRunVpExitReasonX64IoPortAccess:
             ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
             break;

         case WHvRunVpExitReasonX64InterruptWindow:
             vcpu->ready_for_pic_interrupt = 1;
             vcpu->window_registered = 0;
             ret = 0;
             break;

         case WHvRunVpExitReasonX64ApicEoi:
             assert(whpx_apic_in_platform());
             ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
             break;

         case WHvRunVpExitReasonX64Halt:
             /*
              * WARNING: as of build 19043.1526 (21H1), this exit reason is no
              * longer used.
              */
             ret = whpx_handle_halt(cpu);
             break;

         case WHvRunVpExitReasonX64ApicInitSipiTrap: {
             WHV_INTERRUPT_CONTROL ipi = {0};
             uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
             uint32_t delivery_mode =
                 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
             int dest_shorthand =
                 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
             bool broadcast = false;
             bool include_self = false;
             uint32_t i;

             /* We only registered for INIT and SIPI exits. */
             if ((delivery_mode != APIC_DM_INIT) &&
                 (delivery_mode != APIC_DM_SIPI)) {
                 error_report(
                     "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
                 break;
             }

             if (delivery_mode == APIC_DM_INIT) {
                 ipi.Type = WHvX64InterruptTypeInit;
             } else {
                 ipi.Type = WHvX64InterruptTypeSipi;
             }

             ipi.DestinationMode =
                 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
                     WHvX64InterruptDestinationModeLogical :
                     WHvX64InterruptDestinationModePhysical;

             ipi.TriggerMode =
                 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
                     WHvX64InterruptTriggerModeLevel :
                     WHvX64InterruptTriggerModeEdge;

             ipi.Vector = icr & APIC_VECTOR_MASK;
             switch (dest_shorthand) {
             /* no shorthand. Bits 56-63 contain the destination. */
             case 0:
                 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
                         &ipi, sizeof(ipi));
                 if (FAILED(hr)) {
                     error_report("WHPX: Failed to request interrupt  hr=%08lx",
                         hr);
                 }

                 break;

             /* self */
             case 1:
                 include_self = true;
                 break;

             /* broadcast, including self */
             case 2:
                 broadcast = true;
                 include_self = true;
                 break;

             /* broadcast, excluding self */
             case 3:
                 broadcast = true;
                 break;
             }

             if (!broadcast && !include_self) {
                 break;
             }

             for (i = 0; i <= max_vcpu_index; i++) {
                 if (i == cpu->cpu_index && !include_self) {
                     continue;
                 }

                 /*
                  * Assuming that APIC Ids are identity mapped since
                  * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
                  * are not handled yet and the hypervisor doesn't allow the
                  * guest to modify the APIC ID.
                  */
                 ipi.Destination = i;
                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
                         &ipi, sizeof(ipi));
                 if (FAILED(hr)) {
                     error_report(
                         "WHPX: Failed to request SIPI for %d,  hr=%08lx",
                         i, hr);
                 }
             }

             break;
         }

         case WHvRunVpExitReasonCanceled:
             if (exclusive_step_mode != WHPX_STEP_NONE) {
                 /*
                  * We are trying to step over a single instruction, and
                  * likely got a request to stop from another thread.
                  * Delay it until we are done stepping
                  * over.
                  */
                 ret = 0;
             } else {
                 cpu->exception_index = EXCP_INTERRUPT;
                 ret = 1;
             }
             break;
         case WHvRunVpExitReasonX64MsrAccess: {
             WHV_REGISTER_VALUE reg_values[3] = {0};
             WHV_REGISTER_NAME reg_names[3];
             UINT32 reg_count;

             reg_names[0] = WHvX64RegisterRip;
             reg_names[1] = WHvX64RegisterRax;
             reg_names[2] = WHvX64RegisterRdx;

             reg_values[0].Reg64 =
                 vcpu->exit_ctx.VpContext.Rip +
                 vcpu->exit_ctx.VpContext.InstructionLength;

             /*
              * For all unsupported MSR access we:
              *     ignore writes
              *     return 0 on read.
              */
             reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
                         1 : 3;

             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
                 whpx->partition,
                 cpu->cpu_index,
                 reg_names, reg_count,
                 reg_values);

             if (FAILED(hr)) {
                 error_report("WHPX: Failed to set MsrAccess state "
                              " registers, hr=%08lx", hr);
             }
             ret = 0;
             break;
         }
         case WHvRunVpExitReasonX64Cpuid: {
             WHV_REGISTER_VALUE reg_values[5];
             WHV_REGISTER_NAME reg_names[5];
             UINT32 reg_count = 5;
             UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
             X86CPU *x86_cpu = X86_CPU(cpu);
             CPUX86State *env = &x86_cpu->env;

             memset(reg_values, 0, sizeof(reg_values));

             rip = vcpu->exit_ctx.VpContext.Rip +
                   vcpu->exit_ctx.VpContext.InstructionLength;
             cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;

             /*
              * Ideally, these should be supplied to the hypervisor during VCPU
              * initialization and it should be able to satisfy this request.
              * But, currently, WHPX doesn't support setting CPUID values in the
              * hypervisor once the partition has been setup, which is too late
              * since VCPUs are realized later. For now, use the values from
              * QEMU to satisfy these requests, until WHPX adds support for
              * being able to set these values in the hypervisor at runtime.
              */
             cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
                 (UINT32 *)&rcx, (UINT32 *)&rdx);
             switch (cpuid_fn) {
             case 0x40000000:
                 /* Expose the vmware cpu frequency cpuid leaf */
                 rax = 0x40000010;
                 rbx = rcx = rdx = 0;
                 break;

             case 0x40000010:
                 rax = env->tsc_khz;
                 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
                 rcx = rdx = 0;
                 break;

             case 0x80000001:
                 /* Remove any support of OSVW */
                 rcx &= ~CPUID_EXT3_OSVW;
                 break;
             }

             reg_names[0] = WHvX64RegisterRip;
             reg_names[1] = WHvX64RegisterRax;
             reg_names[2] = WHvX64RegisterRcx;
             reg_names[3] = WHvX64RegisterRdx;
             reg_names[4] = WHvX64RegisterRbx;

             reg_values[0].Reg64 = rip;
             reg_values[1].Reg64 = rax;
             reg_values[2].Reg64 = rcx;
             reg_values[3].Reg64 = rdx;
             reg_values[4].Reg64 = rbx;

             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
                 whpx->partition, cpu->cpu_index,
                 reg_names,
                 reg_count,
                 reg_values);

             if (FAILED(hr)) {
                 error_report("WHPX: Failed to set CpuidAccess state registers,"
                              " hr=%08lx", hr);
             }
             ret = 0;
             break;
         }
         case WHvRunVpExitReasonException:
             whpx_get_registers(cpu);

             if ((vcpu->exit_ctx.VpException.ExceptionType ==
                  WHvX64ExceptionTypeDebugTrapOrFault) &&
                 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
                 (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
                  whpx_breakpoint_instruction)) {
                 /* Stopped at a software breakpoint. */
                 cpu->exception_index = EXCP_DEBUG;
             } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
                         WHvX64ExceptionTypeDebugTrapOrFault) &&
                        !cpu->singlestep_enabled) {
                 /*
                  * Just finished stepping over a breakpoint, but the
                  * gdb does not expect us to do single-stepping.
                  * Don't do anything special.
                  */
                 cpu->exception_index = EXCP_INTERRUPT;
             } else {
                 /* Another exception or debug event. Report it to GDB. */
                 cpu->exception_index = EXCP_DEBUG;
             }

             ret = 1;
             break;
         case WHvRunVpExitReasonNone:
         case WHvRunVpExitReasonUnrecoverableException:
         case WHvRunVpExitReasonInvalidVpRegisterValue:
         case WHvRunVpExitReasonUnsupportedFeature:
         default:
             error_report("WHPX: Unexpected VP exit code %d",
                          vcpu->exit_ctx.ExitReason);
             whpx_get_registers(cpu);
             bql_lock();
             qemu_system_guest_panicked(cpu_get_crash_info(cpu));
             bql_unlock();
             break;
         }

     } while (!ret);

     if (stepped_over_bp) {
         /* Restore the breakpoint we stepped over */
         cpu_memory_rw_debug(cpu,
             stepped_over_bp->address,
             (void *)&whpx_breakpoint_instruction,
             1,
             true);
     }

     if (exclusive_step_mode != WHPX_STEP_NONE) {
         g_assert(cpu_in_exclusive_context(cpu));
         cpu->running = false;
         end_exclusive();

         exclusive_step_mode = WHPX_STEP_NONE;
     } else {
         cpu_exec_end(cpu);
     }

     bql_lock();
     current_cpu = cpu;

     if (--whpx->running_cpus == 0) {
         whpx_last_vcpu_stopping(cpu);
     }

     qatomic_set(&cpu->exit_request, false);

     return ret < 0;
 }

 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
 {
     if (!cpu->accel->dirty) {
         whpx_get_registers(cpu);
         cpu->accel->dirty = true;
     }
 }

 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
                                                run_on_cpu_data arg)
 {
     whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
     cpu->accel->dirty = false;
 }

 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
                                               run_on_cpu_data arg)
 {
     whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
     cpu->accel->dirty = false;
 }

 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
                                                run_on_cpu_data arg)
 {
     cpu->accel->dirty = true;
 }

 /*
  * CPU support.
  */

 void whpx_cpu_synchronize_state(CPUState *cpu)
 {
     if (!cpu->accel->dirty) {
         run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
     }
 }

 void whpx_cpu_synchronize_post_reset(CPUState *cpu)
 {
     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
 }

 void whpx_cpu_synchronize_post_init(CPUState *cpu)
 {
     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
 }

 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
 {
     run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
 }

 void whpx_cpu_synchronize_pre_resume(bool step_pending)
 {
     whpx_global.step_pending = step_pending;
 }

 /*
  * Vcpu support.
  */

 static Error *whpx_migration_blocker;

 static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
 {
     CPUX86State *env = opaque;

     if (running) {
         env->tsc_valid = false;
     }
 }

 int whpx_init_vcpu(CPUState *cpu)
 {
     HRESULT hr;
     struct whpx_state *whpx = &whpx_global;
     AccelCPUState *vcpu = NULL;
     Error *local_error = NULL;
     X86CPU *x86_cpu = X86_CPU(cpu);
     CPUX86State *env = &x86_cpu->env;
     UINT64 freq = 0;
     int ret;

     /* Add migration blockers for all unsupported features of the
      * Windows Hypervisor Platform
      */
     if (whpx_migration_blocker == NULL) {
         error_setg(&whpx_migration_blocker,
                "State blocked due to non-migratable CPUID feature support,"
                "dirty memory tracking support, and XSAVE/XRSTOR support");

         if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) {
             error_report_err(local_error);
             ret = -EINVAL;
             goto error;
         }
     }

     vcpu = g_new0(AccelCPUState, 1);

     hr = whp_dispatch.WHvEmulatorCreateEmulator(
         &whpx_emu_callbacks,
         &vcpu->emulator);
     if (FAILED(hr)) {
         error_report("WHPX: Failed to setup instruction completion support,"
                      " hr=%08lx", hr);
         ret = -EINVAL;
         goto error;
     }

     hr = whp_dispatch.WHvCreateVirtualProcessor(
         whpx->partition, cpu->cpu_index, 0);
     if (FAILED(hr)) {
         error_report("WHPX: Failed to create a virtual processor,"
                      " hr=%08lx", hr);
         whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
         ret = -EINVAL;
         goto error;
     }

     /*
      * vcpu's TSC frequency is either specified by user, or use the value
      * provided by Hyper-V if the former is not present. In the latter case, we
      * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
      * frequency can be migrated later via this field.
      */
     if (!env->tsc_khz) {
         hr = whp_dispatch.WHvGetCapability(
             WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
                 NULL);
         if (hr != WHV_E_UNKNOWN_CAPABILITY) {
             if (FAILED(hr)) {
                 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
             } else {
                 env->tsc_khz = freq / 1000; /* Hz to KHz */
             }
         }
     }

     env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
     hr = whp_dispatch.WHvGetCapability(
         WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
     if (hr != WHV_E_UNKNOWN_CAPABILITY) {
         if (FAILED(hr)) {
             printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
         } else {
             env->apic_bus_freq = freq;
         }
     }

     /*
      * If the vmware cpuid frequency leaf option is set, and we have a valid
      * tsc value, trap the corresponding cpuid's.
      */
     if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
         UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};

         hr = whp_dispatch.WHvSetPartitionProperty(
                 whpx->partition,
                 WHvPartitionPropertyCodeCpuidExitList,
                 cpuidExitList,
                 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));

         if (FAILED(hr)) {
             error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
                         hr);
             ret = -EINVAL;
             goto error;
         }
     }

     vcpu->interruptable = true;
     cpu->accel->dirty = true;
     cpu->accel = vcpu;
     max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
     qemu_add_vm_change_state_handler(whpx_cpu_update_state, env);

     return 0;

 error:
     g_free(vcpu);

     return ret;
 }

 int whpx_vcpu_exec(CPUState *cpu)
 {
     int ret;
     int fatal;

     for (;;) {
         if (cpu->exception_index >= EXCP_INTERRUPT) {
             ret = cpu->exception_index;
             cpu->exception_index = -1;
             break;
         }

         fatal = whpx_vcpu_run(cpu);

         if (fatal) {
             error_report("WHPX: Failed to exec a virtual processor");
             abort();
         }
     }

     return ret;
 }

 void whpx_destroy_vcpu(CPUState *cpu)
 {
     struct whpx_state *whpx = &whpx_global;
     AccelCPUState *vcpu = cpu->accel;

     whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
     whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
     g_free(cpu->accel);
     return;
 }

 void whpx_vcpu_kick(CPUState *cpu)
 {
     struct whpx_state *whpx = &whpx_global;
     whp_dispatch.WHvCancelRunVirtualProcessor(
         whpx->partition, cpu->cpu_index, 0);
 }

 /*
  * Memory support.
  */

 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
                                 void *host_va, int add, int rom,
                                 const char *name)
 {
     struct whpx_state *whpx = &whpx_global;
     HRESULT hr;

     /*
     if (add) {
         printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
                (void*)start_pa, (void*)size, host_va,
                (rom ? "ROM" : "RAM"), name);
     } else {
         printf("WHPX: DEL PA:%p Size:%p, Host:%p,      '%s'\n",
                (void*)start_pa, (void*)size, host_va, name);
     }
     */

     if (add) {
         hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
                                          host_va,
                                          start_pa,
                                          size,
                                          (WHvMapGpaRangeFlagRead |
                                           WHvMapGpaRangeFlagExecute |
                                           (rom ? 0 : WHvMapGpaRangeFlagWrite)));
     } else {
         hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
                                            start_pa,
                                            size);
     }

     if (FAILED(hr)) {
         error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
                      " Host:%p, hr=%08lx",
                      (add ? "MAP" : "UNMAP"), name,
                      (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
     }
 }

 static void whpx_process_section(MemoryRegionSection *section, int add)
 {
     MemoryRegion *mr = section->mr;
     hwaddr start_pa = section->offset_within_address_space;
     ram_addr_t size = int128_get64(section->size);
     unsigned int delta;
     uint64_t host_va;

     if (!memory_region_is_ram(mr)) {
         return;
     }

     delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
     delta &= ~qemu_real_host_page_mask();
     if (delta > size) {
         return;
     }
     start_pa += delta;
     size -= delta;
     size &= qemu_real_host_page_mask();
     if (!size || (start_pa & ~qemu_real_host_page_mask())) {
         return;
     }

     host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
             + section->offset_within_region + delta;

     whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
                         memory_region_is_rom(mr), mr->name);
 }

 static void whpx_region_add(MemoryListener *listener,
                            MemoryRegionSection *section)
 {
     memory_region_ref(section->mr);
     whpx_process_section(section, 1);
 }

 static void whpx_region_del(MemoryListener *listener,
                            MemoryRegionSection *section)
 {
     whpx_process_section(section, 0);
     memory_region_unref(section->mr);
 }

 static void whpx_transaction_begin(MemoryListener *listener)
 {
 }

 static void whpx_transaction_commit(MemoryListener *listener)
 {
 }

 static void whpx_log_sync(MemoryListener *listener,
                          MemoryRegionSection *section)
 {
     MemoryRegion *mr = section->mr;

     if (!memory_region_is_ram(mr)) {
         return;
     }

     memory_region_set_dirty(mr, 0, int128_get64(section->size));
 }

 static MemoryListener whpx_memory_listener = {
     .name = "whpx",
     .begin = whpx_transaction_begin,
     .commit = whpx_transaction_commit,
     .region_add = whpx_region_add,
     .region_del = whpx_region_del,
     .log_sync = whpx_log_sync,
     .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
 };

 static void whpx_memory_init(void)
 {
     memory_listener_register(&whpx_memory_listener, &address_space_memory);
 }

 /*
  * Load the functions from the given library, using the given handle. If a
  * handle is provided, it is used, otherwise the library is opened. The
  * handle will be updated on return with the opened one.
  */
 static bool load_whp_dispatch_fns(HMODULE *handle,
     WHPFunctionList function_list)
 {
     HMODULE hLib = *handle;

     #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
     #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
     #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
         whp_dispatch.function_name = \
             (function_name ## _t)GetProcAddress(hLib, #function_name); \

     #define WHP_LOAD_FIELD(return_type, function_name, signature) \
         whp_dispatch.function_name = \
             (function_name ## _t)GetProcAddress(hLib, #function_name); \
         if (!whp_dispatch.function_name) { \
             error_report("Could not load function %s", #function_name); \
             goto error; \
         } \

     #define WHP_LOAD_LIB(lib_name, handle_lib) \
     if (!handle_lib) { \
         handle_lib = LoadLibrary(lib_name); \
         if (!handle_lib) { \
             error_report("Could not load library %s.", lib_name); \
             goto error; \
         } \
     } \

     switch (function_list) {
     case WINHV_PLATFORM_FNS_DEFAULT:
         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
         LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
         break;

     case WINHV_EMULATION_FNS_DEFAULT:
         WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
         LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
         break;

     case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
         LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
         break;
     }

     *handle = hLib;
     return true;

 error:
     if (hLib) {
         FreeLibrary(hLib);
     }

     return false;
 }

 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
                                    const char *name, void *opaque,
                                    Error **errp)
 {
     struct whpx_state *whpx = &whpx_global;
     OnOffSplit mode;

     if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
         return;
     }

     switch (mode) {
     case ON_OFF_SPLIT_ON:
         whpx->kernel_irqchip_allowed = true;
         whpx->kernel_irqchip_required = true;
         break;

     case ON_OFF_SPLIT_OFF:
         whpx->kernel_irqchip_allowed = false;
         whpx->kernel_irqchip_required = false;
         break;

     case ON_OFF_SPLIT_SPLIT:
         error_setg(errp, "WHPX: split irqchip currently not supported");
         error_append_hint(errp,
             "Try without kernel-irqchip or with kernel-irqchip=on|off");
         break;

     default:
         /*
          * The value was checked in visit_type_OnOffSplit() above. If
          * we get here, then something is wrong in QEMU.
          */
         abort();
     }
 }

 /*
  * Partition support
  */

 static int whpx_accel_init(MachineState *ms)
 {
     struct whpx_state *whpx;
     int ret;
     HRESULT hr;
     WHV_CAPABILITY whpx_cap;
     UINT32 whpx_cap_size;
     WHV_PARTITION_PROPERTY prop;
     UINT32 cpuidExitList[] = {1, 0x80000001};
     WHV_CAPABILITY_FEATURES features = {0};

     whpx = &whpx_global;

     if (!init_whp_dispatch()) {
         ret = -ENOSYS;
         goto error;
     }

     whpx->mem_quota = ms->ram_size;

     hr = whp_dispatch.WHvGetCapability(
         WHvCapabilityCodeHypervisorPresent, &whpx_cap,
         sizeof(whpx_cap), &whpx_cap_size);
     if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
         error_report("WHPX: No accelerator found, hr=%08lx", hr);
         ret = -ENOSPC;
         goto error;
     }

     hr = whp_dispatch.WHvGetCapability(
         WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
     if (FAILED(hr)) {
         error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
         ret = -EINVAL;
         goto error;
     }

     hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
     if (FAILED(hr)) {
         error_report("WHPX: Failed to create partition, hr=%08lx", hr);
         ret = -EINVAL;
         goto error;
     }

     /*
      * Query the XSAVE capability of the partition. Any error here is not
      * considered fatal.
      */
     hr = whp_dispatch.WHvGetPartitionProperty(
         whpx->partition,
         WHvPartitionPropertyCodeProcessorXsaveFeatures,
         &whpx_xsave_cap,
         sizeof(whpx_xsave_cap),
         &whpx_cap_size);

     /*
      * Windows version which don't support this property will return with the
      * specific error code.
      */
     if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
         error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
     }

     if (!whpx_has_xsave()) {
         printf("WHPX: Partition is not XSAVE capable\n");
     }

     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
     prop.ProcessorCount = ms->smp.cpus;
     hr = whp_dispatch.WHvSetPartitionProperty(
         whpx->partition,
         WHvPartitionPropertyCodeProcessorCount,
         &prop,
         sizeof(WHV_PARTITION_PROPERTY));

     if (FAILED(hr)) {
         error_report("WHPX: Failed to set partition processor count to %u,"
                      " hr=%08lx", prop.ProcessorCount, hr);
         ret = -EINVAL;
         goto error;
     }

     /*
      * Error out if WHP doesn't support apic emulation and user is requiring
      * it.
      */
     if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
             !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
         error_report("WHPX: kernel irqchip requested, but unavailable. "
             "Try without kernel-irqchip or with kernel-irqchip=off");
         ret = -EINVAL;
         goto error;
     }

     if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
         whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
         WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
             WHvX64LocalApicEmulationModeXApic;
         printf("WHPX: setting APIC emulation mode in the hypervisor\n");
         hr = whp_dispatch.WHvSetPartitionProperty(
             whpx->partition,
             WHvPartitionPropertyCodeLocalApicEmulationMode,
             &mode,
             sizeof(mode));
         if (FAILED(hr)) {
             error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
             if (whpx->kernel_irqchip_required) {
                 error_report("WHPX: kernel irqchip requested, but unavailable");
                 ret = -EINVAL;
                 goto error;
             }
         } else {
             whpx->apic_in_platform = true;
         }
     }

     /* Register for MSR and CPUID exits */
     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
     prop.ExtendedVmExits.X64MsrExit = 1;
     prop.ExtendedVmExits.X64CpuidExit = 1;
     prop.ExtendedVmExits.ExceptionExit = 1;
     if (whpx_apic_in_platform()) {
         prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
     }

     hr = whp_dispatch.WHvSetPartitionProperty(
             whpx->partition,
             WHvPartitionPropertyCodeExtendedVmExits,
             &prop,
             sizeof(WHV_PARTITION_PROPERTY));
     if (FAILED(hr)) {
         error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
         ret = -EINVAL;
         goto error;
     }

     hr = whp_dispatch.WHvSetPartitionProperty(
         whpx->partition,
         WHvPartitionPropertyCodeCpuidExitList,
         cpuidExitList,
         RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));

     if (FAILED(hr)) {
         error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
                      hr);
         ret = -EINVAL;
         goto error;
     }

     /*
      * We do not want to intercept any exceptions from the guest,
      * until we actually start debugging with gdb.
      */
     whpx->exception_exit_bitmap = -1;
     hr = whpx_set_exception_exit_bitmap(0);

     if (FAILED(hr)) {
         error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
         ret = -EINVAL;
         goto error;
     }

     hr = whp_dispatch.WHvSetupPartition(whpx->partition);
     if (FAILED(hr)) {
         error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
         ret = -EINVAL;
         goto error;
     }

     whpx_memory_init();

     printf("Windows Hypervisor Platform accelerator is operational\n");
     return 0;

 error:

     if (NULL != whpx->partition) {
         whp_dispatch.WHvDeletePartition(whpx->partition);
         whpx->partition = NULL;
     }

     return ret;
 }

 int whpx_enabled(void)
 {
     return whpx_allowed;
 }

 bool whpx_apic_in_platform(void) {
     return whpx_global.apic_in_platform;
 }

 static void whpx_accel_class_init(ObjectClass *oc, void *data)
 {
     AccelClass *ac = ACCEL_CLASS(oc);
     ac->name = "WHPX";
     ac->init_machine = whpx_accel_init;
     ac->allowed = &whpx_allowed;

     object_class_property_add(oc, "kernel-irqchip", "on|off|split",
         NULL, whpx_set_kernel_irqchip,
         NULL, NULL);
     object_class_property_set_description(oc, "kernel-irqchip",
         "Configure WHPX in-kernel irqchip");
 }

 static void whpx_accel_instance_init(Object *obj)
 {
     struct whpx_state *whpx = &whpx_global;

     memset(whpx, 0, sizeof(struct whpx_state));
     /* Turn on kernel-irqchip, by default */
     whpx->kernel_irqchip_allowed = true;
 }

 static const TypeInfo whpx_accel_type = {
     .name = ACCEL_CLASS_NAME("whpx"),
     .parent = TYPE_ACCEL,
     .instance_init = whpx_accel_instance_init,
     .class_init = whpx_accel_class_init,
 };

 static void whpx_type_init(void)
 {
     type_register_static(&whpx_accel_type);
 }

 bool init_whp_dispatch(void)
 {
     if (whp_dispatch_initialized) {
         return true;
     }

     if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
         goto error;
     }

     if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
         goto error;
     }

     assert(load_whp_dispatch_fns(&hWinHvPlatform,
         WINHV_PLATFORM_FNS_SUPPLEMENTAL));
     whp_dispatch_initialized = true;

     return true;
 error:
     if (hWinHvPlatform) {
         FreeLibrary(hWinHvPlatform);
     }

     if (hWinHvEmulation) {
         FreeLibrary(hWinHvEmulation);
     }

     return false;
 }

 type_init(whpx_type_init);