| /* |
| * QEMU Windows Hypervisor Platform accelerator (WHPX) |
| * |
| * Copyright Microsoft Corp. 2017 |
| * |
| * This work is licensed under the terms of the GNU GPL, version 2 or later. |
| * See the COPYING file in the top-level directory. |
| * |
| */ |
| |
| #include "qemu/osdep.h" |
| #include "cpu.h" |
| #include "exec/address-spaces.h" |
| #include "exec/ioport.h" |
| #include "gdbstub/helpers.h" |
| #include "qemu/accel.h" |
| #include "sysemu/whpx.h" |
| #include "sysemu/cpus.h" |
| #include "sysemu/runstate.h" |
| #include "qemu/main-loop.h" |
| #include "hw/boards.h" |
| #include "hw/intc/ioapic.h" |
| #include "hw/i386/apic_internal.h" |
| #include "qemu/error-report.h" |
| #include "qapi/error.h" |
| #include "qapi/qapi-types-common.h" |
| #include "qapi/qapi-visit-common.h" |
| #include "migration/blocker.h" |
| #include <winerror.h> |
| |
| #include "whpx-internal.h" |
| #include "whpx-accel-ops.h" |
| |
| #include <WinHvPlatform.h> |
| #include <WinHvEmulation.h> |
| |
| #define HYPERV_APIC_BUS_FREQUENCY (200000000ULL) |
| |
| static const WHV_REGISTER_NAME whpx_register_names[] = { |
| |
| /* X64 General purpose registers */ |
| WHvX64RegisterRax, |
| WHvX64RegisterRcx, |
| WHvX64RegisterRdx, |
| WHvX64RegisterRbx, |
| WHvX64RegisterRsp, |
| WHvX64RegisterRbp, |
| WHvX64RegisterRsi, |
| WHvX64RegisterRdi, |
| WHvX64RegisterR8, |
| WHvX64RegisterR9, |
| WHvX64RegisterR10, |
| WHvX64RegisterR11, |
| WHvX64RegisterR12, |
| WHvX64RegisterR13, |
| WHvX64RegisterR14, |
| WHvX64RegisterR15, |
| WHvX64RegisterRip, |
| WHvX64RegisterRflags, |
| |
| /* X64 Segment registers */ |
| WHvX64RegisterEs, |
| WHvX64RegisterCs, |
| WHvX64RegisterSs, |
| WHvX64RegisterDs, |
| WHvX64RegisterFs, |
| WHvX64RegisterGs, |
| WHvX64RegisterLdtr, |
| WHvX64RegisterTr, |
| |
| /* X64 Table registers */ |
| WHvX64RegisterIdtr, |
| WHvX64RegisterGdtr, |
| |
| /* X64 Control Registers */ |
| WHvX64RegisterCr0, |
| WHvX64RegisterCr2, |
| WHvX64RegisterCr3, |
| WHvX64RegisterCr4, |
| WHvX64RegisterCr8, |
| |
| /* X64 Debug Registers */ |
| /* |
| * WHvX64RegisterDr0, |
| * WHvX64RegisterDr1, |
| * WHvX64RegisterDr2, |
| * WHvX64RegisterDr3, |
| * WHvX64RegisterDr6, |
| * WHvX64RegisterDr7, |
| */ |
| |
| /* X64 Floating Point and Vector Registers */ |
| WHvX64RegisterXmm0, |
| WHvX64RegisterXmm1, |
| WHvX64RegisterXmm2, |
| WHvX64RegisterXmm3, |
| WHvX64RegisterXmm4, |
| WHvX64RegisterXmm5, |
| WHvX64RegisterXmm6, |
| WHvX64RegisterXmm7, |
| WHvX64RegisterXmm8, |
| WHvX64RegisterXmm9, |
| WHvX64RegisterXmm10, |
| WHvX64RegisterXmm11, |
| WHvX64RegisterXmm12, |
| WHvX64RegisterXmm13, |
| WHvX64RegisterXmm14, |
| WHvX64RegisterXmm15, |
| WHvX64RegisterFpMmx0, |
| WHvX64RegisterFpMmx1, |
| WHvX64RegisterFpMmx2, |
| WHvX64RegisterFpMmx3, |
| WHvX64RegisterFpMmx4, |
| WHvX64RegisterFpMmx5, |
| WHvX64RegisterFpMmx6, |
| WHvX64RegisterFpMmx7, |
| WHvX64RegisterFpControlStatus, |
| WHvX64RegisterXmmControlStatus, |
| |
| /* X64 MSRs */ |
| WHvX64RegisterEfer, |
| #ifdef TARGET_X86_64 |
| WHvX64RegisterKernelGsBase, |
| #endif |
| WHvX64RegisterApicBase, |
| /* WHvX64RegisterPat, */ |
| WHvX64RegisterSysenterCs, |
| WHvX64RegisterSysenterEip, |
| WHvX64RegisterSysenterEsp, |
| WHvX64RegisterStar, |
| #ifdef TARGET_X86_64 |
| WHvX64RegisterLstar, |
| WHvX64RegisterCstar, |
| WHvX64RegisterSfmask, |
| #endif |
| |
| /* Interrupt / Event Registers */ |
| /* |
| * WHvRegisterPendingInterruption, |
| * WHvRegisterInterruptState, |
| * WHvRegisterPendingEvent0, |
| * WHvRegisterPendingEvent1 |
| * WHvX64RegisterDeliverabilityNotifications, |
| */ |
| }; |
| |
| struct whpx_register_set { |
| WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)]; |
| }; |
| |
| /* |
| * The current implementation of instruction stepping sets the TF flag |
| * in RFLAGS, causing the CPU to raise an INT1 after each instruction. |
| * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception. |
| * |
| * This approach has a few limitations: |
| * 1. Stepping over a PUSHF/SAHF instruction will save the TF flag |
| * along with the other flags, possibly restoring it later. It would |
| * result in another INT1 when the flags are restored, triggering |
| * a stop in gdb that could be cleared by doing another step. |
| * |
| * Stepping over a POPF/LAHF instruction will let it overwrite the |
| * TF flags, ending the stepping mode. |
| * |
| * 2. Stepping over an instruction raising an exception (e.g. INT, DIV, |
| * or anything that could result in a page fault) will save the flags |
| * to the stack, clear the TF flag, and let the guest execute the |
| * handler. Normally, the guest will restore the original flags, |
| * that will continue single-stepping. |
| * |
| * 3. Debuggers running on the guest may wish to set TF to do instruction |
| * stepping. INT1 events generated by it would be intercepted by us, |
| * as long as the gdb is connected to QEMU. |
| * |
| * In practice this means that: |
| * 1. Stepping through flags-modifying instructions may cause gdb to |
| * continue or stop in unexpected places. This will be fully recoverable |
| * and will not crash the target. |
| * |
| * 2. Stepping over an instruction that triggers an exception will step |
| * over the exception handler, not into it. |
| * |
| * 3. Debugging the guest via gdb, while running debugger on the guest |
| * at the same time may lead to unexpected effects. Removing all |
| * breakpoints set via QEMU will prevent any further interference |
| * with the guest-level debuggers. |
| * |
| * The limitations can be addressed as shown below: |
| * 1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of |
| * stepping through them. The exact semantics of the instructions is |
| * defined in the "Combined Volume Set of Intel 64 and IA-32 |
| * Architectures Software Developer's Manuals", however it involves a |
| * fair amount of corner cases due to compatibility with real mode, |
| * virtual 8086 mode, and differences between 64-bit and 32-bit modes. |
| * |
| * 2. We could step into the guest's exception handlers using the following |
| * sequence: |
| * a. Temporarily enable catching of all exception types via |
| * whpx_set_exception_exit_bitmap(). |
| * b. Once an exception is intercepted, read the IDT/GDT and locate |
| * the original handler. |
| * c. Patch the original handler, injecting an INT3 at the beginning. |
| * d. Update the exception exit bitmap to only catch the |
| * WHvX64ExceptionTypeBreakpointTrap exception. |
| * e. Let the affected CPU run in the exclusive mode. |
| * f. Restore the original handler and the exception exit bitmap. |
| * Note that handling all corner cases related to IDT/GDT is harder |
| * than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a |
| * rough idea. |
| * |
| * 3. In order to properly support guest-level debugging in parallel with |
| * the QEMU-level debugging, we would need to be able to pass some INT1 |
| * events to the guest. This could be done via the following methods: |
| * a. Using the WHvRegisterPendingEvent register. As of Windows 21H1, |
| * it seems to only work for interrupts and not software |
| * exceptions. |
| * b. Locating and patching the original handler by parsing IDT/GDT. |
| * This involves relatively complex logic outlined in the previous |
| * paragraph. |
| * c. Emulating the exception invocation (i.e. manually updating RIP, |
| * RFLAGS, and pushing the old values to stack). This is even more |
| * complicated than the previous option, since it involves checking |
| * CPL, gate attributes, and doing various adjustments depending |
| * on the current CPU mode, whether the CPL is changing, etc. |
| */ |
| typedef enum WhpxStepMode { |
| WHPX_STEP_NONE = 0, |
| /* Halt other VCPUs */ |
| WHPX_STEP_EXCLUSIVE, |
| } WhpxStepMode; |
| |
| struct whpx_vcpu { |
| WHV_EMULATOR_HANDLE emulator; |
| bool window_registered; |
| bool interruptable; |
| bool ready_for_pic_interrupt; |
| uint64_t tpr; |
| uint64_t apic_base; |
| bool interruption_pending; |
| |
| /* Must be the last field as it may have a tail */ |
| WHV_RUN_VP_EXIT_CONTEXT exit_ctx; |
| }; |
| |
| static bool whpx_allowed; |
| static bool whp_dispatch_initialized; |
| static HMODULE hWinHvPlatform, hWinHvEmulation; |
| static uint32_t max_vcpu_index; |
| static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap; |
| |
| struct whpx_state whpx_global; |
| struct WHPDispatch whp_dispatch; |
| |
| static bool whpx_has_xsave(void) |
| { |
| return whpx_xsave_cap.XsaveSupport; |
| } |
| |
| /* |
| * VP support |
| */ |
| |
| static struct whpx_vcpu *get_whpx_vcpu(CPUState *cpu) |
| { |
| return (struct whpx_vcpu *)cpu->hax_vcpu; |
| } |
| |
| static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86, |
| int r86) |
| { |
| WHV_X64_SEGMENT_REGISTER hs; |
| unsigned flags = qs->flags; |
| |
| hs.Base = qs->base; |
| hs.Limit = qs->limit; |
| hs.Selector = qs->selector; |
| |
| if (v86) { |
| hs.Attributes = 0; |
| hs.SegmentType = 3; |
| hs.Present = 1; |
| hs.DescriptorPrivilegeLevel = 3; |
| hs.NonSystemSegment = 1; |
| |
| } else { |
| hs.Attributes = (flags >> DESC_TYPE_SHIFT); |
| |
| if (r86) { |
| /* hs.Base &= 0xfffff; */ |
| } |
| } |
| |
| return hs; |
| } |
| |
| static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs) |
| { |
| SegmentCache qs; |
| |
| qs.base = hs->Base; |
| qs.limit = hs->Limit; |
| qs.selector = hs->Selector; |
| |
| qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT; |
| |
| return qs; |
| } |
| |
| /* X64 Extended Control Registers */ |
| static void whpx_set_xcrs(CPUState *cpu) |
| { |
| CPUX86State *env = cpu->env_ptr; |
| HRESULT hr; |
| struct whpx_state *whpx = &whpx_global; |
| WHV_REGISTER_VALUE xcr0; |
| WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; |
| |
| if (!whpx_has_xsave()) { |
| return; |
| } |
| |
| /* Only xcr0 is supported by the hypervisor currently */ |
| xcr0.Reg64 = env->xcr0; |
| hr = whp_dispatch.WHvSetVirtualProcessorRegisters( |
| whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr); |
| } |
| } |
| |
| static int whpx_set_tsc(CPUState *cpu) |
| { |
| CPUX86State *env = cpu->env_ptr; |
| WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; |
| WHV_REGISTER_VALUE tsc_val; |
| HRESULT hr; |
| struct whpx_state *whpx = &whpx_global; |
| |
| /* |
| * Suspend the partition prior to setting the TSC to reduce the variance |
| * in TSC across vCPUs. When the first vCPU runs post suspend, the |
| * partition is automatically resumed. |
| */ |
| if (whp_dispatch.WHvSuspendPartitionTime) { |
| |
| /* |
| * Unable to suspend partition while setting TSC is not a fatal |
| * error. It just increases the likelihood of TSC variance between |
| * vCPUs and some guest OS are able to handle that just fine. |
| */ |
| hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition); |
| if (FAILED(hr)) { |
| warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr); |
| } |
| } |
| |
| tsc_val.Reg64 = env->tsc; |
| hr = whp_dispatch.WHvSetVirtualProcessorRegisters( |
| whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to set TSC, hr=%08lx", hr); |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * The CR8 register in the CPU is mapped to the TPR register of the APIC, |
| * however, they use a slightly different encoding. Specifically: |
| * |
| * APIC.TPR[bits 7:4] = CR8[bits 3:0] |
| * |
| * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64 |
| * and IA-32 Architectures Software Developer's Manual. |
| * |
| * The functions below translate the value of CR8 to TPR and vice versa. |
| */ |
| |
| static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr) |
| { |
| return tpr >> 4; |
| } |
| |
| static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8) |
| { |
| return cr8 << 4; |
| } |
| |
| static void whpx_set_registers(CPUState *cpu, int level) |
| { |
| struct whpx_state *whpx = &whpx_global; |
| struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); |
| CPUX86State *env = cpu->env_ptr; |
| X86CPU *x86_cpu = X86_CPU(cpu); |
| struct whpx_register_set vcxt; |
| HRESULT hr; |
| int idx; |
| int idx_next; |
| int i; |
| int v86, r86; |
| |
| assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); |
| |
| /* |
| * Following MSRs have side effects on the guest or are too heavy for |
| * runtime. Limit them to full state update. |
| */ |
| if (level >= WHPX_SET_RESET_STATE) { |
| whpx_set_tsc(cpu); |
| } |
| |
| memset(&vcxt, 0, sizeof(struct whpx_register_set)); |
| |
| v86 = (env->eflags & VM_MASK); |
| r86 = !(env->cr[0] & CR0_PE_MASK); |
| |
| vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); |
| vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state); |
| |
| idx = 0; |
| |
| /* Indexes for first 16 registers match between HV and QEMU definitions */ |
| idx_next = 16; |
| for (idx = 0; idx < CPU_NB_REGS; idx += 1) { |
| vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx]; |
| } |
| idx = idx_next; |
| |
| /* Same goes for RIP and RFLAGS */ |
| assert(whpx_register_names[idx] == WHvX64RegisterRip); |
| vcxt.values[idx++].Reg64 = env->eip; |
| |
| assert(whpx_register_names[idx] == WHvX64RegisterRflags); |
| vcxt.values[idx++].Reg64 = env->eflags; |
| |
| /* Translate 6+4 segment registers. HV and QEMU order matches */ |
| assert(idx == WHvX64RegisterEs); |
| for (i = 0; i < 6; i += 1, idx += 1) { |
| vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86); |
| } |
| |
| assert(idx == WHvX64RegisterLdtr); |
| vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0); |
| |
| assert(idx == WHvX64RegisterTr); |
| vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0); |
| |
| assert(idx == WHvX64RegisterIdtr); |
| vcxt.values[idx].Table.Base = env->idt.base; |
| vcxt.values[idx].Table.Limit = env->idt.limit; |
| idx += 1; |
| |
| assert(idx == WHvX64RegisterGdtr); |
| vcxt.values[idx].Table.Base = env->gdt.base; |
| vcxt.values[idx].Table.Limit = env->gdt.limit; |
| idx += 1; |
| |
| /* CR0, 2, 3, 4, 8 */ |
| assert(whpx_register_names[idx] == WHvX64RegisterCr0); |
| vcxt.values[idx++].Reg64 = env->cr[0]; |
| assert(whpx_register_names[idx] == WHvX64RegisterCr2); |
| vcxt.values[idx++].Reg64 = env->cr[2]; |
| assert(whpx_register_names[idx] == WHvX64RegisterCr3); |
| vcxt.values[idx++].Reg64 = env->cr[3]; |
| assert(whpx_register_names[idx] == WHvX64RegisterCr4); |
| vcxt.values[idx++].Reg64 = env->cr[4]; |
| assert(whpx_register_names[idx] == WHvX64RegisterCr8); |
| vcxt.values[idx++].Reg64 = vcpu->tpr; |
| |
| /* 8 Debug Registers - Skipped */ |
| |
| /* |
| * Extended control registers needs to be handled separately depending |
| * on whether xsave is supported/enabled or not. |
| */ |
| whpx_set_xcrs(cpu); |
| |
| /* 16 XMM registers */ |
| assert(whpx_register_names[idx] == WHvX64RegisterXmm0); |
| idx_next = idx + 16; |
| for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { |
| vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0); |
| vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1); |
| } |
| idx = idx_next; |
| |
| /* 8 FP registers */ |
| assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); |
| for (i = 0; i < 8; i += 1, idx += 1) { |
| vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0); |
| /* vcxt.values[idx].Fp.AsUINT128.High64 = |
| env->fpregs[i].mmx.MMX_Q(1); |
| */ |
| } |
| |
| /* FP control status register */ |
| assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); |
| vcxt.values[idx].FpControlStatus.FpControl = env->fpuc; |
| vcxt.values[idx].FpControlStatus.FpStatus = |
| (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11; |
| vcxt.values[idx].FpControlStatus.FpTag = 0; |
| for (i = 0; i < 8; ++i) { |
| vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i; |
| } |
| vcxt.values[idx].FpControlStatus.Reserved = 0; |
| vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop; |
| vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip; |
| idx += 1; |
| |
| /* XMM control status register */ |
| assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); |
| vcxt.values[idx].XmmControlStatus.LastFpRdp = 0; |
| vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr; |
| vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff; |
| idx += 1; |
| |
| /* MSRs */ |
| assert(whpx_register_names[idx] == WHvX64RegisterEfer); |
| vcxt.values[idx++].Reg64 = env->efer; |
| #ifdef TARGET_X86_64 |
| assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); |
| vcxt.values[idx++].Reg64 = env->kernelgsbase; |
| #endif |
| |
| assert(whpx_register_names[idx] == WHvX64RegisterApicBase); |
| vcxt.values[idx++].Reg64 = vcpu->apic_base; |
| |
| /* WHvX64RegisterPat - Skipped */ |
| |
| assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); |
| vcxt.values[idx++].Reg64 = env->sysenter_cs; |
| assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); |
| vcxt.values[idx++].Reg64 = env->sysenter_eip; |
| assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); |
| vcxt.values[idx++].Reg64 = env->sysenter_esp; |
| assert(whpx_register_names[idx] == WHvX64RegisterStar); |
| vcxt.values[idx++].Reg64 = env->star; |
| #ifdef TARGET_X86_64 |
| assert(whpx_register_names[idx] == WHvX64RegisterLstar); |
| vcxt.values[idx++].Reg64 = env->lstar; |
| assert(whpx_register_names[idx] == WHvX64RegisterCstar); |
| vcxt.values[idx++].Reg64 = env->cstar; |
| assert(whpx_register_names[idx] == WHvX64RegisterSfmask); |
| vcxt.values[idx++].Reg64 = env->fmask; |
| #endif |
| |
| /* Interrupt / Event Registers - Skipped */ |
| |
| assert(idx == RTL_NUMBER_OF(whpx_register_names)); |
| |
| hr = whp_dispatch.WHvSetVirtualProcessorRegisters( |
| whpx->partition, cpu->cpu_index, |
| whpx_register_names, |
| RTL_NUMBER_OF(whpx_register_names), |
| &vcxt.values[0]); |
| |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to set virtual processor context, hr=%08lx", |
| hr); |
| } |
| |
| return; |
| } |
| |
| static int whpx_get_tsc(CPUState *cpu) |
| { |
| CPUX86State *env = cpu->env_ptr; |
| WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; |
| WHV_REGISTER_VALUE tsc_val; |
| HRESULT hr; |
| struct whpx_state *whpx = &whpx_global; |
| |
| hr = whp_dispatch.WHvGetVirtualProcessorRegisters( |
| whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to get TSC, hr=%08lx", hr); |
| return -1; |
| } |
| |
| env->tsc = tsc_val.Reg64; |
| return 0; |
| } |
| |
| /* X64 Extended Control Registers */ |
| static void whpx_get_xcrs(CPUState *cpu) |
| { |
| CPUX86State *env = cpu->env_ptr; |
| HRESULT hr; |
| struct whpx_state *whpx = &whpx_global; |
| WHV_REGISTER_VALUE xcr0; |
| WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; |
| |
| if (!whpx_has_xsave()) { |
| return; |
| } |
| |
| /* Only xcr0 is supported by the hypervisor currently */ |
| hr = whp_dispatch.WHvGetVirtualProcessorRegisters( |
| whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr); |
| return; |
| } |
| |
| env->xcr0 = xcr0.Reg64; |
| } |
| |
| static void whpx_get_registers(CPUState *cpu) |
| { |
| struct whpx_state *whpx = &whpx_global; |
| struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); |
| CPUX86State *env = cpu->env_ptr; |
| X86CPU *x86_cpu = X86_CPU(cpu); |
| struct whpx_register_set vcxt; |
| uint64_t tpr, apic_base; |
| HRESULT hr; |
| int idx; |
| int idx_next; |
| int i; |
| |
| assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); |
| |
| if (!env->tsc_valid) { |
| whpx_get_tsc(cpu); |
| env->tsc_valid = !runstate_is_running(); |
| } |
| |
| hr = whp_dispatch.WHvGetVirtualProcessorRegisters( |
| whpx->partition, cpu->cpu_index, |
| whpx_register_names, |
| RTL_NUMBER_OF(whpx_register_names), |
| &vcxt.values[0]); |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to get virtual processor context, hr=%08lx", |
| hr); |
| } |
| |
| if (whpx_apic_in_platform()) { |
| /* |
| * Fetch the TPR value from the emulated APIC. It may get overwritten |
| * below with the value from CR8 returned by |
| * WHvGetVirtualProcessorRegisters(). |
| */ |
| whpx_apic_get(x86_cpu->apic_state); |
| vcpu->tpr = whpx_apic_tpr_to_cr8( |
| cpu_get_apic_tpr(x86_cpu->apic_state)); |
| } |
| |
| idx = 0; |
| |
| /* Indexes for first 16 registers match between HV and QEMU definitions */ |
| idx_next = 16; |
| for (idx = 0; idx < CPU_NB_REGS; idx += 1) { |
| env->regs[idx] = vcxt.values[idx].Reg64; |
| } |
| idx = idx_next; |
| |
| /* Same goes for RIP and RFLAGS */ |
| assert(whpx_register_names[idx] == WHvX64RegisterRip); |
| env->eip = vcxt.values[idx++].Reg64; |
| assert(whpx_register_names[idx] == WHvX64RegisterRflags); |
| env->eflags = vcxt.values[idx++].Reg64; |
| |
| /* Translate 6+4 segment registers. HV and QEMU order matches */ |
| assert(idx == WHvX64RegisterEs); |
| for (i = 0; i < 6; i += 1, idx += 1) { |
| env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment); |
| } |
| |
| assert(idx == WHvX64RegisterLdtr); |
| env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment); |
| assert(idx == WHvX64RegisterTr); |
| env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment); |
| assert(idx == WHvX64RegisterIdtr); |
| env->idt.base = vcxt.values[idx].Table.Base; |
| env->idt.limit = vcxt.values[idx].Table.Limit; |
| idx += 1; |
| assert(idx == WHvX64RegisterGdtr); |
| env->gdt.base = vcxt.values[idx].Table.Base; |
| env->gdt.limit = vcxt.values[idx].Table.Limit; |
| idx += 1; |
| |
| /* CR0, 2, 3, 4, 8 */ |
| assert(whpx_register_names[idx] == WHvX64RegisterCr0); |
| env->cr[0] = vcxt.values[idx++].Reg64; |
| assert(whpx_register_names[idx] == WHvX64RegisterCr2); |
| env->cr[2] = vcxt.values[idx++].Reg64; |
| assert(whpx_register_names[idx] == WHvX64RegisterCr3); |
| env->cr[3] = vcxt.values[idx++].Reg64; |
| assert(whpx_register_names[idx] == WHvX64RegisterCr4); |
| env->cr[4] = vcxt.values[idx++].Reg64; |
| assert(whpx_register_names[idx] == WHvX64RegisterCr8); |
| tpr = vcxt.values[idx++].Reg64; |
| if (tpr != vcpu->tpr) { |
| vcpu->tpr = tpr; |
| cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr)); |
| } |
| |
| /* 8 Debug Registers - Skipped */ |
| |
| /* |
| * Extended control registers needs to be handled separately depending |
| * on whether xsave is supported/enabled or not. |
| */ |
| whpx_get_xcrs(cpu); |
| |
| /* 16 XMM registers */ |
| assert(whpx_register_names[idx] == WHvX64RegisterXmm0); |
| idx_next = idx + 16; |
| for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { |
| env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64; |
| env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64; |
| } |
| idx = idx_next; |
| |
| /* 8 FP registers */ |
| assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); |
| for (i = 0; i < 8; i += 1, idx += 1) { |
| env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64; |
| /* env->fpregs[i].mmx.MMX_Q(1) = |
| vcxt.values[idx].Fp.AsUINT128.High64; |
| */ |
| } |
| |
| /* FP control status register */ |
| assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); |
| env->fpuc = vcxt.values[idx].FpControlStatus.FpControl; |
| env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7; |
| env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800; |
| for (i = 0; i < 8; ++i) { |
| env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1); |
| } |
| env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp; |
| env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip; |
| idx += 1; |
| |
| /* XMM control status register */ |
| assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); |
| env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl; |
| idx += 1; |
| |
| /* MSRs */ |
| assert(whpx_register_names[idx] == WHvX64RegisterEfer); |
| env->efer = vcxt.values[idx++].Reg64; |
| #ifdef TARGET_X86_64 |
| assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); |
| env->kernelgsbase = vcxt.values[idx++].Reg64; |
| #endif |
| |
| assert(whpx_register_names[idx] == WHvX64RegisterApicBase); |
| apic_base = vcxt.values[idx++].Reg64; |
| if (apic_base != vcpu->apic_base) { |
| vcpu->apic_base = apic_base; |
| cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base); |
| } |
| |
| /* WHvX64RegisterPat - Skipped */ |
| |
| assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); |
| env->sysenter_cs = vcxt.values[idx++].Reg64; |
| assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); |
| env->sysenter_eip = vcxt.values[idx++].Reg64; |
| assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); |
| env->sysenter_esp = vcxt.values[idx++].Reg64; |
| assert(whpx_register_names[idx] == WHvX64RegisterStar); |
| env->star = vcxt.values[idx++].Reg64; |
| #ifdef TARGET_X86_64 |
| assert(whpx_register_names[idx] == WHvX64RegisterLstar); |
| env->lstar = vcxt.values[idx++].Reg64; |
| assert(whpx_register_names[idx] == WHvX64RegisterCstar); |
| env->cstar = vcxt.values[idx++].Reg64; |
| assert(whpx_register_names[idx] == WHvX64RegisterSfmask); |
| env->fmask = vcxt.values[idx++].Reg64; |
| #endif |
| |
| /* Interrupt / Event Registers - Skipped */ |
| |
| assert(idx == RTL_NUMBER_OF(whpx_register_names)); |
| |
| if (whpx_apic_in_platform()) { |
| whpx_apic_get(x86_cpu->apic_state); |
| } |
| |
| x86_update_hflags(env); |
| |
| return; |
| } |
| |
| static HRESULT CALLBACK whpx_emu_ioport_callback( |
| void *ctx, |
| WHV_EMULATOR_IO_ACCESS_INFO *IoAccess) |
| { |
| MemTxAttrs attrs = { 0 }; |
| address_space_rw(&address_space_io, IoAccess->Port, attrs, |
| &IoAccess->Data, IoAccess->AccessSize, |
| IoAccess->Direction); |
| return S_OK; |
| } |
| |
| static HRESULT CALLBACK whpx_emu_mmio_callback( |
| void *ctx, |
| WHV_EMULATOR_MEMORY_ACCESS_INFO *ma) |
| { |
| cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize, |
| ma->Direction); |
| return S_OK; |
| } |
| |
| static HRESULT CALLBACK whpx_emu_getreg_callback( |
| void *ctx, |
| const WHV_REGISTER_NAME *RegisterNames, |
| UINT32 RegisterCount, |
| WHV_REGISTER_VALUE *RegisterValues) |
| { |
| HRESULT hr; |
| struct whpx_state *whpx = &whpx_global; |
| CPUState *cpu = (CPUState *)ctx; |
| |
| hr = whp_dispatch.WHvGetVirtualProcessorRegisters( |
| whpx->partition, cpu->cpu_index, |
| RegisterNames, RegisterCount, |
| RegisterValues); |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to get virtual processor registers," |
| " hr=%08lx", hr); |
| } |
| |
| return hr; |
| } |
| |
| static HRESULT CALLBACK whpx_emu_setreg_callback( |
| void *ctx, |
| const WHV_REGISTER_NAME *RegisterNames, |
| UINT32 RegisterCount, |
| const WHV_REGISTER_VALUE *RegisterValues) |
| { |
| HRESULT hr; |
| struct whpx_state *whpx = &whpx_global; |
| CPUState *cpu = (CPUState *)ctx; |
| |
| hr = whp_dispatch.WHvSetVirtualProcessorRegisters( |
| whpx->partition, cpu->cpu_index, |
| RegisterNames, RegisterCount, |
| RegisterValues); |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to set virtual processor registers," |
| " hr=%08lx", hr); |
| } |
| |
| /* |
| * The emulator just successfully wrote the register state. We clear the |
| * dirty state so we avoid the double write on resume of the VP. |
| */ |
| cpu->vcpu_dirty = false; |
| |
| return hr; |
| } |
| |
| static HRESULT CALLBACK whpx_emu_translate_callback( |
| void *ctx, |
| WHV_GUEST_VIRTUAL_ADDRESS Gva, |
| WHV_TRANSLATE_GVA_FLAGS TranslateFlags, |
| WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult, |
| WHV_GUEST_PHYSICAL_ADDRESS *Gpa) |
| { |
| HRESULT hr; |
| struct whpx_state *whpx = &whpx_global; |
| CPUState *cpu = (CPUState *)ctx; |
| WHV_TRANSLATE_GVA_RESULT res; |
| |
| hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index, |
| Gva, TranslateFlags, &res, Gpa); |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to translate GVA, hr=%08lx", hr); |
| } else { |
| *TranslationResult = res.ResultCode; |
| } |
| |
| return hr; |
| } |
| |
| static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = { |
| .Size = sizeof(WHV_EMULATOR_CALLBACKS), |
| .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback, |
| .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback, |
| .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback, |
| .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback, |
| .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback, |
| }; |
| |
| static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx) |
| { |
| HRESULT hr; |
| struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); |
| WHV_EMULATOR_STATUS emu_status; |
| |
| hr = whp_dispatch.WHvEmulatorTryMmioEmulation( |
| vcpu->emulator, cpu, |
| &vcpu->exit_ctx.VpContext, ctx, |
| &emu_status); |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr); |
| return -1; |
| } |
| |
| if (!emu_status.EmulationSuccessful) { |
| error_report("WHPX: Failed to emulate MMIO access with" |
| " EmulatorReturnStatus: %u", emu_status.AsUINT32); |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| static int whpx_handle_portio(CPUState *cpu, |
| WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx) |
| { |
| HRESULT hr; |
| struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); |
| WHV_EMULATOR_STATUS emu_status; |
| |
| hr = whp_dispatch.WHvEmulatorTryIoEmulation( |
| vcpu->emulator, cpu, |
| &vcpu->exit_ctx.VpContext, ctx, |
| &emu_status); |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr); |
| return -1; |
| } |
| |
| if (!emu_status.EmulationSuccessful) { |
| error_report("WHPX: Failed to emulate PortIO access with" |
| " EmulatorReturnStatus: %u", emu_status.AsUINT32); |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Controls whether we should intercept various exceptions on the guest, |
| * namely breakpoint/single-step events. |
| * |
| * The 'exceptions' argument accepts a bitmask, e.g: |
| * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...) |
| */ |
| static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions) |
| { |
| struct whpx_state *whpx = &whpx_global; |
| WHV_PARTITION_PROPERTY prop = { 0, }; |
| HRESULT hr; |
| |
| if (exceptions == whpx->exception_exit_bitmap) { |
| return S_OK; |
| } |
| |
| prop.ExceptionExitBitmap = exceptions; |
| |
| hr = whp_dispatch.WHvSetPartitionProperty( |
| whpx->partition, |
| WHvPartitionPropertyCodeExceptionExitBitmap, |
| &prop, |
| sizeof(WHV_PARTITION_PROPERTY)); |
| |
| if (SUCCEEDED(hr)) { |
| whpx->exception_exit_bitmap = exceptions; |
| } |
| |
| return hr; |
| } |
| |
| |
| /* |
| * This function is called before/after stepping over a single instruction. |
| * It will update the CPU registers to arm/disarm the instruction stepping |
| * accordingly. |
| */ |
| static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu, |
| bool set, |
| uint64_t *exit_context_rflags) |
| { |
| WHV_REGISTER_NAME reg_name; |
| WHV_REGISTER_VALUE reg_value; |
| HRESULT hr; |
| struct whpx_state *whpx = &whpx_global; |
| |
| /* |
| * If we are trying to step over a single instruction, we need to set the |
| * TF bit in rflags. Otherwise, clear it. |
| */ |
| reg_name = WHvX64RegisterRflags; |
| hr = whp_dispatch.WHvGetVirtualProcessorRegisters( |
| whpx->partition, |
| cpu->cpu_index, |
| ®_name, |
| 1, |
| ®_value); |
| |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to get rflags, hr=%08lx", hr); |
| return hr; |
| } |
| |
| if (exit_context_rflags) { |
| assert(*exit_context_rflags == reg_value.Reg64); |
| } |
| |
| if (set) { |
| /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */ |
| reg_value.Reg64 |= TF_MASK; |
| } else { |
| reg_value.Reg64 &= ~TF_MASK; |
| } |
| |
| if (exit_context_rflags) { |
| *exit_context_rflags = reg_value.Reg64; |
| } |
| |
| hr = whp_dispatch.WHvSetVirtualProcessorRegisters( |
| whpx->partition, |
| cpu->cpu_index, |
| ®_name, |
| 1, |
| ®_value); |
| |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to set rflags," |
| " hr=%08lx", |
| hr); |
| return hr; |
| } |
| |
| reg_name = WHvRegisterInterruptState; |
| reg_value.Reg64 = 0; |
| |
| /* Suspend delivery of hardware interrupts during single-stepping. */ |
| reg_value.InterruptState.InterruptShadow = set != 0; |
| |
| hr = whp_dispatch.WHvSetVirtualProcessorRegisters( |
| whpx->partition, |
| cpu->cpu_index, |
| ®_name, |
| 1, |
| ®_value); |
| |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to set InterruptState," |
| " hr=%08lx", |
| hr); |
| return hr; |
| } |
| |
| if (!set) { |
| /* |
| * We have just finished stepping over a single instruction, |
| * and intercepted the INT1 generated by it. |
| * We need to now hide the INT1 from the guest, |
| * as it would not be expecting it. |
| */ |
| |
| reg_name = WHvX64RegisterPendingDebugException; |
| hr = whp_dispatch.WHvGetVirtualProcessorRegisters( |
| whpx->partition, |
| cpu->cpu_index, |
| ®_name, |
| 1, |
| ®_value); |
| |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to get pending debug exceptions," |
| "hr=%08lx", hr); |
| return hr; |
| } |
| |
| if (reg_value.PendingDebugException.SingleStep) { |
| reg_value.PendingDebugException.SingleStep = 0; |
| |
| hr = whp_dispatch.WHvSetVirtualProcessorRegisters( |
| whpx->partition, |
| cpu->cpu_index, |
| ®_name, |
| 1, |
| ®_value); |
| |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to clear pending debug exceptions," |
| "hr=%08lx", hr); |
| return hr; |
| } |
| } |
| |
| } |
| |
| return S_OK; |
| } |
| |
| /* Tries to find a breakpoint at the specified address. */ |
| static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address) |
| { |
| struct whpx_state *whpx = &whpx_global; |
| int i; |
| |
| if (whpx->breakpoints.breakpoints) { |
| for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) { |
| if (address == whpx->breakpoints.breakpoints->data[i].address) { |
| return &whpx->breakpoints.breakpoints->data[i]; |
| } |
| } |
| } |
| |
| return NULL; |
| } |
| |
| /* |
| * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for |
| * debugging user-mode applications. Since the WHPX API does not offer |
| * an easy way to pass the intercepted exception back to the guest, we |
| * resort to using INT1 instead, and let the guest always handle INT3. |
| */ |
| static const uint8_t whpx_breakpoint_instruction = 0xF1; |
| |
| /* |
| * The WHPX QEMU backend implements breakpoints by writing the INT1 |
| * instruction into memory (ignoring the DRx registers). This raises a few |
| * issues that need to be carefully handled: |
| * |
| * 1. Although unlikely, other parts of QEMU may set multiple breakpoints |
| * at the same location, and later remove them in arbitrary order. |
| * This should not cause memory corruption, and should only remove the |
| * physical breakpoint instruction when the last QEMU breakpoint is gone. |
| * |
| * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid |
| * physical location. Hence, physically adding/removing a breakpoint can |
| * theoretically fail at any time. We need to keep track of it. |
| * |
| * The function below rebuilds a list of low-level breakpoints (one per |
| * address, tracking the original instruction and any errors) from the list of |
| * high-level breakpoints (set via cpu_breakpoint_insert()). |
| * |
| * In order to optimize performance, this function stores the list of |
| * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the |
| * low-level ones, so that it won't be re-invoked until these breakpoints |
| * change. |
| * |
| * Note that this function decides which breakpoints should be inserted into, |
| * memory, but doesn't actually do it. The memory accessing is done in |
| * whpx_apply_breakpoints(). |
| */ |
| static void whpx_translate_cpu_breakpoints( |
| struct whpx_breakpoints *breakpoints, |
| CPUState *cpu, |
| int cpu_breakpoint_count) |
| { |
| CPUBreakpoint *bp; |
| int cpu_bp_index = 0; |
| |
| breakpoints->original_addresses = |
| g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count); |
| |
| breakpoints->original_address_count = cpu_breakpoint_count; |
| |
| int max_breakpoints = cpu_breakpoint_count + |
| (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0); |
| |
| struct whpx_breakpoint_collection *new_breakpoints = |
| g_malloc0(sizeof(struct whpx_breakpoint_collection) |
| + max_breakpoints * sizeof(struct whpx_breakpoint)); |
| |
| new_breakpoints->allocated = max_breakpoints; |
| new_breakpoints->used = 0; |
| |
| /* |
| * 1. Preserve all old breakpoints that could not be automatically |
| * cleared when the CPU got stopped. |
| */ |
| if (breakpoints->breakpoints) { |
| int i; |
| for (i = 0; i < breakpoints->breakpoints->used; i++) { |
| if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) { |
| new_breakpoints->data[new_breakpoints->used++] = |
| breakpoints->breakpoints->data[i]; |
| } |
| } |
| } |
| |
| /* 2. Map all CPU breakpoints to WHPX breakpoints */ |
| QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { |
| int i; |
| bool found = false; |
| |
| /* This will be used to detect changed CPU breakpoints later. */ |
| breakpoints->original_addresses[cpu_bp_index++] = bp->pc; |
| |
| for (i = 0; i < new_breakpoints->used; i++) { |
| /* |
| * WARNING: This loop has O(N^2) complexity, where N is the |
| * number of breakpoints. It should not be a bottleneck in |
| * real-world scenarios, since it only needs to run once after |
| * the breakpoints have been modified. |
| * If this ever becomes a concern, it can be optimized by storing |
| * high-level breakpoint objects in a tree or hash map. |
| */ |
| |
| if (new_breakpoints->data[i].address == bp->pc) { |
| /* There was already a breakpoint at this address. */ |
| if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) { |
| new_breakpoints->data[i].state = WHPX_BP_SET; |
| } else if (new_breakpoints->data[i].state == WHPX_BP_SET) { |
| new_breakpoints->data[i].state = WHPX_BP_SET_PENDING; |
| } |
| |
| found = true; |
| break; |
| } |
| } |
| |
| if (!found && new_breakpoints->used < new_breakpoints->allocated) { |
| /* No WHPX breakpoint at this address. Create one. */ |
| new_breakpoints->data[new_breakpoints->used].address = bp->pc; |
| new_breakpoints->data[new_breakpoints->used].state = |
| WHPX_BP_SET_PENDING; |
| new_breakpoints->used++; |
| } |
| } |
| |
| /* |
| * Free the previous breakpoint list. This can be optimized by keeping |
| * it as shadow buffer for the next computation instead of freeing |
| * it immediately. |
| */ |
| g_free(breakpoints->breakpoints); |
| |
| breakpoints->breakpoints = new_breakpoints; |
| } |
| |
| /* |
| * Physically inserts/removes the breakpoints by reading and writing the |
| * physical memory, keeping a track of the failed attempts. |
| * |
| * Passing resuming=true will try to set all previously unset breakpoints. |
| * Passing resuming=false will remove all inserted ones. |
| */ |
| static void whpx_apply_breakpoints( |
| struct whpx_breakpoint_collection *breakpoints, |
| CPUState *cpu, |
| bool resuming) |
| { |
| int i, rc; |
| if (!breakpoints) { |
| return; |
| } |
| |
| for (i = 0; i < breakpoints->used; i++) { |
| /* Decide what to do right now based on the last known state. */ |
| WhpxBreakpointState state = breakpoints->data[i].state; |
| switch (state) { |
| case WHPX_BP_CLEARED: |
| if (resuming) { |
| state = WHPX_BP_SET_PENDING; |
| } |
| break; |
| case WHPX_BP_SET_PENDING: |
| if (!resuming) { |
| state = WHPX_BP_CLEARED; |
| } |
| break; |
| case WHPX_BP_SET: |
| if (!resuming) { |
| state = WHPX_BP_CLEAR_PENDING; |
| } |
| break; |
| case WHPX_BP_CLEAR_PENDING: |
| if (resuming) { |
| state = WHPX_BP_SET; |
| } |
| break; |
| } |
| |
| if (state == WHPX_BP_SET_PENDING) { |
| /* Remember the original instruction. */ |
| rc = cpu_memory_rw_debug(cpu, |
| breakpoints->data[i].address, |
| &breakpoints->data[i].original_instruction, |
| 1, |
| false); |
| |
| if (!rc) { |
| /* Write the breakpoint instruction. */ |
| rc = cpu_memory_rw_debug(cpu, |
| breakpoints->data[i].address, |
| (void *)&whpx_breakpoint_instruction, |
| 1, |
| true); |
| } |
| |
| if (!rc) { |
| state = WHPX_BP_SET; |
| } |
| |
| } |
| |
| if (state == WHPX_BP_CLEAR_PENDING) { |
| /* Restore the original instruction. */ |
| rc = cpu_memory_rw_debug(cpu, |
| breakpoints->data[i].address, |
| &breakpoints->data[i].original_instruction, |
| 1, |
| true); |
| |
| if (!rc) { |
| state = WHPX_BP_CLEARED; |
| } |
| } |
| |
| breakpoints->data[i].state = state; |
| } |
| } |
| |
| /* |
| * This function is called when the a VCPU is about to start and no other |
| * VCPUs have been started so far. Since the VCPU start order could be |
| * arbitrary, it doesn't have to be VCPU#0. |
| * |
| * It is used to commit the breakpoints into memory, and configure WHPX |
| * to intercept debug exceptions. |
| * |
| * Note that whpx_set_exception_exit_bitmap() cannot be called if one or |
| * more VCPUs are already running, so this is the best place to do it. |
| */ |
| static int whpx_first_vcpu_starting(CPUState *cpu) |
| { |
| struct whpx_state *whpx = &whpx_global; |
| HRESULT hr; |
| |
| g_assert(qemu_mutex_iothread_locked()); |
| |
| if (!QTAILQ_EMPTY(&cpu->breakpoints) || |
| (whpx->breakpoints.breakpoints && |
| whpx->breakpoints.breakpoints->used)) { |
| CPUBreakpoint *bp; |
| int i = 0; |
| bool update_pending = false; |
| |
| QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { |
| if (i >= whpx->breakpoints.original_address_count || |
| bp->pc != whpx->breakpoints.original_addresses[i]) { |
| update_pending = true; |
| } |
| |
| i++; |
| } |
| |
| if (i != whpx->breakpoints.original_address_count) { |
| update_pending = true; |
| } |
| |
| if (update_pending) { |
| /* |
| * The CPU breakpoints have changed since the last call to |
| * whpx_translate_cpu_breakpoints(). WHPX breakpoints must |
| * now be recomputed. |
| */ |
| whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i); |
| } |
| |
| /* Actually insert the breakpoints into the memory. */ |
| whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true); |
| } |
| |
| uint64_t exception_mask; |
| if (whpx->step_pending || |
| (whpx->breakpoints.breakpoints && |
| whpx->breakpoints.breakpoints->used)) { |
| /* |
| * We are either attempting to single-step one or more CPUs, or |
| * have one or more breakpoints enabled. Both require intercepting |
| * the WHvX64ExceptionTypeBreakpointTrap exception. |
| */ |
| |
| exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault; |
| } else { |
| /* Let the guest handle all exceptions. */ |
| exception_mask = 0; |
| } |
| |
| hr = whpx_set_exception_exit_bitmap(exception_mask); |
| if (!SUCCEEDED(hr)) { |
| error_report("WHPX: Failed to update exception exit mask," |
| "hr=%08lx.", hr); |
| return 1; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * This function is called when the last VCPU has finished running. |
| * It is used to remove any previously set breakpoints from memory. |
| */ |
| static int whpx_last_vcpu_stopping(CPUState *cpu) |
| { |
| whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false); |
| return 0; |
| } |
| |
| /* Returns the address of the next instruction that is about to be executed. */ |
| static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid) |
| { |
| if (cpu->vcpu_dirty) { |
| /* The CPU registers have been modified by other parts of QEMU. */ |
| CPUArchState *env = (CPUArchState *)(cpu->env_ptr); |
| return env->eip; |
| } else if (exit_context_valid) { |
| /* |
| * The CPU registers have not been modified by neither other parts |
| * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters(). |
| * This is the most common case. |
| */ |
| struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); |
| return vcpu->exit_ctx.VpContext.Rip; |
| } else { |
| /* |
| * The CPU registers have been modified by a call to |
| * WHvSetVirtualProcessorRegisters() and must be re-queried from |
| * the target. |
| */ |
| WHV_REGISTER_VALUE reg_value; |
| WHV_REGISTER_NAME reg_name = WHvX64RegisterRip; |
| HRESULT hr; |
| struct whpx_state *whpx = &whpx_global; |
| |
| hr = whp_dispatch.WHvGetVirtualProcessorRegisters( |
| whpx->partition, |
| cpu->cpu_index, |
| ®_name, |
| 1, |
| ®_value); |
| |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to get PC, hr=%08lx", hr); |
| return 0; |
| } |
| |
| return reg_value.Reg64; |
| } |
| } |
| |
| static int whpx_handle_halt(CPUState *cpu) |
| { |
| CPUX86State *env = cpu->env_ptr; |
| int ret = 0; |
| |
| qemu_mutex_lock_iothread(); |
| if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) && |
| (env->eflags & IF_MASK)) && |
| !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) { |
| cpu->exception_index = EXCP_HLT; |
| cpu->halted = true; |
| ret = 1; |
| } |
| qemu_mutex_unlock_iothread(); |
| |
| return ret; |
| } |
| |
| static void whpx_vcpu_pre_run(CPUState *cpu) |
| { |
| HRESULT hr; |
| struct whpx_state *whpx = &whpx_global; |
| struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); |
| CPUX86State *env = cpu->env_ptr; |
| X86CPU *x86_cpu = X86_CPU(cpu); |
| int irq; |
| uint8_t tpr; |
| WHV_X64_PENDING_INTERRUPTION_REGISTER new_int; |
| UINT32 reg_count = 0; |
| WHV_REGISTER_VALUE reg_values[3]; |
| WHV_REGISTER_NAME reg_names[3]; |
| |
| memset(&new_int, 0, sizeof(new_int)); |
| memset(reg_values, 0, sizeof(reg_values)); |
| |
| qemu_mutex_lock_iothread(); |
| |
| /* Inject NMI */ |
| if (!vcpu->interruption_pending && |
| cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) { |
| if (cpu->interrupt_request & CPU_INTERRUPT_NMI) { |
| cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; |
| vcpu->interruptable = false; |
| new_int.InterruptionType = WHvX64PendingNmi; |
| new_int.InterruptionPending = 1; |
| new_int.InterruptionVector = 2; |
| } |
| if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { |
| cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; |
| } |
| } |
| |
| /* |
| * Force the VCPU out of its inner loop to process any INIT requests or |
| * commit pending TPR access. |
| */ |
| if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { |
| if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && |
| !(env->hflags & HF_SMM_MASK)) { |
| cpu->exit_request = 1; |
| } |
| if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { |
| cpu->exit_request = 1; |
| } |
| } |
| |
| /* Get pending hard interruption or replay one that was overwritten */ |
| if (!whpx_apic_in_platform()) { |
| if (!vcpu->interruption_pending && |
| vcpu->interruptable && (env->eflags & IF_MASK)) { |
| assert(!new_int.InterruptionPending); |
| if (cpu->interrupt_request & CPU_INTERRUPT_HARD) { |
| cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; |
| irq = cpu_get_pic_interrupt(env); |
| if (irq >= 0) { |
| new_int.InterruptionType = WHvX64PendingInterrupt; |
| new_int.InterruptionPending = 1; |
| new_int.InterruptionVector = irq; |
| } |
| } |
| } |
| |
| /* Setup interrupt state if new one was prepared */ |
| if (new_int.InterruptionPending) { |
| reg_values[reg_count].PendingInterruption = new_int; |
| reg_names[reg_count] = WHvRegisterPendingInterruption; |
| reg_count += 1; |
| } |
| } else if (vcpu->ready_for_pic_interrupt && |
| (cpu->interrupt_request & CPU_INTERRUPT_HARD)) { |
| cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; |
| irq = cpu_get_pic_interrupt(env); |
| if (irq >= 0) { |
| reg_names[reg_count] = WHvRegisterPendingEvent; |
| reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT) |
| { |
| .EventPending = 1, |
| .EventType = WHvX64PendingEventExtInt, |
| .Vector = irq, |
| }; |
| reg_count += 1; |
| } |
| } |
| |
| /* Sync the TPR to the CR8 if was modified during the intercept */ |
| tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); |
| if (tpr != vcpu->tpr) { |
| vcpu->tpr = tpr; |
| reg_values[reg_count].Reg64 = tpr; |
| cpu->exit_request = 1; |
| reg_names[reg_count] = WHvX64RegisterCr8; |
| reg_count += 1; |
| } |
| |
| /* Update the state of the interrupt delivery notification */ |
| if (!vcpu->window_registered && |
| cpu->interrupt_request & CPU_INTERRUPT_HARD) { |
| reg_values[reg_count].DeliverabilityNotifications = |
| (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) { |
| .InterruptNotification = 1 |
| }; |
| vcpu->window_registered = 1; |
| reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications; |
| reg_count += 1; |
| } |
| |
| qemu_mutex_unlock_iothread(); |
| vcpu->ready_for_pic_interrupt = false; |
| |
| if (reg_count) { |
| hr = whp_dispatch.WHvSetVirtualProcessorRegisters( |
| whpx->partition, cpu->cpu_index, |
| reg_names, reg_count, reg_values); |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to set interrupt state registers," |
| " hr=%08lx", hr); |
| } |
| } |
| |
| return; |
| } |
| |
| static void whpx_vcpu_post_run(CPUState *cpu) |
| { |
| struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); |
| CPUX86State *env = cpu->env_ptr; |
| X86CPU *x86_cpu = X86_CPU(cpu); |
| |
| env->eflags = vcpu->exit_ctx.VpContext.Rflags; |
| |
| uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8; |
| if (vcpu->tpr != tpr) { |
| vcpu->tpr = tpr; |
| qemu_mutex_lock_iothread(); |
| cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr)); |
| qemu_mutex_unlock_iothread(); |
| } |
| |
| vcpu->interruption_pending = |
| vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending; |
| |
| vcpu->interruptable = |
| !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow; |
| |
| return; |
| } |
| |
| static void whpx_vcpu_process_async_events(CPUState *cpu) |
| { |
| CPUX86State *env = cpu->env_ptr; |
| X86CPU *x86_cpu = X86_CPU(cpu); |
| struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); |
| |
| if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && |
| !(env->hflags & HF_SMM_MASK)) { |
| whpx_cpu_synchronize_state(cpu); |
| do_cpu_init(x86_cpu); |
| vcpu->interruptable = true; |
| } |
| |
| if (cpu->interrupt_request & CPU_INTERRUPT_POLL) { |
| cpu->interrupt_request &= ~CPU_INTERRUPT_POLL; |
| apic_poll_irq(x86_cpu->apic_state); |
| } |
| |
| if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) && |
| (env->eflags & IF_MASK)) || |
| (cpu->interrupt_request & CPU_INTERRUPT_NMI)) { |
| cpu->halted = false; |
| } |
| |
| if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) { |
| whpx_cpu_synchronize_state(cpu); |
| do_cpu_sipi(x86_cpu); |
| } |
| |
| if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { |
| cpu->interrupt_request &= ~CPU_INTERRUPT_TPR; |
| whpx_cpu_synchronize_state(cpu); |
| apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip, |
| env->tpr_access_type); |
| } |
| |
| return; |
| } |
| |
| static int whpx_vcpu_run(CPUState *cpu) |
| { |
| HRESULT hr; |
| struct whpx_state *whpx = &whpx_global; |
| struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); |
| struct whpx_breakpoint *stepped_over_bp = NULL; |
| WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE; |
| int ret; |
| |
| g_assert(qemu_mutex_iothread_locked()); |
| |
| if (whpx->running_cpus++ == 0) { |
| /* Insert breakpoints into memory, update exception exit bitmap. */ |
| ret = whpx_first_vcpu_starting(cpu); |
| if (ret != 0) { |
| return ret; |
| } |
| } |
| |
| if (whpx->breakpoints.breakpoints && |
| whpx->breakpoints.breakpoints->used > 0) |
| { |
| uint64_t pc = whpx_vcpu_get_pc(cpu, true); |
| stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc); |
| if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) { |
| stepped_over_bp = NULL; |
| } |
| |
| if (stepped_over_bp) { |
| /* |
| * We are trying to run the instruction overwritten by an active |
| * breakpoint. We will temporarily disable the breakpoint, suspend |
| * other CPUs, and step over the instruction. |
| */ |
| exclusive_step_mode = WHPX_STEP_EXCLUSIVE; |
| } |
| } |
| |
| if (exclusive_step_mode == WHPX_STEP_NONE) { |
| whpx_vcpu_process_async_events(cpu); |
| if (cpu->halted && !whpx_apic_in_platform()) { |
| cpu->exception_index = EXCP_HLT; |
| qatomic_set(&cpu->exit_request, false); |
| return 0; |
| } |
| } |
| |
| qemu_mutex_unlock_iothread(); |
| |
| if (exclusive_step_mode != WHPX_STEP_NONE) { |
| start_exclusive(); |
| g_assert(cpu == current_cpu); |
| g_assert(!cpu->running); |
| cpu->running = true; |
| |
| hr = whpx_set_exception_exit_bitmap( |
| 1UL << WHvX64ExceptionTypeDebugTrapOrFault); |
| if (!SUCCEEDED(hr)) { |
| error_report("WHPX: Failed to update exception exit mask, " |
| "hr=%08lx.", hr); |
| return 1; |
| } |
| |
| if (stepped_over_bp) { |
| /* Temporarily disable the triggered breakpoint. */ |
| cpu_memory_rw_debug(cpu, |
| stepped_over_bp->address, |
| &stepped_over_bp->original_instruction, |
| 1, |
| true); |
| } |
| } else { |
| cpu_exec_start(cpu); |
| } |
| |
| do { |
| if (cpu->vcpu_dirty) { |
| whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE); |
| cpu->vcpu_dirty = false; |
| } |
| |
| if (exclusive_step_mode == WHPX_STEP_NONE) { |
| whpx_vcpu_pre_run(cpu); |
| |
| if (qatomic_read(&cpu->exit_request)) { |
| whpx_vcpu_kick(cpu); |
| } |
| } |
| |
| if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { |
| whpx_vcpu_configure_single_stepping(cpu, true, NULL); |
| } |
| |
| hr = whp_dispatch.WHvRunVirtualProcessor( |
| whpx->partition, cpu->cpu_index, |
| &vcpu->exit_ctx, sizeof(vcpu->exit_ctx)); |
| |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to exec a virtual processor," |
| " hr=%08lx", hr); |
| ret = -1; |
| break; |
| } |
| |
| if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { |
| whpx_vcpu_configure_single_stepping(cpu, |
| false, |
| &vcpu->exit_ctx.VpContext.Rflags); |
| } |
| |
| whpx_vcpu_post_run(cpu); |
| |
| switch (vcpu->exit_ctx.ExitReason) { |
| case WHvRunVpExitReasonMemoryAccess: |
| ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess); |
| break; |
| |
| case WHvRunVpExitReasonX64IoPortAccess: |
| ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess); |
| break; |
| |
| case WHvRunVpExitReasonX64InterruptWindow: |
| vcpu->ready_for_pic_interrupt = 1; |
| vcpu->window_registered = 0; |
| ret = 0; |
| break; |
| |
| case WHvRunVpExitReasonX64ApicEoi: |
| assert(whpx_apic_in_platform()); |
| ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector); |
| break; |
| |
| case WHvRunVpExitReasonX64Halt: |
| /* |
| * WARNING: as of build 19043.1526 (21H1), this exit reason is no |
| * longer used. |
| */ |
| ret = whpx_handle_halt(cpu); |
| break; |
| |
| case WHvRunVpExitReasonX64ApicInitSipiTrap: { |
| WHV_INTERRUPT_CONTROL ipi = {0}; |
| uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr; |
| uint32_t delivery_mode = |
| (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT; |
| int dest_shorthand = |
| (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT; |
| bool broadcast = false; |
| bool include_self = false; |
| uint32_t i; |
| |
| /* We only registered for INIT and SIPI exits. */ |
| if ((delivery_mode != APIC_DM_INIT) && |
| (delivery_mode != APIC_DM_SIPI)) { |
| error_report( |
| "WHPX: Unexpected APIC exit that is not a INIT or SIPI"); |
| break; |
| } |
| |
| if (delivery_mode == APIC_DM_INIT) { |
| ipi.Type = WHvX64InterruptTypeInit; |
| } else { |
| ipi.Type = WHvX64InterruptTypeSipi; |
| } |
| |
| ipi.DestinationMode = |
| ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ? |
| WHvX64InterruptDestinationModeLogical : |
| WHvX64InterruptDestinationModePhysical; |
| |
| ipi.TriggerMode = |
| ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ? |
| WHvX64InterruptTriggerModeLevel : |
| WHvX64InterruptTriggerModeEdge; |
| |
| ipi.Vector = icr & APIC_VECTOR_MASK; |
| switch (dest_shorthand) { |
| /* no shorthand. Bits 56-63 contain the destination. */ |
| case 0: |
| ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK; |
| hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, |
| &ipi, sizeof(ipi)); |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to request interrupt hr=%08lx", |
| hr); |
| } |
| |
| break; |
| |
| /* self */ |
| case 1: |
| include_self = true; |
| break; |
| |
| /* broadcast, including self */ |
| case 2: |
| broadcast = true; |
| include_self = true; |
| break; |
| |
| /* broadcast, excluding self */ |
| case 3: |
| broadcast = true; |
| break; |
| } |
| |
| if (!broadcast && !include_self) { |
| break; |
| } |
| |
| for (i = 0; i <= max_vcpu_index; i++) { |
| if (i == cpu->cpu_index && !include_self) { |
| continue; |
| } |
| |
| /* |
| * Assuming that APIC Ids are identity mapped since |
| * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers |
| * are not handled yet and the hypervisor doesn't allow the |
| * guest to modify the APIC ID. |
| */ |
| ipi.Destination = i; |
| hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, |
| &ipi, sizeof(ipi)); |
| if (FAILED(hr)) { |
| error_report( |
| "WHPX: Failed to request SIPI for %d, hr=%08lx", |
| i, hr); |
| } |
| } |
| |
| break; |
| } |
| |
| case WHvRunVpExitReasonCanceled: |
| if (exclusive_step_mode != WHPX_STEP_NONE) { |
| /* |
| * We are trying to step over a single instruction, and |
| * likely got a request to stop from another thread. |
| * Delay it until we are done stepping |
| * over. |
| */ |
| ret = 0; |
| } else { |
| cpu->exception_index = EXCP_INTERRUPT; |
| ret = 1; |
| } |
| break; |
| case WHvRunVpExitReasonX64MsrAccess: { |
| WHV_REGISTER_VALUE reg_values[3] = {0}; |
| WHV_REGISTER_NAME reg_names[3]; |
| UINT32 reg_count; |
| |
| reg_names[0] = WHvX64RegisterRip; |
| reg_names[1] = WHvX64RegisterRax; |
| reg_names[2] = WHvX64RegisterRdx; |
| |
| reg_values[0].Reg64 = |
| vcpu->exit_ctx.VpContext.Rip + |
| vcpu->exit_ctx.VpContext.InstructionLength; |
| |
| /* |
| * For all unsupported MSR access we: |
| * ignore writes |
| * return 0 on read. |
| */ |
| reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ? |
| 1 : 3; |
| |
| hr = whp_dispatch.WHvSetVirtualProcessorRegisters( |
| whpx->partition, |
| cpu->cpu_index, |
| reg_names, reg_count, |
| reg_values); |
| |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to set MsrAccess state " |
| " registers, hr=%08lx", hr); |
| } |
| ret = 0; |
| break; |
| } |
| case WHvRunVpExitReasonX64Cpuid: { |
| WHV_REGISTER_VALUE reg_values[5]; |
| WHV_REGISTER_NAME reg_names[5]; |
| UINT32 reg_count = 5; |
| UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0; |
| X86CPU *x86_cpu = X86_CPU(cpu); |
| CPUX86State *env = &x86_cpu->env; |
| |
| memset(reg_values, 0, sizeof(reg_values)); |
| |
| rip = vcpu->exit_ctx.VpContext.Rip + |
| vcpu->exit_ctx.VpContext.InstructionLength; |
| cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax; |
| |
| /* |
| * Ideally, these should be supplied to the hypervisor during VCPU |
| * initialization and it should be able to satisfy this request. |
| * But, currently, WHPX doesn't support setting CPUID values in the |
| * hypervisor once the partition has been setup, which is too late |
| * since VCPUs are realized later. For now, use the values from |
| * QEMU to satisfy these requests, until WHPX adds support for |
| * being able to set these values in the hypervisor at runtime. |
| */ |
| cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx, |
| (UINT32 *)&rcx, (UINT32 *)&rdx); |
| switch (cpuid_fn) { |
| case 0x40000000: |
| /* Expose the vmware cpu frequency cpuid leaf */ |
| rax = 0x40000010; |
| rbx = rcx = rdx = 0; |
| break; |
| |
| case 0x40000010: |
| rax = env->tsc_khz; |
| rbx = env->apic_bus_freq / 1000; /* Hz to KHz */ |
| rcx = rdx = 0; |
| break; |
| |
| case 0x80000001: |
| /* Remove any support of OSVW */ |
| rcx &= ~CPUID_EXT3_OSVW; |
| break; |
| } |
| |
| reg_names[0] = WHvX64RegisterRip; |
| reg_names[1] = WHvX64RegisterRax; |
| reg_names[2] = WHvX64RegisterRcx; |
| reg_names[3] = WHvX64RegisterRdx; |
| reg_names[4] = WHvX64RegisterRbx; |
| |
| reg_values[0].Reg64 = rip; |
| reg_values[1].Reg64 = rax; |
| reg_values[2].Reg64 = rcx; |
| reg_values[3].Reg64 = rdx; |
| reg_values[4].Reg64 = rbx; |
| |
| hr = whp_dispatch.WHvSetVirtualProcessorRegisters( |
| whpx->partition, cpu->cpu_index, |
| reg_names, |
| reg_count, |
| reg_values); |
| |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to set CpuidAccess state registers," |
| " hr=%08lx", hr); |
| } |
| ret = 0; |
| break; |
| } |
| case WHvRunVpExitReasonException: |
| whpx_get_registers(cpu); |
| |
| if ((vcpu->exit_ctx.VpException.ExceptionType == |
| WHvX64ExceptionTypeDebugTrapOrFault) && |
| (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) && |
| (vcpu->exit_ctx.VpException.InstructionBytes[0] == |
| whpx_breakpoint_instruction)) { |
| /* Stopped at a software breakpoint. */ |
| cpu->exception_index = EXCP_DEBUG; |
| } else if ((vcpu->exit_ctx.VpException.ExceptionType == |
| WHvX64ExceptionTypeDebugTrapOrFault) && |
| !cpu->singlestep_enabled) { |
| /* |
| * Just finished stepping over a breakpoint, but the |
| * gdb does not expect us to do single-stepping. |
| * Don't do anything special. |
| */ |
| cpu->exception_index = EXCP_INTERRUPT; |
| } else { |
| /* Another exception or debug event. Report it to GDB. */ |
| cpu->exception_index = EXCP_DEBUG; |
| } |
| |
| ret = 1; |
| break; |
| case WHvRunVpExitReasonNone: |
| case WHvRunVpExitReasonUnrecoverableException: |
| case WHvRunVpExitReasonInvalidVpRegisterValue: |
| case WHvRunVpExitReasonUnsupportedFeature: |
| default: |
| error_report("WHPX: Unexpected VP exit code %d", |
| vcpu->exit_ctx.ExitReason); |
| whpx_get_registers(cpu); |
| qemu_mutex_lock_iothread(); |
| qemu_system_guest_panicked(cpu_get_crash_info(cpu)); |
| qemu_mutex_unlock_iothread(); |
| break; |
| } |
| |
| } while (!ret); |
| |
| if (stepped_over_bp) { |
| /* Restore the breakpoint we stepped over */ |
| cpu_memory_rw_debug(cpu, |
| stepped_over_bp->address, |
| (void *)&whpx_breakpoint_instruction, |
| 1, |
| true); |
| } |
| |
| if (exclusive_step_mode != WHPX_STEP_NONE) { |
| g_assert(cpu_in_exclusive_context(cpu)); |
| cpu->running = false; |
| end_exclusive(); |
| |
| exclusive_step_mode = WHPX_STEP_NONE; |
| } else { |
| cpu_exec_end(cpu); |
| } |
| |
| qemu_mutex_lock_iothread(); |
| current_cpu = cpu; |
| |
| if (--whpx->running_cpus == 0) { |
| whpx_last_vcpu_stopping(cpu); |
| } |
| |
| qatomic_set(&cpu->exit_request, false); |
| |
| return ret < 0; |
| } |
| |
| static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) |
| { |
| if (!cpu->vcpu_dirty) { |
| whpx_get_registers(cpu); |
| cpu->vcpu_dirty = true; |
| } |
| } |
| |
| static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu, |
| run_on_cpu_data arg) |
| { |
| whpx_set_registers(cpu, WHPX_SET_RESET_STATE); |
| cpu->vcpu_dirty = false; |
| } |
| |
| static void do_whpx_cpu_synchronize_post_init(CPUState *cpu, |
| run_on_cpu_data arg) |
| { |
| whpx_set_registers(cpu, WHPX_SET_FULL_STATE); |
| cpu->vcpu_dirty = false; |
| } |
| |
| static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu, |
| run_on_cpu_data arg) |
| { |
| cpu->vcpu_dirty = true; |
| } |
| |
| /* |
| * CPU support. |
| */ |
| |
| void whpx_cpu_synchronize_state(CPUState *cpu) |
| { |
| if (!cpu->vcpu_dirty) { |
| run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL); |
| } |
| } |
| |
| void whpx_cpu_synchronize_post_reset(CPUState *cpu) |
| { |
| run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); |
| } |
| |
| void whpx_cpu_synchronize_post_init(CPUState *cpu) |
| { |
| run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL); |
| } |
| |
| void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu) |
| { |
| run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); |
| } |
| |
| void whpx_cpu_synchronize_pre_resume(bool step_pending) |
| { |
| whpx_global.step_pending = step_pending; |
| } |
| |
| /* |
| * Vcpu support. |
| */ |
| |
| static Error *whpx_migration_blocker; |
| |
| static void whpx_cpu_update_state(void *opaque, bool running, RunState state) |
| { |
| CPUX86State *env = opaque; |
| |
| if (running) { |
| env->tsc_valid = false; |
| } |
| } |
| |
| int whpx_init_vcpu(CPUState *cpu) |
| { |
| HRESULT hr; |
| struct whpx_state *whpx = &whpx_global; |
| struct whpx_vcpu *vcpu = NULL; |
| Error *local_error = NULL; |
| CPUX86State *env = cpu->env_ptr; |
| X86CPU *x86_cpu = X86_CPU(cpu); |
| UINT64 freq = 0; |
| int ret; |
| |
| /* Add migration blockers for all unsupported features of the |
| * Windows Hypervisor Platform |
| */ |
| if (whpx_migration_blocker == NULL) { |
| error_setg(&whpx_migration_blocker, |
| "State blocked due to non-migratable CPUID feature support," |
| "dirty memory tracking support, and XSAVE/XRSTOR support"); |
| |
| if (migrate_add_blocker(whpx_migration_blocker, &local_error) < 0) { |
| error_report_err(local_error); |
| error_free(whpx_migration_blocker); |
| ret = -EINVAL; |
| goto error; |
| } |
| } |
| |
| vcpu = g_new0(struct whpx_vcpu, 1); |
| |
| if (!vcpu) { |
| error_report("WHPX: Failed to allocte VCPU context."); |
| ret = -ENOMEM; |
| goto error; |
| } |
| |
| hr = whp_dispatch.WHvEmulatorCreateEmulator( |
| &whpx_emu_callbacks, |
| &vcpu->emulator); |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to setup instruction completion support," |
| " hr=%08lx", hr); |
| ret = -EINVAL; |
| goto error; |
| } |
| |
| hr = whp_dispatch.WHvCreateVirtualProcessor( |
| whpx->partition, cpu->cpu_index, 0); |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to create a virtual processor," |
| " hr=%08lx", hr); |
| whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); |
| ret = -EINVAL; |
| goto error; |
| } |
| |
| /* |
| * vcpu's TSC frequency is either specified by user, or use the value |
| * provided by Hyper-V if the former is not present. In the latter case, we |
| * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC |
| * frequency can be migrated later via this field. |
| */ |
| if (!env->tsc_khz) { |
| hr = whp_dispatch.WHvGetCapability( |
| WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq), |
| NULL); |
| if (hr != WHV_E_UNKNOWN_CAPABILITY) { |
| if (FAILED(hr)) { |
| printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr); |
| } else { |
| env->tsc_khz = freq / 1000; /* Hz to KHz */ |
| } |
| } |
| } |
| |
| env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY; |
| hr = whp_dispatch.WHvGetCapability( |
| WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL); |
| if (hr != WHV_E_UNKNOWN_CAPABILITY) { |
| if (FAILED(hr)) { |
| printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr); |
| } else { |
| env->apic_bus_freq = freq; |
| } |
| } |
| |
| /* |
| * If the vmware cpuid frequency leaf option is set, and we have a valid |
| * tsc value, trap the corresponding cpuid's. |
| */ |
| if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) { |
| UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010}; |
| |
| hr = whp_dispatch.WHvSetPartitionProperty( |
| whpx->partition, |
| WHvPartitionPropertyCodeCpuidExitList, |
| cpuidExitList, |
| RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); |
| |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", |
| hr); |
| ret = -EINVAL; |
| goto error; |
| } |
| } |
| |
| vcpu->interruptable = true; |
| cpu->vcpu_dirty = true; |
| cpu->hax_vcpu = (struct hax_vcpu_state *)vcpu; |
| max_vcpu_index = max(max_vcpu_index, cpu->cpu_index); |
| qemu_add_vm_change_state_handler(whpx_cpu_update_state, cpu->env_ptr); |
| |
| return 0; |
| |
| error: |
| g_free(vcpu); |
| |
| return ret; |
| } |
| |
| int whpx_vcpu_exec(CPUState *cpu) |
| { |
| int ret; |
| int fatal; |
| |
| for (;;) { |
| if (cpu->exception_index >= EXCP_INTERRUPT) { |
| ret = cpu->exception_index; |
| cpu->exception_index = -1; |
| break; |
| } |
| |
| fatal = whpx_vcpu_run(cpu); |
| |
| if (fatal) { |
| error_report("WHPX: Failed to exec a virtual processor"); |
| abort(); |
| } |
| } |
| |
| return ret; |
| } |
| |
| void whpx_destroy_vcpu(CPUState *cpu) |
| { |
| struct whpx_state *whpx = &whpx_global; |
| struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); |
| |
| whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index); |
| whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); |
| g_free(cpu->hax_vcpu); |
| return; |
| } |
| |
| void whpx_vcpu_kick(CPUState *cpu) |
| { |
| struct whpx_state *whpx = &whpx_global; |
| whp_dispatch.WHvCancelRunVirtualProcessor( |
| whpx->partition, cpu->cpu_index, 0); |
| } |
| |
| /* |
| * Memory support. |
| */ |
| |
| static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size, |
| void *host_va, int add, int rom, |
| const char *name) |
| { |
| struct whpx_state *whpx = &whpx_global; |
| HRESULT hr; |
| |
| /* |
| if (add) { |
| printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n", |
| (void*)start_pa, (void*)size, host_va, |
| (rom ? "ROM" : "RAM"), name); |
| } else { |
| printf("WHPX: DEL PA:%p Size:%p, Host:%p, '%s'\n", |
| (void*)start_pa, (void*)size, host_va, name); |
| } |
| */ |
| |
| if (add) { |
| hr = whp_dispatch.WHvMapGpaRange(whpx->partition, |
| host_va, |
| start_pa, |
| size, |
| (WHvMapGpaRangeFlagRead | |
| WHvMapGpaRangeFlagExecute | |
| (rom ? 0 : WHvMapGpaRangeFlagWrite))); |
| } else { |
| hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition, |
| start_pa, |
| size); |
| } |
| |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes," |
| " Host:%p, hr=%08lx", |
| (add ? "MAP" : "UNMAP"), name, |
| (void *)(uintptr_t)start_pa, (void *)size, host_va, hr); |
| } |
| } |
| |
| static void whpx_process_section(MemoryRegionSection *section, int add) |
| { |
| MemoryRegion *mr = section->mr; |
| hwaddr start_pa = section->offset_within_address_space; |
| ram_addr_t size = int128_get64(section->size); |
| unsigned int delta; |
| uint64_t host_va; |
| |
| if (!memory_region_is_ram(mr)) { |
| return; |
| } |
| |
| delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask()); |
| delta &= ~qemu_real_host_page_mask(); |
| if (delta > size) { |
| return; |
| } |
| start_pa += delta; |
| size -= delta; |
| size &= qemu_real_host_page_mask(); |
| if (!size || (start_pa & ~qemu_real_host_page_mask())) { |
| return; |
| } |
| |
| host_va = (uintptr_t)memory_region_get_ram_ptr(mr) |
| + section->offset_within_region + delta; |
| |
| whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add, |
| memory_region_is_rom(mr), mr->name); |
| } |
| |
| static void whpx_region_add(MemoryListener *listener, |
| MemoryRegionSection *section) |
| { |
| memory_region_ref(section->mr); |
| whpx_process_section(section, 1); |
| } |
| |
| static void whpx_region_del(MemoryListener *listener, |
| MemoryRegionSection *section) |
| { |
| whpx_process_section(section, 0); |
| memory_region_unref(section->mr); |
| } |
| |
| static void whpx_transaction_begin(MemoryListener *listener) |
| { |
| } |
| |
| static void whpx_transaction_commit(MemoryListener *listener) |
| { |
| } |
| |
| static void whpx_log_sync(MemoryListener *listener, |
| MemoryRegionSection *section) |
| { |
| MemoryRegion *mr = section->mr; |
| |
| if (!memory_region_is_ram(mr)) { |
| return; |
| } |
| |
| memory_region_set_dirty(mr, 0, int128_get64(section->size)); |
| } |
| |
| static MemoryListener whpx_memory_listener = { |
| .name = "whpx", |
| .begin = whpx_transaction_begin, |
| .commit = whpx_transaction_commit, |
| .region_add = whpx_region_add, |
| .region_del = whpx_region_del, |
| .log_sync = whpx_log_sync, |
| .priority = 10, |
| }; |
| |
| static void whpx_memory_init(void) |
| { |
| memory_listener_register(&whpx_memory_listener, &address_space_memory); |
| } |
| |
| /* |
| * Load the functions from the given library, using the given handle. If a |
| * handle is provided, it is used, otherwise the library is opened. The |
| * handle will be updated on return with the opened one. |
| */ |
| static bool load_whp_dispatch_fns(HMODULE *handle, |
| WHPFunctionList function_list) |
| { |
| HMODULE hLib = *handle; |
| |
| #define WINHV_PLATFORM_DLL "WinHvPlatform.dll" |
| #define WINHV_EMULATION_DLL "WinHvEmulation.dll" |
| #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \ |
| whp_dispatch.function_name = \ |
| (function_name ## _t)GetProcAddress(hLib, #function_name); \ |
| |
| #define WHP_LOAD_FIELD(return_type, function_name, signature) \ |
| whp_dispatch.function_name = \ |
| (function_name ## _t)GetProcAddress(hLib, #function_name); \ |
| if (!whp_dispatch.function_name) { \ |
| error_report("Could not load function %s", #function_name); \ |
| goto error; \ |
| } \ |
| |
| #define WHP_LOAD_LIB(lib_name, handle_lib) \ |
| if (!handle_lib) { \ |
| handle_lib = LoadLibrary(lib_name); \ |
| if (!handle_lib) { \ |
| error_report("Could not load library %s.", lib_name); \ |
| goto error; \ |
| } \ |
| } \ |
| |
| switch (function_list) { |
| case WINHV_PLATFORM_FNS_DEFAULT: |
| WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) |
| LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD) |
| break; |
| |
| case WINHV_EMULATION_FNS_DEFAULT: |
| WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib) |
| LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD) |
| break; |
| |
| case WINHV_PLATFORM_FNS_SUPPLEMENTAL: |
| WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) |
| LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL) |
| break; |
| } |
| |
| *handle = hLib; |
| return true; |
| |
| error: |
| if (hLib) { |
| FreeLibrary(hLib); |
| } |
| |
| return false; |
| } |
| |
| static void whpx_set_kernel_irqchip(Object *obj, Visitor *v, |
| const char *name, void *opaque, |
| Error **errp) |
| { |
| struct whpx_state *whpx = &whpx_global; |
| OnOffSplit mode; |
| |
| if (!visit_type_OnOffSplit(v, name, &mode, errp)) { |
| return; |
| } |
| |
| switch (mode) { |
| case ON_OFF_SPLIT_ON: |
| whpx->kernel_irqchip_allowed = true; |
| whpx->kernel_irqchip_required = true; |
| break; |
| |
| case ON_OFF_SPLIT_OFF: |
| whpx->kernel_irqchip_allowed = false; |
| whpx->kernel_irqchip_required = false; |
| break; |
| |
| case ON_OFF_SPLIT_SPLIT: |
| error_setg(errp, "WHPX: split irqchip currently not supported"); |
| error_append_hint(errp, |
| "Try without kernel-irqchip or with kernel-irqchip=on|off"); |
| break; |
| |
| default: |
| /* |
| * The value was checked in visit_type_OnOffSplit() above. If |
| * we get here, then something is wrong in QEMU. |
| */ |
| abort(); |
| } |
| } |
| |
| /* |
| * Partition support |
| */ |
| |
| static int whpx_accel_init(MachineState *ms) |
| { |
| struct whpx_state *whpx; |
| int ret; |
| HRESULT hr; |
| WHV_CAPABILITY whpx_cap; |
| UINT32 whpx_cap_size; |
| WHV_PARTITION_PROPERTY prop; |
| UINT32 cpuidExitList[] = {1, 0x80000001}; |
| WHV_CAPABILITY_FEATURES features = {0}; |
| |
| whpx = &whpx_global; |
| |
| if (!init_whp_dispatch()) { |
| ret = -ENOSYS; |
| goto error; |
| } |
| |
| whpx->mem_quota = ms->ram_size; |
| |
| hr = whp_dispatch.WHvGetCapability( |
| WHvCapabilityCodeHypervisorPresent, &whpx_cap, |
| sizeof(whpx_cap), &whpx_cap_size); |
| if (FAILED(hr) || !whpx_cap.HypervisorPresent) { |
| error_report("WHPX: No accelerator found, hr=%08lx", hr); |
| ret = -ENOSPC; |
| goto error; |
| } |
| |
| hr = whp_dispatch.WHvGetCapability( |
| WHvCapabilityCodeFeatures, &features, sizeof(features), NULL); |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to query capabilities, hr=%08lx", hr); |
| ret = -EINVAL; |
| goto error; |
| } |
| |
| hr = whp_dispatch.WHvCreatePartition(&whpx->partition); |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to create partition, hr=%08lx", hr); |
| ret = -EINVAL; |
| goto error; |
| } |
| |
| /* |
| * Query the XSAVE capability of the partition. Any error here is not |
| * considered fatal. |
| */ |
| hr = whp_dispatch.WHvGetPartitionProperty( |
| whpx->partition, |
| WHvPartitionPropertyCodeProcessorXsaveFeatures, |
| &whpx_xsave_cap, |
| sizeof(whpx_xsave_cap), |
| &whpx_cap_size); |
| |
| /* |
| * Windows version which don't support this property will return with the |
| * specific error code. |
| */ |
| if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) { |
| error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr); |
| } |
| |
| if (!whpx_has_xsave()) { |
| printf("WHPX: Partition is not XSAVE capable\n"); |
| } |
| |
| memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); |
| prop.ProcessorCount = ms->smp.cpus; |
| hr = whp_dispatch.WHvSetPartitionProperty( |
| whpx->partition, |
| WHvPartitionPropertyCodeProcessorCount, |
| &prop, |
| sizeof(WHV_PARTITION_PROPERTY)); |
| |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to set partition core count to %d," |
| " hr=%08lx", ms->smp.cores, hr); |
| ret = -EINVAL; |
| goto error; |
| } |
| |
| /* |
| * Error out if WHP doesn't support apic emulation and user is requiring |
| * it. |
| */ |
| if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation || |
| !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) { |
| error_report("WHPX: kernel irqchip requested, but unavailable. " |
| "Try without kernel-irqchip or with kernel-irqchip=off"); |
| ret = -EINVAL; |
| goto error; |
| } |
| |
| if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation && |
| whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) { |
| WHV_X64_LOCAL_APIC_EMULATION_MODE mode = |
| WHvX64LocalApicEmulationModeXApic; |
| printf("WHPX: setting APIC emulation mode in the hypervisor\n"); |
| hr = whp_dispatch.WHvSetPartitionProperty( |
| whpx->partition, |
| WHvPartitionPropertyCodeLocalApicEmulationMode, |
| &mode, |
| sizeof(mode)); |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr); |
| if (whpx->kernel_irqchip_required) { |
| error_report("WHPX: kernel irqchip requested, but unavailable"); |
| ret = -EINVAL; |
| goto error; |
| } |
| } else { |
| whpx->apic_in_platform = true; |
| } |
| } |
| |
| /* Register for MSR and CPUID exits */ |
| memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); |
| prop.ExtendedVmExits.X64MsrExit = 1; |
| prop.ExtendedVmExits.X64CpuidExit = 1; |
| prop.ExtendedVmExits.ExceptionExit = 1; |
| if (whpx_apic_in_platform()) { |
| prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1; |
| } |
| |
| hr = whp_dispatch.WHvSetPartitionProperty( |
| whpx->partition, |
| WHvPartitionPropertyCodeExtendedVmExits, |
| &prop, |
| sizeof(WHV_PARTITION_PROPERTY)); |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr); |
| ret = -EINVAL; |
| goto error; |
| } |
| |
| hr = whp_dispatch.WHvSetPartitionProperty( |
| whpx->partition, |
| WHvPartitionPropertyCodeCpuidExitList, |
| cpuidExitList, |
| RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); |
| |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", |
| hr); |
| ret = -EINVAL; |
| goto error; |
| } |
| |
| /* |
| * We do not want to intercept any exceptions from the guest, |
| * until we actually start debugging with gdb. |
| */ |
| whpx->exception_exit_bitmap = -1; |
| hr = whpx_set_exception_exit_bitmap(0); |
| |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr); |
| ret = -EINVAL; |
| goto error; |
| } |
| |
| hr = whp_dispatch.WHvSetupPartition(whpx->partition); |
| if (FAILED(hr)) { |
| error_report("WHPX: Failed to setup partition, hr=%08lx", hr); |
| ret = -EINVAL; |
| goto error; |
| } |
| |
| whpx_memory_init(); |
| |
| printf("Windows Hypervisor Platform accelerator is operational\n"); |
| return 0; |
| |
| error: |
| |
| if (NULL != whpx->partition) { |
| whp_dispatch.WHvDeletePartition(whpx->partition); |
| whpx->partition = NULL; |
| } |
| |
| return ret; |
| } |
| |
| int whpx_enabled(void) |
| { |
| return whpx_allowed; |
| } |
| |
| bool whpx_apic_in_platform(void) { |
| return whpx_global.apic_in_platform; |
| } |
| |
| static void whpx_accel_class_init(ObjectClass *oc, void *data) |
| { |
| AccelClass *ac = ACCEL_CLASS(oc); |
| ac->name = "WHPX"; |
| ac->init_machine = whpx_accel_init; |
| ac->allowed = &whpx_allowed; |
| |
| object_class_property_add(oc, "kernel-irqchip", "on|off|split", |
| NULL, whpx_set_kernel_irqchip, |
| NULL, NULL); |
| object_class_property_set_description(oc, "kernel-irqchip", |
| "Configure WHPX in-kernel irqchip"); |
| } |
| |
| static void whpx_accel_instance_init(Object *obj) |
| { |
| struct whpx_state *whpx = &whpx_global; |
| |
| memset(whpx, 0, sizeof(struct whpx_state)); |
| /* Turn on kernel-irqchip, by default */ |
| whpx->kernel_irqchip_allowed = true; |
| } |
| |
| static const TypeInfo whpx_accel_type = { |
| .name = ACCEL_CLASS_NAME("whpx"), |
| .parent = TYPE_ACCEL, |
| .instance_init = whpx_accel_instance_init, |
| .class_init = whpx_accel_class_init, |
| }; |
| |
| static void whpx_type_init(void) |
| { |
| type_register_static(&whpx_accel_type); |
| } |
| |
| bool init_whp_dispatch(void) |
| { |
| if (whp_dispatch_initialized) { |
| return true; |
| } |
| |
| if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) { |
| goto error; |
| } |
| |
| if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) { |
| goto error; |
| } |
| |
| assert(load_whp_dispatch_fns(&hWinHvPlatform, |
| WINHV_PLATFORM_FNS_SUPPLEMENTAL)); |
| whp_dispatch_initialized = true; |
| |
| return true; |
| error: |
| if (hWinHvPlatform) { |
| FreeLibrary(hWinHvPlatform); |
| } |
| |
| if (hWinHvEmulation) { |
| FreeLibrary(hWinHvEmulation); |
| } |
| |
| return false; |
| } |
| |
| type_init(whpx_type_init); |