QEMU: MCE: Add MCE simulation to qemu/tcg

- MCE features are initialized when VCPU is intialized according to CPUID.
- A monitor command "mce" is added to inject a MCE.
- A new interrupt mask: CPU_INTERRUPT_MCE is added to inject the MCE.

aliguori: fix build for linux-user

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
diff --git a/cpu-all.h b/cpu-all.h
index 97a224d..fda15ce 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -770,6 +770,7 @@
 #define CPU_INTERRUPT_NMI    0x200 /* NMI pending. */
 #define CPU_INTERRUPT_INIT   0x400 /* INIT pending. */
 #define CPU_INTERRUPT_SIPI   0x800 /* SIPI pending. */
+#define CPU_INTERRUPT_MCE    0x1000 /* (x86 only) MCE pending. */
 
 void cpu_interrupt(CPUState *s, int mask);
 void cpu_reset_interrupt(CPUState *env, int mask);
@@ -1071,4 +1072,7 @@
 extern int64_t kqemu_ret_intr_count;
 #endif
 
+void cpu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
+
 #endif /* CPU_ALL_H */
diff --git a/cpu-exec.c b/cpu-exec.c
index db5cb57..38335f8 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -400,6 +400,10 @@
                             env->hflags2 |= HF2_NMI_MASK;
                             do_interrupt(EXCP02_NMI, 0, 0, 0, 1);
                             next_tb = 0;
+			} else if (interrupt_request & CPU_INTERRUPT_MCE) {
+                            env->interrupt_request &= ~CPU_INTERRUPT_MCE;
+                            do_interrupt(EXCP12_MCHK, 0, 0, 0, 0);
+                            next_tb = 0;
                         } else if ((interrupt_request & CPU_INTERRUPT_HARD) &&
                                    (((env->hflags2 & HF2_VINTR_MASK) && 
                                      (env->hflags2 & HF2_HIF_MASK)) ||
diff --git a/monitor.c b/monitor.c
index bad79fe..301bde0 100644
--- a/monitor.c
+++ b/monitor.c
@@ -1677,6 +1677,28 @@
     }
 }
 
+#if defined(TARGET_I386)
+static void do_inject_mce(Monitor *mon,
+                          int cpu_index, int bank,
+                          unsigned status_hi, unsigned status_lo,
+                          unsigned mcg_status_hi, unsigned mcg_status_lo,
+                          unsigned addr_hi, unsigned addr_lo,
+                          unsigned misc_hi, unsigned misc_lo)
+{
+    CPUState *cenv;
+    uint64_t status = ((uint64_t)status_hi << 32) | status_lo;
+    uint64_t mcg_status = ((uint64_t)mcg_status_hi << 32) | mcg_status_lo;
+    uint64_t addr = ((uint64_t)addr_hi << 32) | addr_lo;
+    uint64_t misc = ((uint64_t)misc_hi << 32) | misc_lo;
+
+    for (cenv = first_cpu; cenv != NULL; cenv = cenv->next_cpu)
+        if (cenv->cpu_index == cpu_index && cenv->mcg_cap) {
+            cpu_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
+            break;
+        }
+}
+#endif
+
 static const mon_cmd_t mon_cmds[] = {
 #include "qemu-monitor.h"
     { NULL, NULL, },
@@ -2451,6 +2473,15 @@
                       void *arg3, void *arg4, void *arg5);
     void (*handler_7)(Monitor *mon, void *arg0, void *arg1, void *arg2,
                       void *arg3, void *arg4, void *arg5, void *arg6);
+    void (*handler_8)(Monitor *mon, void *arg0, void *arg1, void *arg2,
+                      void *arg3, void *arg4, void *arg5, void *arg6,
+                      void *arg7);
+    void (*handler_9)(Monitor *mon, void *arg0, void *arg1, void *arg2,
+                      void *arg3, void *arg4, void *arg5, void *arg6,
+                      void *arg7, void *arg8);
+    void (*handler_10)(Monitor *mon, void *arg0, void *arg1, void *arg2,
+                       void *arg3, void *arg4, void *arg5, void *arg6,
+                       void *arg7, void *arg8, void *arg9);
 
 #ifdef DEBUG
     monitor_printf(mon, "command='%s'\n", cmdline);
@@ -2739,6 +2770,21 @@
         handler_7(mon, args[0], args[1], args[2], args[3], args[4], args[5],
                   args[6]);
         break;
+    case 8:
+        handler_8 = cmd->handler;
+        handler_8(mon, args[0], args[1], args[2], args[3], args[4], args[5],
+                  args[6], args[7]);
+        break;
+    case 9:
+        handler_9 = cmd->handler;
+        handler_9(mon, args[0], args[1], args[2], args[3], args[4], args[5],
+                  args[6], args[7], args[8]);
+        break;
+    case 10:
+        handler_10 = cmd->handler;
+        handler_10(mon, args[0], args[1], args[2], args[3], args[4], args[5],
+                   args[6], args[7], args[8], args[9]);
+        break;
     default:
         monitor_printf(mon, "unsupported number of arguments: %d\n", nb_args);
         goto fail;
diff --git a/qemu-monitor.hx b/qemu-monitor.hx
index dc10b75..62edbcd 100644
--- a/qemu-monitor.hx
+++ b/qemu-monitor.hx
@@ -615,6 +615,14 @@
 policy back to @code{deny}.
 ETEXI
 
+#if defined(TARGET_I386)
+    { "mce", "iillll", do_inject_mce, "cpu bank status mcgstatus addr misc", "inject a MCE on the given CPU"},
+#endif
+STEXI
+@item mce @var{cpu} @var{bank} @var{status} @var{mcgstatus} @var{addr} @var{misc}
+Inject an MCE on the given CPU (x86 only).
+ETEXI
+
 STEXI
 @end table
 ETEXI
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 4a8608e..6f7478a 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -204,6 +204,7 @@
 #define CR4_DE_MASK   (1 << 3)
 #define CR4_PSE_MASK  (1 << 4)
 #define CR4_PAE_MASK  (1 << 5)
+#define CR4_MCE_MASK  (1 << 6)
 #define CR4_PGE_MASK  (1 << 7)
 #define CR4_PCE_MASK  (1 << 8)
 #define CR4_OSFXSR_SHIFT 9
@@ -250,6 +251,17 @@
 #define PG_ERROR_RSVD_MASK 0x08
 #define PG_ERROR_I_D_MASK  0x10
 
+#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
+
+#define MCE_CAP_DEF	MCG_CTL_P
+#define MCE_BANKS_DEF	10
+
+#define MCG_STATUS_MCIP	(1UL<<2)   /* machine check in progress */
+
+#define MCI_STATUS_VAL	(1UL<<63)  /* valid error */
+#define MCI_STATUS_OVER	(1UL<<62)  /* previous errors lost */
+#define MCI_STATUS_UC	(1UL<<61)  /* uncorrected error */
+
 #define MSR_IA32_TSC                    0x10
 #define MSR_IA32_APICBASE               0x1b
 #define MSR_IA32_APICBASE_BSP           (1<<8)
@@ -290,6 +302,11 @@
 
 #define MSR_MTRRdefType			0x2ff
 
+#define MSR_MC0_CTL			0x400
+#define MSR_MC0_STATUS			0x401
+#define MSR_MC0_ADDR			0x402
+#define MSR_MC0_MISC			0x403
+
 #define MSR_EFER                        0xc0000080
 
 #define MSR_EFER_SCE   (1 << 0)
@@ -678,6 +695,11 @@
     /* in order to simplify APIC support, we leave this pointer to the
        user */
     struct APICState *apic_state;
+
+    uint64 mcg_cap;
+    uint64 mcg_status;
+    uint64 mcg_ctl;
+    uint64 *mce_banks;
 } CPUX86State;
 
 CPUX86State *cpu_x86_init(const char *cpu_model);
@@ -842,7 +864,7 @@
 #define cpu_signal_handler cpu_x86_signal_handler
 #define cpu_list x86_cpu_list
 
-#define CPU_SAVE_VERSION 9
+#define CPU_SAVE_VERSION 10
 
 /* MMU modes definitions */
 #define MMU_MODE0_SUFFIX _kernel
diff --git a/target-i386/helper.c b/target-i386/helper.c
index 82e1ff1..ce5346c 100644
--- a/target-i386/helper.c
+++ b/target-i386/helper.c
@@ -1496,8 +1496,77 @@
     if (prev_debug_excp_handler)
         prev_debug_excp_handler(env);
 }
+
+/* This should come from sysemu.h - if we could include it here... */
+void qemu_system_reset_request(void);
+
+void cpu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
+{
+    uint64_t mcg_cap = cenv->mcg_cap;
+    unsigned bank_num = mcg_cap & 0xff;
+    uint64_t *banks = cenv->mce_banks;
+
+    if (bank >= bank_num || !(status & MCI_STATUS_VAL))
+        return;
+
+    /*
+     * if MSR_MCG_CTL is not all 1s, the uncorrected error
+     * reporting is disabled
+     */
+    if ((status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
+        cenv->mcg_ctl != ~(uint64_t)0)
+        return;
+    banks += 4 * bank;
+    /*
+     * if MSR_MCi_CTL is not all 1s, the uncorrected error
+     * reporting is disabled for the bank
+     */
+    if ((status & MCI_STATUS_UC) && banks[0] != ~(uint64_t)0)
+        return;
+    if (status & MCI_STATUS_UC) {
+        if ((cenv->mcg_status & MCG_STATUS_MCIP) ||
+            !(cenv->cr[4] & CR4_MCE_MASK)) {
+            fprintf(stderr, "injects mce exception while previous "
+                    "one is in progress!\n");
+            qemu_log_mask(CPU_LOG_RESET, "Triple fault\n");
+            qemu_system_reset_request();
+            return;
+        }
+        if (banks[1] & MCI_STATUS_VAL)
+            status |= MCI_STATUS_OVER;
+        banks[2] = addr;
+        banks[3] = misc;
+        cenv->mcg_status = mcg_status;
+        banks[1] = status;
+        cpu_interrupt(cenv, CPU_INTERRUPT_MCE);
+    } else if (!(banks[1] & MCI_STATUS_VAL)
+               || !(banks[1] & MCI_STATUS_UC)) {
+        if (banks[1] & MCI_STATUS_VAL)
+            status |= MCI_STATUS_OVER;
+        banks[2] = addr;
+        banks[3] = misc;
+        banks[1] = status;
+    } else
+        banks[1] |= MCI_STATUS_OVER;
+}
 #endif /* !CONFIG_USER_ONLY */
 
+static void mce_init(CPUX86State *cenv)
+{
+    unsigned int bank, bank_num;
+
+    if (((cenv->cpuid_version >> 8)&0xf) >= 6
+        && (cenv->cpuid_features&(CPUID_MCE|CPUID_MCA)) == (CPUID_MCE|CPUID_MCA)) {
+        cenv->mcg_cap = MCE_CAP_DEF | MCE_BANKS_DEF;
+        cenv->mcg_ctl = ~(uint64_t)0;
+        bank_num = cenv->mcg_cap & 0xff;
+        cenv->mce_banks = qemu_mallocz(bank_num * sizeof(uint64_t) * 4);
+        for (bank = 0; bank < bank_num; bank++)
+            cenv->mce_banks[bank*4] = ~(uint64_t)0;
+    }
+}
+
 static void host_cpuid(uint32_t function, uint32_t count,
                        uint32_t *eax, uint32_t *ebx,
                        uint32_t *ecx, uint32_t *edx)
@@ -1735,6 +1804,7 @@
         cpu_x86_close(env);
         return NULL;
     }
+    mce_init(env);
     cpu_reset(env);
 #ifdef CONFIG_KQEMU
     kqemu_init(env);
diff --git a/target-i386/machine.c b/target-i386/machine.c
index 2a72b01..8bf13cc 100644
--- a/target-i386/machine.c
+++ b/target-i386/machine.c
@@ -158,7 +158,20 @@
     qemu_put_sbe32s(f, &pending_irq);
     qemu_put_be32s(f, &env->mp_state);
     qemu_put_be64s(f, &env->tsc);
-}
+
+    /* MCE */
+    qemu_put_be64s(f, &env->mcg_cap);
+    if (env->mcg_cap) {
+        qemu_put_be64s(f, &env->mcg_status);
+        qemu_put_be64s(f, &env->mcg_ctl);
+        for (i = 0; i < (env->mcg_cap & 0xff); i++) {
+            qemu_put_be64s(f, &env->mce_banks[4*i]);
+            qemu_put_be64s(f, &env->mce_banks[4*i + 1]);
+            qemu_put_be64s(f, &env->mce_banks[4*i + 2]);
+            qemu_put_be64s(f, &env->mce_banks[4*i + 3]);
+        }
+    }
+ }
 
 #ifdef USE_X86LDOUBLE
 /* XXX: add that in a FPU generic layer */
@@ -349,6 +362,20 @@
         qemu_get_be64s(f, &env->tsc);
     }
 
+    if (version_id >= 10) {
+        qemu_get_be64s(f, &env->mcg_cap);
+        if (env->mcg_cap) {
+            qemu_get_be64s(f, &env->mcg_status);
+            qemu_get_be64s(f, &env->mcg_ctl);
+            for (i = 0; i < (env->mcg_cap & 0xff); i++) {
+                qemu_get_be64s(f, &env->mce_banks[4*i]);
+                qemu_get_be64s(f, &env->mce_banks[4*i + 1]);
+                qemu_get_be64s(f, &env->mce_banks[4*i + 2]);
+                qemu_get_be64s(f, &env->mce_banks[4*i + 3]);
+            }
+        }
+    }
+
     /* XXX: ensure compatiblity for halted bit ? */
     /* XXX: compute redundant hflags bits */
     env->hflags = hflags;
diff --git a/target-i386/op_helper.c b/target-i386/op_helper.c
index bd1769c..ed22c7a 100644
--- a/target-i386/op_helper.c
+++ b/target-i386/op_helper.c
@@ -3133,7 +3133,23 @@
     case MSR_MTRRdefType:
         env->mtrr_deftype = val;
         break;
+    case MSR_MCG_STATUS:
+        env->mcg_status = val;
+        break;
+    case MSR_MCG_CTL:
+        if ((env->mcg_cap & MCG_CTL_P)
+            && (val == 0 || val == ~(uint64_t)0))
+            env->mcg_ctl = val;
+        break;
     default:
+        if ((uint32_t)ECX >= MSR_MC0_CTL
+            && (uint32_t)ECX < MSR_MC0_CTL + (4 * env->mcg_cap & 0xff)) {
+            uint32_t offset = (uint32_t)ECX - MSR_MC0_CTL;
+            if ((offset & 0x3) != 0
+                || (val == 0 || val == ~(uint64_t)0))
+                env->mce_banks[offset] = val;
+            break;
+        }
         /* XXX: exception ? */
         break;
     }
@@ -3252,7 +3268,25 @@
             /* XXX: exception ? */
             val = 0;
         break;
+    case MSR_MCG_CAP:
+        val = env->mcg_cap;
+        break;
+    case MSR_MCG_CTL:
+        if (env->mcg_cap & MCG_CTL_P)
+            val = env->mcg_ctl;
+        else
+            val = 0;
+        break;
+    case MSR_MCG_STATUS:
+        val = env->mcg_status;
+        break;
     default:
+        if ((uint32_t)ECX >= MSR_MC0_CTL
+            && (uint32_t)ECX < MSR_MC0_CTL + (4 * env->mcg_cap & 0xff)) {
+            uint32_t offset = (uint32_t)ECX - MSR_MC0_CTL;
+            val = env->mce_banks[offset];
+            break;
+        }
         /* XXX: exception ? */
         val = 0;
         break;