Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging

* i386: fix issue with cache topology passthrough
* scsi-disk: migrate emulated requests
* i386/sev: fix Coverity issues
* i386/tcg: more conversions to new decoder

# -----BEGIN PGP SIGNATURE-----
#
# iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmZv6kMUHHBib256aW5p
# QHJlZGhhdC5jb20ACgkQv/vSX3jHroOn4Af/evnpsae1fm8may1NQmmezKiks/4X
# cR0GaQ7w75Oas05jKsG7Xnrq3Vn6p5wllf3Wf00p7F1iJX18azY9rQgIsUVUgVem
# /EIZk1eM6+mDxuIG0taPxc5Aw3cfIBWAjUmzsXrSr55e/wyiIxZCeUo2zk8Il+iL
# Z4ceNzY5PZzc2Fl10D3cGs/+ynfiDM53ucwe3ve2T6NrxEVfKQPp5jkIUkBUba6z
# zM5O4Q5KTEZYVth1gbDTB/uUJLUFjQ12kCQfRCNX+bEPDHwARr0UWr/Oxtz0jZSd
# FvXohz7tI+v+ph0xHyE4tEFqryvLCII1td2ohTAYZZXNGkjK6XZildngBw==
# =m4BE
# -----END PGP SIGNATURE-----
# gpg: Signature made Mon 17 Jun 2024 12:48:19 AM PDT
# gpg:                using RSA key F13338574B662389866C7682BFFBD25F78C7AE83
# gpg:                issuer "pbonzini@redhat.com"
# gpg: Good signature from "Paolo Bonzini <bonzini@gnu.org>" [full]
# gpg:                 aka "Paolo Bonzini <pbonzini@redhat.com>" [full]

* tag 'for-upstream' of https://gitlab.com/bonzini/qemu: (25 commits)
  target/i386: SEV: do not assume machine->cgs is SEV
  target/i386: convert CMPXCHG to new decoder
  target/i386: convert XADD to new decoder
  target/i386: convert LZCNT/TZCNT/BSF/BSR/POPCNT to new decoder
  target/i386: convert SHLD/SHRD to new decoder
  target/i386: adapt gen_shift_count for SHLD/SHRD
  target/i386: pull load/writeback out of gen_shiftd_rm_T1
  target/i386: convert non-grouped, helper-based 2-byte opcodes
  target/i386: split X86_CHECK_prot into PE and VM86 checks
  target/i386: finish converting 0F AE to the new decoder
  target/i386: fix bad sorting of entries in the 0F table
  target/i386: replace read_crN helper with read_cr8
  target/i386: convert MOV from/to CR and DR to new decoder
  target/i386: fix processing of intercept 0 (read CR0)
  target/i386: replace NoSeg special with NoLoadEA
  target/i386: change X86_ENTRYwr to use T0, use it for moves
  target/i386: change X86_ENTRYr to use T0
  target/i386: put BLS* input in T1, use generic flag writeback
  target/i386: rewrite flags writeback for ADCX/ADOX
  target/i386: remove CPUX86State argument from generator functions
  ...

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
diff --git a/hw/core/machine.c b/hw/core/machine.c
index c93d249..655d75c 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -36,6 +36,7 @@
 
 GlobalProperty hw_compat_9_0[] = {
     {"arm-cpu", "backcompat-cntfrq", "true" },
+    {"scsi-disk-base", "migrate-emulated-scsi-request", "false" },
     {"vfio-pci", "skip-vsc-check", "false" },
 };
 const size_t hw_compat_9_0_len = G_N_ELEMENTS(hw_compat_9_0);
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 0812d39..a67092d 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -114,6 +114,7 @@
      * 0xffff        - reserved
      */
     uint16_t rotation_rate;
+    bool migrate_emulated_scsi_request;
 };
 
 static void scsi_free_request(SCSIRequest *req)
@@ -162,6 +163,15 @@
     }
 }
 
+static void scsi_disk_emulate_save_request(QEMUFile *f, SCSIRequest *req)
+{
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev);
+
+    if (s->migrate_emulated_scsi_request) {
+        scsi_disk_save_request(f, req);
+    }
+}
+
 static void scsi_disk_load_request(QEMUFile *f, SCSIRequest *req)
 {
     SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
@@ -185,6 +195,15 @@
     qemu_iovec_init_external(&r->qiov, &r->iov, 1);
 }
 
+static void scsi_disk_emulate_load_request(QEMUFile *f, SCSIRequest *req)
+{
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev);
+
+    if (s->migrate_emulated_scsi_request) {
+        scsi_disk_load_request(f, req);
+    }
+}
+
 /*
  * scsi_handle_rw_error has two return values.  False means that the error
  * must be ignored, true means that the error has been processed and the
@@ -2606,6 +2625,8 @@
     .read_data    = scsi_disk_emulate_read_data,
     .write_data   = scsi_disk_emulate_write_data,
     .get_buf      = scsi_get_buf,
+    .load_request = scsi_disk_emulate_load_request,
+    .save_request = scsi_disk_emulate_save_request,
 };
 
 static const SCSIReqOps scsi_disk_dma_reqops = {
@@ -3114,7 +3135,8 @@
     DEFINE_PROP_STRING("serial", SCSIDiskState, serial),                \
     DEFINE_PROP_STRING("vendor", SCSIDiskState, vendor),                \
     DEFINE_PROP_STRING("product", SCSIDiskState, product),              \
-    DEFINE_PROP_STRING("device_id", SCSIDiskState, device_id)
+    DEFINE_PROP_STRING("device_id", SCSIDiskState, device_id),          \
+    DEFINE_PROP_BOOL("migrate-emulated-scsi-request", SCSIDiskState, migrate_emulated_scsi_request, true)
 
 
 static Property scsi_hd_properties[] = {
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 7466217..365852c 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -6455,10 +6455,8 @@
             if (*eax & 31) {
                 int host_vcpus_per_cache = 1 + ((*eax & 0x3FFC000) >> 14);
 
-                if (cores_per_pkg > 1) {
-                    *eax &= ~0xFC000000;
-                    *eax |= max_core_ids_in_package(&topo_info) << 26;
-                }
+                *eax &= ~0xFC000000;
+                *eax |= max_core_ids_in_package(&topo_info) << 26;
                 if (host_vcpus_per_cache > threads_per_pkg) {
                     *eax &= ~0x3FFC000;
 
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 8fe28b6..7e2a9b5 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1260,6 +1260,8 @@
 /* Use a clearer name for this.  */
 #define CPU_INTERRUPT_INIT      CPU_INTERRUPT_RESET
 
+#define CC_OP_HAS_EFLAGS(op) ((op) >= CC_OP_EFLAGS && (op) <= CC_OP_ADCOX)
+
 /* Instead of computing the condition codes after each x86 instruction,
  * QEMU just stores one operand (called CC_SRC), the result
  * (called CC_DST) and the type of operation (called CC_OP). When the
@@ -1270,6 +1272,9 @@
 typedef enum {
     CC_OP_DYNAMIC, /* must use dynamic code to get cc_op */
     CC_OP_EFLAGS,  /* all cc are explicitly computed, CC_SRC = flags */
+    CC_OP_ADCX, /* CC_DST = C, CC_SRC = rest.  */
+    CC_OP_ADOX, /* CC_SRC2 = O, CC_SRC = rest.  */
+    CC_OP_ADCOX, /* CC_DST = C, CC_SRC2 = O, CC_SRC = rest.  */
 
     CC_OP_MULB, /* modify all flags, C, O = (CC_SRC != 0) */
     CC_OP_MULW,
@@ -1326,10 +1331,6 @@
     CC_OP_BMILGL,
     CC_OP_BMILGQ,
 
-    CC_OP_ADCX, /* CC_DST = C, CC_SRC = rest.  */
-    CC_OP_ADOX, /* CC_DST = O, CC_SRC = rest.  */
-    CC_OP_ADCOX, /* CC_DST = C, CC_SRC2 = O, CC_SRC = rest.  */
-
     CC_OP_CLR, /* Z set, all other flags clear.  */
     CC_OP_POPCNT, /* Z via CC_SRC, all other flags clear.  */
 
diff --git a/target/i386/helper.h b/target/i386/helper.h
index 2f46cff..eeb8df5 100644
--- a/target/i386/helper.h
+++ b/target/i386/helper.h
@@ -95,7 +95,7 @@
 DEF_HELPER_FLAGS_2(mwait, TCG_CALL_NO_WG, noreturn, env, int)
 DEF_HELPER_1(rdmsr, void, env)
 DEF_HELPER_1(wrmsr, void, env)
-DEF_HELPER_FLAGS_2(read_crN, TCG_CALL_NO_RWG, tl, env, int)
+DEF_HELPER_FLAGS_1(read_cr8, TCG_CALL_NO_RWG, tl, env)
 DEF_HELPER_FLAGS_3(write_crN, TCG_CALL_NO_RWG, void, env, int, tl)
 #endif /* !CONFIG_USER_ONLY */
 
diff --git a/target/i386/sev.c b/target/i386/sev.c
index 004c667..30b83f1 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -587,6 +587,7 @@
     sev_common = SEV_COMMON(MACHINE(qdev_get_machine())->cgs);
     if (!sev_common) {
         error_setg(errp, "SEV is not configured");
+        return NULL;
     }
 
     sev_device = object_property_get_str(OBJECT(sev_common), "sev-device",
@@ -1529,11 +1530,12 @@
 sev_encrypt_flash(hwaddr gpa, uint8_t *ptr, uint64_t len, Error **errp)
 {
     SevCommonState *sev_common = SEV_COMMON(MACHINE(qdev_get_machine())->cgs);
-    SevCommonStateClass *klass = SEV_COMMON_GET_CLASS(sev_common);
+    SevCommonStateClass *klass;
 
     if (!sev_common) {
         return 0;
     }
+    klass = SEV_COMMON_GET_CLASS(sev_common);
 
     /* if SEV is in update state then encrypt the data else do nothing */
     if (sev_check_state(sev_common, SEV_STATE_LAUNCH_UPDATE)) {
@@ -1710,7 +1712,9 @@
 {
     X86CPU *x86;
     CPUX86State *env;
-    SevCommonState *sev_common = SEV_COMMON(MACHINE(qdev_get_machine())->cgs);
+    ConfidentialGuestSupport *cgs = MACHINE(qdev_get_machine())->cgs;
+    SevCommonState *sev_common = SEV_COMMON(
+        object_dynamic_cast(OBJECT(cgs), TYPE_SEV_COMMON));
 
     /* Only update if we have valid reset information */
     if (!sev_common || !sev_common->reset_data_valid) {
@@ -2165,6 +2169,7 @@
     struct kvm_sev_snp_launch_finish *finish = &sev_snp_guest->kvm_finish_conf;
     gsize len;
 
+    finish->id_block_en = 0;
     g_free(sev_snp_guest->id_block);
     g_free((guchar *)finish->id_block_uaddr);
 
@@ -2184,7 +2189,7 @@
         return;
     }
 
-    finish->id_block_en = (len) ? 1 : 0;
+    finish->id_block_en = 1;
 }
 
 static char *
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index c2d8da8..0d846c3 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -151,6 +151,8 @@
     X86_OP_GROUP3(op, op0, s0, 2op, s0, op1, s1, ## __VA_ARGS__)
 #define X86_OP_GROUPw(op, op0, s0, ...)                           \
     X86_OP_GROUP3(op, op0, s0, None, None, None, None, ## __VA_ARGS__)
+#define X86_OP_GROUPwr(op, op0, s0, op1, s1, ...)                 \
+    X86_OP_GROUP3(op, op0, s0, op1, s1, None, None, ## __VA_ARGS__)
 #define X86_OP_GROUP0(op, ...)                                    \
     X86_OP_GROUP3(op, None, None, None, None, None, None, ## __VA_ARGS__)
 
@@ -180,20 +182,20 @@
 #define X86_OP_ENTRYrr(op, op0, s0, op1, s1, ...)                 \
     X86_OP_ENTRY3(op, None, None, op0, s0, op1, s1, ## __VA_ARGS__)
 #define X86_OP_ENTRYwr(op, op0, s0, op1, s1, ...)                 \
-    X86_OP_ENTRY3(op, op0, s0, None, None, op1, s1, ## __VA_ARGS__)
+    X86_OP_ENTRY3(op, op0, s0, op1, s1, None, None, ## __VA_ARGS__)
 #define X86_OP_ENTRY2(op, op0, s0, op1, s1, ...)                  \
     X86_OP_ENTRY3(op, op0, s0, 2op, s0, op1, s1, ## __VA_ARGS__)
 #define X86_OP_ENTRYw(op, op0, s0, ...)                           \
     X86_OP_ENTRY3(op, op0, s0, None, None, None, None, ## __VA_ARGS__)
 #define X86_OP_ENTRYr(op, op0, s0, ...)                           \
-    X86_OP_ENTRY3(op, None, None, None, None, op0, s0, ## __VA_ARGS__)
+    X86_OP_ENTRY3(op, None, None, op0, s0, None, None, ## __VA_ARGS__)
 #define X86_OP_ENTRY1(op, op0, s0, ...)                           \
     X86_OP_ENTRY3(op, op0, s0, 2op, s0, None, None, ## __VA_ARGS__)
 #define X86_OP_ENTRY0(op, ...)                                    \
     X86_OP_ENTRY3(op, None, None, None, None, None, None, ## __VA_ARGS__)
 
 #define cpuid(feat) .cpuid = X86_FEAT_##feat,
-#define noseg .special = X86_SPECIAL_NoSeg,
+#define nolea .special = X86_SPECIAL_NoLoadEA,
 #define xchg .special = X86_SPECIAL_Locked,
 #define lock .special = X86_SPECIAL_HasLock,
 #define mmx .special = X86_SPECIAL_MMX,
@@ -221,7 +223,9 @@
 #define vex13 .vex_class = 13,
 
 #define chk(a) .check = X86_CHECK_##a,
-#define svm(a) .intercept = SVM_EXIT_##a,
+#define chk2(a, b) .check = X86_CHECK_##a | X86_CHECK_##b,
+#define chk3(a, b, c) .check = X86_CHECK_##a | X86_CHECK_##b | X86_CHECK_##c,
+#define svm(a) .intercept = SVM_EXIT_##a, .has_intercept = true,
 
 #define avx2_256 .vex_special = X86_VEX_AVX2_256,
 
@@ -267,20 +271,41 @@
 
 static void decode_group15(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
 {
-    /* only includes ldmxcsr and stmxcsr, because they have AVX variants.  */
     static const X86OpEntry group15_reg[8] = {
+        [0] = X86_OP_ENTRYw(RDxxBASE,   R,y, cpuid(FSGSBASE) chk(o64) p_f3),
+        [1] = X86_OP_ENTRYw(RDxxBASE,   R,y, cpuid(FSGSBASE) chk(o64) p_f3),
+        [2] = X86_OP_ENTRYr(WRxxBASE,   R,y, cpuid(FSGSBASE) chk(o64) p_f3 zextT0),
+        [3] = X86_OP_ENTRYr(WRxxBASE,   R,y, cpuid(FSGSBASE) chk(o64) p_f3 zextT0),
+        [5] = X86_OP_ENTRY0(LFENCE,          cpuid(SSE2) p_00),
+        [6] = X86_OP_ENTRY0(MFENCE,          cpuid(SSE2) p_00),
+        [7] = X86_OP_ENTRY0(SFENCE,          cpuid(SSE2) p_00),
     };
 
     static const X86OpEntry group15_mem[8] = {
-        [2] = X86_OP_ENTRYr(LDMXCSR,    E,d, vex5 chk(VEX128)),
-        [3] = X86_OP_ENTRYw(STMXCSR,    E,d, vex5 chk(VEX128)),
+        [0] = X86_OP_ENTRYw(FXSAVE,     M,y, cpuid(FXSR) p_00),
+        [1] = X86_OP_ENTRYr(FXRSTOR,    M,y, cpuid(FXSR) p_00),
+        [2] = X86_OP_ENTRYr(LDMXCSR,    E,d, vex5 chk(VEX128) p_00),
+        [3] = X86_OP_ENTRYw(STMXCSR,    E,d, vex5 chk(VEX128) p_00),
+        [4] = X86_OP_ENTRYw(XSAVE,      M,y, cpuid(XSAVE) p_00),
+        [5] = X86_OP_ENTRYr(XRSTOR,     M,y, cpuid(XSAVE) p_00),
+        [6] = X86_OP_ENTRYw(XSAVEOPT,   M,b, cpuid(XSAVEOPT) p_00),
+        [7] = X86_OP_ENTRYw(NOP,        M,b, cpuid(CLFLUSH) p_00),
+    };
+
+    static const X86OpEntry group15_mem_66[8] = {
+        [6] = X86_OP_ENTRYw(NOP,        M,b, cpuid(CLWB)),
+        [7] = X86_OP_ENTRYw(NOP,        M,b, cpuid(CLFLUSHOPT)),
     };
 
     uint8_t modrm = get_modrm(s, env);
+    int op = (modrm >> 3) & 7;
+
     if ((modrm >> 6) == 3) {
-        *entry = group15_reg[(modrm >> 3) & 7];
+        *entry = group15_reg[op];
+    } else if (s->prefix & PREFIX_DATA) {
+        *entry = group15_mem_66[op];
     } else {
-        *entry = group15_mem[(modrm >> 3) & 7];
+        *entry = group15_mem[op];
     }
 }
 
@@ -425,6 +450,50 @@
     *entry = *decode_by_prefix(s, opcodes_0F7F);
 }
 
+static void decode_0FB8(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    static const X86OpEntry popcnt =
+        X86_OP_ENTRYwr(POPCNT,    G,v, E,v,  cpuid(POPCNT) zextT0);
+
+    if (s->prefix & PREFIX_REPZ) {
+        *entry = popcnt;
+    } else {
+        memset(entry, 0, sizeof(*entry));
+    }
+}
+
+static void decode_0FBC(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    /* For BSF, pass 2op as the third operand so that we can use zextT0 */
+    static const X86OpEntry opcodes_0FBC[4] = {
+        X86_OP_ENTRY3(BSF,    G,v, E,v, 2op,v, zextT0),
+        X86_OP_ENTRY3(BSF,    G,v, E,v, 2op,v, zextT0), /* 0x66 */
+        X86_OP_ENTRYwr(TZCNT, G,v, E,v,        zextT0), /* 0xf3 */
+        X86_OP_ENTRY3(BSF,    G,v, E,v, 2op,v, zextT0), /* 0xf2 */
+    };
+    if (!(s->cpuid_ext3_features & CPUID_EXT3_ABM)) {
+        *entry = opcodes_0FBC[0];
+    } else {
+        *entry = *decode_by_prefix(s, opcodes_0FBC);
+    }
+}
+
+static void decode_0FBD(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+    /* For BSR, pass 2op as the third operand so that we can use zextT0 */
+    static const X86OpEntry opcodes_0FBD[4] = {
+        X86_OP_ENTRY3(BSR,    G,v, E,v, 2op,v, zextT0),
+        X86_OP_ENTRY3(BSR,    G,v, E,v, 2op,v, zextT0), /* 0x66 */
+        X86_OP_ENTRYwr(LZCNT, G,v, E,v,        zextT0), /* 0xf3 */
+        X86_OP_ENTRY3(BSR,    G,v, E,v, 2op,v, zextT0), /* 0xf2 */
+    };
+    if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)) {
+        *entry = opcodes_0FBD[0];
+    } else {
+        *entry = *decode_by_prefix(s, opcodes_0FBD);
+    }
+}
+
 static void decode_0FD6(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
 {
     static const X86OpEntry movq[4] = {
@@ -612,15 +681,15 @@
 /* five rows for no prefix, 66, F3, F2, 66+F2  */
 static const X86OpEntry opcodes_0F38_F0toFF[16][5] = {
     [0] = {
-        X86_OP_ENTRY3(MOVBE, G,y, M,y, None,None, cpuid(MOVBE)),
-        X86_OP_ENTRY3(MOVBE, G,w, M,w, None,None, cpuid(MOVBE)),
+        X86_OP_ENTRYwr(MOVBE, G,y, M,y, cpuid(MOVBE)),
+        X86_OP_ENTRYwr(MOVBE, G,w, M,w, cpuid(MOVBE)),
         {},
         X86_OP_ENTRY2(CRC32, G,d, E,b, cpuid(SSE42)),
         X86_OP_ENTRY2(CRC32, G,d, E,b, cpuid(SSE42)),
     },
     [1] = {
-        X86_OP_ENTRY3(MOVBE, M,y, G,y, None,None, cpuid(MOVBE)),
-        X86_OP_ENTRY3(MOVBE, M,w, G,w, None,None, cpuid(MOVBE)),
+        X86_OP_ENTRYwr(MOVBE, M,y, G,y, cpuid(MOVBE)),
+        X86_OP_ENTRYwr(MOVBE, M,w, G,w, cpuid(MOVBE)),
         {},
         X86_OP_ENTRY2(CRC32, G,d, E,y, cpuid(SSE42)),
         X86_OP_ENTRY2(CRC32, G,d, E,w, cpuid(SSE42)),
@@ -633,7 +702,7 @@
         {},
     },
     [3] = {
-        X86_OP_GROUP3(group17, B,y, E,y, None,None, vex13 cpuid(BMI1)),
+        X86_OP_GROUP3(group17, B,y, None,None, E,y, vex13 cpuid(BMI1)),
         {},
         {},
         {},
@@ -985,14 +1054,30 @@
     *entry = *decode_by_prefix(s, opcodes_0FE6);
 }
 
-static const X86OpEntry opcodes_0F[256] = {
-    [0x0E] = X86_OP_ENTRY0(EMMS,                              cpuid(3DNOW)), /* femms */
+/*
+ * These ignore the mod bits (assume (modrm&0xc0)==0xc0), so group the
+ * pre-decode tweak here for all MOVs from/to CR and DR.
+ *
+ * AMD documentation (24594.pdf) and testing of Intel 386 and 486
+ * processors all show that the mod bits are assumed to be 1's,
+ * regardless of actual values.
+ */
+static void decode_MOV_CR_DR(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
     /*
-     * 3DNow!'s opcode byte comes *after* modrm and displacements, making it
-     * more like an Ib operand.  Dispatch to the right helper in a single gen_*
-     * function.
      */
-    [0x0F] = X86_OP_ENTRY3(3dnow,       P,q, Q,q, I,b,        cpuid(3DNOW)),
+    get_modrm(s, env);
+    s->modrm |= 0xC0;
+
+    entry->gen = gen_MOV;
+}
+
+static const X86OpEntry opcodes_0F[256] = {
+    [0x02] = X86_OP_ENTRYwr(LAR,        G,v, E,w,             chk(prot)),
+    [0x03] = X86_OP_ENTRYwr(LSL,        G,v, E,w,             chk(prot)),
+    [0x05] = X86_OP_ENTRY0(SYSCALL,                           chk(o64_intel)),
+    [0x06] = X86_OP_ENTRY0(CLTS,                              chk(cpl0) svm(WRITE_CR0)),
+    [0x07] = X86_OP_ENTRY0(SYSRET,                            chk3(o64_intel, prot, cpl0)),
 
     [0x10] = X86_OP_GROUP0(0F10),
     [0x11] = X86_OP_GROUP0(0F11),
@@ -1004,6 +1089,22 @@
     /* Incorrectly listed as Mq,Vq in the manual */
     [0x17] = X86_OP_ENTRY3(VMOVHPx_st,  M,q, None,None, V,dq, vex5 p_00_66),
 
+    /*
+     * Incorrectly listed as using "d" operand type in the manual.  In reality
+     * there's no 16-bit version (like y) and it does not use REX.W (like d64).
+     */
+    [0x20] = X86_OP_GROUPwr(MOV_CR_DR,   R,y_d64, C,y_d64, chk(cpl0) svm(READ_CR0)),
+    [0x21] = X86_OP_GROUPwr(MOV_CR_DR,   R,y_d64, D,y_d64, chk(cpl0) svm(READ_DR0)),
+    [0x22] = X86_OP_GROUPwr(MOV_CR_DR,   C,y_d64, R,y_d64, zextT0 chk(cpl0) svm(WRITE_CR0)),
+    [0x23] = X86_OP_GROUPwr(MOV_CR_DR,   D,y_d64, R,y_d64, zextT0 chk(cpl0) svm(WRITE_DR0)),
+
+    [0x30] = X86_OP_ENTRY0(WRMSR,                             chk(cpl0)),
+    [0x31] = X86_OP_ENTRY0(RDTSC),
+    [0x32] = X86_OP_ENTRY0(RDMSR,                             chk(cpl0)),
+    [0x33] = X86_OP_ENTRY0(RDPMC),
+    [0x34] = X86_OP_ENTRY0(SYSENTER,                          chk2(i64_amd, prot_or_vm86)),
+    [0x35] = X86_OP_ENTRY0(SYSEXIT,                           chk3(i64_amd, prot, cpl0)),
+
     [0x40] = X86_OP_ENTRY2(CMOVcc,     G,v, E,v, cpuid(CMOV)),
     [0x41] = X86_OP_ENTRY2(CMOVcc,     G,v, E,v, cpuid(CMOV)),
     [0x42] = X86_OP_ENTRY2(CMOVcc,     G,v, E,v, cpuid(CMOV)),
@@ -1060,9 +1161,64 @@
 
     [0xa0] = X86_OP_ENTRYr(PUSH, FS, w),
     [0xa1] = X86_OP_ENTRYw(POP, FS, w),
+    [0xa2] = X86_OP_ENTRY0(CPUID),
+    [0xa4] = X86_OP_ENTRY4(SHLD,  E,v, 2op,v, G,v),
+    [0xa5] = X86_OP_ENTRY3(SHLD,  E,v, 2op,v, G,v),
 
+    [0xb0] = X86_OP_ENTRY2(CMPXCHG,E,b, G,b, lock),
+    [0xb1] = X86_OP_ENTRY2(CMPXCHG,E,v, G,v, lock),
+    [0xb2] = X86_OP_ENTRY3(LSS,    G,v, EM,p, None, None),
+    [0xb4] = X86_OP_ENTRY3(LFS,    G,v, EM,p, None, None),
+    [0xb5] = X86_OP_ENTRY3(LGS,    G,v, EM,p, None, None),
+    [0xb6] = X86_OP_ENTRY3(MOV,    G,v, E,b, None, None, zextT0), /* MOVZX */
+    [0xb7] = X86_OP_ENTRY3(MOV,    G,v, E,w, None, None, zextT0), /* MOVZX */
+
+    [0xc0] = X86_OP_ENTRY2(XADD,       E,b, G,b,            lock),
+    [0xc1] = X86_OP_ENTRY2(XADD,       E,v, G,v,            lock),
+    [0xc2] = X86_OP_ENTRY4(VCMP,       V,x, H,x, W,x,       vex2_rep3 p_00_66_f3_f2),
+    [0xc3] = X86_OP_ENTRY3(MOV,        EM,y,G,y, None,None, cpuid(SSE2)), /* MOVNTI */
+    [0xc4] = X86_OP_ENTRY4(PINSRW,     V,dq,H,dq,E,w,       vex5 mmx p_00_66),
+    [0xc5] = X86_OP_ENTRY3(PEXTRW,     G,d, U,dq,I,b,       vex5 mmx p_00_66),
+    [0xc6] = X86_OP_ENTRY4(VSHUF,      V,x, H,x, W,x,       vex4 p_00_66),
+
+    [0xd0] = X86_OP_ENTRY3(VADDSUB,   V,x, H,x, W,x,        vex2 cpuid(SSE3) p_66_f2),
+    [0xd1] = X86_OP_ENTRY3(PSRLW_r,   V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
+    [0xd2] = X86_OP_ENTRY3(PSRLD_r,   V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
+    [0xd3] = X86_OP_ENTRY3(PSRLQ_r,   V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
+    [0xd4] = X86_OP_ENTRY3(PADDQ,     V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
+    [0xd5] = X86_OP_ENTRY3(PMULLW,    V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
+    [0xd6] = X86_OP_GROUP0(0FD6),
+    [0xd7] = X86_OP_ENTRY3(PMOVMSKB,  G,d, None,None, U,x,  vex7 mmx avx2_256 p_00_66),
+
+    [0xe0] = X86_OP_ENTRY3(PAVGB,     V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
+    [0xe1] = X86_OP_ENTRY3(PSRAW_r,   V,x, H,x, W,x,        vex7 mmx avx2_256 p_00_66),
+    [0xe2] = X86_OP_ENTRY3(PSRAD_r,   V,x, H,x, W,x,        vex7 mmx avx2_256 p_00_66),
+    [0xe3] = X86_OP_ENTRY3(PAVGW,     V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
+    [0xe4] = X86_OP_ENTRY3(PMULHUW,   V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
+    [0xe5] = X86_OP_ENTRY3(PMULHW,    V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
+    [0xe6] = X86_OP_GROUP0(0FE6),
+    [0xe7] = X86_OP_ENTRY3(MOVDQ,     W,x, None,None, V,x,  vex1 mmx p_00_66), /* MOVNTQ/MOVNTDQ */
+
+    [0xf0] = X86_OP_ENTRY3(MOVDQ,    V,x, None,None, WM,x,  vex4_unal cpuid(SSE3) p_f2), /* LDDQU */
+    [0xf1] = X86_OP_ENTRY3(PSLLW_r,  V,x, H,x, W,x,         vex7 mmx avx2_256 p_00_66),
+    [0xf2] = X86_OP_ENTRY3(PSLLD_r,  V,x, H,x, W,x,         vex7 mmx avx2_256 p_00_66),
+    [0xf3] = X86_OP_ENTRY3(PSLLQ_r,  V,x, H,x, W,x,         vex7 mmx avx2_256 p_00_66),
+    [0xf4] = X86_OP_ENTRY3(PMULUDQ,  V,x, H,x, W,x,         vex4 mmx avx2_256 p_00_66),
+    [0xf5] = X86_OP_ENTRY3(PMADDWD,  V,x, H,x, W,x,         vex4 mmx avx2_256 p_00_66),
+    [0xf6] = X86_OP_ENTRY3(PSADBW,   V,x, H,x, W,x,         vex4 mmx avx2_256 p_00_66),
+    [0xf7] = X86_OP_ENTRY3(MASKMOV,  None,None, V,dq, U,dq, vex4_unal avx2_256 mmx p_00_66),
+
+    [0x08] = X86_OP_ENTRY0(NOP,           svm(INVD)),
+    [0x09] = X86_OP_ENTRY0(NOP,           svm(WBINVD)),
     [0x0b] = X86_OP_ENTRY0(UD),           /* UD2 */
     [0x0d] = X86_OP_ENTRY1(NOP,  M,v),    /* 3DNow! prefetch */
+    [0x0e] = X86_OP_ENTRY0(EMMS,                              cpuid(3DNOW)), /* femms */
+    /*
+     * 3DNow!'s opcode byte comes *after* modrm and displacements, making it
+     * more like an Ib operand.  Dispatch to the right helper in a single gen_*
+     * function.
+     */
+    [0x0f] = X86_OP_ENTRY3(3dnow,       P,q, Q,q, I,b,        cpuid(3DNOW)),
 
     [0x18] = X86_OP_ENTRY1(NOP,  nop,v),  /* prefetch/reserved NOP */
     [0x19] = X86_OP_ENTRY1(NOP,  nop,v),  /* reserved NOP */
@@ -1137,6 +1293,9 @@
 
     [0xa8] = X86_OP_ENTRYr(PUSH,   GS, w),
     [0xa9] = X86_OP_ENTRYw(POP,    GS, w),
+    [0xaa] = X86_OP_ENTRY0(RSM,             chk(smm) svm(RSM)),
+    [0xac] = X86_OP_ENTRY4(SHRD,   E,v, 2op,v, G,v),
+    [0xad] = X86_OP_ENTRY3(SHRD,   E,v, 2op,v, G,v),
     [0xae] = X86_OP_GROUP0(group15),
     /*
      * It's slightly more efficient to put Ev operand in T0 and allow gen_IMUL3
@@ -1144,23 +1303,14 @@
      */
     [0xaf] = X86_OP_ENTRY3(IMUL3,  G,v, E,v, 2op,v, sextT0),
 
-    [0xb2] = X86_OP_ENTRY3(LSS,    G,v, EM,p, None, None),
-    [0xb4] = X86_OP_ENTRY3(LFS,    G,v, EM,p, None, None),
-    [0xb5] = X86_OP_ENTRY3(LGS,    G,v, EM,p, None, None),
-    [0xb6] = X86_OP_ENTRY3(MOV,    G,v, E,b, None, None, zextT0), /* MOVZX */
-    [0xb7] = X86_OP_ENTRY3(MOV,    G,v, E,w, None, None, zextT0), /* MOVZX */
-
+    [0xb8] = X86_OP_GROUP0(0FB8),
     /* decoded as modrm, which is visible as a difference between page fault and #UD */
     [0xb9] = X86_OP_ENTRYr(UD,     nop,v),                        /* UD1 */
+    [0xbc] = X86_OP_GROUP0(0FBC),
+    [0xbd] = X86_OP_GROUP0(0FBD),
     [0xbe] = X86_OP_ENTRY3(MOV,    G,v, E,b, None, None, sextT0), /* MOVSX */
     [0xbf] = X86_OP_ENTRY3(MOV,    G,v, E,w, None, None, sextT0), /* MOVSX */
 
-    [0xc2] = X86_OP_ENTRY4(VCMP,       V,x, H,x, W,x,       vex2_rep3 p_00_66_f3_f2),
-    [0xc3] = X86_OP_ENTRY3(MOV,        EM,y,G,y, None,None, cpuid(SSE2)), /* MOVNTI */
-    [0xc4] = X86_OP_ENTRY4(PINSRW,     V,dq,H,dq,E,w,       vex5 mmx p_00_66),
-    [0xc5] = X86_OP_ENTRY3(PEXTRW,     G,d, U,dq,I,b,       vex5 mmx p_00_66),
-    [0xc6] = X86_OP_ENTRY4(VSHUF,      V,x, H,x, W,x,       vex4 p_00_66),
-
     [0xc8] = X86_OP_ENTRY1(BSWAP,     LoBits,y),
     [0xc9] = X86_OP_ENTRY1(BSWAP,     LoBits,y),
     [0xca] = X86_OP_ENTRY1(BSWAP,     LoBits,y),
@@ -1170,33 +1320,6 @@
     [0xce] = X86_OP_ENTRY1(BSWAP,     LoBits,y),
     [0xcf] = X86_OP_ENTRY1(BSWAP,     LoBits,y),
 
-    [0xd0] = X86_OP_ENTRY3(VADDSUB,   V,x, H,x, W,x,        vex2 cpuid(SSE3) p_66_f2),
-    [0xd1] = X86_OP_ENTRY3(PSRLW_r,   V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
-    [0xd2] = X86_OP_ENTRY3(PSRLD_r,   V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
-    [0xd3] = X86_OP_ENTRY3(PSRLQ_r,   V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
-    [0xd4] = X86_OP_ENTRY3(PADDQ,     V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
-    [0xd5] = X86_OP_ENTRY3(PMULLW,    V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
-    [0xd6] = X86_OP_GROUP0(0FD6),
-    [0xd7] = X86_OP_ENTRY3(PMOVMSKB,  G,d, None,None, U,x,  vex7 mmx avx2_256 p_00_66),
-
-    [0xe0] = X86_OP_ENTRY3(PAVGB,     V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
-    [0xe1] = X86_OP_ENTRY3(PSRAW_r,   V,x, H,x, W,x,        vex7 mmx avx2_256 p_00_66),
-    [0xe2] = X86_OP_ENTRY3(PSRAD_r,   V,x, H,x, W,x,        vex7 mmx avx2_256 p_00_66),
-    [0xe3] = X86_OP_ENTRY3(PAVGW,     V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
-    [0xe4] = X86_OP_ENTRY3(PMULHUW,   V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
-    [0xe5] = X86_OP_ENTRY3(PMULHW,    V,x, H,x, W,x,        vex4 mmx avx2_256 p_00_66),
-    [0xe6] = X86_OP_GROUP0(0FE6),
-    [0xe7] = X86_OP_ENTRY3(MOVDQ,     W,x, None,None, V,x,  vex1 mmx p_00_66), /* MOVNTQ/MOVNTDQ */
-
-    [0xf0] = X86_OP_ENTRY3(MOVDQ,    V,x, None,None, WM,x,  vex4_unal cpuid(SSE3) p_f2), /* LDDQU */
-    [0xf1] = X86_OP_ENTRY3(PSLLW_r,  V,x, H,x, W,x,         vex7 mmx avx2_256 p_00_66),
-    [0xf2] = X86_OP_ENTRY3(PSLLD_r,  V,x, H,x, W,x,         vex7 mmx avx2_256 p_00_66),
-    [0xf3] = X86_OP_ENTRY3(PSLLQ_r,  V,x, H,x, W,x,         vex7 mmx avx2_256 p_00_66),
-    [0xf4] = X86_OP_ENTRY3(PMULUDQ,  V,x, H,x, W,x,         vex4 mmx avx2_256 p_00_66),
-    [0xf5] = X86_OP_ENTRY3(PMADDWD,  V,x, H,x, W,x,         vex4 mmx avx2_256 p_00_66),
-    [0xf6] = X86_OP_ENTRY3(PSADBW,   V,x, H,x, W,x,         vex4 mmx avx2_256 p_00_66),
-    [0xf7] = X86_OP_ENTRY3(MASKMOV,  None,None, V,dq, U,dq, vex4_unal avx2_256 mmx p_00_66),
-
     /* Incorrectly missing from 2-17 */
     [0xd8] = X86_OP_ENTRY3(PSUBUSB,  V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
     [0xd9] = X86_OP_ENTRY3(PSUBUSW,  V,x, H,x, W,x,  vex4 mmx avx2_256 p_00_66),
@@ -1335,9 +1458,9 @@
         /* 0xff */
         [0x08] = X86_OP_ENTRY1(INC,     E,v,                           lock),
         [0x09] = X86_OP_ENTRY1(DEC,     E,v,                           lock),
-        [0x0a] = X86_OP_ENTRY3(CALL_m,  None, None, E,f64, None, None, zextT0),
+        [0x0a] = X86_OP_ENTRYr(CALL_m,  E,f64,                         zextT0),
         [0x0b] = X86_OP_ENTRYr(CALLF_m, M,p),
-        [0x0c] = X86_OP_ENTRY3(JMP_m,   None, None, E,f64, None, None, zextT0),
+        [0x0c] = X86_OP_ENTRYr(JMP_m,   E,f64,                         zextT0),
         [0x0d] = X86_OP_ENTRYr(JMPF_m,  M,p),
         [0x0e] = X86_OP_ENTRYr(PUSH,    E,f64),
     };
@@ -1586,18 +1709,18 @@
     [0x7E] = X86_OP_ENTRYr(Jcc, J,b),
     [0x7F] = X86_OP_ENTRYr(Jcc, J,b),
 
-    [0x88] = X86_OP_ENTRY3(MOV, E,b, G,b, None, None),
-    [0x89] = X86_OP_ENTRY3(MOV, E,v, G,v, None, None),
-    [0x8A] = X86_OP_ENTRY3(MOV, G,b, E,b, None, None),
-    [0x8B] = X86_OP_ENTRY3(MOV, G,v, E,v, None, None),
-    /* Missing in Table A-2: memory destination is always 16-bit.  */
-    [0x8C] = X86_OP_ENTRY3(MOV, E,v, S,w, None, None, op0_Mw),
-    [0x8D] = X86_OP_ENTRY3(LEA, G,v, M,v, None, None, noseg),
-    [0x8E] = X86_OP_ENTRY3(MOV, S,w, E,w, None, None),
+    [0x88] = X86_OP_ENTRYwr(MOV, E,b, G,b),
+    [0x89] = X86_OP_ENTRYwr(MOV, E,v, G,v),
+    [0x8A] = X86_OP_ENTRYwr(MOV, G,b, E,b),
+    [0x8B] = X86_OP_ENTRYwr(MOV, G,v, E,v),
+     /* Missing in Table A-2: memory destination is always 16-bit.  */
+    [0x8C] = X86_OP_ENTRYwr(MOV, E,v, S,w, op0_Mw),
+    [0x8D] = X86_OP_ENTRYwr(LEA, G,v, M,v, nolea),
+    [0x8E] = X86_OP_ENTRYwr(MOV, S,w, E,w),
     [0x8F] = X86_OP_GROUPw(group1A, E,v),
 
     [0x98] = X86_OP_ENTRY1(CBW,    0,v), /* rAX */
-    [0x99] = X86_OP_ENTRY3(CWD,    2,v, 0,v, None, None), /* rDX, rAX */
+    [0x99] = X86_OP_ENTRYwr(CWD,   2,v, 0,v), /* rDX, rAX */
     [0x9A] = X86_OP_ENTRYrr(CALLF, I_unsigned,p, I_unsigned,w, chk(i64)),
     [0x9B] = X86_OP_ENTRY0(WAIT),
     [0x9C] = X86_OP_ENTRY0(PUSHF,  chk(vm86_iopl) svm(PUSHF)),
@@ -1607,22 +1730,22 @@
 
     [0xA8] = X86_OP_ENTRYrr(AND, 0,b, I,b),   /* AL, Ib */
     [0xA9] = X86_OP_ENTRYrr(AND, 0,v, I,z),   /* rAX, Iz */
-    [0xAA] = X86_OP_ENTRY3(STOS, Y,b, 0,b, None, None),
-    [0xAB] = X86_OP_ENTRY3(STOS, Y,v, 0,v, None, None),
+    [0xAA] = X86_OP_ENTRYwr(STOS, Y,b, 0,b),
+    [0xAB] = X86_OP_ENTRYwr(STOS, Y,v, 0,v),
     /* Manual writeback because REP LODS (!) has to write EAX/RAX after every LODS.  */
     [0xAC] = X86_OP_ENTRYr(LODS, X,b),
     [0xAD] = X86_OP_ENTRYr(LODS, X,v),
     [0xAE] = X86_OP_ENTRYrr(SCAS, 0,b, Y,b),
     [0xAF] = X86_OP_ENTRYrr(SCAS, 0,v, Y,v),
 
-    [0xB8] = X86_OP_ENTRY3(MOV, LoBits,v, I,v, None, None),
-    [0xB9] = X86_OP_ENTRY3(MOV, LoBits,v, I,v, None, None),
-    [0xBA] = X86_OP_ENTRY3(MOV, LoBits,v, I,v, None, None),
-    [0xBB] = X86_OP_ENTRY3(MOV, LoBits,v, I,v, None, None),
-    [0xBC] = X86_OP_ENTRY3(MOV, LoBits,v, I,v, None, None),
-    [0xBD] = X86_OP_ENTRY3(MOV, LoBits,v, I,v, None, None),
-    [0xBE] = X86_OP_ENTRY3(MOV, LoBits,v, I,v, None, None),
-    [0xBF] = X86_OP_ENTRY3(MOV, LoBits,v, I,v, None, None),
+    [0xB8] = X86_OP_ENTRYwr(MOV, LoBits,v, I,v),
+    [0xB9] = X86_OP_ENTRYwr(MOV, LoBits,v, I,v),
+    [0xBA] = X86_OP_ENTRYwr(MOV, LoBits,v, I,v),
+    [0xBB] = X86_OP_ENTRYwr(MOV, LoBits,v, I,v),
+    [0xBC] = X86_OP_ENTRYwr(MOV, LoBits,v, I,v),
+    [0xBD] = X86_OP_ENTRYwr(MOV, LoBits,v, I,v),
+    [0xBE] = X86_OP_ENTRYwr(MOV, LoBits,v, I,v),
+    [0xBF] = X86_OP_ENTRYwr(MOV, LoBits,v, I,v),
 
     [0xC8] = X86_OP_ENTRYrr(ENTER, I,w, I,b),
     [0xC9] = X86_OP_ENTRY1(LEAVE, A,d64),
@@ -1725,6 +1848,10 @@
         *ot = s->dflag == MO_16 ? MO_32 : s->dflag;
         return true;
 
+    case X86_SIZE_y_d64:  /* Full (not 16-bit) register access */
+        *ot = CODE64(s) ? MO_64 : MO_32;
+        return true;
+
     case X86_SIZE_z:  /* 16-bit for 16-bit operand size, else 32-bit */
         *ot = s->dflag == MO_16 ? MO_16 : MO_32;
         return true;
@@ -1802,11 +1929,34 @@
 
     case X86_TYPE_C:  /* REG in the modrm byte selects a control register */
         op->unit = X86_OP_CR;
-        goto get_reg;
+        op->n = ((get_modrm(s, env) >> 3) & 7) | REX_R(s);
+        if (op->n == 0 && (s->prefix & PREFIX_LOCK) &&
+            (s->cpuid_ext3_features & CPUID_EXT3_CR8LEG)) {
+            op->n = 8;
+            s->prefix &= ~PREFIX_LOCK;
+        }
+        if (op->n != 0 && op->n != 2 && op->n != 3 && op->n != 4 && op->n != 8) {
+            return false;
+        }
+        if (decode->e.intercept) {
+            decode->e.intercept += op->n;
+        }
+        break;
 
     case X86_TYPE_D:  /* REG in the modrm byte selects a debug register */
         op->unit = X86_OP_DR;
-        goto get_reg;
+        op->n = ((get_modrm(s, env) >> 3) & 7) | REX_R(s);
+        if (op->n >= 8) {
+            /*
+             * illegal opcode.  The DR4 and DR5 case is checked in the generated
+             * code instead, to save on hflags bits.
+             */
+            return false;
+        }
+        if (decode->e.intercept) {
+            decode->e.intercept += op->n;
+        }
+        break;
 
     case X86_TYPE_G:  /* REG in the modrm byte selects a GPR */
         op->unit = X86_OP_INT;
@@ -2047,6 +2197,10 @@
         return true;
     case X86_FEAT_CMOV:
         return (s->cpuid_features & CPUID_CMOV);
+    case X86_FEAT_CLFLUSH:
+        return (s->cpuid_features & CPUID_CLFLUSH);
+    case X86_FEAT_FXSR:
+        return (s->cpuid_features & CPUID_FXSR);
     case X86_FEAT_F16C:
         return (s->cpuid_ext_features & CPUID_EXT_F16C);
     case X86_FEAT_FMA:
@@ -2055,6 +2209,8 @@
         return (s->cpuid_ext_features & CPUID_EXT_MOVBE);
     case X86_FEAT_PCLMULQDQ:
         return (s->cpuid_ext_features & CPUID_EXT_PCLMULQDQ);
+    case X86_FEAT_POPCNT:
+        return (s->cpuid_ext_features & CPUID_EXT_POPCNT);
     case X86_FEAT_SSE:
         return (s->cpuid_features & CPUID_SSE);
     case X86_FEAT_SSE2:
@@ -2080,6 +2236,8 @@
 
     case X86_FEAT_AVX:
         return (s->cpuid_ext_features & CPUID_EXT_AVX);
+    case X86_FEAT_XSAVE:
+        return (s->cpuid_ext_features & CPUID_EXT_XSAVE);
 
     case X86_FEAT_3DNOW:
         return (s->cpuid_ext2_features & CPUID_EXT2_3DNOW);
@@ -2094,11 +2252,20 @@
         return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2);
     case X86_FEAT_AVX2:
         return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_AVX2);
+    case X86_FEAT_CLFLUSHOPT:
+        return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_CLFLUSHOPT);
+    case X86_FEAT_CLWB:
+        return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_CLWB);
+    case X86_FEAT_FSGSBASE:
+        return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_FSGSBASE);
     case X86_FEAT_SHA_NI:
         return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_SHA_NI);
 
     case X86_FEAT_CMPCCXADD:
         return (s->cpuid_7_1_eax_features & CPUID_7_1_EAX_CMPCCXADD);
+
+    case X86_FEAT_XSAVEOPT:
+        return (s->cpuid_xsave_features & CPUID_XSAVE_XSAVEOPT);
     }
     g_assert_not_reached();
 }
@@ -2428,18 +2595,12 @@
         if (b == 0x0f) {
             b = x86_ldub_code(env, s);
             switch (b) {
-            case 0x00 ... 0x03: /* mostly privileged instructions */
-            case 0x05 ... 0x09:
+            case 0x00 ... 0x01: /* mostly privileged instructions */
             case 0x1a ... 0x1b: /* MPX */
-            case 0x20 ... 0x23: /* mov from/to CR and DR */
-            case 0x30 ... 0x35: /* more privileged instructions */
-            case 0xa2 ... 0xa5: /* CPUID, BT, SHLD */
-            case 0xaa ... 0xae: /* RSM, SHRD, grp15 */
-            case 0xb0 ... 0xb1: /* cmpxchg */
+            case 0xa3:          /* bt */
+            case 0xab:          /* bts */
             case 0xb3:          /* btr */
-            case 0xb8:          /* integer ops */
-            case 0xba ... 0xbd: /* integer ops */
-            case 0xc0 ... 0xc1: /* xadd */
+            case 0xba ... 0xbb: /* grp8, btc */
             case 0xc7:          /* grp9 */
                 disas_insn_old(s, cpu, b + 0x100);
                 return;
@@ -2466,18 +2627,28 @@
 
     /* Checks that result in #UD come first.  */
     if (decode.e.check) {
-        if (decode.e.check & X86_CHECK_i64) {
-            if (CODE64(s)) {
+        if (CODE64(s)) {
+            if (decode.e.check & X86_CHECK_i64) {
+                goto illegal_op;
+            }
+            if ((decode.e.check & X86_CHECK_i64_amd) && env->cpuid_vendor1 != CPUID_VENDOR_INTEL_1) {
+                goto illegal_op;
+            }
+        } else {
+            if (decode.e.check & X86_CHECK_o64) {
+                goto illegal_op;
+            }
+            if ((decode.e.check & X86_CHECK_o64_intel) && env->cpuid_vendor1 == CPUID_VENDOR_INTEL_1) {
                 goto illegal_op;
             }
         }
-        if (decode.e.check & X86_CHECK_o64) {
-            if (!CODE64(s)) {
+        if (decode.e.check & X86_CHECK_prot_or_vm86) {
+            if (!PE(s)) {
                 goto illegal_op;
             }
         }
-        if (decode.e.check & X86_CHECK_prot) {
-            if (!PE(s) || VM86(s)) {
+        if (decode.e.check & X86_CHECK_no_vm86) {
+            if (VM86(s)) {
                 goto illegal_op;
             }
         }
@@ -2524,11 +2695,6 @@
         assert(decode.op[1].unit == X86_OP_INT);
         break;
 
-    case X86_SPECIAL_NoSeg:
-        decode.mem.def_seg = -1;
-        s->override = -1;
-        break;
-
     case X86_SPECIAL_Op0_Mw:
         assert(decode.op[0].unit == X86_OP_INT);
         if (decode.op[0].has_ea) {
@@ -2556,19 +2722,21 @@
      * exceptions if there is no memory operand).  Exceptions are
      * vm86 checks (INTn, IRET, PUSHF/POPF), RSM and XSETBV (!).
      *
-     * RSM and XSETBV will be handled in the gen_* functions
-     * instead of using chk().
+     * XSETBV will check for CPL0 in the gen_* function instead of using chk().
      */
     if (decode.e.check & X86_CHECK_cpl0) {
         if (CPL(s) != 0) {
             goto gp_fault;
         }
     }
-    if (decode.e.intercept && unlikely(GUEST(s))) {
+    if (decode.e.has_intercept && unlikely(GUEST(s))) {
         gen_helper_svm_check_intercept(tcg_env,
                                        tcg_constant_i32(decode.e.intercept));
     }
     if (decode.e.check) {
+        if ((decode.e.check & X86_CHECK_smm) && !(s->flags & HF_SMM_MASK)) {
+            goto illegal_op;
+        }
         if ((decode.e.check & X86_CHECK_vm86_iopl) && VM86(s)) {
             if (IOPL(s) < 3) {
                 goto gp_fault;
@@ -2585,12 +2753,13 @@
         gen_helper_enter_mmx(tcg_env);
     }
 
-    if (decode.op[0].has_ea || decode.op[1].has_ea || decode.op[2].has_ea) {
+    if (decode.e.special != X86_SPECIAL_NoLoadEA &&
+        (decode.op[0].has_ea || decode.op[1].has_ea || decode.op[2].has_ea)) {
         gen_load_ea(s, &decode.mem, decode.e.vex_class == 12);
     }
     if (s->prefix & PREFIX_LOCK) {
         gen_load(s, &decode, 2, s->T1);
-        decode.e.gen(s, env, &decode);
+        decode.e.gen(s, &decode);
     } else {
         if (decode.op[0].unit == X86_OP_MMX) {
             compute_mmx_offset(&decode.op[0]);
@@ -2599,12 +2768,12 @@
         }
         gen_load(s, &decode, 1, s->T0);
         gen_load(s, &decode, 2, s->T1);
-        decode.e.gen(s, env, &decode);
+        decode.e.gen(s, &decode);
         gen_writeback(s, &decode, 0, s->T0);
     }
 
     /*
-     * Write back flags after last memory access.  Some newer ALU instructions, as
+     * Write back flags after last memory access.  Some older ALU instructions, as
      * well as SSE instructions, write flags in the gen_* function, but that can
      * cause incorrect tracking of CC_OP for instructions that write to both memory
      * and flags.
diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h
index 1f90cf9..f9bf9a6 100644
--- a/target/i386/tcg/decode-new.h
+++ b/target/i386/tcg/decode-new.h
@@ -90,6 +90,7 @@
     X86_SIZE_w,  /* 16-bit */
     X86_SIZE_x,  /* 128/256-bit, based on operand size */
     X86_SIZE_y,  /* 32/64-bit, based on operand size */
+    X86_SIZE_y_d64,  /* 32/64-bit, based on 64-bit mode */
     X86_SIZE_z,  /* 16-bit for 16-bit operand size, else 32-bit */
     X86_SIZE_z_f64,  /* 32-bit for 32-bit operand size or 64-bit mode, else 16-bit */
 
@@ -108,12 +109,18 @@
     X86_FEAT_AVX2,
     X86_FEAT_BMI1,
     X86_FEAT_BMI2,
+    X86_FEAT_CLFLUSH,
+    X86_FEAT_CLFLUSHOPT,
+    X86_FEAT_CLWB,
     X86_FEAT_CMOV,
     X86_FEAT_CMPCCXADD,
     X86_FEAT_F16C,
     X86_FEAT_FMA,
+    X86_FEAT_FSGSBASE,
+    X86_FEAT_FXSR,
     X86_FEAT_MOVBE,
     X86_FEAT_PCLMULQDQ,
+    X86_FEAT_POPCNT,
     X86_FEAT_SHA_NI,
     X86_FEAT_SSE,
     X86_FEAT_SSE2,
@@ -122,6 +129,8 @@
     X86_FEAT_SSE41,
     X86_FEAT_SSE42,
     X86_FEAT_SSE4A,
+    X86_FEAT_XSAVE,
+    X86_FEAT_XSAVEOPT,
 } X86CPUIDFeature;
 
 /* Execution flags */
@@ -142,8 +151,8 @@
     X86_CHECK_i64 = 1,
     X86_CHECK_o64 = 2,
 
-    /* Fault outside protected mode */
-    X86_CHECK_prot = 4,
+    /* Fault in vm86 mode */
+    X86_CHECK_no_vm86 = 4,
 
     /* Privileged instruction checks */
     X86_CHECK_cpl0 = 8,
@@ -159,6 +168,17 @@
 
     /* Fault if VEX.W=0 */
     X86_CHECK_W1 = 256,
+
+    /* Fault outside protected mode, possibly including vm86 mode */
+    X86_CHECK_prot_or_vm86 = 512,
+    X86_CHECK_prot = X86_CHECK_prot_or_vm86 | X86_CHECK_no_vm86,
+
+    /* Fault outside SMM */
+    X86_CHECK_smm = 1024,
+
+    /* Vendor-specific checks for Intel/AMD differences */
+    X86_CHECK_i64_amd = 2048,
+    X86_CHECK_o64_intel = 4096,
 } X86InsnCheck;
 
 typedef enum X86InsnSpecial {
@@ -170,8 +190,9 @@
     /* Always locked if it has a memory operand (XCHG) */
     X86_SPECIAL_Locked,
 
-    /* Do not apply segment base to effective address */
-    X86_SPECIAL_NoSeg,
+    /* Do not load effective address in s->A0 */
+    X86_SPECIAL_NoLoadEA,
+
     /*
      * Rd/Mb or Rd/Mw in the manual: register operand 0 is treated as 32 bits
      * (and writeback zero-extends it to 64 bits if applicable).  PREFIX_DATA
@@ -245,7 +266,7 @@
 typedef void (*X86DecodeFunc)(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b);
 
 /* Code generation function.  */
-typedef void (*X86GenFunc)(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode);
+typedef void (*X86GenFunc)(DisasContext *s, X86DecodedInsn *decode);
 
 struct X86OpEntry {
     /* Based on the is_decode flags.  */
@@ -271,6 +292,7 @@
     unsigned     valid_prefix:16;
     unsigned     check:16;
     unsigned     intercept:8;
+    bool         has_intercept:1;
     bool         is_decode:1;
 };
 
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index 4be3d9a..11faa70 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -60,8 +60,8 @@
                                   TCGv_ptr reg_c, TCGv_ptr reg_d, TCGv_i32 even,
                                   TCGv_i32 odd);
 
-static void gen_JMP_m(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode);
-static void gen_JMP(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode);
+static void gen_JMP_m(DisasContext *s, X86DecodedInsn *decode);
+static void gen_JMP(DisasContext *s, X86DecodedInsn *decode);
 
 static inline TCGv_i32 tcg_constant8u_i32(uint8_t val)
 {
@@ -242,12 +242,20 @@
         tcg_gen_ld32u_tl(v, tcg_env,
                          offsetof(CPUX86State,segs[op->n].selector));
         break;
+#ifndef CONFIG_USER_ONLY
     case X86_OP_CR:
-        tcg_gen_ld_tl(v, tcg_env, offsetof(CPUX86State, cr[op->n]));
+        if (op->n == 8) {
+            translator_io_start(&s->base);
+            gen_helper_read_cr8(v, tcg_env);
+        } else {
+            tcg_gen_ld_tl(v, tcg_env, offsetof(CPUX86State, cr[op->n]));
+        }
         break;
     case X86_OP_DR:
-        tcg_gen_ld_tl(v, tcg_env, offsetof(CPUX86State, dr[op->n]));
+        /* CR4.DE tested in the helper.  */
+        gen_helper_get_dr(v, tcg_env, tcg_constant_i32(op->n));
         break;
+#endif
     case X86_OP_INT:
         if (op->has_ea) {
             if (v == s->T0 && decode->e.special == X86_SPECIAL_SExtT0) {
@@ -343,8 +351,20 @@
                                  16, 16, 0);
         }
         break;
+#ifndef CONFIG_USER_ONLY
     case X86_OP_CR:
+        if (op->n == 8) {
+            translator_io_start(&s->base);
+        }
+        gen_helper_write_crN(tcg_env, tcg_constant_i32(op->n), v);
+        s->base.is_jmp = DISAS_EOB_NEXT;
+        break;
     case X86_OP_DR:
+        /* CR4.DE tested in the helper.  */
+        gen_helper_set_dr(tcg_env, tcg_constant_i32(op->n), v);
+        s->base.is_jmp = DISAS_EOB_NEXT;
+        break;
+#endif
     default:
         g_assert_not_reached();
     }
@@ -446,7 +466,7 @@
     [0xbf] = gen_helper_pavgusb,
 };
 
-static void gen_3dnow(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_3dnow(DisasContext *s, X86DecodedInsn *decode)
 {
     uint8_t b = decode->immediate;
     SSEFunc_0_epp fn = b < ARRAY_SIZE(fns_3dnow) ? fns_3dnow[b] : NULL;
@@ -479,7 +499,7 @@
  * f3 = v*ss Vss, Hss, Wps
  * f2 = v*sd Vsd, Hsd, Wps
  */
-static inline void gen_unary_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+static inline void gen_unary_fp_sse(DisasContext *s, X86DecodedInsn *decode,
                               SSEFunc_0_epp pd_xmm, SSEFunc_0_epp ps_xmm,
                               SSEFunc_0_epp pd_ymm, SSEFunc_0_epp ps_ymm,
                               SSEFunc_0_eppp sd, SSEFunc_0_eppp ss)
@@ -504,9 +524,9 @@
     }
 }
 #define UNARY_FP_SSE(uname, lname)                                                 \
-static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
 {                                                                                  \
-    gen_unary_fp_sse(s, env, decode,                                               \
+    gen_unary_fp_sse(s, decode,                                                    \
                      gen_helper_##lname##pd_xmm,                                   \
                      gen_helper_##lname##ps_xmm,                                   \
                      gen_helper_##lname##pd_ymm,                                   \
@@ -522,7 +542,7 @@
  * f3 = v*ss Vss, Hss, Wps
  * f2 = v*sd Vsd, Hsd, Wps
  */
-static inline void gen_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+static inline void gen_fp_sse(DisasContext *s, X86DecodedInsn *decode,
                               SSEFunc_0_eppp pd_xmm, SSEFunc_0_eppp ps_xmm,
                               SSEFunc_0_eppp pd_ymm, SSEFunc_0_eppp ps_ymm,
                               SSEFunc_0_eppp sd, SSEFunc_0_eppp ss)
@@ -543,9 +563,9 @@
 }
 
 #define FP_SSE(uname, lname)                                                       \
-static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
 {                                                                                  \
-    gen_fp_sse(s, env, decode,                                                     \
+    gen_fp_sse(s, decode,                                                          \
                gen_helper_##lname##pd_xmm,                                         \
                gen_helper_##lname##ps_xmm,                                         \
                gen_helper_##lname##pd_ymm,                                         \
@@ -561,7 +581,7 @@
 FP_SSE(VMAX, max)
 
 #define FMA_SSE_PACKED(uname, ptr0, ptr1, ptr2, even, odd)                         \
-static void gen_##uname##Px(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uname##Px(DisasContext *s, X86DecodedInsn *decode)               \
 {                                                                                  \
     SSEFunc_0_eppppii xmm = s->vex_w ? gen_helper_fma4pd_xmm : gen_helper_fma4ps_xmm; \
     SSEFunc_0_eppppii ymm = s->vex_w ? gen_helper_fma4pd_ymm : gen_helper_fma4ps_ymm; \
@@ -574,7 +594,7 @@
 
 #define FMA_SSE(uname, ptr0, ptr1, ptr2, flags)                                    \
 FMA_SSE_PACKED(uname, ptr0, ptr1, ptr2, flags, flags)                              \
-static void gen_##uname##Sx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uname##Sx(DisasContext *s, X86DecodedInsn *decode)               \
 {                                                                                  \
     SSEFunc_0_eppppi fn = s->vex_w ? gen_helper_fma4sd : gen_helper_fma4ss;        \
                                                                                    \
@@ -607,10 +627,10 @@
 FMA_SSE_PACKED(VFMSUBADD132, OP_PTR0, OP_PTR2, OP_PTR1, 0, float_muladd_negate_c)
 
 #define FP_UNPACK_SSE(uname, lname)                                                \
-static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
 {                                                                                  \
     /* PS maps to the DQ integer instruction, PD maps to QDQ.  */                  \
-    gen_fp_sse(s, env, decode,                                                     \
+    gen_fp_sse(s, decode,                                                          \
                gen_helper_##lname##qdq_xmm,                                        \
                gen_helper_##lname##dq_xmm,                                         \
                gen_helper_##lname##qdq_ymm,                                        \
@@ -624,7 +644,7 @@
  * 00 = v*ps Vps, Wpd
  * f3 = v*ss Vss, Wps
  */
-static inline void gen_unary_fp32_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+static inline void gen_unary_fp32_sse(DisasContext *s, X86DecodedInsn *decode,
                                       SSEFunc_0_epp ps_xmm,
                                       SSEFunc_0_epp ps_ymm,
                                       SSEFunc_0_eppp ss)
@@ -649,9 +669,9 @@
     gen_illegal_opcode(s);
 }
 #define UNARY_FP32_SSE(uname, lname)                                               \
-static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
 {                                                                                  \
-    gen_unary_fp32_sse(s, env, decode,                                             \
+    gen_unary_fp32_sse(s, decode,                                                  \
                        gen_helper_##lname##ps_xmm,                                 \
                        gen_helper_##lname##ps_ymm,                                 \
                        gen_helper_##lname##ss);                                    \
@@ -663,7 +683,7 @@
  * 66 = v*pd Vpd, Hpd, Wpd
  * f2 = v*ps Vps, Hps, Wps
  */
-static inline void gen_horizontal_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+static inline void gen_horizontal_fp_sse(DisasContext *s, X86DecodedInsn *decode,
                                          SSEFunc_0_eppp pd_xmm, SSEFunc_0_eppp ps_xmm,
                                          SSEFunc_0_eppp pd_ymm, SSEFunc_0_eppp ps_ymm)
 {
@@ -674,9 +694,9 @@
     fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
 }
 #define HORIZONTAL_FP_SSE(uname, lname)                                            \
-static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
 {                                                                                  \
-    gen_horizontal_fp_sse(s, env, decode,                                          \
+    gen_horizontal_fp_sse(s, decode,                                               \
                           gen_helper_##lname##pd_xmm, gen_helper_##lname##ps_xmm,  \
                           gen_helper_##lname##pd_ymm, gen_helper_##lname##ps_ymm); \
 }
@@ -684,7 +704,7 @@
 HORIZONTAL_FP_SSE(VHSUB, hsub)
 HORIZONTAL_FP_SSE(VADDSUB, addsub)
 
-static inline void gen_ternary_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+static inline void gen_ternary_sse(DisasContext *s, X86DecodedInsn *decode,
                                    int op3, SSEFunc_0_epppp xmm, SSEFunc_0_epppp ymm)
 {
     SSEFunc_0_epppp fn = s->vex_l ? ymm : xmm;
@@ -695,21 +715,21 @@
     fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, ptr3);
 }
 #define TERNARY_SSE(uname, uvname, lname)                                          \
-static void gen_##uvname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uvname(DisasContext *s, X86DecodedInsn *decode)                  \
 {                                                                                  \
-    gen_ternary_sse(s, env, decode, (uint8_t)decode->immediate >> 4,               \
+    gen_ternary_sse(s, decode, (uint8_t)decode->immediate >> 4,                    \
                     gen_helper_##lname##_xmm, gen_helper_##lname##_ymm);           \
 }                                                                                  \
-static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
 {                                                                                  \
-    gen_ternary_sse(s, env, decode, 0,                                             \
+    gen_ternary_sse(s, decode, 0,                                                  \
                   gen_helper_##lname##_xmm, gen_helper_##lname##_ymm);             \
 }
 TERNARY_SSE(BLENDVPS, VBLENDVPS, blendvps)
 TERNARY_SSE(BLENDVPD, VBLENDVPD, blendvpd)
 TERNARY_SSE(PBLENDVB, VPBLENDVB, pblendvb)
 
-static inline void gen_binary_imm_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+static inline void gen_binary_imm_sse(DisasContext *s, X86DecodedInsn *decode,
                                       SSEFunc_0_epppi xmm, SSEFunc_0_epppi ymm)
 {
     TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
@@ -721,9 +741,9 @@
 }
 
 #define BINARY_IMM_SSE(uname, lname)                                               \
-static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
 {                                                                                  \
-    gen_binary_imm_sse(s, env, decode,                                             \
+    gen_binary_imm_sse(s, decode,                                                  \
                        gen_helper_##lname##_xmm,                                   \
                        gen_helper_##lname##_ymm);                                  \
 }
@@ -739,7 +759,7 @@
 
 
 #define UNARY_INT_GVEC(uname, func, ...)                                           \
-static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
 {                                                                                  \
     int vec_len = vector_len(s, decode);                                          \
                                                                                    \
@@ -757,7 +777,7 @@
 
 
 #define BINARY_INT_GVEC(uname, func, ...)                                          \
-static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
 {                                                                                  \
     int vec_len = vector_len(s, decode);                                          \
                                                                                    \
@@ -816,7 +836,7 @@
  * These are really the same encoding, because 1) V is the same as P when VEX.V
  * is not present 2) P and Q are the same as H and W apart from MM/XMM
  */
-static inline void gen_binary_int_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+static inline void gen_binary_int_sse(DisasContext *s, X86DecodedInsn *decode,
                                       SSEFunc_0_eppp mmx, SSEFunc_0_eppp xmm, SSEFunc_0_eppp ymm)
 {
     assert(!!mmx == !!(decode->e.special == X86_SPECIAL_MMX));
@@ -837,9 +857,9 @@
 
 
 #define BINARY_INT_MMX(uname, lname)                                               \
-static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
 {                                                                                  \
-    gen_binary_int_sse(s, env, decode,                                             \
+    gen_binary_int_sse(s, decode,                                                  \
                           gen_helper_##lname##_mmx,                                \
                           gen_helper_##lname##_xmm,                                \
                           gen_helper_##lname##_ymm);                               \
@@ -886,9 +906,9 @@
 
 /* Instructions with no MMX equivalent.  */
 #define BINARY_INT_SSE(uname, lname)                                               \
-static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
 {                                                                                  \
-    gen_binary_int_sse(s, env, decode,                                             \
+    gen_binary_int_sse(s, decode,                                                  \
                           NULL,                                                    \
                           gen_helper_##lname##_xmm,                                \
                           gen_helper_##lname##_ymm);                               \
@@ -911,7 +931,7 @@
 BINARY_INT_SSE(VAESENCLAST, aesenclast)
 
 #define UNARY_CMP_SSE(uname, lname)                                                \
-static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
 {                                                                                  \
     if (!s->vex_l) {                                                               \
         gen_helper_##lname##_xmm(tcg_env, OP_PTR1, OP_PTR2);                       \
@@ -924,7 +944,7 @@
 UNARY_CMP_SSE(VTESTPS,    vtestps)
 UNARY_CMP_SSE(VTESTPD,    vtestpd)
 
-static inline void gen_unary_int_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+static inline void gen_unary_int_sse(DisasContext *s, X86DecodedInsn *decode,
                                      SSEFunc_0_epp xmm, SSEFunc_0_epp ymm)
 {
     if (!s->vex_l) {
@@ -935,9 +955,9 @@
 }
 
 #define UNARY_INT_SSE(uname, lname)                                                \
-static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
 {                                                                                  \
-    gen_unary_int_sse(s, env, decode,                                              \
+    gen_unary_int_sse(s, decode,                                                   \
                       gen_helper_##lname##_xmm,                                    \
                       gen_helper_##lname##_ymm);                                   \
 }
@@ -969,7 +989,7 @@
 UNARY_INT_SSE(VCVTPH2PS, cvtph2ps)
 
 
-static inline void gen_unary_imm_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+static inline void gen_unary_imm_sse(DisasContext *s, X86DecodedInsn *decode,
                                      SSEFunc_0_ppi xmm, SSEFunc_0_ppi ymm)
 {
     TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
@@ -981,9 +1001,9 @@
 }
 
 #define UNARY_IMM_SSE(uname, lname)                                                \
-static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
 {                                                                                  \
-    gen_unary_imm_sse(s, env, decode,                                              \
+    gen_unary_imm_sse(s, decode,                                                   \
                       gen_helper_##lname##_xmm,                                    \
                       gen_helper_##lname##_ymm);                                   \
 }
@@ -996,7 +1016,7 @@
 UNARY_IMM_SSE(VPERMILPS_i, vpermilps_imm)
 UNARY_IMM_SSE(VPERMILPD_i, vpermilpd_imm)
 
-static inline void gen_unary_imm_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+static inline void gen_unary_imm_fp_sse(DisasContext *s, X86DecodedInsn *decode,
                                         SSEFunc_0_eppi xmm, SSEFunc_0_eppi ymm)
 {
     TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
@@ -1008,9 +1028,9 @@
 }
 
 #define UNARY_IMM_FP_SSE(uname, lname)                                             \
-static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
 {                                                                                  \
-    gen_unary_imm_fp_sse(s, env, decode,                                           \
+    gen_unary_imm_fp_sse(s, decode,                                                \
                       gen_helper_##lname##_xmm,                                    \
                       gen_helper_##lname##_ymm);                                   \
 }
@@ -1018,7 +1038,7 @@
 UNARY_IMM_FP_SSE(VROUNDPS,    roundps)
 UNARY_IMM_FP_SSE(VROUNDPD,    roundpd)
 
-static inline void gen_vexw_avx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+static inline void gen_vexw_avx(DisasContext *s, X86DecodedInsn *decode,
                                 SSEFunc_0_eppp d_xmm, SSEFunc_0_eppp q_xmm,
                                 SSEFunc_0_eppp d_ymm, SSEFunc_0_eppp q_ymm)
 {
@@ -1030,9 +1050,9 @@
 
 /* VEX.W affects whether to operate on 32- or 64-bit elements.  */
 #define VEXW_AVX(uname, lname)                                                     \
-static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
 {                                                                                  \
-    gen_vexw_avx(s, env, decode,                                                   \
+    gen_vexw_avx(s, decode,                                                        \
                  gen_helper_##lname##d_xmm, gen_helper_##lname##q_xmm,             \
                  gen_helper_##lname##d_ymm, gen_helper_##lname##q_ymm);            \
 }
@@ -1042,7 +1062,7 @@
 VEXW_AVX(VPMASKMOV, vpmaskmov)
 
 /* Same as above, but with extra arguments to the helper.  */
-static inline void gen_vsib_avx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+static inline void gen_vsib_avx(DisasContext *s, X86DecodedInsn *decode,
                                 SSEFunc_0_epppti d_xmm, SSEFunc_0_epppti q_xmm,
                                 SSEFunc_0_epppti d_ymm, SSEFunc_0_epppti q_ymm)
 {
@@ -1066,29 +1086,29 @@
     }
 }
 #define VSIB_AVX(uname, lname)                                                     \
-static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+static void gen_##uname(DisasContext *s, X86DecodedInsn *decode)                   \
 {                                                                                  \
-    gen_vsib_avx(s, env, decode,                                                   \
+    gen_vsib_avx(s, decode,                                                        \
                  gen_helper_##lname##d_xmm, gen_helper_##lname##q_xmm,             \
                  gen_helper_##lname##d_ymm, gen_helper_##lname##q_ymm);            \
 }
 VSIB_AVX(VPGATHERD, vpgatherd)
 VSIB_AVX(VPGATHERQ, vpgatherq)
 
-static void gen_AAA(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_AAA(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_update_cc_op(s);
     gen_helper_aaa(tcg_env);
     assume_cc_op(s, CC_OP_EFLAGS);
 }
 
-static void gen_AAD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_AAD(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_helper_aad(s->T0, s->T0, s->T1);
     prepare_update1_cc(decode, s, CC_OP_LOGICB);
 }
 
-static void gen_AAM(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_AAM(DisasContext *s, X86DecodedInsn *decode)
 {
     if (decode->immediate == 0) {
         gen_exception(s, EXCP00_DIVZ);
@@ -1098,14 +1118,14 @@
     }
 }
 
-static void gen_AAS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_AAS(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_update_cc_op(s);
     gen_helper_aas(tcg_env);
     assume_cc_op(s, CC_OP_EFLAGS);
 }
 
-static void gen_ADC(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_ADC(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[1].ot;
     TCGv c_in = tcg_temp_new();
@@ -1122,24 +1142,41 @@
     prepare_update3_cc(decode, s, CC_OP_ADCB + ot, c_in);
 }
 
-/* ADCX/ADOX do not have memory operands and can use set_cc_op.  */
-static void gen_ADCOX(DisasContext *s, CPUX86State *env, MemOp ot, int cc_op)
+static void gen_ADCOX(DisasContext *s, X86DecodedInsn *decode, int cc_op)
 {
-    int opposite_cc_op;
+    MemOp ot = decode->op[0].ot;
     TCGv carry_in = NULL;
-    TCGv carry_out = (cc_op == CC_OP_ADCX ? cpu_cc_dst : cpu_cc_src2);
+    TCGv *carry_out = (cc_op == CC_OP_ADCX ? &decode->cc_dst : &decode->cc_src2);
     TCGv zero;
 
-    if (cc_op == s->cc_op || s->cc_op == CC_OP_ADCOX) {
-        /* Re-use the carry-out from a previous round.  */
-        carry_in = carry_out;
-    } else {
-        /* We don't have a carry-in, get it out of EFLAGS.  */
-        if (s->cc_op != CC_OP_ADCX && s->cc_op != CC_OP_ADOX) {
-            gen_compute_eflags(s);
+    decode->cc_op = cc_op;
+    *carry_out = tcg_temp_new();
+    if (CC_OP_HAS_EFLAGS(s->cc_op)) {
+        decode->cc_src = cpu_cc_src;
+
+        /* Re-use the carry-out from a previous round?  */
+        if (s->cc_op == cc_op || s->cc_op == CC_OP_ADCOX) {
+            carry_in = (cc_op == CC_OP_ADCX ? cpu_cc_dst : cpu_cc_src2);
         }
-        carry_in = s->tmp0;
-        tcg_gen_extract_tl(carry_in, cpu_cc_src,
+
+        /* Preserve the opposite carry from previous rounds?  */
+        if (s->cc_op != cc_op && s->cc_op != CC_OP_EFLAGS) {
+            decode->cc_op = CC_OP_ADCOX;
+            if (carry_out == &decode->cc_dst) {
+                decode->cc_src2 = cpu_cc_src2;
+            } else {
+                decode->cc_dst = cpu_cc_dst;
+            }
+        }
+    } else {
+        decode->cc_src = tcg_temp_new();
+        gen_mov_eflags(s, decode->cc_src);
+    }
+
+    if (!carry_in) {
+        /* Get carry_in out of EFLAGS.  */
+        carry_in = tcg_temp_new();
+        tcg_gen_extract_tl(carry_in, decode->cc_src,
             ctz32(cc_op == CC_OP_ADCX ? CC_C : CC_O), 1);
     }
 
@@ -1151,31 +1188,23 @@
         tcg_gen_ext32u_tl(s->T1, s->T1);
         tcg_gen_add_i64(s->T0, s->T0, s->T1);
         tcg_gen_add_i64(s->T0, s->T0, carry_in);
-        tcg_gen_shri_i64(carry_out, s->T0, 32);
+        tcg_gen_shri_i64(*carry_out, s->T0, 32);
         break;
 #endif
     default:
         zero = tcg_constant_tl(0);
-        tcg_gen_add2_tl(s->T0, carry_out, s->T0, zero, carry_in, zero);
-        tcg_gen_add2_tl(s->T0, carry_out, s->T0, carry_out, s->T1, zero);
+        tcg_gen_add2_tl(s->T0, *carry_out, s->T0, zero, carry_in, zero);
+        tcg_gen_add2_tl(s->T0, *carry_out, s->T0, *carry_out, s->T1, zero);
         break;
     }
-
-    opposite_cc_op = cc_op == CC_OP_ADCX ? CC_OP_ADOX : CC_OP_ADCX;
-    if (s->cc_op == CC_OP_ADCOX || s->cc_op == opposite_cc_op) {
-        /* Merge with the carry-out from the opposite instruction.  */
-        set_cc_op(s, CC_OP_ADCOX);
-    } else {
-        set_cc_op(s, cc_op);
-    }
 }
 
-static void gen_ADCX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_ADCX(DisasContext *s, X86DecodedInsn *decode)
 {
-    gen_ADCOX(s, env, decode->op[0].ot, CC_OP_ADCX);
+    gen_ADCOX(s, decode, CC_OP_ADCX);
 }
 
-static void gen_ADD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_ADD(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[1].ot;
 
@@ -1188,12 +1217,12 @@
     prepare_update2_cc(decode, s, CC_OP_ADDB + ot);
 }
 
-static void gen_ADOX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_ADOX(DisasContext *s, X86DecodedInsn *decode)
 {
-    gen_ADCOX(s, env, decode->op[0].ot, CC_OP_ADOX);
+    gen_ADCOX(s, decode, CC_OP_ADOX);
 }
 
-static void gen_AND(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_AND(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[1].ot;
 
@@ -1206,7 +1235,7 @@
     prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
 }
 
-static void gen_ANDN(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_ANDN(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
 
@@ -1214,7 +1243,7 @@
     prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
 }
 
-static void gen_ARPL(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_ARPL(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGv zf = tcg_temp_new();
     TCGv flags = tcg_temp_new();
@@ -1235,7 +1264,7 @@
     decode->cc_op = CC_OP_EFLAGS;
 }
 
-static void gen_BEXTR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_BEXTR(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
     TCGv bound = tcg_constant_tl(ot == MO_64 ? 63 : 31);
@@ -1263,43 +1292,37 @@
     prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
 }
 
-/* BLSI do not have memory operands and can use set_cc_op.  */
-static void gen_BLSI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_BLSI(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
 
-    tcg_gen_mov_tl(cpu_cc_src, s->T0);
-    tcg_gen_neg_tl(s->T1, s->T0);
+    /* input in T1, which is ready for prepare_update2_cc  */
+    tcg_gen_neg_tl(s->T0, s->T1);
     tcg_gen_and_tl(s->T0, s->T0, s->T1);
-    tcg_gen_mov_tl(cpu_cc_dst, s->T0);
-    set_cc_op(s, CC_OP_BMILGB + ot);
+    prepare_update2_cc(decode, s, CC_OP_BMILGB + ot);
 }
 
-/* BLSMSK do not have memory operands and can use set_cc_op.  */
-static void gen_BLSMSK(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_BLSMSK(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
 
-    tcg_gen_mov_tl(cpu_cc_src, s->T0);
-    tcg_gen_subi_tl(s->T1, s->T0, 1);
+    /* input in T1, which is ready for prepare_update2_cc  */
+    tcg_gen_subi_tl(s->T0, s->T1, 1);
     tcg_gen_xor_tl(s->T0, s->T0, s->T1);
-    tcg_gen_mov_tl(cpu_cc_dst, s->T0);
-    set_cc_op(s, CC_OP_BMILGB + ot);
+    prepare_update2_cc(decode, s, CC_OP_BMILGB + ot);
 }
 
-/* BLSR do not have memory operands and can use set_cc_op.  */
-static void gen_BLSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_BLSR(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
 
-    tcg_gen_mov_tl(cpu_cc_src, s->T0);
-    tcg_gen_subi_tl(s->T1, s->T0, 1);
+    /* input in T1, which is ready for prepare_update2_cc  */
+    tcg_gen_subi_tl(s->T0, s->T1, 1);
     tcg_gen_and_tl(s->T0, s->T0, s->T1);
-    tcg_gen_mov_tl(cpu_cc_dst, s->T0);
-    set_cc_op(s, CC_OP_BMILGB + ot);
+    prepare_update2_cc(decode, s, CC_OP_BMILGB + ot);
 }
 
-static void gen_BOUND(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_BOUND(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGv_i32 op = tcg_temp_new_i32();
     tcg_gen_trunc_tl_i32(op, s->T0);
@@ -1310,7 +1333,48 @@
     }
 }
 
-static void gen_BSWAP(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+/* Non-standard convention - on entry T0 is zero-extended input, T1 is the output.  */
+static void gen_BSF(DisasContext *s, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[0].ot;
+
+    /* Only the Z bit is defined and it is related to the input.  */
+    decode->cc_dst = tcg_temp_new();
+    decode->cc_op = CC_OP_LOGICB + ot;
+    tcg_gen_mov_tl(decode->cc_dst, s->T0);
+
+    /*
+     * The manual says that the output is undefined when the
+     * input is zero, but real hardware leaves it unchanged, and
+     * real programs appear to depend on that.  Accomplish this
+     * by passing the output as the value to return upon zero.
+     */
+    tcg_gen_ctz_tl(s->T0, s->T0, s->T1);
+}
+
+/* Non-standard convention - on entry T0 is zero-extended input, T1 is the output.  */
+static void gen_BSR(DisasContext *s, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[0].ot;
+
+    /* Only the Z bit is defined and it is related to the input.  */
+    decode->cc_dst = tcg_temp_new();
+    decode->cc_op = CC_OP_LOGICB + ot;
+    tcg_gen_mov_tl(decode->cc_dst, s->T0);
+
+    /*
+     * The manual says that the output is undefined when the
+     * input is zero, but real hardware leaves it unchanged, and
+     * real programs appear to depend on that.  Accomplish this
+     * by passing the output as the value to return upon zero.
+     * Plus, return the bit index of the first 1 bit.
+     */
+    tcg_gen_xori_tl(s->T1, s->T1, TARGET_LONG_BITS - 1);
+    tcg_gen_clz_tl(s->T0, s->T0, s->T1);
+    tcg_gen_xori_tl(s->T0, s->T0, TARGET_LONG_BITS - 1);
+}
+
+static void gen_BSWAP(DisasContext *s, X86DecodedInsn *decode)
 {
 #ifdef TARGET_X86_64
     if (s->dflag == MO_64) {
@@ -1321,7 +1385,7 @@
     tcg_gen_bswap32_tl(s->T0, s->T0, TCG_BSWAP_OZ);
 }
 
-static void gen_BZHI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_BZHI(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
     TCGv bound = tcg_constant_tl(ot == MO_64 ? 63 : 31);
@@ -1341,26 +1405,26 @@
     prepare_update2_cc(decode, s, CC_OP_BMILGB + ot);
 }
 
-static void gen_CALL(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_CALL(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_push_v(s, eip_next_tl(s));
-    gen_JMP(s, env, decode);
+    gen_JMP(s, decode);
 }
 
-static void gen_CALL_m(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_CALL_m(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_push_v(s, eip_next_tl(s));
-    gen_JMP_m(s, env, decode);
+    gen_JMP_m(s, decode);
 }
 
-static void gen_CALLF(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_CALLF(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_far_call(s);
 }
 
-static void gen_CALLF_m(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_CALLF_m(DisasContext *s, X86DecodedInsn *decode)
 {
-    MemOp ot = decode->op[2].ot;
+    MemOp ot = decode->op[1].ot;
 
     gen_op_ld_v(s, ot, s->T0, s->A0);
     gen_add_A0_im(s, 1 << ot);
@@ -1368,41 +1432,48 @@
     gen_far_call(s);
 }
 
-static void gen_CBW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_CBW(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp src_ot = decode->op[0].ot - 1;
 
     tcg_gen_ext_tl(s->T0, s->T0, src_ot | MO_SIGN);
 }
 
-static void gen_CLC(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_CLC(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_compute_eflags(s);
     tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~CC_C);
 }
 
-static void gen_CLD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_CLD(DisasContext *s, X86DecodedInsn *decode)
 {
     tcg_gen_st_i32(tcg_constant_i32(1), tcg_env, offsetof(CPUX86State, df));
 }
 
-static void gen_CLI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_CLI(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_reset_eflags(s, IF_MASK);
 }
 
-static void gen_CMC(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_CLTS(DisasContext *s, X86DecodedInsn *decode)
+{
+    gen_helper_clts(tcg_env);
+    /* abort block because static cpu state changed */
+    s->base.is_jmp = DISAS_EOB_NEXT;
+}
+
+static void gen_CMC(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_compute_eflags(s);
     tcg_gen_xori_tl(cpu_cc_src, cpu_cc_src, CC_C);
 }
 
-static void gen_CMOVcc(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_CMOVcc(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_cmovcc1(s, decode->b & 0xf, s->T0, s->T1);
 }
 
-static void gen_CMPccXADD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_CMPccXADD(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGLabel *label_top = gen_new_label();
     TCGLabel *label_bottom = gen_new_label();
@@ -1505,7 +1576,7 @@
     decode->cc_op = CC_OP_SUBB + ot;
 }
 
-static void gen_CMPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_CMPS(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[2].ot;
     if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
@@ -1515,7 +1586,65 @@
     }
 }
 
-static void gen_CRC32(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_CMPXCHG(DisasContext *s, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[2].ot;
+    TCGv cmpv = tcg_temp_new();
+    TCGv oldv = tcg_temp_new();
+    TCGv newv = tcg_temp_new();
+    TCGv dest;
+
+    tcg_gen_ext_tl(cmpv, cpu_regs[R_EAX], ot);
+    tcg_gen_ext_tl(newv, s->T1, ot);
+    if (s->prefix & PREFIX_LOCK) {
+        tcg_gen_atomic_cmpxchg_tl(oldv, s->A0, cmpv, newv,
+                                  s->mem_index, ot | MO_LE);
+    } else {
+        tcg_gen_ext_tl(oldv, s->T0, ot);
+        if (decode->op[0].has_ea) {
+            /*
+             * Perform an unconditional store cycle like physical cpu;
+             * must be before changing accumulator to ensure
+             * idempotency if the store faults and the instruction
+             * is restarted
+             */
+            tcg_gen_movcond_tl(TCG_COND_EQ, newv, oldv, cmpv, newv, oldv);
+            gen_op_st_v(s, ot, newv, s->A0);
+        } else {
+            /*
+             * Unlike the memory case, where "the destination operand receives
+             * a write cycle without regard to the result of the comparison",
+             * rm must not be touched altogether if the write fails, including
+             * not zero-extending it on 64-bit processors.  So, precompute
+             * the result of a successful writeback and perform the movcond
+             * directly on cpu_regs.  In case rm is part of RAX, note that this
+             * movcond and the one below are mutually exclusive is executed.
+             */
+            dest = gen_op_deposit_reg_v(s, ot, decode->op[0].n, newv, newv);
+            tcg_gen_movcond_tl(TCG_COND_EQ, dest, oldv, cmpv, newv, dest);
+        }
+        decode->op[0].unit = X86_OP_SKIP;
+    }
+
+    /* Write RAX only if the cmpxchg fails.  */
+    dest = gen_op_deposit_reg_v(s, ot, R_EAX, s->T0, oldv);
+    tcg_gen_movcond_tl(TCG_COND_NE, dest, oldv, cmpv, s->T0, dest);
+
+    tcg_gen_mov_tl(s->cc_srcT, cmpv);
+    tcg_gen_sub_tl(cmpv, cmpv, oldv);
+    decode->cc_dst = cmpv;
+    decode->cc_src = oldv;
+    decode->cc_op = CC_OP_SUBB + ot;
+}
+
+static void gen_CPUID(DisasContext *s, X86DecodedInsn *decode)
+{
+    gen_update_cc_op(s);
+    gen_update_eip_cur(s);
+    gen_helper_cpuid(tcg_env);
+}
+
+static void gen_CRC32(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[2].ot;
 
@@ -1523,7 +1652,7 @@
     gen_helper_crc32(s->T0, s->tmp2_i32, s->T1, tcg_constant_i32(8 << ot));
 }
 
-static void gen_CVTPI2Px(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_CVTPI2Px(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_helper_enter_mmx(tcg_env);
     if (s->prefix & PREFIX_DATA) {
@@ -1533,7 +1662,7 @@
     }
 }
 
-static void gen_CVTPx2PI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_CVTPx2PI(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_helper_enter_mmx(tcg_env);
     if (s->prefix & PREFIX_DATA) {
@@ -1543,7 +1672,7 @@
     }
 }
 
-static void gen_CVTTPx2PI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_CVTTPx2PI(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_helper_enter_mmx(tcg_env);
     if (s->prefix & PREFIX_DATA) {
@@ -1553,28 +1682,28 @@
     }
 }
 
-static void gen_CWD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_CWD(DisasContext *s, X86DecodedInsn *decode)
 {
     int shift = 8 << decode->op[0].ot;
 
     tcg_gen_sextract_tl(s->T0, s->T0, shift - 1, 1);
 }
 
-static void gen_DAA(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_DAA(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_update_cc_op(s);
     gen_helper_daa(tcg_env);
     assume_cc_op(s, CC_OP_EFLAGS);
 }
 
-static void gen_DAS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_DAS(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_update_cc_op(s);
     gen_helper_das(tcg_env);
     assume_cc_op(s, CC_OP_EFLAGS);
 }
 
-static void gen_DEC(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_DEC(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[1].ot;
 
@@ -1588,40 +1717,40 @@
     prepare_update_cc_incdec(decode, s, CC_OP_DECB + ot);
 }
 
-static void gen_DIV(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_DIV(DisasContext *s, X86DecodedInsn *decode)
 {
-    MemOp ot = decode->op[2].ot;
+    MemOp ot = decode->op[1].ot;
 
     switch(ot) {
     case MO_8:
-        gen_helper_divb_AL(tcg_env, s->T1);
+        gen_helper_divb_AL(tcg_env, s->T0);
         break;
     case MO_16:
-        gen_helper_divw_AX(tcg_env, s->T1);
+        gen_helper_divw_AX(tcg_env, s->T0);
         break;
     default:
     case MO_32:
-        gen_helper_divl_EAX(tcg_env, s->T1);
+        gen_helper_divl_EAX(tcg_env, s->T0);
         break;
 #ifdef TARGET_X86_64
     case MO_64:
-        gen_helper_divq_EAX(tcg_env, s->T1);
+        gen_helper_divq_EAX(tcg_env, s->T0);
         break;
 #endif
     }
 }
 
-static void gen_EMMS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_EMMS(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_helper_emms(tcg_env);
 }
 
-static void gen_ENTER(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_ENTER(DisasContext *s, X86DecodedInsn *decode)
 {
    gen_enter(s, decode->op[1].imm, decode->op[2].imm);
 }
 
-static void gen_EXTRQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_EXTRQ_i(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGv_i32 length = tcg_constant_i32(decode->immediate & 63);
     TCGv_i32 index = tcg_constant_i32((decode->immediate >> 8) & 63);
@@ -1629,12 +1758,30 @@
     gen_helper_extrq_i(tcg_env, OP_PTR0, index, length);
 }
 
-static void gen_EXTRQ_r(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_EXTRQ_r(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_helper_extrq_r(tcg_env, OP_PTR0, OP_PTR2);
 }
 
-static void gen_HLT(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_FXRSTOR(DisasContext *s, X86DecodedInsn *decode)
+{
+    if ((s->flags & HF_EM_MASK) || (s->flags & HF_TS_MASK)) {
+        gen_NM_exception(s);
+    } else {
+        gen_helper_fxrstor(tcg_env, s->A0);
+    }
+}
+
+static void gen_FXSAVE(DisasContext *s, X86DecodedInsn *decode)
+{
+    if ((s->flags & HF_EM_MASK) || (s->flags & HF_TS_MASK)) {
+        gen_NM_exception(s);
+    } else {
+        gen_helper_fxsave(tcg_env, s->A0);
+    }
+}
+
+static void gen_HLT(DisasContext *s, X86DecodedInsn *decode)
 {
 #ifdef CONFIG_SYSTEM_ONLY
     gen_update_cc_op(s);
@@ -1644,30 +1791,30 @@
 #endif
 }
 
-static void gen_IDIV(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_IDIV(DisasContext *s, X86DecodedInsn *decode)
 {
-    MemOp ot = decode->op[2].ot;
+    MemOp ot = decode->op[1].ot;
 
     switch(ot) {
     case MO_8:
-        gen_helper_idivb_AL(tcg_env, s->T1);
+        gen_helper_idivb_AL(tcg_env, s->T0);
         break;
     case MO_16:
-        gen_helper_idivw_AX(tcg_env, s->T1);
+        gen_helper_idivw_AX(tcg_env, s->T0);
         break;
     default:
     case MO_32:
-        gen_helper_idivl_EAX(tcg_env, s->T1);
+        gen_helper_idivl_EAX(tcg_env, s->T0);
         break;
 #ifdef TARGET_X86_64
     case MO_64:
-        gen_helper_idivq_EAX(tcg_env, s->T1);
+        gen_helper_idivq_EAX(tcg_env, s->T0);
         break;
 #endif
     }
 }
 
-static void gen_IMUL3(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_IMUL3(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
     TCGv cc_src_rhs;
@@ -1730,7 +1877,7 @@
     prepare_update2_cc(decode, s, CC_OP_MULB + ot);
 }
 
-static void gen_IMUL(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_IMUL(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[1].ot;
     TCGv cc_src_rhs;
@@ -1788,12 +1935,12 @@
     prepare_update2_cc(decode, s, CC_OP_MULB + ot);
 }
 
-static void gen_IN(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_IN(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
     TCGv_i32 port = tcg_temp_new_i32();
 
-    tcg_gen_trunc_tl_i32(port, s->T1);
+    tcg_gen_trunc_tl_i32(port, s->T0);
     tcg_gen_ext16u_i32(port, port);
     if (!gen_check_io(s, ot, port, SVM_IOIO_TYPE_MASK)) {
         return;
@@ -1804,7 +1951,7 @@
     gen_bpt_io(s, port, ot);
 }
 
-static void gen_INC(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_INC(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[1].ot;
 
@@ -1818,7 +1965,7 @@
     prepare_update_cc_incdec(decode, s, CC_OP_INCB + ot);
 }
 
-static void gen_INS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_INS(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[1].ot;
     TCGv_i32 port = tcg_temp_new_i32();
@@ -1838,7 +1985,7 @@
     }
 }
 
-static void gen_INSERTQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_INSERTQ_i(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGv_i32 length = tcg_constant_i32(decode->immediate & 63);
     TCGv_i32 index = tcg_constant_i32((decode->immediate >> 8) & 63);
@@ -1846,17 +1993,17 @@
     gen_helper_insertq_i(tcg_env, OP_PTR0, OP_PTR1, index, length);
 }
 
-static void gen_INSERTQ_r(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_INSERTQ_r(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_helper_insertq_r(tcg_env, OP_PTR0, OP_PTR2);
 }
 
-static void gen_INT(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_INT(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_interrupt(s, decode->immediate);
 }
 
-static void gen_INT1(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_INT1(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_update_cc_op(s);
     gen_update_eip_next(s);
@@ -1864,19 +2011,19 @@
     s->base.is_jmp = DISAS_NORETURN;
 }
 
-static void gen_INT3(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_INT3(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_interrupt(s, EXCP03_INT3);
 }
 
-static void gen_INTO(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_INTO(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_update_cc_op(s);
     gen_update_eip_cur(s);
     gen_helper_into(tcg_env, cur_insn_len_i32(s));
 }
 
-static void gen_IRET(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_IRET(DisasContext *s, X86DecodedInsn *decode)
 {
     if (!PE(s) || VM86(s)) {
         gen_helper_iret_real(tcg_env, tcg_constant_i32(s->dflag - 1));
@@ -1888,13 +2035,13 @@
     s->base.is_jmp = DISAS_EOB_ONLY;
 }
 
-static void gen_Jcc(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_Jcc(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_bnd_jmp(s);
     gen_jcc(s, decode->b & 0xf, decode->immediate);
 }
 
-static void gen_JCXZ(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_JCXZ(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGLabel *taken = gen_new_label();
 
@@ -1903,27 +2050,27 @@
     gen_conditional_jump_labels(s, decode->immediate, NULL, taken);
 }
 
-static void gen_JMP(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_JMP(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_update_cc_op(s);
     gen_jmp_rel(s, s->dflag, decode->immediate, 0);
 }
 
-static void gen_JMP_m(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_JMP_m(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_op_jmp_v(s, s->T0);
     gen_bnd_jmp(s);
     s->base.is_jmp = DISAS_JUMP;
 }
 
-static void gen_JMPF(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_JMPF(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_far_jmp(s);
 }
 
-static void gen_JMPF_m(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_JMPF_m(DisasContext *s, X86DecodedInsn *decode)
 {
-    MemOp ot = decode->op[2].ot;
+    MemOp ot = decode->op[1].ot;
 
     gen_op_ld_v(s, ot, s->T0, s->A0);
     gen_add_A0_im(s, 1 << ot);
@@ -1931,7 +2078,7 @@
     gen_far_jmp(s);
 }
 
-static void gen_LAHF(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_LAHF(DisasContext *s, X86DecodedInsn *decode)
 {
     if (CODE64(s) && !(s->cpuid_ext3_features & CPUID_EXT3_LAHF_LM)) {
         return gen_illegal_opcode(s);
@@ -1942,13 +2089,30 @@
     tcg_gen_deposit_tl(cpu_regs[R_EAX], cpu_regs[R_EAX], s->T0, 8, 8);
 }
 
-static void gen_LDMXCSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_LAR(DisasContext *s, X86DecodedInsn *decode)
 {
-    tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T1);
+    MemOp ot = decode->op[0].ot;
+    TCGv result = tcg_temp_new();
+    TCGv dest;
+
+    gen_compute_eflags(s);
+    gen_update_cc_op(s);
+    gen_helper_lar(result, tcg_env, s->T0);
+
+    /* Perform writeback here to skip it if ZF=0.  */
+    decode->op[0].unit = X86_OP_SKIP;
+    dest = gen_op_deposit_reg_v(s, ot, decode->op[0].n, result, result);
+    tcg_gen_movcond_tl(TCG_COND_TSTNE, dest, cpu_cc_src, tcg_constant_tl(CC_Z),
+                       result, dest);
+}
+
+static void gen_LDMXCSR(DisasContext *s, X86DecodedInsn *decode)
+{
+    tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
     gen_helper_ldmxcsr(tcg_env, s->tmp2_i32);
 }
 
-static void gen_lxx_seg(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, int seg)
+static void gen_lxx_seg(DisasContext *s, X86DecodedInsn *decode, int seg)
 {
     MemOp ot = decode->op[0].ot;
 
@@ -1960,39 +2124,45 @@
     gen_movl_seg(s, seg, s->T1);
 }
 
-static void gen_LDS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_LDS(DisasContext *s, X86DecodedInsn *decode)
 {
-    gen_lxx_seg(s, env, decode, R_DS);
+    gen_lxx_seg(s, decode, R_DS);
 }
 
-static void gen_LEA(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_LEA(DisasContext *s, X86DecodedInsn *decode)
 {
-    tcg_gen_mov_tl(s->T0, s->A0);
+    TCGv ea = gen_lea_modrm_1(s, decode->mem, false);
+    gen_lea_v_seg_dest(s, s->aflag, s->T0, ea, -1, -1);
 }
 
-static void gen_LEAVE(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_LEAVE(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_leave(s);
 }
 
-static void gen_LES(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_LES(DisasContext *s, X86DecodedInsn *decode)
 {
-    gen_lxx_seg(s, env, decode, R_ES);
+    gen_lxx_seg(s, decode, R_ES);
 }
 
-static void gen_LFS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_LFENCE(DisasContext *s, X86DecodedInsn *decode)
 {
-    gen_lxx_seg(s, env, decode, R_FS);
+    tcg_gen_mb(TCG_MO_LD_LD | TCG_BAR_SC);
 }
 
-static void gen_LGS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_LFS(DisasContext *s, X86DecodedInsn *decode)
 {
-    gen_lxx_seg(s, env, decode, R_GS);
+    gen_lxx_seg(s, decode, R_FS);
 }
 
-static void gen_LODS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_LGS(DisasContext *s, X86DecodedInsn *decode)
 {
-    MemOp ot = decode->op[2].ot;
+    gen_lxx_seg(s, decode, R_GS);
+}
+
+static void gen_LODS(DisasContext *s, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[1].ot;
     if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
         gen_repz(s, ot, gen_lods);
     } else {
@@ -2000,7 +2170,7 @@
     }
 }
 
-static void gen_LOOP(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_LOOP(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGLabel *taken = gen_new_label();
 
@@ -2010,7 +2180,7 @@
     gen_conditional_jump_labels(s, decode->immediate, NULL, taken);
 }
 
-static void gen_LOOPE(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_LOOPE(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGLabel *taken = gen_new_label();
     TCGLabel *not_taken = gen_new_label();
@@ -2022,7 +2192,7 @@
     gen_conditional_jump_labels(s, decode->immediate, not_taken, taken);
 }
 
-static void gen_LOOPNE(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_LOOPNE(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGLabel *taken = gen_new_label();
     TCGLabel *not_taken = gen_new_label();
@@ -2034,18 +2204,58 @@
     gen_conditional_jump_labels(s, decode->immediate, not_taken, taken);
 }
 
-static void gen_LSS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_LSL(DisasContext *s, X86DecodedInsn *decode)
 {
-    gen_lxx_seg(s, env, decode, R_SS);
+    MemOp ot = decode->op[0].ot;
+    TCGv result = tcg_temp_new();
+    TCGv dest;
+
+    gen_compute_eflags(s);
+    gen_update_cc_op(s);
+    gen_helper_lsl(result, tcg_env, s->T0);
+
+    /* Perform writeback here to skip it if ZF=0.  */
+    decode->op[0].unit = X86_OP_SKIP;
+    dest = gen_op_deposit_reg_v(s, ot, decode->op[0].n, result, result);
+    tcg_gen_movcond_tl(TCG_COND_TSTNE, dest, cpu_cc_src, tcg_constant_tl(CC_Z),
+                       result, dest);
 }
 
-static void gen_MOV(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_LSS(DisasContext *s, X86DecodedInsn *decode)
+{
+    gen_lxx_seg(s, decode, R_SS);
+}
+
+static void gen_LZCNT(DisasContext *s, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[0].ot;
+
+    /* C bit (cc_src) is defined related to the input.  */
+    decode->cc_src = tcg_temp_new();
+    decode->cc_dst = s->T0;
+    decode->cc_op = CC_OP_BMILGB + ot;
+    tcg_gen_mov_tl(decode->cc_src, s->T0);
+
+    /*
+     * Reduce the target_ulong result by the number of zeros that
+     * we expect to find at the top.
+     */
+    tcg_gen_clzi_tl(s->T0, s->T0, TARGET_LONG_BITS);
+    tcg_gen_subi_tl(s->T0, s->T0, TARGET_LONG_BITS - (8 << ot));
+}
+
+static void gen_MFENCE(DisasContext *s, X86DecodedInsn *decode)
+{
+    tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
+}
+
+static void gen_MOV(DisasContext *s, X86DecodedInsn *decode)
 {
     /* nothing to do! */
 }
 #define gen_NOP gen_MOV
 
-static void gen_MASKMOV(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_MASKMOV(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_lea_v_seg(s, cpu_regs[R_EDI], R_DS, s->override);
 
@@ -2056,7 +2266,7 @@
     }
 }
 
-static void gen_MOVBE(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_MOVBE(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
 
@@ -2068,7 +2278,7 @@
     }
 }
 
-static void gen_MOVD_from(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_MOVD_from(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[2].ot;
 
@@ -2086,7 +2296,7 @@
     }
 }
 
-static void gen_MOVD_to(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_MOVD_to(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[2].ot;
     int vec_len = vector_len(s, decode);
@@ -2108,12 +2318,12 @@
     }
 }
 
-static void gen_MOVDQ(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_MOVDQ(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_store_sse(s, decode, decode->op[2].offset);
 }
 
-static void gen_MOVMSK(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_MOVMSK(DisasContext *s, X86DecodedInsn *decode)
 {
     typeof(gen_helper_movmskps_ymm) *ps, *pd, *fn;
     ps = s->vex_l ? gen_helper_movmskps_ymm : gen_helper_movmskps_xmm;
@@ -2123,7 +2333,7 @@
     tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32);
 }
 
-static void gen_MOVQ(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_MOVQ(DisasContext *s, X86DecodedInsn *decode)
 {
     int vec_len = vector_len(s, decode);
     int lo_ofs = vector_elem_offset(&decode->op[0], MO_64, 0);
@@ -2145,14 +2355,14 @@
     }
 }
 
-static void gen_MOVq_dq(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_MOVq_dq(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_helper_enter_mmx(tcg_env);
     /* Otherwise the same as any other movq.  */
-    return gen_MOVQ(s, env, decode);
+    return gen_MOVQ(s, decode);
 }
 
-static void gen_MOVS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_MOVS(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[2].ot;
     if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
@@ -2162,7 +2372,7 @@
     }
 }
 
-static void gen_MUL(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_MUL(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[1].ot;
 
@@ -2213,7 +2423,7 @@
     decode->cc_op = CC_OP_MULB + ot;
 }
 
-static void gen_MULX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_MULX(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
 
@@ -2239,7 +2449,7 @@
     }
 }
 
-static void gen_NEG(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_NEG(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
     TCGv oldv = tcg_temp_new();
@@ -2266,7 +2476,7 @@
     decode->cc_op = CC_OP_SUBB + ot;
 }
 
-static void gen_NOT(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_NOT(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
 
@@ -2279,7 +2489,7 @@
     }
 }
 
-static void gen_OR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_OR(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[1].ot;
 
@@ -2292,7 +2502,7 @@
     prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
 }
 
-static void gen_OUT(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_OUT(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[1].ot;
     TCGv_i32 port = tcg_temp_new_i32();
@@ -2309,7 +2519,7 @@
     gen_bpt_io(s, port, ot);
 }
 
-static void gen_OUTS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_OUTS(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[1].ot;
     TCGv_i32 port = tcg_temp_new_i32();
@@ -2328,7 +2538,7 @@
     }
 }
 
-static void gen_PALIGNR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PALIGNR(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
     if (!(s->prefix & PREFIX_DATA)) {
@@ -2340,7 +2550,7 @@
     }
 }
 
-static void gen_PANDN(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PANDN(DisasContext *s, X86DecodedInsn *decode)
 {
     int vec_len = vector_len(s, decode);
 
@@ -2350,7 +2560,7 @@
                       decode->op[1].offset, vec_len, vec_len);
 }
 
-static void gen_PAUSE(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PAUSE(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_update_cc_op(s);
     gen_update_eip_next(s);
@@ -2358,14 +2568,14 @@
     s->base.is_jmp = DISAS_NORETURN;
 }
 
-static void gen_PCMPESTRI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PCMPESTRI(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
     gen_helper_pcmpestri_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
     assume_cc_op(s, CC_OP_EFLAGS);
 }
 
-static void gen_PCMPESTRM(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PCMPESTRM(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
     gen_helper_pcmpestrm_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
@@ -2376,14 +2586,14 @@
     }
 }
 
-static void gen_PCMPISTRI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PCMPISTRI(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
     gen_helper_pcmpistri_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
     assume_cc_op(s, CC_OP_EFLAGS);
 }
 
-static void gen_PCMPISTRM(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PCMPISTRM(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
     gen_helper_pcmpistrm_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
@@ -2394,17 +2604,17 @@
     }
 }
 
-static void gen_PDEP(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PDEP(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_helper_pdep(s->T0, s->T0, s->T1);
 }
 
-static void gen_PEXT(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PEXT(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_helper_pext(s->T0, s->T0, s->T1);
 }
 
-static inline void gen_pextr(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, MemOp ot)
+static inline void gen_pextr(DisasContext *s, X86DecodedInsn *decode, MemOp ot)
 {
     int vec_len = vector_len(s, decode);
     int mask = (vec_len >> ot) - 1;
@@ -2430,23 +2640,23 @@
     }
 }
 
-static void gen_PEXTRB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PEXTRB(DisasContext *s, X86DecodedInsn *decode)
 {
-    gen_pextr(s, env, decode, MO_8);
+    gen_pextr(s, decode, MO_8);
 }
 
-static void gen_PEXTRW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PEXTRW(DisasContext *s, X86DecodedInsn *decode)
 {
-    gen_pextr(s, env, decode, MO_16);
+    gen_pextr(s, decode, MO_16);
 }
 
-static void gen_PEXTR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PEXTR(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
-    gen_pextr(s, env, decode, ot);
+    gen_pextr(s, decode, ot);
 }
 
-static inline void gen_pinsr(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, MemOp ot)
+static inline void gen_pinsr(DisasContext *s, X86DecodedInsn *decode, MemOp ot)
 {
     int vec_len = vector_len(s, decode);
     int mask = (vec_len >> ot) - 1;
@@ -2477,19 +2687,19 @@
     }
 }
 
-static void gen_PINSRB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PINSRB(DisasContext *s, X86DecodedInsn *decode)
 {
-    gen_pinsr(s, env, decode, MO_8);
+    gen_pinsr(s, decode, MO_8);
 }
 
-static void gen_PINSRW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PINSRW(DisasContext *s, X86DecodedInsn *decode)
 {
-    gen_pinsr(s, env, decode, MO_16);
+    gen_pinsr(s, decode, MO_16);
 }
 
-static void gen_PINSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PINSR(DisasContext *s, X86DecodedInsn *decode)
 {
-    gen_pinsr(s, env, decode, decode->op[2].ot);
+    gen_pinsr(s, decode, decode->op[2].ot);
 }
 
 static void gen_pmovmskb_i64(TCGv_i64 d, TCGv_i64 s)
@@ -2529,7 +2739,7 @@
     tcg_gen_or_vec(vece, d, d, t);
 }
 
-static void gen_PMOVMSKB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PMOVMSKB(DisasContext *s, X86DecodedInsn *decode)
 {
     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
     static const GVecGen2 g = {
@@ -2573,7 +2783,7 @@
     }
 }
 
-static void gen_POP(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_POP(DisasContext *s, X86DecodedInsn *decode)
 {
     X86DecodedOp *op = &decode->op[0];
     MemOp ot = gen_pop_T0(s);
@@ -2587,12 +2797,21 @@
     gen_pop_update(s, ot);
 }
 
-static void gen_POPA(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_POPA(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_popa(s);
 }
 
-static void gen_POPF(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_POPCNT(DisasContext *s, X86DecodedInsn *decode)
+{
+    decode->cc_src = tcg_temp_new();
+    decode->cc_op = CC_OP_POPCNT;
+
+    tcg_gen_mov_tl(decode->cc_src, s->T0);
+    tcg_gen_ctpop_tl(s->T0, s->T0);
+}
+
+static void gen_POPF(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot;
     int mask = TF_MASK | AC_MASK | ID_MASK | NT_MASK;
@@ -2614,13 +2833,13 @@
     s->base.is_jmp = DISAS_EOB_NEXT;
 }
 
-static void gen_PSHUFW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PSHUFW(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
     gen_helper_pshufw_mmx(OP_PTR0, OP_PTR1, imm);
 }
 
-static void gen_PSRLW_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PSRLW_i(DisasContext *s, X86DecodedInsn *decode)
 {
     int vec_len = vector_len(s, decode);
 
@@ -2633,7 +2852,7 @@
     }
 }
 
-static void gen_PSLLW_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PSLLW_i(DisasContext *s, X86DecodedInsn *decode)
 {
     int vec_len = vector_len(s, decode);
 
@@ -2646,7 +2865,7 @@
     }
 }
 
-static void gen_PSRAW_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PSRAW_i(DisasContext *s, X86DecodedInsn *decode)
 {
     int vec_len = vector_len(s, decode);
 
@@ -2658,7 +2877,7 @@
                       decode->immediate, vec_len, vec_len);
 }
 
-static void gen_PSRLD_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PSRLD_i(DisasContext *s, X86DecodedInsn *decode)
 {
     int vec_len = vector_len(s, decode);
 
@@ -2671,7 +2890,7 @@
     }
 }
 
-static void gen_PSLLD_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PSLLD_i(DisasContext *s, X86DecodedInsn *decode)
 {
     int vec_len = vector_len(s, decode);
 
@@ -2684,7 +2903,7 @@
     }
 }
 
-static void gen_PSRAD_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PSRAD_i(DisasContext *s, X86DecodedInsn *decode)
 {
     int vec_len = vector_len(s, decode);
 
@@ -2696,7 +2915,7 @@
                       decode->immediate, vec_len, vec_len);
 }
 
-static void gen_PSRLQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PSRLQ_i(DisasContext *s, X86DecodedInsn *decode)
 {
     int vec_len = vector_len(s, decode);
 
@@ -2709,7 +2928,7 @@
     }
 }
 
-static void gen_PSLLQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PSLLQ_i(DisasContext *s, X86DecodedInsn *decode)
 {
     int vec_len = vector_len(s, decode);
 
@@ -2736,7 +2955,7 @@
     return ptr;
 }
 
-static void gen_PSRLDQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PSRLDQ_i(DisasContext *s, X86DecodedInsn *decode)
 {
     int vec_len = vector_len(s, decode);
     TCGv_ptr imm_vec = make_imm8u_xmm_vec(decode->immediate, vec_len);
@@ -2748,7 +2967,7 @@
     }
 }
 
-static void gen_PSLLDQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PSLLDQ_i(DisasContext *s, X86DecodedInsn *decode)
 {
     int vec_len = vector_len(s, decode);
     TCGv_ptr imm_vec = make_imm8u_xmm_vec(decode->immediate, vec_len);
@@ -2760,17 +2979,17 @@
     }
 }
 
-static void gen_PUSH(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PUSH(DisasContext *s, X86DecodedInsn *decode)
 {
-    gen_push_v(s, s->T1);
+    gen_push_v(s, s->T0);
 }
 
-static void gen_PUSHA(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PUSHA(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_pusha(s);
 }
 
-static void gen_PUSHF(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_PUSHF(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_update_cc_op(s);
     gen_helper_read_eflags(s->T0, tcg_env);
@@ -2778,16 +2997,16 @@
 }
 
 static MemOp gen_shift_count(DisasContext *s, X86DecodedInsn *decode,
-                             bool *can_be_zero, TCGv *count)
+                             bool *can_be_zero, TCGv *count, int unit)
 {
     MemOp ot = decode->op[0].ot;
     int mask = (ot <= MO_32 ? 0x1f : 0x3f);
 
     *can_be_zero = false;
-    switch (decode->op[2].unit) {
+    switch (unit) {
     case X86_OP_INT:
         *count = tcg_temp_new();
-        tcg_gen_andi_tl(*count, s->T1, mask);
+        tcg_gen_andi_tl(*count, cpu_regs[R_ECX], mask);
         *can_be_zero = true;
         break;
 
@@ -2967,12 +3186,12 @@
  * length - count, because (length-1) - (count-1) can be computed with
  * a XOR, and that is commutative unlike subtraction.
  */
-static void gen_RCL(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_RCL(DisasContext *s, X86DecodedInsn *decode)
 {
     bool have_1bit_cin, can_be_zero;
     TCGv count;
     TCGLabel *zero_label = NULL;
-    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count);
+    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
     TCGv low, high, low_count;
 
     if (!count) {
@@ -3019,12 +3238,12 @@
     }
 }
 
-static void gen_RCR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_RCR(DisasContext *s, X86DecodedInsn *decode)
 {
     bool have_1bit_cin, can_be_zero;
     TCGv count;
     TCGLabel *zero_label = NULL;
-    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count);
+    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
     TCGv low, high, high_count;
 
     if (!count) {
@@ -3072,9 +3291,53 @@
     }
 }
 
-static void gen_RET(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+#ifdef CONFIG_USER_ONLY
+static void gen_unreachable(DisasContext *s, X86DecodedInsn *decode)
 {
-    int16_t adjust = decode->e.op2 == X86_TYPE_I ? decode->immediate : 0;
+    g_assert_not_reached();
+}
+#endif
+
+#ifndef CONFIG_USER_ONLY
+static void gen_RDMSR(DisasContext *s, X86DecodedInsn *decode)
+{
+    gen_update_cc_op(s);
+    gen_update_eip_cur(s);
+    gen_helper_rdmsr(tcg_env);
+}
+#else
+#define gen_RDMSR gen_unreachable
+#endif
+
+static void gen_RDPMC(DisasContext *s, X86DecodedInsn *decode)
+{
+    gen_update_cc_op(s);
+    gen_update_eip_cur(s);
+    translator_io_start(&s->base);
+    gen_helper_rdpmc(tcg_env);
+    s->base.is_jmp = DISAS_NORETURN;
+}
+
+static void gen_RDTSC(DisasContext *s, X86DecodedInsn *decode)
+{
+    gen_update_cc_op(s);
+    gen_update_eip_cur(s);
+    translator_io_start(&s->base);
+    gen_helper_rdtsc(tcg_env);
+}
+
+static void gen_RDxxBASE(DisasContext *s, X86DecodedInsn *decode)
+{
+    TCGv base = cpu_seg_base[s->modrm & 8 ? R_GS : R_FS];
+
+    /* Preserve hflags bits by testing CR4 at runtime.  */
+    gen_helper_cr4_testbit(tcg_env, tcg_constant_i32(CR4_FSGSBASE_MASK));
+    tcg_gen_mov_tl(s->T0, base);
+}
+
+static void gen_RET(DisasContext *s, X86DecodedInsn *decode)
+{
+    int16_t adjust = decode->e.op1 == X86_TYPE_I ? decode->immediate : 0;
 
     MemOp ot = gen_pop_T0(s);
     gen_stack_update(s, adjust + (1 << ot));
@@ -3083,9 +3346,9 @@
     s->base.is_jmp = DISAS_JUMP;
 }
 
-static void gen_RETF(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_RETF(DisasContext *s, X86DecodedInsn *decode)
 {
-    int16_t adjust = decode->e.op2 == X86_TYPE_I ? decode->immediate : 0;
+    int16_t adjust = decode->e.op1 == X86_TYPE_I ? decode->immediate : 0;
 
     if (!PE(s) || VM86(s)) {
         gen_lea_ss_ofs(s, s->A0, cpu_regs[R_ESP], 0);
@@ -3154,11 +3417,11 @@
     }
 }
 
-static void gen_ROL(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_ROL(DisasContext *s, X86DecodedInsn *decode)
 {
     bool can_be_zero;
     TCGv count;
-    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count);
+    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
     TCGv_i32 temp32, count32;
     TCGv old = tcg_temp_new();
 
@@ -3182,11 +3445,11 @@
     gen_rot_overflow(decode, s->T0, old, can_be_zero, count);
 }
 
-static void gen_ROR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_ROR(DisasContext *s, X86DecodedInsn *decode)
 {
     bool can_be_zero;
     TCGv count;
-    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count);
+    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
     TCGv_i32 temp32, count32;
     TCGv old = tcg_temp_new();
 
@@ -3211,7 +3474,7 @@
     gen_rot_overflow(decode, s->T0, old, can_be_zero, count);
 }
 
-static void gen_RORX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_RORX(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
     int mask = ot == MO_64 ? 63 : 31;
@@ -3235,7 +3498,18 @@
     }
 }
 
-static void gen_SAHF(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+#ifndef CONFIG_USER_ONLY
+static void gen_RSM(DisasContext *s, X86DecodedInsn *decode)
+{
+    gen_helper_rsm(tcg_env);
+    assume_cc_op(s, CC_OP_EFLAGS);
+    s->base.is_jmp = DISAS_EOB_ONLY;
+}
+#else
+#define gen_RSM gen_UD
+#endif
+
+static void gen_SAHF(DisasContext *s, X86DecodedInsn *decode)
 {
     if (CODE64(s) && !(s->cpuid_ext3_features & CPUID_EXT3_LAHF_LM)) {
         return gen_illegal_opcode(s);
@@ -3247,7 +3521,7 @@
     tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, s->T0);
 }
 
-static void gen_SALC(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_SALC(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_compute_eflags_c(s, s->T0);
     tcg_gen_neg_tl(s->T0, s->T0);
@@ -3283,11 +3557,11 @@
                         old_cc_op, tcg_constant_i32(cc_op));
 }
 
-static void gen_SAR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_SAR(DisasContext *s, X86DecodedInsn *decode)
 {
     bool can_be_zero;
     TCGv count;
-    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count);
+    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
 
     if (!count) {
         return;
@@ -3305,7 +3579,7 @@
     }
 }
 
-static void gen_SARX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_SARX(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
     int mask;
@@ -3315,7 +3589,7 @@
     tcg_gen_sar_tl(s->T0, s->T0, s->T1);
 }
 
-static void gen_SBB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_SBB(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
     TCGv c_in = tcg_temp_new();
@@ -3337,7 +3611,7 @@
     prepare_update3_cc(decode, s, CC_OP_SBBB + ot, c_in);
 }
 
-static void gen_SCAS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_SCAS(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[2].ot;
     if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
@@ -3347,27 +3621,32 @@
     }
 }
 
-static void gen_SETcc(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_SETcc(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_setcc1(s, decode->b & 0xf, s->T0);
 }
 
-static void gen_SHA1NEXTE(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_SFENCE(DisasContext *s, X86DecodedInsn *decode)
+{
+    tcg_gen_mb(TCG_MO_ST_ST | TCG_BAR_SC);
+}
+
+static void gen_SHA1NEXTE(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_helper_sha1nexte(OP_PTR0, OP_PTR1, OP_PTR2);
 }
 
-static void gen_SHA1MSG1(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_SHA1MSG1(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_helper_sha1msg1(OP_PTR0, OP_PTR1, OP_PTR2);
 }
 
-static void gen_SHA1MSG2(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_SHA1MSG2(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_helper_sha1msg2(OP_PTR0, OP_PTR1, OP_PTR2);
 }
 
-static void gen_SHA1RNDS4(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_SHA1RNDS4(DisasContext *s, X86DecodedInsn *decode)
 {
     switch(decode->immediate & 3) {
     case 0:
@@ -3385,17 +3664,17 @@
     }
 }
 
-static void gen_SHA256MSG1(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_SHA256MSG1(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_helper_sha256msg1(OP_PTR0, OP_PTR1, OP_PTR2);
 }
 
-static void gen_SHA256MSG2(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_SHA256MSG2(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_helper_sha256msg2(OP_PTR0, OP_PTR1, OP_PTR2);
 }
 
-static void gen_SHA256RNDS2(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_SHA256RNDS2(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGv_i32 wk0 = tcg_temp_new_i32();
     TCGv_i32 wk1 = tcg_temp_new_i32();
@@ -3406,11 +3685,11 @@
     gen_helper_sha256rnds2(OP_PTR0, OP_PTR1, OP_PTR2, wk0, wk1);
 }
 
-static void gen_SHL(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_SHL(DisasContext *s, X86DecodedInsn *decode)
 {
     bool can_be_zero;
     TCGv count;
-    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count);
+    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
 
     if (!count) {
         return;
@@ -3428,7 +3707,28 @@
     }
 }
 
-static void gen_SHLX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_SHLD(DisasContext *s, X86DecodedInsn *decode)
+{
+    bool can_be_zero;
+    TCGv count;
+    int unit = decode->e.op3 == X86_TYPE_I ? X86_OP_IMM : X86_OP_INT;
+    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, unit);
+
+    if (!count) {
+        return;
+    }
+
+    decode->cc_dst = s->T0;
+    decode->cc_src = s->tmp0;
+    gen_shiftd_rm_T1(s, ot, false, count);
+    if (can_be_zero) {
+        gen_shift_dynamic_flags(s, decode, count, CC_OP_SHLB + ot);
+    } else {
+        decode->cc_op = CC_OP_SHLB + ot;
+    }
+}
+
+static void gen_SHLX(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
     int mask;
@@ -3438,11 +3738,11 @@
     tcg_gen_shl_tl(s->T0, s->T0, s->T1);
 }
 
-static void gen_SHR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_SHR(DisasContext *s, X86DecodedInsn *decode)
 {
     bool can_be_zero;
     TCGv count;
-    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count);
+    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, decode->op[2].unit);
 
     if (!count) {
         return;
@@ -3460,7 +3760,28 @@
     }
 }
 
-static void gen_SHRX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_SHRD(DisasContext *s, X86DecodedInsn *decode)
+{
+    bool can_be_zero;
+    TCGv count;
+    int unit = decode->e.op3 == X86_TYPE_I ? X86_OP_IMM : X86_OP_INT;
+    MemOp ot = gen_shift_count(s, decode, &can_be_zero, &count, unit);
+
+    if (!count) {
+        return;
+    }
+
+    decode->cc_dst = s->T0;
+    decode->cc_src = s->tmp0;
+    gen_shiftd_rm_T1(s, ot, true, count);
+    if (can_be_zero) {
+        gen_shift_dynamic_flags(s, decode, count, CC_OP_SARB + ot);
+    } else {
+        decode->cc_op = CC_OP_SARB + ot;
+    }
+}
+
+static void gen_SHRX(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
     int mask;
@@ -3470,37 +3791,37 @@
     tcg_gen_shr_tl(s->T0, s->T0, s->T1);
 }
 
-static void gen_STC(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_STC(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_compute_eflags(s);
     tcg_gen_ori_tl(cpu_cc_src, cpu_cc_src, CC_C);
 }
 
-static void gen_STD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_STD(DisasContext *s, X86DecodedInsn *decode)
 {
     tcg_gen_st_i32(tcg_constant_i32(-1), tcg_env, offsetof(CPUX86State, df));
 }
 
-static void gen_STI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_STI(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_set_eflags(s, IF_MASK);
     s->base.is_jmp = DISAS_EOB_INHIBIT_IRQ;
 }
 
-static void gen_VAESKEYGEN(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VAESKEYGEN(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
     assert(!s->vex_l);
     gen_helper_aeskeygenassist_xmm(tcg_env, OP_PTR0, OP_PTR1, imm);
 }
 
-static void gen_STMXCSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_STMXCSR(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_helper_update_mxcsr(tcg_env);
     tcg_gen_ld32u_tl(s->T0, tcg_env, offsetof(CPUX86State, mxcsr));
 }
 
-static void gen_STOS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_STOS(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[1].ot;
     if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) {
@@ -3510,7 +3831,7 @@
     }
 }
 
-static void gen_SUB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_SUB(DisasContext *s, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[1].ot;
 
@@ -3526,12 +3847,71 @@
     prepare_update2_cc(decode, s, CC_OP_SUBB + ot);
 }
 
-static void gen_UD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_SYSCALL(DisasContext *s, X86DecodedInsn *decode)
+{
+    gen_update_cc_op(s);
+    gen_update_eip_cur(s);
+    gen_helper_syscall(tcg_env, cur_insn_len_i32(s));
+    if (LMA(s)) {
+        assume_cc_op(s, CC_OP_EFLAGS);
+    }
+
+    /*
+     * TF handling for the syscall insn is different. The TF bit is checked
+     * after the syscall insn completes. This allows #DB to not be
+     * generated after one has entered CPL0 if TF is set in FMASK.
+     */
+    s->base.is_jmp = DISAS_EOB_RECHECK_TF;
+}
+
+static void gen_SYSENTER(DisasContext *s, X86DecodedInsn *decode)
+{
+    gen_helper_sysenter(tcg_env);
+    s->base.is_jmp = DISAS_EOB_ONLY;
+}
+
+static void gen_SYSEXIT(DisasContext *s, X86DecodedInsn *decode)
+{
+    gen_helper_sysexit(tcg_env, tcg_constant_i32(s->dflag - 1));
+    s->base.is_jmp = DISAS_EOB_ONLY;
+}
+
+static void gen_SYSRET(DisasContext *s, X86DecodedInsn *decode)
+{
+    gen_helper_sysret(tcg_env, tcg_constant_i32(s->dflag - 1));
+    if (LMA(s)) {
+        assume_cc_op(s, CC_OP_EFLAGS);
+    }
+
+    /*
+     * TF handling for the sysret insn is different. The TF bit is checked
+     * after the sysret insn completes. This allows #DB to be
+     * generated "as if" the syscall insn in userspace has just
+     * completed.
+     */
+    s->base.is_jmp = DISAS_EOB_RECHECK_TF;
+}
+
+static void gen_TZCNT(DisasContext *s, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[0].ot;
+
+    /* C bit (cc_src) is defined related to the input.  */
+    decode->cc_src = tcg_temp_new();
+    decode->cc_dst = s->T0;
+    decode->cc_op = CC_OP_BMILGB + ot;
+    tcg_gen_mov_tl(decode->cc_src, s->T0);
+
+    /* A zero input returns the operand size.  */
+    tcg_gen_ctzi_tl(s->T0, s->T0, 8 << ot);
+}
+
+static void gen_UD(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_illegal_opcode(s);
 }
 
-static void gen_VAESIMC(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VAESIMC(DisasContext *s, X86DecodedInsn *decode)
 {
     assert(!s->vex_l);
     gen_helper_aesimc_xmm(tcg_env, OP_PTR0, OP_PTR2);
@@ -3586,7 +3966,7 @@
 };
 #undef SSE_CMP
 
-static void gen_VCMP(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VCMP(DisasContext *s, X86DecodedInsn *decode)
 {
     int index = decode->immediate & (s->prefix & PREFIX_VEX ? 31 : 7);
     int b =
@@ -3597,7 +3977,7 @@
     gen_helper_cmp_funcs[index][b](tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
 }
 
-static void gen_VCOMI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VCOMI(DisasContext *s, X86DecodedInsn *decode)
 {
     SSEFunc_0_epp fn;
     fn = s->prefix & PREFIX_DATA ? gen_helper_comisd : gen_helper_comiss;
@@ -3605,7 +3985,7 @@
     assume_cc_op(s, CC_OP_EFLAGS);
 }
 
-static void gen_VCVTPD2PS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VCVTPD2PS(DisasContext *s, X86DecodedInsn *decode)
 {
     if (s->vex_l) {
         gen_helper_cvtpd2ps_ymm(tcg_env, OP_PTR0, OP_PTR2);
@@ -3614,7 +3994,7 @@
     }
 }
 
-static void gen_VCVTPS2PD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VCVTPS2PD(DisasContext *s, X86DecodedInsn *decode)
 {
     if (s->vex_l) {
         gen_helper_cvtps2pd_ymm(tcg_env, OP_PTR0, OP_PTR2);
@@ -3623,9 +4003,9 @@
     }
 }
 
-static void gen_VCVTPS2PH(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VCVTPS2PH(DisasContext *s, X86DecodedInsn *decode)
 {
-    gen_unary_imm_fp_sse(s, env, decode,
+    gen_unary_imm_fp_sse(s, decode,
                       gen_helper_cvtps2ph_xmm,
                       gen_helper_cvtps2ph_ymm);
     /*
@@ -3637,17 +4017,17 @@
     }
 }
 
-static void gen_VCVTSD2SS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VCVTSD2SS(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_helper_cvtsd2ss(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
 }
 
-static void gen_VCVTSS2SD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VCVTSS2SD(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_helper_cvtss2sd(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
 }
 
-static void gen_VCVTSI2Sx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VCVTSI2Sx(DisasContext *s, X86DecodedInsn *decode)
 {
     int vec_len = vector_len(s, decode);
     TCGv_i32 in;
@@ -3677,7 +4057,7 @@
     }
 }
 
-static inline void gen_VCVTtSx2SI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+static inline void gen_VCVTtSx2SI(DisasContext *s, X86DecodedInsn *decode,
                                   SSEFunc_i_ep ss2si, SSEFunc_l_ep ss2sq,
                                   SSEFunc_i_ep sd2si, SSEFunc_l_ep sd2sq)
 {
@@ -3715,21 +4095,21 @@
 #define gen_helper_cvttsd2sq NULL
 #endif
 
-static void gen_VCVTSx2SI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VCVTSx2SI(DisasContext *s, X86DecodedInsn *decode)
 {
-    gen_VCVTtSx2SI(s, env, decode,
+    gen_VCVTtSx2SI(s, decode,
                    gen_helper_cvtss2si, gen_helper_cvtss2sq,
                    gen_helper_cvtsd2si, gen_helper_cvtsd2sq);
 }
 
-static void gen_VCVTTSx2SI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VCVTTSx2SI(DisasContext *s, X86DecodedInsn *decode)
 {
-    gen_VCVTtSx2SI(s, env, decode,
+    gen_VCVTtSx2SI(s, decode,
                    gen_helper_cvttss2si, gen_helper_cvttss2sq,
                    gen_helper_cvttsd2si, gen_helper_cvttsd2sq);
 }
 
-static void gen_VEXTRACTx128(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VEXTRACTx128(DisasContext *s, X86DecodedInsn *decode)
 {
     int mask = decode->immediate & 1;
     int src_ofs = vector_elem_offset(&decode->op[1], MO_128, mask);
@@ -3741,12 +4121,12 @@
     }
 }
 
-static void gen_VEXTRACTPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VEXTRACTPS(DisasContext *s, X86DecodedInsn *decode)
 {
-    gen_pextr(s, env, decode, MO_32);
+    gen_pextr(s, decode, MO_32);
 }
 
-static void gen_vinsertps(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_vinsertps(DisasContext *s, X86DecodedInsn *decode)
 {
     int val = decode->immediate;
     int dest_word = (val >> 4) & 3;
@@ -3779,21 +4159,21 @@
     }
 }
 
-static void gen_VINSERTPS_r(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VINSERTPS_r(DisasContext *s, X86DecodedInsn *decode)
 {
     int val = decode->immediate;
     tcg_gen_ld_i32(s->tmp2_i32, tcg_env,
                    vector_elem_offset(&decode->op[2], MO_32, (val >> 6) & 3));
-    gen_vinsertps(s, env, decode);
+    gen_vinsertps(s, decode);
 }
 
-static void gen_VINSERTPS_m(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VINSERTPS_m(DisasContext *s, X86DecodedInsn *decode)
 {
     tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
-    gen_vinsertps(s, env, decode);
+    gen_vinsertps(s, decode);
 }
 
-static void gen_VINSERTx128(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VINSERTx128(DisasContext *s, X86DecodedInsn *decode)
 {
     int mask = decode->immediate & 1;
     tcg_gen_gvec_mov(MO_64,
@@ -3804,7 +4184,7 @@
                      decode->op[1].offset + offsetof(YMMReg, YMM_X(!mask)), 16, 16);
 }
 
-static inline void gen_maskmov(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
+static inline void gen_maskmov(DisasContext *s, X86DecodedInsn *decode,
                                SSEFunc_0_eppt xmm, SSEFunc_0_eppt ymm)
 {
     if (!s->vex_l) {
@@ -3814,17 +4194,17 @@
     }
 }
 
-static void gen_VMASKMOVPD_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VMASKMOVPD_st(DisasContext *s, X86DecodedInsn *decode)
 {
-    gen_maskmov(s, env, decode, gen_helper_vpmaskmovq_st_xmm, gen_helper_vpmaskmovq_st_ymm);
+    gen_maskmov(s, decode, gen_helper_vpmaskmovq_st_xmm, gen_helper_vpmaskmovq_st_ymm);
 }
 
-static void gen_VMASKMOVPS_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VMASKMOVPS_st(DisasContext *s, X86DecodedInsn *decode)
 {
-    gen_maskmov(s, env, decode, gen_helper_vpmaskmovd_st_xmm, gen_helper_vpmaskmovd_st_ymm);
+    gen_maskmov(s, decode, gen_helper_vpmaskmovd_st_xmm, gen_helper_vpmaskmovd_st_ymm);
 }
 
-static void gen_VMOVHPx_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VMOVHPx_ld(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_ldq_env_A0(s, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
     if (decode->op[0].offset != decode->op[1].offset) {
@@ -3833,12 +4213,12 @@
     }
 }
 
-static void gen_VMOVHPx_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VMOVHPx_st(DisasContext *s, X86DecodedInsn *decode)
 {
     gen_stq_env_A0(s, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
 }
 
-static void gen_VMOVHPx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VMOVHPx(DisasContext *s, X86DecodedInsn *decode)
 {
     if (decode->op[0].offset != decode->op[2].offset) {
         tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
@@ -3850,7 +4230,7 @@
     }
 }
 
-static void gen_VMOVHLPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VMOVHLPS(DisasContext *s, X86DecodedInsn *decode)
 {
     tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
     tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
@@ -3860,7 +4240,7 @@
     }
 }
 
-static void gen_VMOVLHPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VMOVLHPS(DisasContext *s, X86DecodedInsn *decode)
 {
     tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset);
     tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
@@ -3875,7 +4255,7 @@
  * Use a gvec move to move everything above the bottom 64 bits.
  */
 
-static void gen_VMOVLPx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VMOVLPx(DisasContext *s, X86DecodedInsn *decode)
 {
     int vec_len = vector_len(s, decode);
 
@@ -3884,7 +4264,7 @@
     tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
 }
 
-static void gen_VMOVLPx_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VMOVLPx_ld(DisasContext *s, X86DecodedInsn *decode)
 {
     int vec_len = vector_len(s, decode);
 
@@ -3893,13 +4273,13 @@
     tcg_gen_st_i64(s->tmp1_i64, OP_PTR0, offsetof(ZMMReg, ZMM_Q(0)));
 }
 
-static void gen_VMOVLPx_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VMOVLPx_st(DisasContext *s, X86DecodedInsn *decode)
 {
     tcg_gen_ld_i64(s->tmp1_i64, OP_PTR2, offsetof(ZMMReg, ZMM_Q(0)));
     tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
 }
 
-static void gen_VMOVSD_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VMOVSD_ld(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGv_i64 zero = tcg_constant_i64(0);
 
@@ -3908,7 +4288,7 @@
     tcg_gen_st_i64(s->tmp1_i64, OP_PTR0, offsetof(ZMMReg, ZMM_Q(0)));
 }
 
-static void gen_VMOVSS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VMOVSS(DisasContext *s, X86DecodedInsn *decode)
 {
     int vec_len = vector_len(s, decode);
 
@@ -3917,7 +4297,7 @@
     tcg_gen_st_i32(s->tmp2_i32, OP_PTR0, offsetof(ZMMReg, ZMM_L(0)));
 }
 
-static void gen_VMOVSS_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VMOVSS_ld(DisasContext *s, X86DecodedInsn *decode)
 {
     int vec_len = vector_len(s, decode);
 
@@ -3926,55 +4306,55 @@
     tcg_gen_st_i32(s->tmp2_i32, OP_PTR0, offsetof(ZMMReg, ZMM_L(0)));
 }
 
-static void gen_VMOVSS_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VMOVSS_st(DisasContext *s, X86DecodedInsn *decode)
 {
     tcg_gen_ld_i32(s->tmp2_i32, OP_PTR2, offsetof(ZMMReg, ZMM_L(0)));
     tcg_gen_qemu_st_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
 }
 
-static void gen_VPMASKMOV_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VPMASKMOV_st(DisasContext *s, X86DecodedInsn *decode)
 {
     if (s->vex_w) {
-        gen_VMASKMOVPD_st(s, env, decode);
+        gen_VMASKMOVPD_st(s, decode);
     } else {
-        gen_VMASKMOVPS_st(s, env, decode);
+        gen_VMASKMOVPS_st(s, decode);
     }
 }
 
-static void gen_VPERMD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VPERMD(DisasContext *s, X86DecodedInsn *decode)
 {
     assert(s->vex_l);
     gen_helper_vpermd_ymm(OP_PTR0, OP_PTR1, OP_PTR2);
 }
 
-static void gen_VPERM2x128(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VPERM2x128(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
     assert(s->vex_l);
     gen_helper_vpermdq_ymm(OP_PTR0, OP_PTR1, OP_PTR2, imm);
 }
 
-static void gen_VPHMINPOSUW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VPHMINPOSUW(DisasContext *s, X86DecodedInsn *decode)
 {
     assert(!s->vex_l);
     gen_helper_phminposuw_xmm(tcg_env, OP_PTR0, OP_PTR2);
 }
 
-static void gen_VROUNDSD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VROUNDSD(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
     assert(!s->vex_l);
     gen_helper_roundsd_xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
 }
 
-static void gen_VROUNDSS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VROUNDSS(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
     assert(!s->vex_l);
     gen_helper_roundss_xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
 }
 
-static void gen_VSHUF(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VSHUF(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGv_i32 imm = tcg_constant_i32(decode->immediate);
     SSEFunc_0_pppi ps, pd, fn;
@@ -3984,7 +4364,7 @@
     fn(OP_PTR0, OP_PTR1, OP_PTR2, imm);
 }
 
-static void gen_VUCOMI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VUCOMI(DisasContext *s, X86DecodedInsn *decode)
 {
     SSEFunc_0_epp fn;
     fn = s->prefix & PREFIX_DATA ? gen_helper_ucomisd : gen_helper_ucomiss;
@@ -3992,7 +4372,7 @@
     assume_cc_op(s, CC_OP_EFLAGS);
 }
 
-static void gen_VZEROALL(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VZEROALL(DisasContext *s, X86DecodedInsn *decode)
 {
     TCGv_ptr ptr = tcg_temp_new_ptr();
 
@@ -4001,7 +4381,7 @@
                       tcg_constant_ptr(CPU_NB_REGS * sizeof(ZMMReg)));
 }
 
-static void gen_VZEROUPPER(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_VZEROUPPER(DisasContext *s, X86DecodedInsn *decode)
 {
     int i;
 
@@ -4011,7 +4391,7 @@
     }
 }
 
-static void gen_WAIT(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_WAIT(DisasContext *s, X86DecodedInsn *decode)
 {
     if ((s->flags & (HF_MP_MASK | HF_TS_MASK)) == (HF_MP_MASK | HF_TS_MASK)) {
         gen_NM_exception(s);
@@ -4022,7 +4402,52 @@
     }
 }
 
-static void gen_XCHG(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+#ifndef CONFIG_USER_ONLY
+static void gen_WRMSR(DisasContext *s, X86DecodedInsn *decode)
+{
+    gen_update_cc_op(s);
+    gen_update_eip_cur(s);
+    gen_helper_wrmsr(tcg_env);
+    s->base.is_jmp = DISAS_EOB_NEXT;
+}
+#else
+#define gen_WRMSR gen_unreachable
+#endif
+
+static void gen_WRxxBASE(DisasContext *s, X86DecodedInsn *decode)
+{
+    TCGv base = cpu_seg_base[s->modrm & 8 ? R_GS : R_FS];
+
+    /* Preserve hflags bits by testing CR4 at runtime.  */
+    gen_helper_cr4_testbit(tcg_env, tcg_constant_i32(CR4_FSGSBASE_MASK));
+    tcg_gen_mov_tl(base, s->T0);
+}
+
+static void gen_XADD(DisasContext *s, X86DecodedInsn *decode)
+{
+    MemOp ot = decode->op[1].ot;
+
+    decode->cc_dst = tcg_temp_new();
+    decode->cc_src = s->T1;
+    decode->cc_op = CC_OP_ADDB + ot;
+
+    if (s->prefix & PREFIX_LOCK) {
+        tcg_gen_atomic_fetch_add_tl(s->T0, s->A0, s->T1, s->mem_index, ot | MO_LE);
+        tcg_gen_add_tl(decode->cc_dst, s->T0, s->T1);
+    } else {
+        tcg_gen_add_tl(decode->cc_dst, s->T0, s->T1);
+        /*
+         * NOTE: writing memory first is important for MMU exceptions,
+         * but "new result" wins for XADD AX, AX.
+         */
+        gen_writeback(s, decode, 0, decode->cc_dst);
+    }
+    if (decode->op[0].has_ea || decode->op[2].n != decode->op[0].n) {
+        gen_writeback(s, decode, 2, s->T0);
+    }
+}
+
+static void gen_XCHG(DisasContext *s, X86DecodedInsn *decode)
 {
     if (s->prefix & PREFIX_LOCK) {
         tcg_gen_atomic_xchg_tl(s->T0, s->A0, s->T1,
@@ -4036,7 +4461,7 @@
     }
 }
 
-static void gen_XLAT(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_XLAT(DisasContext *s, X86DecodedInsn *decode)
 {
     /* AL is already zero-extended into s->T0.  */
     tcg_gen_add_tl(s->A0, cpu_regs[R_EBX], s->T0);
@@ -4044,7 +4469,7 @@
     gen_op_ld_v(s, MO_8, s->T0, s->A0);
 }
 
-static void gen_XOR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+static void gen_XOR(DisasContext *s, X86DecodedInsn *decode)
 {
     /* special case XOR reg, reg */
     if (decode->op[1].unit == X86_OP_INT &&
@@ -4064,3 +4489,34 @@
         prepare_update1_cc(decode, s, CC_OP_LOGICB + ot);
     }
 }
+
+static void gen_XRSTOR(DisasContext *s, X86DecodedInsn *decode)
+{
+    TCGv_i64 features = tcg_temp_new_i64();
+
+    tcg_gen_concat_tl_i64(features, cpu_regs[R_EAX], cpu_regs[R_EDX]);
+    gen_helper_xrstor(tcg_env, s->A0, features);
+    if (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_MPX) {
+        /*
+         * XRSTOR is how MPX is enabled, which changes how
+         * we translate.  Thus we need to end the TB.
+         */
+        s->base.is_jmp = DISAS_EOB_NEXT;
+    }
+}
+
+static void gen_XSAVE(DisasContext *s, X86DecodedInsn *decode)
+{
+    TCGv_i64 features = tcg_temp_new_i64();
+
+    tcg_gen_concat_tl_i64(features, cpu_regs[R_EAX], cpu_regs[R_EDX]);
+    gen_helper_xsave(tcg_env, s->A0, features);
+}
+
+static void gen_XSAVEOPT(DisasContext *s, X86DecodedInsn *decode)
+{
+    TCGv_i64 features = tcg_temp_new_i64();
+
+    tcg_gen_concat_tl_i64(features, cpu_regs[R_EAX], cpu_regs[R_EDX]);
+    gen_helper_xsave(tcg_env, s->A0, features);
+}
diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c
index 715db1f..aee3d19 100644
--- a/target/i386/tcg/seg_helper.c
+++ b/target/i386/tcg/seg_helper.c
@@ -2265,11 +2265,11 @@
 target_ulong helper_lsl(CPUX86State *env, target_ulong selector1)
 {
     unsigned int limit;
-    uint32_t e1, e2, eflags, selector;
+    uint32_t e1, e2, selector;
     int rpl, dpl, cpl, type;
 
     selector = selector1 & 0xffff;
-    eflags = cpu_cc_compute_all(env);
+    assert(CC_OP == CC_OP_EFLAGS);
     if ((selector & 0xfffc) == 0) {
         goto fail;
     }
@@ -2301,22 +2301,22 @@
         }
         if (dpl < cpl || dpl < rpl) {
         fail:
-            CC_SRC = eflags & ~CC_Z;
+            CC_SRC &= ~CC_Z;
             return 0;
         }
     }
     limit = get_seg_limit(e1, e2);
-    CC_SRC = eflags | CC_Z;
+    CC_SRC |= CC_Z;
     return limit;
 }
 
 target_ulong helper_lar(CPUX86State *env, target_ulong selector1)
 {
-    uint32_t e1, e2, eflags, selector;
+    uint32_t e1, e2, selector;
     int rpl, dpl, cpl, type;
 
     selector = selector1 & 0xffff;
-    eflags = cpu_cc_compute_all(env);
+    assert(CC_OP == CC_OP_EFLAGS);
     if ((selector & 0xfffc) == 0) {
         goto fail;
     }
@@ -2351,11 +2351,11 @@
         }
         if (dpl < cpl || dpl < rpl) {
         fail:
-            CC_SRC = eflags & ~CC_Z;
+            CC_SRC &= ~CC_Z;
             return 0;
         }
     }
-    CC_SRC = eflags | CC_Z;
+    CC_SRC |= CC_Z;
     return e2 & 0x00f0ff00;
 }
 
diff --git a/target/i386/tcg/sysemu/misc_helper.c b/target/i386/tcg/sysemu/misc_helper.c
index 7fa0c5a..094aa56 100644
--- a/target/i386/tcg/sysemu/misc_helper.c
+++ b/target/i386/tcg/sysemu/misc_helper.c
@@ -63,23 +63,13 @@
                              cpu_get_mem_attrs(env), NULL);
 }
 
-target_ulong helper_read_crN(CPUX86State *env, int reg)
+target_ulong helper_read_cr8(CPUX86State *env)
 {
-    target_ulong val;
-
-    switch (reg) {
-    default:
-        val = env->cr[reg];
-        break;
-    case 8:
-        if (!(env->hflags2 & HF2_VINTR_MASK)) {
-            val = cpu_get_apic_tpr(env_archcpu(env)->apic_state);
-        } else {
-            val = env->int_ctl & V_TPR_MASK;
-        }
-        break;
+    if (!(env->hflags2 & HF2_VINTR_MASK)) {
+        return cpu_get_apic_tpr(env_archcpu(env)->apic_state);
+    } else {
+        return env->int_ctl & V_TPR_MASK;
     }
-    return val;
 }
 
 void helper_write_crN(CPUX86State *env, int reg, target_ulong t0)
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index fcba9c1..ad18198 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -246,10 +246,6 @@
 STUB_HELPER(outb, TCGv_env env, TCGv_i32 port, TCGv_i32 val)
 STUB_HELPER(outw, TCGv_env env, TCGv_i32 port, TCGv_i32 val)
 STUB_HELPER(outl, TCGv_env env, TCGv_i32 port, TCGv_i32 val)
-STUB_HELPER(rdmsr, TCGv_env env)
-STUB_HELPER(read_crN, TCGv ret, TCGv_env env, TCGv_i32 reg)
-STUB_HELPER(get_dr, TCGv ret, TCGv_env env, TCGv_i32 reg)
-STUB_HELPER(set_dr, TCGv_env env, TCGv_i32 reg, TCGv val)
 STUB_HELPER(stgi, TCGv_env env)
 STUB_HELPER(svm_check_intercept, TCGv_env env, TCGv_i32 type)
 STUB_HELPER(vmload, TCGv_env env, TCGv_i32 aflag)
@@ -257,7 +253,6 @@
 STUB_HELPER(vmrun, TCGv_env env, TCGv_i32 aflag, TCGv_i32 pc_ofs)
 STUB_HELPER(vmsave, TCGv_env env, TCGv_i32 aflag)
 STUB_HELPER(write_crN, TCGv_env env, TCGv_i32 reg, TCGv val)
-STUB_HELPER(wrmsr, TCGv_env env)
 #endif
 
 static void gen_jmp_rel(DisasContext *s, MemOp ot, int diff, int tb_num);
@@ -439,13 +434,6 @@
     return CODE64(s) ? MO_64 : SS32(s) ? MO_32 : MO_16;
 }
 
-/* Select size 8 if lsb of B is clear, else OT.  Used for decoding
-   byte vs word opcodes.  */
-static inline MemOp mo_b_d(int b, MemOp ot)
-{
-    return b & 1 ? ot : MO_8;
-}
-
 /* Compute the result of writing t0 to the OT-sized register REG.
  *
  * If DEST is NULL, store the result into the register and return the
@@ -540,15 +528,6 @@
     tcg_gen_qemu_st_tl(t0, a0, s->mem_index, idx | MO_LE);
 }
 
-static inline void gen_op_st_rm_T0_A0(DisasContext *s, int idx, int d)
-{
-    if (d == OR_TMP0) {
-        gen_op_st_v(s, idx, s->T0, s->A0);
-    } else {
-        gen_op_mov_reg_v(s, idx, d, s->T0);
-    }
-}
-
 static void gen_update_eip_next(DisasContext *s)
 {
     assert(s->pc_save != -1);
@@ -729,11 +708,6 @@
     return dst;
 }
 
-static void gen_extu(MemOp ot, TCGv reg)
-{
-    gen_ext_tl(reg, reg, ot, false);
-}
-
 static void gen_exts(MemOp ot, TCGv reg)
 {
     gen_ext_tl(reg, reg, ot, true);
@@ -837,17 +811,6 @@
     gen_op_add_reg(s, s->aflag, R_EDI, dshift);
 }
 
-static void gen_op_update1_cc(DisasContext *s)
-{
-    tcg_gen_mov_tl(cpu_cc_dst, s->T0);
-}
-
-static void gen_op_update2_cc(DisasContext *s)
-{
-    tcg_gen_mov_tl(cpu_cc_src, s->T1);
-    tcg_gen_mov_tl(cpu_cc_dst, s->T0);
-}
-
 /* compute all eflags to reg */
 static void gen_mov_eflags(DisasContext *s, TCGv reg)
 {
@@ -1448,64 +1411,11 @@
     return false;
 }
 
-static void gen_shift_flags(DisasContext *s, MemOp ot, TCGv result,
-                            TCGv shm1, TCGv count, bool is_right)
-{
-    TCGv_i32 z32, s32, oldop;
-    TCGv z_tl;
-
-    /* Store the results into the CC variables.  If we know that the
-       variable must be dead, store unconditionally.  Otherwise we'll
-       need to not disrupt the current contents.  */
-    z_tl = tcg_constant_tl(0);
-    if (cc_op_live[s->cc_op] & USES_CC_DST) {
-        tcg_gen_movcond_tl(TCG_COND_NE, cpu_cc_dst, count, z_tl,
-                           result, cpu_cc_dst);
-    } else {
-        tcg_gen_mov_tl(cpu_cc_dst, result);
-    }
-    if (cc_op_live[s->cc_op] & USES_CC_SRC) {
-        tcg_gen_movcond_tl(TCG_COND_NE, cpu_cc_src, count, z_tl,
-                           shm1, cpu_cc_src);
-    } else {
-        tcg_gen_mov_tl(cpu_cc_src, shm1);
-    }
-
-    /* Get the two potential CC_OP values into temporaries.  */
-    tcg_gen_movi_i32(s->tmp2_i32, (is_right ? CC_OP_SARB : CC_OP_SHLB) + ot);
-    if (s->cc_op == CC_OP_DYNAMIC) {
-        oldop = cpu_cc_op;
-    } else {
-        tcg_gen_movi_i32(s->tmp3_i32, s->cc_op);
-        oldop = s->tmp3_i32;
-    }
-
-    /* Conditionally store the CC_OP value.  */
-    z32 = tcg_constant_i32(0);
-    s32 = tcg_temp_new_i32();
-    tcg_gen_trunc_tl_i32(s32, count);
-    tcg_gen_movcond_i32(TCG_COND_NE, cpu_cc_op, s32, z32, s->tmp2_i32, oldop);
-
-    /* The CC_OP value is no longer predictable.  */
-    set_cc_op(s, CC_OP_DYNAMIC);
-}
-
 /* XXX: add faster immediate case */
-static void gen_shiftd_rm_T1(DisasContext *s, MemOp ot, int op1,
-                             bool is_right, TCGv count_in)
+static void gen_shiftd_rm_T1(DisasContext *s, MemOp ot,
+                             bool is_right, TCGv count)
 {
     target_ulong mask = (ot == MO_64 ? 63 : 31);
-    TCGv count;
-
-    /* load */
-    if (op1 == OR_TMP0) {
-        gen_op_ld_v(s, ot, s->T0, s->A0);
-    } else {
-        gen_op_mov_v_reg(s, ot, s->T0, op1);
-    }
-
-    count = tcg_temp_new();
-    tcg_gen_andi_tl(count, count_in, mask);
 
     switch (ot) {
     case MO_16:
@@ -1567,11 +1477,6 @@
         tcg_gen_or_tl(s->T0, s->T0, s->T1);
         break;
     }
-
-    /* store */
-    gen_op_st_rm_T0_A0(s, ot, op1);
-
-    gen_shift_flags(s, ot, s->T0, s->tmp0, count, is_right);
 }
 
 #define X86_MAX_INSN_LENGTH 15
@@ -3081,108 +2986,11 @@
     CPUX86State *env = cpu_env(cpu);
     int prefixes = s->prefix;
     MemOp dflag = s->dflag;
-    int shift;
     MemOp ot;
-    int modrm, reg, rm, mod, op, opreg, val;
+    int modrm, reg, rm, mod, op, val;
 
     /* now check op code */
     switch (b) {
-        /**************************/
-        /* arith & logic */
-    case 0x1c0:
-    case 0x1c1: /* xadd Ev, Gv */
-        ot = mo_b_d(b, dflag);
-        modrm = x86_ldub_code(env, s);
-        reg = ((modrm >> 3) & 7) | REX_R(s);
-        mod = (modrm >> 6) & 3;
-        gen_op_mov_v_reg(s, ot, s->T0, reg);
-        if (mod == 3) {
-            rm = (modrm & 7) | REX_B(s);
-            gen_op_mov_v_reg(s, ot, s->T1, rm);
-            tcg_gen_add_tl(s->T0, s->T0, s->T1);
-            gen_op_mov_reg_v(s, ot, reg, s->T1);
-            gen_op_mov_reg_v(s, ot, rm, s->T0);
-        } else {
-            gen_lea_modrm(env, s, modrm);
-            if (s->prefix & PREFIX_LOCK) {
-                tcg_gen_atomic_fetch_add_tl(s->T1, s->A0, s->T0,
-                                            s->mem_index, ot | MO_LE);
-                tcg_gen_add_tl(s->T0, s->T0, s->T1);
-            } else {
-                gen_op_ld_v(s, ot, s->T1, s->A0);
-                tcg_gen_add_tl(s->T0, s->T0, s->T1);
-                gen_op_st_v(s, ot, s->T0, s->A0);
-            }
-            gen_op_mov_reg_v(s, ot, reg, s->T1);
-        }
-        gen_op_update2_cc(s);
-        set_cc_op(s, CC_OP_ADDB + ot);
-        break;
-    case 0x1b0:
-    case 0x1b1: /* cmpxchg Ev, Gv */
-        {
-            TCGv oldv, newv, cmpv, dest;
-
-            ot = mo_b_d(b, dflag);
-            modrm = x86_ldub_code(env, s);
-            reg = ((modrm >> 3) & 7) | REX_R(s);
-            mod = (modrm >> 6) & 3;
-            oldv = tcg_temp_new();
-            newv = tcg_temp_new();
-            cmpv = tcg_temp_new();
-            gen_op_mov_v_reg(s, ot, newv, reg);
-            tcg_gen_mov_tl(cmpv, cpu_regs[R_EAX]);
-            gen_extu(ot, cmpv);
-            if (s->prefix & PREFIX_LOCK) {
-                if (mod == 3) {
-                    goto illegal_op;
-                }
-                gen_lea_modrm(env, s, modrm);
-                tcg_gen_atomic_cmpxchg_tl(oldv, s->A0, cmpv, newv,
-                                          s->mem_index, ot | MO_LE);
-            } else {
-                if (mod == 3) {
-                    rm = (modrm & 7) | REX_B(s);
-                    gen_op_mov_v_reg(s, ot, oldv, rm);
-                    gen_extu(ot, oldv);
-
-                    /*
-                     * Unlike the memory case, where "the destination operand receives
-                     * a write cycle without regard to the result of the comparison",
-                     * rm must not be touched altogether if the write fails, including
-                     * not zero-extending it on 64-bit processors.  So, precompute
-                     * the result of a successful writeback and perform the movcond
-                     * directly on cpu_regs.  Also need to write accumulator first, in
-                     * case rm is part of RAX too.
-                     */
-                    dest = gen_op_deposit_reg_v(s, ot, rm, newv, newv);
-                    tcg_gen_movcond_tl(TCG_COND_EQ, dest, oldv, cmpv, newv, dest);
-                } else {
-                    gen_lea_modrm(env, s, modrm);
-                    gen_op_ld_v(s, ot, oldv, s->A0);
-
-                    /*
-                     * Perform an unconditional store cycle like physical cpu;
-                     * must be before changing accumulator to ensure
-                     * idempotency if the store faults and the instruction
-                     * is restarted
-                     */
-                    tcg_gen_movcond_tl(TCG_COND_EQ, newv, oldv, cmpv, newv, oldv);
-                    gen_op_st_v(s, ot, newv, s->A0);
-                }
-            }
-	    /*
-	     * Write EAX only if the cmpxchg fails; reuse newv as the destination,
-	     * since it's dead here.
-	     */
-            dest = gen_op_deposit_reg_v(s, ot, R_EAX, newv, oldv);
-            tcg_gen_movcond_tl(TCG_COND_EQ, dest, oldv, cmpv, dest, newv);
-            tcg_gen_mov_tl(cpu_cc_src, oldv);
-            tcg_gen_mov_tl(s->cc_srcT, cmpv);
-            tcg_gen_sub_tl(cpu_cc_dst, cmpv, oldv);
-            set_cc_op(s, CC_OP_SUBB + ot);
-        }
-        break;
     case 0x1c7: /* cmpxchg8b */
         modrm = x86_ldub_code(env, s);
         mod = (modrm >> 6) & 3;
@@ -3245,45 +3053,6 @@
         }
         break;
 
-        /**************************/
-        /* shifts */
-    case 0x1a4: /* shld imm */
-        op = 0;
-        shift = 1;
-        goto do_shiftd;
-    case 0x1a5: /* shld cl */
-        op = 0;
-        shift = 0;
-        goto do_shiftd;
-    case 0x1ac: /* shrd imm */
-        op = 1;
-        shift = 1;
-        goto do_shiftd;
-    case 0x1ad: /* shrd cl */
-        op = 1;
-        shift = 0;
-    do_shiftd:
-        ot = dflag;
-        modrm = x86_ldub_code(env, s);
-        mod = (modrm >> 6) & 3;
-        rm = (modrm & 7) | REX_B(s);
-        reg = ((modrm >> 3) & 7) | REX_R(s);
-        if (mod != 3) {
-            gen_lea_modrm(env, s, modrm);
-            opreg = OR_TMP0;
-        } else {
-            opreg = rm;
-        }
-        gen_op_mov_v_reg(s, ot, s->T1, reg);
-
-        if (shift) {
-            TCGv imm = tcg_constant_tl(x86_ldub_code(env, s));
-            gen_shiftd_rm_T1(s, ot, opreg, op, imm);
-        } else {
-            gen_shiftd_rm_T1(s, ot, opreg, op, cpu_regs[R_ECX]);
-        }
-        break;
-
         /************************/
         /* bit operations */
     case 0x1ba: /* bt/bts/btr/btc Gv, im */
@@ -3423,147 +3192,6 @@
             break;
         }
         break;
-    case 0x1bc: /* bsf / tzcnt */
-    case 0x1bd: /* bsr / lzcnt */
-        ot = dflag;
-        modrm = x86_ldub_code(env, s);
-        reg = ((modrm >> 3) & 7) | REX_R(s);
-        gen_ld_modrm(env, s, modrm, ot);
-        gen_extu(ot, s->T0);
-
-        /* Note that lzcnt and tzcnt are in different extensions.  */
-        if ((prefixes & PREFIX_REPZ)
-            && (b & 1
-                ? s->cpuid_ext3_features & CPUID_EXT3_ABM
-                : s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)) {
-            int size = 8 << ot;
-            /* For lzcnt/tzcnt, C bit is defined related to the input. */
-            tcg_gen_mov_tl(cpu_cc_src, s->T0);
-            if (b & 1) {
-                /* For lzcnt, reduce the target_ulong result by the
-                   number of zeros that we expect to find at the top.  */
-                tcg_gen_clzi_tl(s->T0, s->T0, TARGET_LONG_BITS);
-                tcg_gen_subi_tl(s->T0, s->T0, TARGET_LONG_BITS - size);
-            } else {
-                /* For tzcnt, a zero input must return the operand size.  */
-                tcg_gen_ctzi_tl(s->T0, s->T0, size);
-            }
-            /* For lzcnt/tzcnt, Z bit is defined related to the result.  */
-            gen_op_update1_cc(s);
-            set_cc_op(s, CC_OP_BMILGB + ot);
-        } else {
-            /* For bsr/bsf, only the Z bit is defined and it is related
-               to the input and not the result.  */
-            tcg_gen_mov_tl(cpu_cc_dst, s->T0);
-            set_cc_op(s, CC_OP_LOGICB + ot);
-
-            /* ??? The manual says that the output is undefined when the
-               input is zero, but real hardware leaves it unchanged, and
-               real programs appear to depend on that.  Accomplish this
-               by passing the output as the value to return upon zero.  */
-            if (b & 1) {
-                /* For bsr, return the bit index of the first 1 bit,
-                   not the count of leading zeros.  */
-                tcg_gen_xori_tl(s->T1, cpu_regs[reg], TARGET_LONG_BITS - 1);
-                tcg_gen_clz_tl(s->T0, s->T0, s->T1);
-                tcg_gen_xori_tl(s->T0, s->T0, TARGET_LONG_BITS - 1);
-            } else {
-                tcg_gen_ctz_tl(s->T0, s->T0, cpu_regs[reg]);
-            }
-        }
-        gen_op_mov_reg_v(s, ot, reg, s->T0);
-        break;
-    case 0x130: /* wrmsr */
-    case 0x132: /* rdmsr */
-        if (check_cpl0(s)) {
-            gen_update_cc_op(s);
-            gen_update_eip_cur(s);
-            if (b & 2) {
-                gen_helper_rdmsr(tcg_env);
-            } else {
-                gen_helper_wrmsr(tcg_env);
-                s->base.is_jmp = DISAS_EOB_NEXT;
-            }
-        }
-        break;
-    case 0x131: /* rdtsc */
-        gen_update_cc_op(s);
-        gen_update_eip_cur(s);
-        translator_io_start(&s->base);
-        gen_helper_rdtsc(tcg_env);
-        break;
-    case 0x133: /* rdpmc */
-        gen_update_cc_op(s);
-        gen_update_eip_cur(s);
-        gen_helper_rdpmc(tcg_env);
-        s->base.is_jmp = DISAS_NORETURN;
-        break;
-    case 0x134: /* sysenter */
-        /* For AMD SYSENTER is not valid in long mode */
-        if (LMA(s) && env->cpuid_vendor1 != CPUID_VENDOR_INTEL_1) {
-            goto illegal_op;
-        }
-        if (!PE(s)) {
-            gen_exception_gpf(s);
-        } else {
-            gen_helper_sysenter(tcg_env);
-            s->base.is_jmp = DISAS_EOB_ONLY;
-        }
-        break;
-    case 0x135: /* sysexit */
-        /* For AMD SYSEXIT is not valid in long mode */
-        if (LMA(s) && env->cpuid_vendor1 != CPUID_VENDOR_INTEL_1) {
-            goto illegal_op;
-        }
-        if (!PE(s) || CPL(s) != 0) {
-            gen_exception_gpf(s);
-        } else {
-            gen_helper_sysexit(tcg_env, tcg_constant_i32(dflag - 1));
-            s->base.is_jmp = DISAS_EOB_ONLY;
-        }
-        break;
-    case 0x105: /* syscall */
-        /* For Intel SYSCALL is only valid in long mode */
-        if (!LMA(s) && env->cpuid_vendor1 == CPUID_VENDOR_INTEL_1) {
-            goto illegal_op;
-        }
-        gen_update_cc_op(s);
-        gen_update_eip_cur(s);
-        gen_helper_syscall(tcg_env, cur_insn_len_i32(s));
-        /* condition codes are modified only in long mode */
-        if (LMA(s)) {
-            assume_cc_op(s, CC_OP_EFLAGS);
-        }
-        /* TF handling for the syscall insn is different. The TF bit is  checked
-           after the syscall insn completes. This allows #DB to not be
-           generated after one has entered CPL0 if TF is set in FMASK.  */
-        s->base.is_jmp = DISAS_EOB_RECHECK_TF;
-        break;
-    case 0x107: /* sysret */
-        /* For Intel SYSRET is only valid in long mode */
-        if (!LMA(s) && env->cpuid_vendor1 == CPUID_VENDOR_INTEL_1) {
-            goto illegal_op;
-        }
-        if (!PE(s) || CPL(s) != 0) {
-            gen_exception_gpf(s);
-        } else {
-            gen_helper_sysret(tcg_env, tcg_constant_i32(dflag - 1));
-            /* condition codes are modified only in long mode */
-            if (LMA(s)) {
-                assume_cc_op(s, CC_OP_EFLAGS);
-            }
-            /* TF handling for the sysret insn is different. The TF bit is
-               checked after the sysret insn completes. This allows #DB to be
-               generated "as if" the syscall insn in userspace has just
-               completed.  */
-            s->base.is_jmp = DISAS_EOB_RECHECK_TF;
-        }
-        break;
-    case 0x1a2: /* cpuid */
-        gen_update_cc_op(s);
-        gen_update_eip_cur(s);
-        gen_helper_cpuid(tcg_env);
-        break;
     case 0x100:
         modrm = x86_ldub_code(env, s);
         mod = (modrm >> 6) & 3;
@@ -3967,39 +3595,6 @@
         }
         break;
 
-    case 0x108: /* invd */
-    case 0x109: /* wbinvd; wbnoinvd with REPZ prefix */
-        if (check_cpl0(s)) {
-            gen_svm_check_intercept(s, (b & 1) ? SVM_EXIT_WBINVD : SVM_EXIT_INVD);
-            /* nothing to do */
-        }
-        break;
-    case 0x102: /* lar */
-    case 0x103: /* lsl */
-        {
-            TCGLabel *label1;
-            TCGv t0;
-            if (!PE(s) || VM86(s))
-                goto illegal_op;
-            ot = dflag != MO_16 ? MO_32 : MO_16;
-            modrm = x86_ldub_code(env, s);
-            reg = ((modrm >> 3) & 7) | REX_R(s);
-            gen_ld_modrm(env, s, modrm, MO_16);
-            t0 = tcg_temp_new();
-            gen_update_cc_op(s);
-            if (b == 0x102) {
-                gen_helper_lar(t0, tcg_env, s->T0);
-            } else {
-                gen_helper_lsl(t0, tcg_env, s->T0);
-            }
-            tcg_gen_andi_tl(s->tmp0, cpu_cc_src, CC_Z);
-            label1 = gen_new_label();
-            tcg_gen_brcondi_tl(TCG_COND_EQ, s->tmp0, 0, label1);
-            gen_op_mov_reg_v(s, ot, reg, t0);
-            gen_set_label(label1);
-            set_cc_op(s, CC_OP_EFLAGS);
-        }
-        break;
     case 0x11a:
         modrm = x86_ldub_code(env, s);
         if (s->flags & HF_MPX_EN_MASK) {
@@ -4191,311 +3786,6 @@
         }
         gen_nop_modrm(env, s, modrm);
         break;
-
-    case 0x120: /* mov reg, crN */
-    case 0x122: /* mov crN, reg */
-        if (!check_cpl0(s)) {
-            break;
-        }
-        modrm = x86_ldub_code(env, s);
-        /*
-         * Ignore the mod bits (assume (modrm&0xc0)==0xc0).
-         * AMD documentation (24594.pdf) and testing of Intel 386 and 486
-         * processors all show that the mod bits are assumed to be 1's,
-         * regardless of actual values.
-         */
-        rm = (modrm & 7) | REX_B(s);
-        reg = ((modrm >> 3) & 7) | REX_R(s);
-        switch (reg) {
-        case 0:
-            if ((prefixes & PREFIX_LOCK) &&
-                (s->cpuid_ext3_features & CPUID_EXT3_CR8LEG)) {
-                reg = 8;
-            }
-            break;
-        case 2:
-        case 3:
-        case 4:
-        case 8:
-            break;
-        default:
-            goto unknown_op;
-        }
-        ot  = (CODE64(s) ? MO_64 : MO_32);
-
-        translator_io_start(&s->base);
-        if (b & 2) {
-            gen_svm_check_intercept(s, SVM_EXIT_WRITE_CR0 + reg);
-            gen_op_mov_v_reg(s, ot, s->T0, rm);
-            gen_helper_write_crN(tcg_env, tcg_constant_i32(reg), s->T0);
-            s->base.is_jmp = DISAS_EOB_NEXT;
-        } else {
-            gen_svm_check_intercept(s, SVM_EXIT_READ_CR0 + reg);
-            gen_helper_read_crN(s->T0, tcg_env, tcg_constant_i32(reg));
-            gen_op_mov_reg_v(s, ot, rm, s->T0);
-        }
-        break;
-
-    case 0x121: /* mov reg, drN */
-    case 0x123: /* mov drN, reg */
-        if (check_cpl0(s)) {
-            modrm = x86_ldub_code(env, s);
-            /* Ignore the mod bits (assume (modrm&0xc0)==0xc0).
-             * AMD documentation (24594.pdf) and testing of
-             * intel 386 and 486 processors all show that the mod bits
-             * are assumed to be 1's, regardless of actual values.
-             */
-            rm = (modrm & 7) | REX_B(s);
-            reg = ((modrm >> 3) & 7) | REX_R(s);
-            if (CODE64(s))
-                ot = MO_64;
-            else
-                ot = MO_32;
-            if (reg >= 8) {
-                goto illegal_op;
-            }
-            if (b & 2) {
-                gen_svm_check_intercept(s, SVM_EXIT_WRITE_DR0 + reg);
-                gen_op_mov_v_reg(s, ot, s->T0, rm);
-                tcg_gen_movi_i32(s->tmp2_i32, reg);
-                gen_helper_set_dr(tcg_env, s->tmp2_i32, s->T0);
-                s->base.is_jmp = DISAS_EOB_NEXT;
-            } else {
-                gen_svm_check_intercept(s, SVM_EXIT_READ_DR0 + reg);
-                tcg_gen_movi_i32(s->tmp2_i32, reg);
-                gen_helper_get_dr(s->T0, tcg_env, s->tmp2_i32);
-                gen_op_mov_reg_v(s, ot, rm, s->T0);
-            }
-        }
-        break;
-    case 0x106: /* clts */
-        if (check_cpl0(s)) {
-            gen_svm_check_intercept(s, SVM_EXIT_WRITE_CR0);
-            gen_helper_clts(tcg_env);
-            /* abort block because static cpu state changed */
-            s->base.is_jmp = DISAS_EOB_NEXT;
-        }
-        break;
-    /* MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4 support */
-    case 0x1ae:
-        modrm = x86_ldub_code(env, s);
-        switch (modrm) {
-        CASE_MODRM_MEM_OP(0): /* fxsave */
-            if (!(s->cpuid_features & CPUID_FXSR)
-                || (prefixes & PREFIX_LOCK)) {
-                goto illegal_op;
-            }
-            if ((s->flags & HF_EM_MASK) || (s->flags & HF_TS_MASK)) {
-                gen_exception(s, EXCP07_PREX);
-                break;
-            }
-            gen_lea_modrm(env, s, modrm);
-            gen_helper_fxsave(tcg_env, s->A0);
-            break;
-
-        CASE_MODRM_MEM_OP(1): /* fxrstor */
-            if (!(s->cpuid_features & CPUID_FXSR)
-                || (prefixes & PREFIX_LOCK)) {
-                goto illegal_op;
-            }
-            if ((s->flags & HF_EM_MASK) || (s->flags & HF_TS_MASK)) {
-                gen_exception(s, EXCP07_PREX);
-                break;
-            }
-            gen_lea_modrm(env, s, modrm);
-            gen_helper_fxrstor(tcg_env, s->A0);
-            break;
-
-        CASE_MODRM_MEM_OP(2): /* ldmxcsr */
-            if ((s->flags & HF_EM_MASK) || !(s->flags & HF_OSFXSR_MASK)) {
-                goto illegal_op;
-            }
-            if (s->flags & HF_TS_MASK) {
-                gen_exception(s, EXCP07_PREX);
-                break;
-            }
-            gen_lea_modrm(env, s, modrm);
-            tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
-            gen_helper_ldmxcsr(tcg_env, s->tmp2_i32);
-            break;
-
-        CASE_MODRM_MEM_OP(3): /* stmxcsr */
-            if ((s->flags & HF_EM_MASK) || !(s->flags & HF_OSFXSR_MASK)) {
-                goto illegal_op;
-            }
-            if (s->flags & HF_TS_MASK) {
-                gen_exception(s, EXCP07_PREX);
-                break;
-            }
-            gen_helper_update_mxcsr(tcg_env);
-            gen_lea_modrm(env, s, modrm);
-            tcg_gen_ld32u_tl(s->T0, tcg_env, offsetof(CPUX86State, mxcsr));
-            gen_op_st_v(s, MO_32, s->T0, s->A0);
-            break;
-
-        CASE_MODRM_MEM_OP(4): /* xsave */
-            if ((s->cpuid_ext_features & CPUID_EXT_XSAVE) == 0
-                || (prefixes & (PREFIX_LOCK | PREFIX_DATA
-                                | PREFIX_REPZ | PREFIX_REPNZ))) {
-                goto illegal_op;
-            }
-            gen_lea_modrm(env, s, modrm);
-            tcg_gen_concat_tl_i64(s->tmp1_i64, cpu_regs[R_EAX],
-                                  cpu_regs[R_EDX]);
-            gen_helper_xsave(tcg_env, s->A0, s->tmp1_i64);
-            break;
-
-        CASE_MODRM_MEM_OP(5): /* xrstor */
-            if ((s->cpuid_ext_features & CPUID_EXT_XSAVE) == 0
-                || (prefixes & (PREFIX_LOCK | PREFIX_DATA
-                                | PREFIX_REPZ | PREFIX_REPNZ))) {
-                goto illegal_op;
-            }
-            gen_lea_modrm(env, s, modrm);
-            tcg_gen_concat_tl_i64(s->tmp1_i64, cpu_regs[R_EAX],
-                                  cpu_regs[R_EDX]);
-            gen_helper_xrstor(tcg_env, s->A0, s->tmp1_i64);
-            /* XRSTOR is how MPX is enabled, which changes how
-               we translate.  Thus we need to end the TB.  */
-            s->base.is_jmp = DISAS_EOB_NEXT;
-            break;
-
-        CASE_MODRM_MEM_OP(6): /* xsaveopt / clwb */
-            if (prefixes & PREFIX_LOCK) {
-                goto illegal_op;
-            }
-            if (prefixes & PREFIX_DATA) {
-                /* clwb */
-                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_CLWB)) {
-                    goto illegal_op;
-                }
-                gen_nop_modrm(env, s, modrm);
-            } else {
-                /* xsaveopt */
-                if ((s->cpuid_ext_features & CPUID_EXT_XSAVE) == 0
-                    || (s->cpuid_xsave_features & CPUID_XSAVE_XSAVEOPT) == 0
-                    || (prefixes & (PREFIX_REPZ | PREFIX_REPNZ))) {
-                    goto illegal_op;
-                }
-                gen_lea_modrm(env, s, modrm);
-                tcg_gen_concat_tl_i64(s->tmp1_i64, cpu_regs[R_EAX],
-                                      cpu_regs[R_EDX]);
-                gen_helper_xsaveopt(tcg_env, s->A0, s->tmp1_i64);
-            }
-            break;
-
-        CASE_MODRM_MEM_OP(7): /* clflush / clflushopt */
-            if (prefixes & PREFIX_LOCK) {
-                goto illegal_op;
-            }
-            if (prefixes & PREFIX_DATA) {
-                /* clflushopt */
-                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_CLFLUSHOPT)) {
-                    goto illegal_op;
-                }
-            } else {
-                /* clflush */
-                if ((s->prefix & (PREFIX_REPZ | PREFIX_REPNZ))
-                    || !(s->cpuid_features & CPUID_CLFLUSH)) {
-                    goto illegal_op;
-                }
-            }
-            gen_nop_modrm(env, s, modrm);
-            break;
-
-        case 0xc0 ... 0xc7: /* rdfsbase (f3 0f ae /0) */
-        case 0xc8 ... 0xcf: /* rdgsbase (f3 0f ae /1) */
-        case 0xd0 ... 0xd7: /* wrfsbase (f3 0f ae /2) */
-        case 0xd8 ... 0xdf: /* wrgsbase (f3 0f ae /3) */
-            if (CODE64(s)
-                && (prefixes & PREFIX_REPZ)
-                && !(prefixes & PREFIX_LOCK)
-                && (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_FSGSBASE)) {
-                TCGv base, treg, src, dst;
-
-                /* Preserve hflags bits by testing CR4 at runtime.  */
-                tcg_gen_movi_i32(s->tmp2_i32, CR4_FSGSBASE_MASK);
-                gen_helper_cr4_testbit(tcg_env, s->tmp2_i32);
-
-                base = cpu_seg_base[modrm & 8 ? R_GS : R_FS];
-                treg = cpu_regs[(modrm & 7) | REX_B(s)];
-
-                if (modrm & 0x10) {
-                    /* wr*base */
-                    dst = base, src = treg;
-                } else {
-                    /* rd*base */
-                    dst = treg, src = base;
-                }
-
-                if (s->dflag == MO_32) {
-                    tcg_gen_ext32u_tl(dst, src);
-                } else {
-                    tcg_gen_mov_tl(dst, src);
-                }
-                break;
-            }
-            goto unknown_op;
-
-        case 0xf8 ... 0xff: /* sfence */
-            if (!(s->cpuid_features & CPUID_SSE)
-                || (prefixes & PREFIX_LOCK)) {
-                goto illegal_op;
-            }
-            tcg_gen_mb(TCG_MO_ST_ST | TCG_BAR_SC);
-            break;
-        case 0xe8 ... 0xef: /* lfence */
-            if (!(s->cpuid_features & CPUID_SSE)
-                || (prefixes & PREFIX_LOCK)) {
-                goto illegal_op;
-            }
-            tcg_gen_mb(TCG_MO_LD_LD | TCG_BAR_SC);
-            break;
-        case 0xf0 ... 0xf7: /* mfence */
-            if (!(s->cpuid_features & CPUID_SSE2)
-                || (prefixes & PREFIX_LOCK)) {
-                goto illegal_op;
-            }
-            tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
-            break;
-
-        default:
-            goto unknown_op;
-        }
-        break;
-
-    case 0x1aa: /* rsm */
-        gen_svm_check_intercept(s, SVM_EXIT_RSM);
-        if (!(s->flags & HF_SMM_MASK))
-            goto illegal_op;
-#ifdef CONFIG_USER_ONLY
-        /* we should not be in SMM mode */
-        g_assert_not_reached();
-#else
-        gen_helper_rsm(tcg_env);
-        assume_cc_op(s, CC_OP_EFLAGS);
-#endif /* CONFIG_USER_ONLY */
-        s->base.is_jmp = DISAS_EOB_ONLY;
-        break;
-    case 0x1b8: /* SSE4.2 popcnt */
-        if ((prefixes & (PREFIX_REPZ | PREFIX_LOCK | PREFIX_REPNZ)) !=
-             PREFIX_REPZ)
-            goto illegal_op;
-        if (!(s->cpuid_ext_features & CPUID_EXT_POPCNT))
-            goto illegal_op;
-
-        modrm = x86_ldub_code(env, s);
-        reg = ((modrm >> 3) & 7) | REX_R(s);
-
-        ot = dflag;
-        gen_ld_modrm(env, s, modrm, ot);
-        gen_extu(ot, s->T0);
-        tcg_gen_mov_tl(cpu_cc_src, s->T0);
-        tcg_gen_ctpop_tl(s->T0, s->T0);
-        gen_op_mov_reg_v(s, ot, reg, s->T0);
-
-        set_cc_op(s, CC_OP_POPCNT);
-        break;
     default:
         g_assert_not_reached();
     }