Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging

* target/i386: Introduce SapphireRapids-v3 to add missing features
* switch boards to "default y"
* allow building emulators without any board
* configs: list "implied" device groups in the default configs
* remove unnecessary declarations from typedefs.h
* target/i386: Give IRQs a chance when resetting HF_INHIBIT_IRQ_MASK

# -----BEGIN PGP SIGNATURE-----
#
# iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmY1ILsUHHBib256aW5p
# QHJlZGhhdC5jb20ACgkQv/vSX3jHroNtIwf+MEehq2HudZvsK1M8FrvNmkB/AssO
# x4tqL8DlTus23mQDBu9+rANTB93ManJdK9ybtf6NfjEwK+R8RJslLVnuy/qT+aQX
# PD208L88fjZg17G8uyawwvD1VmqWzHFSN14ShmKzqB2yPXXo/1cJ30w78DbD50yC
# 6rw/xbC5j195CwE2u8eBcIyY4Hh2PUYEE4uyHbYVr57cMjfmmA5Pg4I4FJrpLrF3
# eM2Avl/4pIbsW3zxXVB8QbAkgypxZErk3teDK1AkPJnlnBYM1jGKbt/GdKe7vcHR
# V/o+7NlcbS3oHVItQ2gP3m91stjFq+NhixaZpa0VlmuqayBa3xNGl0G6OQ==
# =ZbNW
# -----END PGP SIGNATURE-----
# gpg: Signature made Fri 03 May 2024 10:36:59 AM PDT
# gpg:                using RSA key F13338574B662389866C7682BFFBD25F78C7AE83
# gpg:                issuer "pbonzini@redhat.com"
# gpg: Good signature from "Paolo Bonzini <bonzini@gnu.org>" [full]
# gpg:                 aka "Paolo Bonzini <pbonzini@redhat.com>" [full]

* tag 'for-upstream' of https://gitlab.com/bonzini/qemu: (46 commits)
  qga/commands-posix: fix typo in qmp_guest_set_user_password
  migration: do not include coroutine_int.h
  kvm: move target-dependent interrupt routing out of kvm-all.c
  pci: remove some types from typedefs.h
  tcg: remove CPU* types from typedefs.h
  display: remove GraphicHwOps from typedefs.h
  qapi/machine: remove types from typedefs.h
  monitor: remove MonitorDef from typedefs.h
  migration: remove PostcopyDiscardState from typedefs.h
  lockable: remove QemuLockable from typedefs.h
  intc: remove PICCommonState from typedefs.h
  qemu-option: remove QemuOpt from typedefs.h
  net: remove AnnounceTimer from typedefs.h
  numa: remove types from typedefs.h
  qdev-core: remove DeviceListener from typedefs.h
  fw_cfg: remove useless declarations from typedefs.h
  build: do not build virtio-vga-gl if virgl/opengl not available
  bitmap: Use g_try_new0/g_new0/g_renew
  target/i386: Introduce SapphireRapids-v3 to add missing features
  docs: document new convention for Kconfig board symbols
  ...

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
diff --git a/MAINTAINERS b/MAINTAINERS
index 302b6fd..2f08cc5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -532,7 +532,7 @@
 ---------------------
 X86 Xen CPUs
 M: Stefano Stabellini <sstabellini@kernel.org>
-M: Anthony Perard <anthony.perard@citrix.com>
+M: Anthony PERARD <anthony@xenproject.org>
 M: Paul Durrant <paul@xen.org>
 L: xen-devel@lists.xenproject.org
 S: Supported
@@ -3692,7 +3692,6 @@
 M: Riku Voipio <riku.voipio@iki.fi>
 S: Maintained
 F: accel/tcg/user-exec*.c
-F: include/exec/user/
 F: include/user/
 F: common-user/
 
diff --git a/bsd-user/qemu.h b/bsd-user/qemu.h
index 8629f0d..a916724 100644
--- a/bsd-user/qemu.h
+++ b/bsd-user/qemu.h
@@ -22,11 +22,11 @@
 #include "exec/cpu_ldst.h"
 #include "exec/exec-all.h"
 
-#include "exec/user/abitypes.h"
+#include "user/abitypes.h"
 
 extern char **environ;
 
-#include "exec/user/thunk.h"
+#include "user/thunk.h"
 #include "target_arch.h"
 #include "syscall_defs.h"
 #include "target_syscall.h"
diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index e75ec13..032c6d9 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -64,7 +64,7 @@
 /* MMU memory access macros */
 
 #if defined(CONFIG_USER_ONLY)
-#include "exec/user/abitypes.h"
+#include "user/abitypes.h"
 
 /*
  * If non-zero, the guest virtual address space is a contiguous subset
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 6d53188..8812ba7 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -1,8 +1,13 @@
+/*
+ * CPU interfaces that are target independent.
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * SPDX-License-Identifier: LGPL-2.1+
+ */
 #ifndef CPU_COMMON_H
 #define CPU_COMMON_H
 
-/* CPU interfaces that are target independent.  */
-
 #include "exec/vaddr.h"
 #ifndef CONFIG_USER_ONLY
 #include "exec/hwaddr.h"
diff --git a/include/qemu/cutils.h b/include/qemu/cutils.h
index 92c927a..741dade 100644
--- a/include/qemu/cutils.h
+++ b/include/qemu/cutils.h
@@ -187,9 +187,39 @@
 /* used to print char* safely */
 #define STR_OR_NULL(str) ((str) ? (str) : "null")
 
-bool buffer_is_zero(const void *buf, size_t len);
+/*
+ * Check if a buffer is all zeroes.
+ */
+
+bool buffer_is_zero_ool(const void *vbuf, size_t len);
+bool buffer_is_zero_ge256(const void *vbuf, size_t len);
 bool test_buffer_is_zero_next_accel(void);
 
+static inline bool buffer_is_zero_sample3(const char *buf, size_t len)
+{
+    /*
+     * For any reasonably sized buffer, these three samples come from
+     * three different cachelines.  In qemu-img usage, we find that
+     * each byte eliminates more than half of all buffer testing.
+     * It is therefore critical to performance that the byte tests
+     * short-circuit, so that we do not pull in additional cache lines.
+     * Do not "optimize" this to !(a | b | c).
+     */
+    return !buf[0] && !buf[len - 1] && !buf[len / 2];
+}
+
+#ifdef __OPTIMIZE__
+static inline bool buffer_is_zero(const void *buf, size_t len)
+{
+    return (__builtin_constant_p(len) && len >= 256
+            ? buffer_is_zero_sample3(buf, len) &&
+              buffer_is_zero_ge256(buf, len)
+            : buffer_is_zero_ool(buf, len));
+}
+#else
+#define buffer_is_zero  buffer_is_zero_ool
+#endif
+
 /*
  * Implementation of ULEB128 (http://en.wikipedia.org/wiki/LEB128)
  * Input is limited to 14-bit numbers
diff --git a/include/exec/user/abitypes.h b/include/user/abitypes.h
similarity index 97%
rename from include/exec/user/abitypes.h
rename to include/user/abitypes.h
index 3ec1969..5c9a955 100644
--- a/include/exec/user/abitypes.h
+++ b/include/user/abitypes.h
@@ -1,5 +1,5 @@
-#ifndef EXEC_USER_ABITYPES_H
-#define EXEC_USER_ABITYPES_H
+#ifndef USER_ABITYPES_H
+#define USER_ABITYPES_H
 
 #ifndef CONFIG_USER_ONLY
 #error Cannot include this header from system emulation
diff --git a/include/user/syscall-trace.h b/include/user/syscall-trace.h
index b48b2b2..9bd7ca1 100644
--- a/include/user/syscall-trace.h
+++ b/include/user/syscall-trace.h
@@ -10,7 +10,7 @@
 #ifndef SYSCALL_TRACE_H
 #define SYSCALL_TRACE_H
 
-#include "exec/user/abitypes.h"
+#include "user/abitypes.h"
 #include "gdbstub/user.h"
 #include "qemu/plugin.h"
 #include "trace/trace-root.h"
diff --git a/include/exec/user/thunk.h b/include/user/thunk.h
similarity index 96%
rename from include/exec/user/thunk.h
rename to include/user/thunk.h
index 2ebfecf..2a2104b 100644
--- a/include/exec/user/thunk.h
+++ b/include/user/thunk.h
@@ -17,11 +17,15 @@
  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef THUNK_H
-#define THUNK_H
+#ifndef USER_THUNK_H
+#define USER_THUNK_H
+
+#ifndef CONFIG_USER_ONLY
+#error Cannot include this header from system emulation
+#endif
 
 #include "cpu.h"
-#include "exec/user/abitypes.h"
+#include "user/abitypes.h"
 
 /* types enums definitions */
 
diff --git a/linux-user/qemu.h b/linux-user/qemu.h
index 4777856..263f445 100644
--- a/linux-user/qemu.h
+++ b/linux-user/qemu.h
@@ -4,7 +4,7 @@
 #include "cpu.h"
 #include "exec/cpu_ldst.h"
 
-#include "exec/user/abitypes.h"
+#include "user/abitypes.h"
 
 #include "syscall_defs.h"
 #include "target_syscall.h"
diff --git a/linux-user/thunk.c b/linux-user/thunk.c
index 071aad4..3cd19e7 100644
--- a/linux-user/thunk.c
+++ b/linux-user/thunk.c
@@ -20,7 +20,7 @@
 #include "qemu/log.h"
 
 #include "qemu.h"
-#include "exec/user/thunk.h"
+#include "user/thunk.h"
 
 //#define DEBUG
 
diff --git a/linux-user/user-internals.h b/linux-user/user-internals.h
index ce11d9e..5c7f173 100644
--- a/linux-user/user-internals.h
+++ b/linux-user/user-internals.h
@@ -18,7 +18,7 @@
 #ifndef LINUX_USER_USER_INTERNALS_H
 #define LINUX_USER_USER_INTERNALS_H
 
-#include "exec/user/thunk.h"
+#include "user/thunk.h"
 #include "exec/exec-all.h"
 #include "exec/tb-flush.h"
 #include "qemu/log.h"
diff --git a/plugins/api.c b/plugins/api.c
index 3912c9c..2144da1 100644
--- a/plugins/api.c
+++ b/plugins/api.c
@@ -42,10 +42,10 @@
 #include "tcg/tcg.h"
 #include "exec/exec-all.h"
 #include "exec/gdbstub.h"
-#include "exec/ram_addr.h"
 #include "disas/disas.h"
 #include "plugin.h"
 #ifndef CONFIG_USER_ONLY
+#include "exec/ram_addr.h"
 #include "qemu/plugin-memory.h"
 #include "hw/boards.h"
 #else
diff --git a/plugins/core.c b/plugins/core.c
index 0213513..081323d 100644
--- a/plugins/core.c
+++ b/plugins/core.c
@@ -410,7 +410,7 @@
     struct qemu_plugin_cb *cb, *next;
     enum qemu_plugin_event ev = QEMU_PLUGIN_EV_VCPU_TB_TRANS;
 
-    /* no plugin_mask check here; caller should have checked */
+    /* no plugin_state->event_mask check here; caller should have checked */
 
     QLIST_FOREACH_SAFE_RCU(cb, &plugin.cb_lists[ev], entry, next) {
         qemu_plugin_vcpu_tb_trans_cb_t func = cb->f.vcpu_tb_trans;
diff --git a/scripts/coverity-scan/COMPONENTS.md b/scripts/coverity-scan/COMPONENTS.md
index 91be8d1..1537e49 100644
--- a/scripts/coverity-scan/COMPONENTS.md
+++ b/scripts/coverity-scan/COMPONENTS.md
@@ -121,7 +121,7 @@
   ~ (/qemu)?(/hw/usb/.*|/include/hw/usb/.*)
 
 user
-  ~ (/qemu)?(/linux-user/.*|/bsd-user/.*|/user-exec\.c|/thunk\.c|/include/exec/user/.*)
+  ~ (/qemu)?(/linux-user/.*|/bsd-user/.*|/user-exec\.c|/thunk\.c|/include/user/.*)
 
 util
   ~ (/qemu)?(/util/.*|/include/qemu/.*)
diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
index 05f9ee41..0e2fbcb 100644
--- a/target/alpha/cpu.c
+++ b/target/alpha/cpu.c
@@ -28,25 +28,37 @@
 
 static void alpha_cpu_set_pc(CPUState *cs, vaddr value)
 {
-    AlphaCPU *cpu = ALPHA_CPU(cs);
-
-    cpu->env.pc = value;
+    CPUAlphaState *env = cpu_env(cs);
+    env->pc = value;
 }
 
 static vaddr alpha_cpu_get_pc(CPUState *cs)
 {
-    AlphaCPU *cpu = ALPHA_CPU(cs);
+    CPUAlphaState *env = cpu_env(cs);
+    return env->pc;
+}
 
-    return cpu->env.pc;
+static void alpha_cpu_synchronize_from_tb(CPUState *cs,
+                                          const TranslationBlock *tb)
+{
+    /* The program counter is always up to date with CF_PCREL. */
+    if (!(tb_cflags(tb) & CF_PCREL)) {
+        CPUAlphaState *env = cpu_env(cs);
+        env->pc = tb->pc;
+    }
 }
 
 static void alpha_restore_state_to_opc(CPUState *cs,
                                        const TranslationBlock *tb,
                                        const uint64_t *data)
 {
-    AlphaCPU *cpu = ALPHA_CPU(cs);
+    CPUAlphaState *env = cpu_env(cs);
 
-    cpu->env.pc = data[0];
+    if (tb_cflags(tb) & CF_PCREL) {
+        env->pc = (env->pc & TARGET_PAGE_MASK) | data[0];
+    } else {
+        env->pc = data[0];
+    }
 }
 
 static bool alpha_cpu_has_work(CPUState *cs)
@@ -81,6 +93,11 @@
     AlphaCPUClass *acc = ALPHA_CPU_GET_CLASS(dev);
     Error *local_err = NULL;
 
+#ifndef CONFIG_USER_ONLY
+    /* Use pc-relative instructions in system-mode */
+    cs->tcg_cflags |= CF_PCREL;
+#endif
+
     cpu_exec_realizefn(cs, &local_err);
     if (local_err != NULL) {
         error_propagate(errp, local_err);
@@ -193,6 +210,7 @@
 
 static const TCGCPUOps alpha_tcg_ops = {
     .initialize = alpha_translate_init,
+    .synchronize_from_tb = alpha_cpu_synchronize_from_tb,
     .restore_state_to_opc = alpha_restore_state_to_opc,
 
 #ifdef CONFIG_USER_ONLY
diff --git a/target/alpha/helper.c b/target/alpha/helper.c
index d6d4353..c5e4958 100644
--- a/target/alpha/helper.c
+++ b/target/alpha/helper.c
@@ -124,7 +124,7 @@
                               MMUAccessType access_type,
                               bool maperr, uintptr_t retaddr)
 {
-    AlphaCPU *cpu = ALPHA_CPU(cs);
+    CPUAlphaState *env = cpu_env(cs);
     target_ulong mmcsr, cause;
 
     /* Assuming !maperr, infer the missing protection. */
@@ -155,9 +155,9 @@
     }
 
     /* Record the arguments that PALcode would give to the kernel. */
-    cpu->env.trap_arg0 = address;
-    cpu->env.trap_arg1 = mmcsr;
-    cpu->env.trap_arg2 = cause;
+    env->trap_arg0 = address;
+    env->trap_arg1 = mmcsr;
+    env->trap_arg2 = cause;
 }
 #else
 /* Returns the OSF/1 entMM failure indication, or -1 on success.  */
diff --git a/target/alpha/translate.c b/target/alpha/translate.c
index a97cd54..db847e7 100644
--- a/target/alpha/translate.c
+++ b/target/alpha/translate.c
@@ -54,6 +54,9 @@
     uint32_t tbflags;
     int mem_idx;
 
+    /* True if generating pc-relative code.  */
+    bool pcrel;
+
     /* implver and amask values for this CPU.  */
     int implver;
     int amask;
@@ -252,6 +255,16 @@
     tcg_gen_st8_i64(val, tcg_env, get_flag_ofs(shift));
 }
 
+static void gen_pc_disp(DisasContext *ctx, TCGv dest, int32_t disp)
+{
+    uint64_t addr = ctx->base.pc_next + disp;
+    if (ctx->pcrel) {
+        tcg_gen_addi_i64(dest, cpu_pc, addr - ctx->base.pc_first);
+    } else {
+        tcg_gen_movi_i64(dest, addr);
+    }
+}
+
 static void gen_excp_1(int exception, int error_code)
 {
     TCGv_i32 tmp1, tmp2;
@@ -263,7 +276,7 @@
 
 static DisasJumpType gen_excp(DisasContext *ctx, int exception, int error_code)
 {
-    tcg_gen_movi_i64(cpu_pc, ctx->base.pc_next);
+    gen_pc_disp(ctx, cpu_pc, 0);
     gen_excp_1(exception, error_code);
     return DISAS_NORETURN;
 }
@@ -425,60 +438,49 @@
     return DISAS_NEXT;
 }
 
-static bool use_goto_tb(DisasContext *ctx, uint64_t dest)
+static void gen_goto_tb(DisasContext *ctx, int idx, int32_t disp)
 {
-    return translator_use_goto_tb(&ctx->base, dest);
+    if (translator_use_goto_tb(&ctx->base, ctx->base.pc_next + disp)) {
+        /* With PCREL, PC must always be up-to-date. */
+        if (ctx->pcrel) {
+            gen_pc_disp(ctx, cpu_pc, disp);
+            tcg_gen_goto_tb(idx);
+        } else {
+            tcg_gen_goto_tb(idx);
+            gen_pc_disp(ctx, cpu_pc, disp);
+        }
+        tcg_gen_exit_tb(ctx->base.tb, idx);
+    } else {
+        gen_pc_disp(ctx, cpu_pc, disp);
+        tcg_gen_lookup_and_goto_ptr();
+    }
 }
 
 static DisasJumpType gen_bdirect(DisasContext *ctx, int ra, int32_t disp)
 {
-    uint64_t dest = ctx->base.pc_next + (disp << 2);
-
     if (ra != 31) {
-        tcg_gen_movi_i64(ctx->ir[ra], ctx->base.pc_next);
+        gen_pc_disp(ctx, ctx->ir[ra], 0);
     }
 
     /* Notice branch-to-next; used to initialize RA with the PC.  */
     if (disp == 0) {
-        return 0;
-    } else if (use_goto_tb(ctx, dest)) {
-        tcg_gen_goto_tb(0);
-        tcg_gen_movi_i64(cpu_pc, dest);
-        tcg_gen_exit_tb(ctx->base.tb, 0);
-        return DISAS_NORETURN;
-    } else {
-        tcg_gen_movi_i64(cpu_pc, dest);
-        return DISAS_PC_UPDATED;
+        return DISAS_NEXT;
     }
+    gen_goto_tb(ctx, 0, disp);
+    return DISAS_NORETURN;
 }
 
 static DisasJumpType gen_bcond_internal(DisasContext *ctx, TCGCond cond,
                                         TCGv cmp, uint64_t imm, int32_t disp)
 {
-    uint64_t dest = ctx->base.pc_next + (disp << 2);
     TCGLabel *lab_true = gen_new_label();
 
-    if (use_goto_tb(ctx, dest)) {
-        tcg_gen_brcondi_i64(cond, cmp, imm, lab_true);
+    tcg_gen_brcondi_i64(cond, cmp, imm, lab_true);
+    gen_goto_tb(ctx, 0, 0);
+    gen_set_label(lab_true);
+    gen_goto_tb(ctx, 1, disp);
 
-        tcg_gen_goto_tb(0);
-        tcg_gen_movi_i64(cpu_pc, ctx->base.pc_next);
-        tcg_gen_exit_tb(ctx->base.tb, 0);
-
-        gen_set_label(lab_true);
-        tcg_gen_goto_tb(1);
-        tcg_gen_movi_i64(cpu_pc, dest);
-        tcg_gen_exit_tb(ctx->base.tb, 1);
-
-        return DISAS_NORETURN;
-    } else {
-        TCGv_i64 i = tcg_constant_i64(imm);
-        TCGv_i64 d = tcg_constant_i64(dest);
-        TCGv_i64 p = tcg_constant_i64(ctx->base.pc_next);
-
-        tcg_gen_movcond_i64(cond, cpu_pc, cmp, i, d, p);
-        return DISAS_PC_UPDATED;
-    }
+    return DISAS_NORETURN;
 }
 
 static DisasJumpType gen_bcond(DisasContext *ctx, TCGCond cond, int ra,
@@ -1106,7 +1108,7 @@
             }
 
             /* Allow interrupts to be recognized right away.  */
-            tcg_gen_movi_i64(cpu_pc, ctx->base.pc_next);
+            gen_pc_disp(ctx, cpu_pc, 0);
             return DISAS_PC_UPDATED_NOCHAIN;
 
         case 0x36:
@@ -1153,19 +1155,17 @@
 #else
     {
         TCGv tmp = tcg_temp_new();
-        uint64_t exc_addr = ctx->base.pc_next;
-        uint64_t entry = ctx->palbr;
+        uint64_t entry;
 
+        gen_pc_disp(ctx, tmp, 0);
         if (ctx->tbflags & ENV_FLAG_PAL_MODE) {
-            exc_addr |= 1;
+            tcg_gen_ori_i64(tmp, tmp, 1);
         } else {
-            tcg_gen_movi_i64(tmp, 1);
-            st_flag_byte(tmp, ENV_FLAG_PAL_SHIFT);
+            st_flag_byte(tcg_constant_i64(1), ENV_FLAG_PAL_SHIFT);
         }
-
-        tcg_gen_movi_i64(tmp, exc_addr);
         tcg_gen_st_i64(tmp, tcg_env, offsetof(CPUAlphaState, exc_addr));
 
+        entry = ctx->palbr;
         entry += (palcode & 0x80
                   ? 0x2000 + (palcode - 0x80) * 64
                   : 0x1000 + palcode * 64);
@@ -1382,7 +1382,7 @@
     real_islit = islit = extract32(insn, 12, 1);
     lit = extract32(insn, 13, 8);
 
-    disp21 = sextract32(insn, 0, 21);
+    disp21 = sextract32(insn, 0, 21) * 4;
     disp16 = sextract32(insn, 0, 16);
     disp12 = sextract32(insn, 0, 12);
 
@@ -2359,9 +2359,13 @@
         /* JMP, JSR, RET, JSR_COROUTINE.  These only differ by the branch
            prediction stack action, which of course we don't implement.  */
         vb = load_gpr(ctx, rb);
-        tcg_gen_andi_i64(cpu_pc, vb, ~3);
         if (ra != 31) {
-            tcg_gen_movi_i64(ctx->ir[ra], ctx->base.pc_next);
+            tmp = tcg_temp_new();
+            tcg_gen_andi_i64(tmp, vb, ~3);
+            gen_pc_disp(ctx, ctx->ir[ra], 0);
+            tcg_gen_mov_i64(cpu_pc, tmp);
+        } else {
+            tcg_gen_andi_i64(cpu_pc, vb, ~3);
         }
         ret = DISAS_PC_UPDATED;
         break;
@@ -2862,6 +2866,7 @@
 
     ctx->tbflags = ctx->base.tb->flags;
     ctx->mem_idx = alpha_env_mmu_index(env);
+    ctx->pcrel = ctx->base.tb->cflags & CF_PCREL;
     ctx->implver = env->implver;
     ctx->amask = env->amask;
 
@@ -2897,7 +2902,13 @@
 
 static void alpha_tr_insn_start(DisasContextBase *dcbase, CPUState *cpu)
 {
-    tcg_gen_insn_start(dcbase->pc_next);
+    DisasContext *ctx = container_of(dcbase, DisasContext, base);
+
+    if (ctx->pcrel) {
+        tcg_gen_insn_start(dcbase->pc_next & ~TARGET_PAGE_MASK);
+    } else {
+        tcg_gen_insn_start(dcbase->pc_next);
+    }
 }
 
 static void alpha_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
@@ -2920,14 +2931,10 @@
     case DISAS_NORETURN:
         break;
     case DISAS_TOO_MANY:
-        if (use_goto_tb(ctx, ctx->base.pc_next)) {
-            tcg_gen_goto_tb(0);
-            tcg_gen_movi_i64(cpu_pc, ctx->base.pc_next);
-            tcg_gen_exit_tb(ctx->base.tb, 0);
-        }
-        /* FALLTHRU */
+        gen_goto_tb(ctx, 0, 0);
+        break;
     case DISAS_PC_STALE:
-        tcg_gen_movi_i64(cpu_pc, ctx->base.pc_next);
+        gen_pc_disp(ctx, cpu_pc, 0);
         /* FALLTHRU */
     case DISAS_PC_UPDATED:
         tcg_gen_lookup_and_goto_ptr();
diff --git a/target/hexagon/README b/target/hexagon/README
index 746ebec..7ffd517 100644
--- a/target/hexagon/README
+++ b/target/hexagon/README
@@ -43,11 +43,9 @@
 That file is consumed by the following python scripts to produce the indicated
 header files in <BUILD_DIR>/target/hexagon
         gen_opcodes_def.py              -> opcodes_def_generated.h.inc
-        gen_op_regs.py                  -> op_regs_generated.h.inc
         gen_printinsn.py                -> printinsn_generated.h.inc
         gen_op_attribs.py               -> op_attribs_generated.h.inc
         gen_helper_protos.py            -> helper_protos_generated.h.inc
-        gen_shortcode.py                -> shortcode_generated.h.inc
         gen_tcg_funcs.py                -> tcg_funcs_generated.c.inc
         gen_tcg_func_table.py           -> tcg_func_table_generated.c.inc
         gen_helper_funcs.py             -> helper_funcs_generated.c.inc
@@ -183,10 +181,11 @@
     }
 
 We also generate an analyze_<tag> function for each instruction.  Currently,
-these functions record the writes to registers by calling ctx_log_*.  During
-gen_start_packet, we invoke the analyze_<tag> function for each instruction in
-the packet, and we mark the implicit writes.  After the analysis is performed,
-we initialize the result register for each of the predicated assignments.
+these functions record the reads and writes to registers by calling ctx_log_*.
+During gen_start_packet, we invoke the analyze_<tag> function for each instruction in
+the packet, and we mark the implicit writes.  The analysis determines if the packet
+semantics can be short-circuited.  If not, we initialize the result register for each
+of the predicated assignments.
 
 In addition to instruction semantics, we use a generator to create the decode
 tree.  This generation is a four step process.
diff --git a/target/hexagon/attribs_def.h.inc b/target/hexagon/attribs_def.h.inc
index 87942d4..9e3a05f 100644
--- a/target/hexagon/attribs_def.h.inc
+++ b/target/hexagon/attribs_def.h.inc
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -117,6 +117,7 @@
 DEF_ATTRIB(IMPLICIT_READS_P2, "Reads the P2 register", "", "")
 DEF_ATTRIB(IMPLICIT_READS_P3, "Reads the P3 register", "", "")
 DEF_ATTRIB(IMPLICIT_WRITES_USR, "May write USR", "", "")
+DEF_ATTRIB(IMPLICIT_READS_SP, "Reads the SP register", "", "")
 DEF_ATTRIB(COMMUTES, "The operation is communitive", "", "")
 DEF_ATTRIB(DEALLOCRET, "dealloc_return", "", "")
 DEF_ATTRIB(DEALLOCFRAME, "deallocframe", "", "")
diff --git a/target/hexagon/decode.c b/target/hexagon/decode.c
index a40210c..23deba2 100644
--- a/target/hexagon/decode.c
+++ b/target/hexagon/decode.c
@@ -115,22 +115,13 @@
 decode_fill_newvalue_regno(Packet *packet)
 {
     int i, use_regidx, offset, def_idx, dst_idx;
-    uint16_t def_opcode, use_opcode;
-    char *dststr;
 
     for (i = 1; i < packet->num_insns; i++) {
         if (GET_ATTRIB(packet->insn[i].opcode, A_DOTNEWVALUE) &&
             !GET_ATTRIB(packet->insn[i].opcode, A_EXTENSION)) {
-            use_opcode = packet->insn[i].opcode;
 
-            /* It's a store, so we're adjusting the Nt field */
-            if (GET_ATTRIB(use_opcode, A_STORE)) {
-                use_regidx = strchr(opcode_reginfo[use_opcode], 't') -
-                    opcode_reginfo[use_opcode];
-            } else {    /* It's a Jump, so we're adjusting the Ns field */
-                use_regidx = strchr(opcode_reginfo[use_opcode], 's') -
-                    opcode_reginfo[use_opcode];
-            }
+            g_assert(packet->insn[i].new_read_idx != -1);
+            use_regidx = packet->insn[i].new_read_idx;
 
             /*
              * What's encoded at the N-field is the offset to who's producing
@@ -151,37 +142,9 @@
              */
             g_assert(!((def_idx < 0) || (def_idx > (packet->num_insns - 1))));
 
-            /*
-             * packet->insn[def_idx] is the producer
-             * Figure out which type of destination it produces
-             * and the corresponding index in the reginfo
-             */
-            def_opcode = packet->insn[def_idx].opcode;
-            dststr = strstr(opcode_wregs[def_opcode], "Rd");
-            if (dststr) {
-                dststr = strchr(opcode_reginfo[def_opcode], 'd');
-            } else {
-                dststr = strstr(opcode_wregs[def_opcode], "Rx");
-                if (dststr) {
-                    dststr = strchr(opcode_reginfo[def_opcode], 'x');
-                } else {
-                    dststr = strstr(opcode_wregs[def_opcode], "Re");
-                    if (dststr) {
-                        dststr = strchr(opcode_reginfo[def_opcode], 'e');
-                    } else {
-                        dststr = strstr(opcode_wregs[def_opcode], "Ry");
-                        if (dststr) {
-                            dststr = strchr(opcode_reginfo[def_opcode], 'y');
-                        } else {
-                            g_assert_not_reached();
-                        }
-                    }
-                }
-            }
-            g_assert(dststr != NULL);
-
             /* Now patch up the consumer with the register number */
-            dst_idx = dststr - opcode_reginfo[def_opcode];
+            g_assert(packet->insn[def_idx].dest_idx != -1);
+            dst_idx = packet->insn[def_idx].dest_idx;
             packet->insn[i].regno[use_regidx] =
                 packet->insn[def_idx].regno[dst_idx];
             /*
@@ -362,8 +325,7 @@
         for (flag = false, i = 0; i < last_insn + 1; i++) {
             int opcode = packet->insn[i].opcode;
 
-            if ((strstr(opcode_wregs[opcode], "Pd4") ||
-                 strstr(opcode_wregs[opcode], "Pe4")) &&
+            if (packet->insn[i].has_pred_dest &&
                 GET_ATTRIB(opcode, A_STORE) == 0) {
                 /* This should be a compare (not a store conditional) */
                 if (flag) {
diff --git a/target/hexagon/gen_analyze_funcs.py b/target/hexagon/gen_analyze_funcs.py
index a9af666..54bac19 100755
--- a/target/hexagon/gen_analyze_funcs.py
+++ b/target/hexagon/gen_analyze_funcs.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 ##
-##  Copyright(c) 2022-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+##  Copyright(c) 2022-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
 ##
 ##  This program is free software; you can redistribute it and/or modify
 ##  it under the terms of the GNU General Public License as published by
@@ -43,59 +43,53 @@
     f.write("{\n")
 
     f.write("    Insn *insn G_GNUC_UNUSED = ctx->insn;\n")
-
-    i = 0
-    ## Analyze all the registers
-    for regtype, regid in regs:
-        reg = hex_common.get_register(tag, regtype, regid)
-        if reg.is_written():
-            reg.analyze_write(f, tag, i)
+    if (hex_common.is_hvx_insn(tag)):
+        if hex_common.has_hvx_helper(tag):
+            f.write(
+                "    const bool G_GNUC_UNUSED insn_has_hvx_helper = true;\n"
+            )
+            f.write("    ctx_start_hvx_insn(ctx);\n")
         else:
-            reg.analyze_read(f, i)
-        i += 1
+            f.write(
+                "    const bool G_GNUC_UNUSED insn_has_hvx_helper = false;\n"
+            )
 
-    has_generated_helper = not hex_common.skip_qemu_helper(
-        tag
-    ) and not hex_common.is_idef_parser_enabled(tag)
+    ## Declare all the registers
+    for regno, register in enumerate(regs):
+        reg_type, reg_id = register
+        reg = hex_common.get_register(tag, reg_type, reg_id)
+        reg.decl_reg_num(f, regno)
 
-    ## Mark HVX instructions with generated helpers
-    if (has_generated_helper and
-        "A_CVI" in hex_common.attribdict[tag]):
-        f.write("    ctx->has_hvx_helper = true;\n")
+    ## Analyze the register reads
+    for regno, register in enumerate(regs):
+        reg_type, reg_id = register
+        reg = hex_common.get_register(tag, reg_type, reg_id)
+        if reg.is_read():
+            reg.analyze_read(f, regno)
+
+    ## Analyze the register writes
+    for regno, register in enumerate(regs):
+        reg_type, reg_id = register
+        reg = hex_common.get_register(tag, reg_type, reg_id)
+        if reg.is_written():
+            reg.analyze_write(f, tag, regno)
 
     f.write("}\n\n")
 
 
 def main():
-    hex_common.read_semantics_file(sys.argv[1])
-    hex_common.read_attribs_file(sys.argv[2])
-    hex_common.read_overrides_file(sys.argv[3])
-    hex_common.read_overrides_file(sys.argv[4])
-    ## Whether or not idef-parser is enabled is
-    ## determined by the number of arguments to
-    ## this script:
-    ##
-    ##   5 args. -> not enabled,
-    ##   6 args. -> idef-parser enabled.
-    ##
-    ## The 6:th arg. then holds a list of the successfully
-    ## parsed instructions.
-    is_idef_parser_enabled = len(sys.argv) > 6
-    if is_idef_parser_enabled:
-        hex_common.read_idef_parser_enabled_file(sys.argv[5])
-    hex_common.calculate_attribs()
-    hex_common.init_registers()
+    hex_common.read_common_files()
     tagregs = hex_common.get_tagregs()
     tagimms = hex_common.get_tagimms()
 
     with open(sys.argv[-1], "w") as f:
-        f.write("#ifndef HEXAGON_TCG_FUNCS_H\n")
-        f.write("#define HEXAGON_TCG_FUNCS_H\n\n")
+        f.write("#ifndef HEXAGON_ANALYZE_FUNCS_C_INC\n")
+        f.write("#define HEXAGON_ANALYZE_FUNCS_C_INC\n\n")
 
         for tag in hex_common.tags:
             gen_analyze_func(f, tag, tagregs[tag], tagimms[tag])
 
-        f.write("#endif    /* HEXAGON_TCG_FUNCS_H */\n")
+        f.write("#endif    /* HEXAGON_ANALYZE_FUNCS_C_INC */\n")
 
 
 if __name__ == "__main__":
diff --git a/target/hexagon/gen_helper_funcs.py b/target/hexagon/gen_helper_funcs.py
index 9cc3d69..e9685bf 100755
--- a/target/hexagon/gen_helper_funcs.py
+++ b/target/hexagon/gen_helper_funcs.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 ##
-##  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+##  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
 ##
 ##  This program is free software; you can redistribute it and/or modify
 ##  it under the terms of the GNU General Public License as published by
@@ -102,24 +102,7 @@
 
 
 def main():
-    hex_common.read_semantics_file(sys.argv[1])
-    hex_common.read_attribs_file(sys.argv[2])
-    hex_common.read_overrides_file(sys.argv[3])
-    hex_common.read_overrides_file(sys.argv[4])
-    ## Whether or not idef-parser is enabled is
-    ## determined by the number of arguments to
-    ## this script:
-    ##
-    ##   5 args. -> not enabled,
-    ##   6 args. -> idef-parser enabled.
-    ##
-    ## The 6:th arg. then holds a list of the successfully
-    ## parsed instructions.
-    is_idef_parser_enabled = len(sys.argv) > 6
-    if is_idef_parser_enabled:
-        hex_common.read_idef_parser_enabled_file(sys.argv[5])
-    hex_common.calculate_attribs()
-    hex_common.init_registers()
+    hex_common.read_common_files()
     tagregs = hex_common.get_tagregs()
     tagimms = hex_common.get_tagimms()
 
diff --git a/target/hexagon/gen_helper_protos.py b/target/hexagon/gen_helper_protos.py
index c82b0f5..fd2bfd0 100755
--- a/target/hexagon/gen_helper_protos.py
+++ b/target/hexagon/gen_helper_protos.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 ##
-##  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+##  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
 ##
 ##  This program is free software; you can redistribute it and/or modify
 ##  it under the terms of the GNU General Public License as published by
@@ -40,28 +40,19 @@
         declared.append(arg.proto_arg)
 
     arguments = ", ".join(declared)
-    f.write(f"DEF_HELPER_{len(declared) - 1}({tag}, {arguments})\n")
+
+    ## Add the TCG_CALL_NO_RWG_SE flag to helpers that don't take the env
+    ## argument and aren't HVX instructions.  Since HVX instructions take
+    ## pointers to their arguments, they will have side effects.
+    if hex_common.need_env(tag) or hex_common.is_hvx_insn(tag):
+        f.write(f"DEF_HELPER_{len(declared) - 1}({tag}, {arguments})\n")
+    else:
+        f.write(f"DEF_HELPER_FLAGS_{len(declared) - 1}({tag}, "
+                f"TCG_CALL_NO_RWG_SE, {arguments})\n")
 
 
 def main():
-    hex_common.read_semantics_file(sys.argv[1])
-    hex_common.read_attribs_file(sys.argv[2])
-    hex_common.read_overrides_file(sys.argv[3])
-    hex_common.read_overrides_file(sys.argv[4])
-    ## Whether or not idef-parser is enabled is
-    ## determined by the number of arguments to
-    ## this script:
-    ##
-    ##   5 args. -> not enabled,
-    ##   6 args. -> idef-parser enabled.
-    ##
-    ## The 6:th arg. then holds a list of the successfully
-    ## parsed instructions.
-    is_idef_parser_enabled = len(sys.argv) > 6
-    if is_idef_parser_enabled:
-        hex_common.read_idef_parser_enabled_file(sys.argv[5])
-    hex_common.calculate_attribs()
-    hex_common.init_registers()
+    hex_common.read_common_files()
     tagregs = hex_common.get_tagregs()
     tagimms = hex_common.get_tagimms()
 
diff --git a/target/hexagon/gen_idef_parser_funcs.py b/target/hexagon/gen_idef_parser_funcs.py
index 550a48c..eb494ab 100644
--- a/target/hexagon/gen_idef_parser_funcs.py
+++ b/target/hexagon/gen_idef_parser_funcs.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 ##
-##  Copyright(c) 2019-2023 rev.ng Labs Srl. All Rights Reserved.
+##  Copyright(c) 2019-2024 rev.ng Labs Srl. All Rights Reserved.
 ##
 ##  This program is free software; you can redistribute it and/or modify
 ##  it under the terms of the GNU General Public License as published by
@@ -44,13 +44,12 @@
 ##
 def main():
     hex_common.read_semantics_file(sys.argv[1])
-    hex_common.read_attribs_file(sys.argv[2])
     hex_common.calculate_attribs()
     hex_common.init_registers()
     tagregs = hex_common.get_tagregs()
     tagimms = hex_common.get_tagimms()
 
-    with open(sys.argv[3], "w") as f:
+    with open(sys.argv[-1], "w") as f:
         f.write('#include "macros.inc"\n\n')
 
         for tag in hex_common.tags:
diff --git a/target/hexagon/gen_op_attribs.py b/target/hexagon/gen_op_attribs.py
index 41074b8..9944822 100755
--- a/target/hexagon/gen_op_attribs.py
+++ b/target/hexagon/gen_op_attribs.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 ##
-##  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+##  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
 ##
 ##  This program is free software; you can redistribute it and/or modify
 ##  it under the terms of the GNU General Public License as published by
@@ -25,13 +25,12 @@
 
 def main():
     hex_common.read_semantics_file(sys.argv[1])
-    hex_common.read_attribs_file(sys.argv[2])
     hex_common.calculate_attribs()
 
     ##
     ##     Generate all the attributes associated with each instruction
     ##
-    with open(sys.argv[3], "w") as f:
+    with open(sys.argv[-1], "w") as f:
         for tag in hex_common.tags:
             f.write(
                 f"OP_ATTRIB({tag},ATTRIBS("
diff --git a/target/hexagon/gen_op_regs.py b/target/hexagon/gen_op_regs.py
deleted file mode 100755
index 7b7b338..0000000
--- a/target/hexagon/gen_op_regs.py
+++ /dev/null
@@ -1,125 +0,0 @@
-#!/usr/bin/env python3
-
-##
-##  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
-##
-##  This program is free software; you can redistribute it and/or modify
-##  it under the terms of the GNU General Public License as published by
-##  the Free Software Foundation; either version 2 of the License, or
-##  (at your option) any later version.
-##
-##  This program is distributed in the hope that it will be useful,
-##  but WITHOUT ANY WARRANTY; without even the implied warranty of
-##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-##  GNU General Public License for more details.
-##
-##  You should have received a copy of the GNU General Public License
-##  along with this program; if not, see <http://www.gnu.org/licenses/>.
-##
-
-import sys
-import re
-import string
-import hex_common
-
-
-##
-##     Generate the register and immediate operands for each instruction
-##
-def calculate_regid_reg(tag):
-    def letter_inc(x):
-        return chr(ord(x) + 1)
-
-    ordered_implregs = ["SP", "FP", "LR"]
-    srcdst_lett = "X"
-    src_lett = "S"
-    dst_lett = "D"
-    retstr = ""
-    mapdict = {}
-    for reg in ordered_implregs:
-        reg_rd = 0
-        reg_wr = 0
-        if ("A_IMPLICIT_WRITES_" + reg) in hex_common.attribdict[tag]:
-            reg_wr = 1
-        if reg_rd and reg_wr:
-            retstr += srcdst_lett
-            mapdict[srcdst_lett] = reg
-            srcdst_lett = letter_inc(srcdst_lett)
-        elif reg_rd:
-            retstr += src_lett
-            mapdict[src_lett] = reg
-            src_lett = letter_inc(src_lett)
-        elif reg_wr:
-            retstr += dst_lett
-            mapdict[dst_lett] = reg
-            dst_lett = letter_inc(dst_lett)
-    return retstr, mapdict
-
-
-def calculate_regid_letters(tag):
-    retstr, mapdict = calculate_regid_reg(tag)
-    return retstr
-
-
-def strip_reg_prefix(x):
-    y = x.replace("UREG.", "")
-    y = y.replace("MREG.", "")
-    return y.replace("GREG.", "")
-
-
-def main():
-    hex_common.read_semantics_file(sys.argv[1])
-    hex_common.read_attribs_file(sys.argv[2])
-    hex_common.init_registers()
-    tagregs = hex_common.get_tagregs(full=True)
-    tagimms = hex_common.get_tagimms()
-
-    with open(sys.argv[3], "w") as f:
-        for tag in hex_common.tags:
-            regs = tagregs[tag]
-            rregs = []
-            wregs = []
-            regids = ""
-            for regtype, regid, _, numregs in regs:
-                reg = hex_common.get_register(tag, regtype, regid)
-                if reg.is_read():
-                    if regid[0] not in regids:
-                        regids += regid[0]
-                    rregs.append(regtype + regid + numregs)
-                if reg.is_written():
-                    wregs.append(regtype + regid + numregs)
-                    if regid[0] not in regids:
-                        regids += regid[0]
-            for attrib in hex_common.attribdict[tag]:
-                if hex_common.attribinfo[attrib]["rreg"]:
-                    rregs.append(strip_reg_prefix(attribinfo[attrib]["rreg"]))
-                if hex_common.attribinfo[attrib]["wreg"]:
-                    wregs.append(strip_reg_prefix(attribinfo[attrib]["wreg"]))
-            regids += calculate_regid_letters(tag)
-            f.write(
-                f'REGINFO({tag},"{regids}",\t/*RD:*/\t"{",".join(rregs)}",'
-                f'\t/*WR:*/\t"{",".join(wregs)}")\n'
-            )
-
-        for tag in hex_common.tags:
-            imms = tagimms[tag]
-            f.write(f"IMMINFO({tag}")
-            if not imms:
-                f.write(""",'u',0,0,'U',0,0""")
-            for sign, size, shamt in imms:
-                if sign == "r":
-                    sign = "s"
-                if not shamt:
-                    shamt = "0"
-                f.write(f""",'{sign}',{size},{shamt}""")
-            if len(imms) == 1:
-                if sign.isupper():
-                    myu = "u"
-                else:
-                    myu = "U"
-                f.write(f""",'{myu}',0,0""")
-            f.write(")\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/target/hexagon/gen_opcodes_def.py b/target/hexagon/gen_opcodes_def.py
index cddd868..536f0eb 100755
--- a/target/hexagon/gen_opcodes_def.py
+++ b/target/hexagon/gen_opcodes_def.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 ##
-##  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+##  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
 ##
 ##  This program is free software; you can redistribute it and/or modify
 ##  it under the terms of the GNU General Public License as published by
@@ -29,7 +29,7 @@
     ##
     ##     Generate a list of all the opcodes
     ##
-    with open(sys.argv[3], "w") as f:
+    with open(sys.argv[-1], "w") as f:
         for tag in hex_common.tags:
             f.write(f"OPCODE({tag}),\n")
 
diff --git a/target/hexagon/gen_printinsn.py b/target/hexagon/gen_printinsn.py
index e570bd7..8bf4d09 100755
--- a/target/hexagon/gen_printinsn.py
+++ b/target/hexagon/gen_printinsn.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 ##
-##  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+##  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
 ##
 ##  This program is free software; you can redistribute it and/or modify
 ##  it under the terms of the GNU General Public License as published by
@@ -97,11 +97,10 @@
 
 def main():
     hex_common.read_semantics_file(sys.argv[1])
-    hex_common.read_attribs_file(sys.argv[2])
 
     immext_casere = re.compile(r"IMMEXT\(([A-Za-z])")
 
-    with open(sys.argv[3], "w") as f:
+    with open(sys.argv[-1], "w") as f:
         for tag in hex_common.tags:
             if not hex_common.behdict[tag]:
                 continue
diff --git a/target/hexagon/gen_shortcode.py b/target/hexagon/gen_shortcode.py
deleted file mode 100755
index deb9444..0000000
--- a/target/hexagon/gen_shortcode.py
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/usr/bin/env python3
-
-##
-##  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
-##
-##  This program is free software; you can redistribute it and/or modify
-##  it under the terms of the GNU General Public License as published by
-##  the Free Software Foundation; either version 2 of the License, or
-##  (at your option) any later version.
-##
-##  This program is distributed in the hope that it will be useful,
-##  but WITHOUT ANY WARRANTY; without even the implied warranty of
-##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-##  GNU General Public License for more details.
-##
-##  You should have received a copy of the GNU General Public License
-##  along with this program; if not, see <http://www.gnu.org/licenses/>.
-##
-
-import sys
-import re
-import string
-import hex_common
-
-
-def gen_shortcode(f, tag):
-    f.write(f"DEF_SHORTCODE({tag}, {hex_common.semdict[tag]})\n")
-
-
-def main():
-    hex_common.read_semantics_file(sys.argv[1])
-    hex_common.read_attribs_file(sys.argv[2])
-    hex_common.calculate_attribs()
-    tagregs = hex_common.get_tagregs()
-    tagimms = hex_common.get_tagimms()
-
-    with open(sys.argv[3], "w") as f:
-        f.write("#ifndef DEF_SHORTCODE\n")
-        f.write("#define DEF_SHORTCODE(TAG,SHORTCODE)    /* Nothing */\n")
-        f.write("#endif\n")
-
-        for tag in hex_common.tags:
-            ## Skip the priv instructions
-            if "A_PRIV" in hex_common.attribdict[tag]:
-                continue
-            ## Skip the guest instructions
-            if "A_GUEST" in hex_common.attribdict[tag]:
-                continue
-            ## Skip the diag instructions
-            if tag == "Y6_diag":
-                continue
-            if tag == "Y6_diag0":
-                continue
-            if tag == "Y6_diag1":
-                continue
-
-            gen_shortcode(f, tag)
-
-        f.write("#undef DEF_SHORTCODE\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/target/hexagon/gen_tcg.h b/target/hexagon/gen_tcg.h
index 1c4391b..3fc1f4e 100644
--- a/target/hexagon/gen_tcg.h
+++ b/target/hexagon/gen_tcg.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -1369,3 +1369,6 @@
         gen_helper_raise_exception(tcg_env, excp); \
     } while (0)
 #endif
+
+#define fGEN_TCG_A2_nop(SHORTCODE) do { } while (0)
+#define fGEN_TCG_SA1_setin1(SHORTCODE) tcg_gen_movi_tl(RdV, -1)
diff --git a/target/hexagon/gen_tcg_func_table.py b/target/hexagon/gen_tcg_func_table.py
index f998ef0..978ac18 100755
--- a/target/hexagon/gen_tcg_func_table.py
+++ b/target/hexagon/gen_tcg_func_table.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 ##
-##  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+##  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
 ##
 ##  This program is free software; you can redistribute it and/or modify
 ##  it under the terms of the GNU General Public License as published by
@@ -25,12 +25,11 @@
 
 def main():
     hex_common.read_semantics_file(sys.argv[1])
-    hex_common.read_attribs_file(sys.argv[2])
     hex_common.calculate_attribs()
     tagregs = hex_common.get_tagregs()
     tagimms = hex_common.get_tagimms()
 
-    with open(sys.argv[3], "w") as f:
+    with open(sys.argv[-1], "w") as f:
         f.write("#ifndef HEXAGON_FUNC_TABLE_H\n")
         f.write("#define HEXAGON_FUNC_TABLE_H\n\n")
 
diff --git a/target/hexagon/gen_tcg_funcs.py b/target/hexagon/gen_tcg_funcs.py
index 3d8e3cb..05aa0a7 100755
--- a/target/hexagon/gen_tcg_funcs.py
+++ b/target/hexagon/gen_tcg_funcs.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 ##
-##  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+##  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
 ##
 ##  This program is free software; you can redistribute it and/or modify
 ##  it under the terms of the GNU General Public License as published by
@@ -108,24 +108,7 @@
 
 
 def main():
-    hex_common.read_semantics_file(sys.argv[1])
-    hex_common.read_attribs_file(sys.argv[2])
-    hex_common.read_overrides_file(sys.argv[3])
-    hex_common.read_overrides_file(sys.argv[4])
-    hex_common.calculate_attribs()
-    hex_common.init_registers()
-    ## Whether or not idef-parser is enabled is
-    ## determined by the number of arguments to
-    ## this script:
-    ##
-    ##   5 args. -> not enabled,
-    ##   6 args. -> idef-parser enabled.
-    ##
-    ## The 6:th arg. then holds a list of the successfully
-    ## parsed instructions.
-    is_idef_parser_enabled = len(sys.argv) > 6
-    if is_idef_parser_enabled:
-        hex_common.read_idef_parser_enabled_file(sys.argv[5])
+    is_idef_parser_enabled = hex_common.read_common_files()
     tagregs = hex_common.get_tagregs()
     tagimms = hex_common.get_tagimms()
 
diff --git a/target/hexagon/gen_trans_funcs.py b/target/hexagon/gen_trans_funcs.py
index 53e844a..9f86b4e 100755
--- a/target/hexagon/gen_trans_funcs.py
+++ b/target/hexagon/gen_trans_funcs.py
@@ -68,6 +68,9 @@
 ##         insn->regno[0] = args->Rd;
 ##         insn->regno[1] = args->Rs;
 ##         insn->regno[2] = args->Rt;
+##         insn->new_read_idx = -1;
+##         insn->dest_idx = 0;
+##         insn->has_pred_dest = false;
 ##         return true;
 ##     }
 ##
@@ -84,14 +87,21 @@
                 insn->opcode = {tag};
         """))
 
-        regno = 0
-        for reg in regs:
-            reg_type = reg[0]
-            reg_id = reg[1]
+        new_read_idx = -1
+        dest_idx = -1
+        has_pred_dest = "false"
+        for regno, (reg_type, reg_id, *_) in enumerate(regs):
+            reg = hex_common.get_register(tag, reg_type, reg_id)
             f.write(code_fmt(f"""\
                 insn->regno[{regno}] = args->{reg_type}{reg_id};
             """))
-            regno += 1
+            if reg.is_read() and reg.is_new():
+                new_read_idx = regno
+            # dest_idx should be the first destination, so check for -1
+            if reg.is_written() and dest_idx == -1:
+                dest_idx = regno
+            if reg_type == "P" and reg.is_written() and not reg.is_read():
+                has_pred_dest = "true"
 
         if len(imms) != 0:
             mark_which_imm_extended(f, tag)
@@ -112,6 +122,11 @@
                     insn->immed[{immno}] = args->{imm_type}{imm_letter};
                 """))
 
+        f.write(code_fmt(f"""\
+            insn->new_read_idx = {new_read_idx};
+            insn->dest_idx = {dest_idx};
+            insn->has_pred_dest = {has_pred_dest};
+        """))
         f.write(textwrap.dedent(f"""\
                 return true;
             {close_curly}
@@ -120,5 +135,6 @@
 
 if __name__ == "__main__":
     hex_common.read_semantics_file(sys.argv[1])
+    hex_common.init_registers()
     with open(sys.argv[2], "w") as f:
         gen_trans_funcs(f)
diff --git a/target/hexagon/hex_common.py b/target/hexagon/hex_common.py
index 195620c..15ed498 100755
--- a/target/hexagon/hex_common.py
+++ b/target/hexagon/hex_common.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 ##
-##  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+##  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
 ##
 ##  This program is free software; you can redistribute it and/or modify
 ##  it under the terms of the GNU General Public License as published by
@@ -26,7 +26,6 @@
 semdict = {}  # tag -> semantics
 attribdict = {}  # tag -> attributes
 macros = {}  # macro -> macro information...
-attribinfo = {}  # Register information and misc
 registers = {}  # register -> register functions
 new_registers = {}
 tags = []  # list of all tags
@@ -101,6 +100,7 @@
     add_qemu_macro_attrib('fLSBNEW1', 'A_IMPLICIT_READS_P1')
     add_qemu_macro_attrib('fLSBNEW1NOT', 'A_IMPLICIT_READS_P1')
     add_qemu_macro_attrib('fREAD_P3', 'A_IMPLICIT_READS_P3')
+    add_qemu_macro_attrib('fREAD_SP', 'A_IMPLICIT_READS_SP')
 
     # Recurse down macros, find attributes from sub-macros
     macroValues = list(macros.values())
@@ -197,6 +197,26 @@
     return dict(zip(tags, list(map(compute_tag_immediates, tags))))
 
 
+def need_p0(tag):
+    return "A_IMPLICIT_READS_P0" in attribdict[tag]
+
+
+def need_sp(tag):
+    return "A_IMPLICIT_READS_SP" in attribdict[tag]
+
+
+def is_hvx_insn(tag):
+    return "A_CVI" in attribdict[tag]
+
+
+def need_env(tag):
+    return ("A_STORE" in attribdict[tag] or
+            "A_LOAD" in attribdict[tag] or
+            "A_CVI_GATHER" in attribdict[tag] or
+            "A_CVI_SCATTER" in attribdict[tag] or
+            "A_IMPLICIT_WRITES_USR" in attribdict[tag])
+
+
 def need_slot(tag):
     if (
         "A_CVI_SCATTER" not in attribdict[tag]
@@ -241,6 +261,16 @@
     return tag in idef_parser_enabled
 
 
+def is_hvx_insn(tag):
+    return "A_CVI" in attribdict[tag]
+
+
+def has_hvx_helper(tag):
+    return (is_hvx_insn(tag) and
+            not skip_qemu_helper(tag) and
+            not is_idef_parser_enabled(tag))
+
+
 def imm_name(immlett):
     return f"{immlett}iV"
 
@@ -257,19 +287,6 @@
                 eval_line = ""
 
 
-def read_attribs_file(name):
-    attribre = re.compile(
-        r"DEF_ATTRIB\(([A-Za-z0-9_]+), ([^,]*), "
-        + r'"([A-Za-z0-9_\.]*)", "([A-Za-z0-9_\.]*)"\)'
-    )
-    for line in open(name, "rt").readlines():
-        if not attribre.match(line):
-            continue
-        (attrib_base, descr, rreg, wreg) = attribre.findall(line)[0]
-        attrib_base = "A_" + attrib_base
-        attribinfo[attrib_base] = {"rreg": rreg, "wreg": wreg, "descr": descr}
-
-
 def read_overrides_file(name):
     overridere = re.compile(r"#define fGEN_TCG_([A-Za-z0-9_]+)\(.*")
     for line in open(name, "rt").readlines():
@@ -397,10 +414,18 @@
 class OldSource(Source):
     def reg_tcg(self):
         return f"{self.regtype}{self.regid}V"
+    def is_old(self):
+        return True
+    def is_new(self):
+        return False
 
 class NewSource(Source):
     def reg_tcg(self):
         return f"{self.regtype}{self.regid}N"
+    def is_old(self):
+        return False
+    def is_new(self):
+        return True
 
 class ReadWrite:
     def reg_tcg(self):
@@ -413,6 +438,10 @@
         return True
     def is_readwrite(self):
         return True
+    def is_old(self):
+        return True
+    def is_new(self):
+        return False
 
 class GprDest(Register, Single, Dest):
     def decl_tcg(self, f, tag, regno):
@@ -425,7 +454,6 @@
             gen_log_reg_write(ctx, {self.reg_num}, {self.reg_tcg()});
         """))
     def analyze_write(self, f, tag, regno):
-        self.decl_reg_num(f, regno)
         predicated = "true" if is_predicated(tag) else "false"
         f.write(code_fmt(f"""\
             ctx_log_reg_write(ctx, {self.reg_num}, {predicated});
@@ -438,7 +466,6 @@
             TCGv {self.reg_tcg()} = hex_gpr[{self.reg_num}];
         """))
     def analyze_read(self, f, regno):
-        self.decl_reg_num(f, regno)
         f.write(code_fmt(f"""\
             ctx_log_reg_read(ctx, {self.reg_num});
         """))
@@ -449,9 +476,8 @@
             TCGv {self.reg_tcg()} = get_result_gpr(ctx, insn->regno[{regno}]);
         """))
     def analyze_read(self, f, regno):
-        self.decl_reg_num(f, regno)
         f.write(code_fmt(f"""\
-            ctx_log_reg_read(ctx, {self.reg_num});
+            ctx_log_reg_read_new(ctx, {self.reg_num});
         """))
 
 class GprReadWrite(Register, Single, ReadWrite):
@@ -471,8 +497,11 @@
         f.write(code_fmt(f"""\
             gen_log_reg_write(ctx, {self.reg_num}, {self.reg_tcg()});
         """))
+    def analyze_read(self, f, regno):
+        f.write(code_fmt(f"""\
+            ctx_log_reg_read(ctx, {self.reg_num});
+        """))
     def analyze_write(self, f, tag, regno):
-        self.decl_reg_num(f, regno)
         predicated = "true" if is_predicated(tag) else "false"
         f.write(code_fmt(f"""\
             ctx_log_reg_write(ctx, {self.reg_num}, {predicated});
@@ -493,7 +522,6 @@
             gen_write_ctrl_reg(ctx, {self.reg_num}, {self.reg_tcg()});
         """))
     def analyze_write(self, f, tag, regno):
-        self.decl_reg_num(f, regno)
         predicated = "true" if is_predicated(tag) else "false"
         f.write(code_fmt(f"""\
             ctx_log_reg_write(ctx, {self.reg_num}, {predicated});
@@ -511,7 +539,6 @@
             gen_read_ctrl_reg(ctx, {self.reg_num}, {self.reg_tcg()});
         """))
     def analyze_read(self, f, regno):
-        self.decl_reg_num(f, regno)
         f.write(code_fmt(f"""\
             ctx_log_reg_read(ctx, {self.reg_num});
         """))
@@ -532,7 +559,6 @@
         declared.append(self.reg_tcg())
         declared.append("CS")
     def analyze_read(self, f, regno):
-        self.decl_reg_num(f, regno)
         f.write(code_fmt(f"""\
             ctx_log_reg_read(ctx, {self.reg_num});
         """))
@@ -548,7 +574,6 @@
             gen_log_pred_write(ctx, {self.reg_num}, {self.reg_tcg()});
         """))
     def analyze_write(self, f, tag, regno):
-        self.decl_reg_num(f, regno)
         f.write(code_fmt(f"""\
             ctx_log_pred_write(ctx, {self.reg_num});
         """))
@@ -560,7 +585,6 @@
             TCGv {self.reg_tcg()} = hex_pred[{self.reg_num}];
         """))
     def analyze_read(self, f, regno):
-        self.decl_reg_num(f, regno)
         f.write(code_fmt(f"""\
             ctx_log_pred_read(ctx, {self.reg_num});
         """))
@@ -571,9 +595,8 @@
             TCGv {self.reg_tcg()} = get_result_pred(ctx, insn->regno[{regno}]);
         """))
     def analyze_read(self, f, regno):
-        self.decl_reg_num(f, regno)
         f.write(code_fmt(f"""\
-            ctx_log_pred_read(ctx, {self.reg_num});
+            ctx_log_pred_read_new(ctx, {self.reg_num});
         """))
 
 class PredReadWrite(Register, Single, ReadWrite):
@@ -587,8 +610,11 @@
         f.write(code_fmt(f"""\
             gen_log_pred_write(ctx, {self.reg_num}, {self.reg_tcg()});
         """))
+    def analyze_read(self, f, regno):
+        f.write(code_fmt(f"""\
+            ctx_log_pred_read(ctx, {self.reg_num});
+        """))
     def analyze_write(self, f, tag, regno):
-        self.decl_reg_num(f, regno)
         f.write(code_fmt(f"""\
             ctx_log_pred_write(ctx, {self.reg_num});
         """))
@@ -605,7 +631,6 @@
             gen_log_reg_write_pair(ctx, {self.reg_num}, {self.reg_tcg()});
         """))
     def analyze_write(self, f, tag, regno):
-        self.decl_reg_num(f, regno)
         predicated = "true" if is_predicated(tag) else "false"
         f.write(code_fmt(f"""\
             ctx_log_reg_write_pair(ctx, {self.reg_num}, {predicated});
@@ -621,7 +646,6 @@
                                     hex_gpr[{self.reg_num} + 1]);
         """))
     def analyze_read(self, f, regno):
-        self.decl_reg_num(f, regno)
         f.write(code_fmt(f"""\
             ctx_log_reg_read_pair(ctx, {self.reg_num});
         """))
@@ -640,8 +664,11 @@
         f.write(code_fmt(f"""\
             gen_log_reg_write_pair(ctx, {self.reg_num}, {self.reg_tcg()});
         """))
+    def analyze_read(self, f, regno):
+        f.write(code_fmt(f"""\
+            ctx_log_reg_read_pair(ctx, {self.reg_num});
+        """))
     def analyze_write(self, f, tag, regno):
-        self.decl_reg_num(f, regno)
         predicated = "true" if is_predicated(tag) else "false"
         f.write(code_fmt(f"""\
             ctx_log_reg_write_pair(ctx, {self.reg_num}, {predicated});
@@ -663,7 +690,6 @@
             gen_write_ctrl_reg_pair(ctx, {self.reg_num}, {self.reg_tcg()});
         """))
     def analyze_write(self, f, tag, regno):
-        self.decl_reg_num(f, regno)
         predicated = "true" if is_predicated(tag) else "false"
         f.write(code_fmt(f"""\
             ctx_log_reg_write_pair(ctx, {self.reg_num}, {predicated});
@@ -681,7 +707,6 @@
             gen_read_ctrl_reg_pair(ctx, {self.reg_num}, {self.reg_tcg()});
         """))
     def analyze_read(self, f, regno):
-        self.decl_reg_num(f, regno)
         f.write(code_fmt(f"""\
             ctx_log_reg_read_pair(ctx, {self.reg_num});
         """))
@@ -705,11 +730,11 @@
             /* {self.reg_tcg()} is *(MMVector *)({self.helper_arg_name()}) */
         """))
     def analyze_write(self, f, tag, regno):
-        self.decl_reg_num(f, regno)
         newv = hvx_newv(tag)
         predicated = "true" if is_predicated(tag) else "false"
         f.write(code_fmt(f"""\
-            ctx_log_vreg_write(ctx, {self.reg_num}, {newv}, {predicated});
+            ctx_log_vreg_write(ctx, {self.reg_num}, {newv}, {predicated},
+                               insn_has_hvx_helper);
         """))
 
 class VRegSource(Register, Hvx, OldSource):
@@ -728,9 +753,8 @@
             /* {self.reg_tcg()} is *(MMVector *)({self.helper_arg_name()}) */
         """))
     def analyze_read(self, f, regno):
-        self.decl_reg_num(f, regno)
         f.write(code_fmt(f"""\
-            ctx_log_vreg_read(ctx, {self.reg_num});
+            ctx_log_vreg_read(ctx, {self.reg_num}, insn_has_hvx_helper);
         """))
 
 class VRegNewSource(Register, Hvx, NewSource):
@@ -746,9 +770,8 @@
             /* {self.reg_tcg()} is *(MMVector *)({self.helper_arg_name()}) */
         """))
     def analyze_read(self, f, regno):
-        self.decl_reg_num(f, regno)
         f.write(code_fmt(f"""\
-            ctx_log_vreg_read(ctx, {self.reg_num});
+            ctx_log_vreg_read_new(ctx, {self.reg_num}, insn_has_hvx_helper);
         """))
 
 class VRegReadWrite(Register, Hvx, ReadWrite):
@@ -772,12 +795,16 @@
         f.write(code_fmt(f"""\
             /* {self.reg_tcg()} is *(MMVector *)({self.helper_arg_name()}) */
         """))
+    def analyze_read(self, f, regno):
+        f.write(code_fmt(f"""\
+            ctx_log_vreg_read(ctx, {self.reg_num}, insn_has_hvx_helper);
+        """))
     def analyze_write(self, f, tag, regno):
-        self.decl_reg_num(f, regno)
         newv = hvx_newv(tag)
         predicated = "true" if is_predicated(tag) else "false"
         f.write(code_fmt(f"""\
-            ctx_log_vreg_write(ctx, {self.reg_num}, {newv}, {predicated});
+            ctx_log_vreg_write(ctx, {self.reg_num}, {newv}, {predicated},
+                               insn_has_hvx_helper);
         """))
 
 class VRegTmp(Register, Hvx, ReadWrite):
@@ -803,12 +830,16 @@
         f.write(code_fmt(f"""\
             /* {self.reg_tcg()} is *(MMVector *)({self.helper_arg_name()}) */
         """))
+    def analyze_read(self, f, regno):
+        f.write(code_fmt(f"""\
+            ctx_log_vreg_read(ctx, {self.reg_num}, insn_has_hvx_helper);
+        """))
     def analyze_write(self, f, tag, regno):
-        self.decl_reg_num(f, regno)
         newv = hvx_newv(tag)
         predicated = "true" if is_predicated(tag) else "false"
         f.write(code_fmt(f"""\
-            ctx_log_vreg_write(ctx, {self.reg_num}, {newv}, {predicated});
+            ctx_log_vreg_write(ctx, {self.reg_num}, {newv}, {predicated},
+                               insn_has_hvx_helper);
         """))
 
 class VRegPairDest(Register, Hvx, Dest):
@@ -830,11 +861,11 @@
             /* {self.reg_tcg()} is *(MMVectorPair *)({self.helper_arg_name()}) */
         """))
     def analyze_write(self, f, tag, regno):
-        self.decl_reg_num(f, regno)
         newv = hvx_newv(tag)
         predicated = "true" if is_predicated(tag) else "false"
         f.write(code_fmt(f"""\
-            ctx_log_vreg_write_pair(ctx, {self.reg_num}, {newv}, {predicated});
+            ctx_log_vreg_write_pair(ctx, {self.reg_num}, {newv}, {predicated},
+                                    insn_has_hvx_helper);
         """))
 
 class VRegPairSource(Register, Hvx, OldSource):
@@ -860,9 +891,8 @@
             /* {self.reg_tcg()} is *(MMVectorPair *)({self.helper_arg_name()}) */
         """))
     def analyze_read(self, f, regno):
-        self.decl_reg_num(f, regno)
         f.write(code_fmt(f"""\
-            ctx_log_vreg_read_pair(ctx, {self.reg_num});
+            ctx_log_vreg_read_pair(ctx, {self.reg_num}, insn_has_hvx_helper);
         """))
 
 class VRegPairReadWrite(Register, Hvx, ReadWrite):
@@ -892,12 +922,16 @@
         f.write(code_fmt(f"""\
             /* {self.reg_tcg()} is *(MMVectorPair *)({self.helper_arg_name()}) */
         """))
+    def analyze_read(self, f, regno):
+        f.write(code_fmt(f"""\
+            ctx_log_vreg_read_pair(ctx, {self.reg_num}, insn_has_hvx_helper);
+        """))
     def analyze_write(self, f, tag, regno):
-        self.decl_reg_num(f, regno)
         newv = hvx_newv(tag)
         predicated = "true" if is_predicated(tag) else "false"
         f.write(code_fmt(f"""\
-            ctx_log_vreg_write_pair(ctx, {self.reg_num}, {newv}, {predicated});
+            ctx_log_vreg_write_pair(ctx, {self.reg_num}, {newv}, {predicated},
+                                    insn_has_hvx_helper);
         """))
 
 class QRegDest(Register, Hvx, Dest):
@@ -919,9 +953,8 @@
             /* {self.reg_tcg()} is *(MMQReg *)({self.helper_arg_name()}) */
         """))
     def analyze_write(self, f, tag, regno):
-        self.decl_reg_num(f, regno)
         f.write(code_fmt(f"""\
-            ctx_log_qreg_write(ctx, {self.reg_num});
+            ctx_log_qreg_write(ctx, {self.reg_num}, insn_has_hvx_helper);
         """))
 
 class QRegSource(Register, Hvx, OldSource):
@@ -941,9 +974,8 @@
             /* {self.reg_tcg()} is *(MMQReg *)({self.helper_arg_name()}) */
         """))
     def analyze_read(self, f, regno):
-        self.decl_reg_num(f, regno)
         f.write(code_fmt(f"""\
-            ctx_log_qreg_read(ctx, {self.reg_num});
+            ctx_log_qreg_read(ctx, {self.reg_num}, insn_has_hvx_helper);
         """))
 
 class QRegReadWrite(Register, Hvx, ReadWrite):
@@ -967,10 +999,13 @@
         f.write(code_fmt(f"""\
             /* {self.reg_tcg()} is *(MMQReg *)({self.helper_arg_name()}) */
         """))
-    def analyze_write(self, f, tag, regno):
-        self.decl_reg_num(f, regno)
+    def analyze_read(self, f, regno):
         f.write(code_fmt(f"""\
-            ctx_log_qreg_write(ctx, {self.reg_num});
+            ctx_log_qreg_read(ctx, {self.reg_num}, insn_has_hvx_helper);
+        """))
+    def analyze_write(self, f, tag, regno):
+        f.write(code_fmt(f"""\
+            ctx_log_qreg_write(ctx, {self.reg_num}, insn_has_hvx_helper);
         """))
 
 def init_registers():
@@ -1060,11 +1095,12 @@
     args = []
 
     ## First argument is the CPU state
-    args.append(HelperArg(
-        "env",
-        "tcg_env",
-        "CPUHexagonState *env"
-    ))
+    if need_env(tag):
+        args.append(HelperArg(
+            "env",
+            "tcg_env",
+            "CPUHexagonState *env"
+        ))
 
     ## For predicated instructions, we pass in the destination register
     if is_predicated(tag):
@@ -1118,6 +1154,18 @@
             "tcg_constant_tl(ctx->next_PC)",
             "target_ulong next_PC"
         ))
+    if need_p0(tag):
+        args.append(HelperArg(
+            "i32",
+            "hex_pred[0]",
+            "uint32_t P0"
+        ))
+    if need_sp(tag):
+        args.append(HelperArg(
+            "i32",
+            "hex_gpr[HEX_REG_SP]",
+            "uint32_t SP"
+        ))
     if need_slot(tag):
         args.append(HelperArg(
             "i32",
@@ -1131,3 +1179,24 @@
             "uint32_t part1"
         ))
     return args
+
+
+def read_common_files():
+    read_semantics_file(sys.argv[1])
+    read_overrides_file(sys.argv[2])
+    read_overrides_file(sys.argv[3])
+    ## Whether or not idef-parser is enabled is
+    ## determined by the number of arguments to
+    ## this script:
+    ##
+    ##   4 args. -> not enabled,
+    ##   5 args. -> idef-parser enabled.
+    ##
+    ## The 5:th arg. then holds a list of the successfully
+    ## parsed instructions.
+    is_idef_parser_enabled = len(sys.argv) > 5
+    if is_idef_parser_enabled:
+        read_idef_parser_enabled_file(sys.argv[4])
+    calculate_attribs()
+    init_registers()
+    return is_idef_parser_enabled
diff --git a/target/hexagon/insn.h b/target/hexagon/insn.h
index 3e7a22c..24dcf7f 100644
--- a/target/hexagon/insn.h
+++ b/target/hexagon/insn.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -39,6 +39,9 @@
     uint32_t slot:3;
     uint32_t which_extended:1;    /* If has an extender, which immediate */
     uint32_t new_value_producer_slot:4;
+    int32_t new_read_idx;
+    int32_t dest_idx;
+    bool has_pred_dest;
 
     bool part1;              /*
                               * cmp-jumps are split into two insns.
diff --git a/target/hexagon/macros.h b/target/hexagon/macros.h
index 1376d6c..feb798c 100644
--- a/target/hexagon/macros.h
+++ b/target/hexagon/macros.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -343,7 +343,7 @@
 
 #define fREAD_LR() (env->gpr[HEX_REG_LR])
 
-#define fREAD_SP() (env->gpr[HEX_REG_SP])
+#define fREAD_SP() (SP)
 #define fREAD_LC0 (env->gpr[HEX_REG_LC0])
 #define fREAD_LC1 (env->gpr[HEX_REG_LC1])
 #define fREAD_SA0 (env->gpr[HEX_REG_SA0])
@@ -358,7 +358,7 @@
 #endif
 #define fREAD_PC() (PC)
 
-#define fREAD_P0() (env->pred[0])
+#define fREAD_P0() (P0)
 
 #define fCHECK_PCALIGN(A)
 
diff --git a/target/hexagon/meson.build b/target/hexagon/meson.build
index fb480af..b0b253a 100644
--- a/target/hexagon/meson.build
+++ b/target/hexagon/meson.build
@@ -1,5 +1,5 @@
 ##
-##  Copyright(c) 2020-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+##  Copyright(c) 2020-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
 ##
 ##  This program is free software; you can redistribute it and/or modify
 ##  it under the terms of the GNU General Public License as published by
@@ -18,7 +18,6 @@
 hexagon_ss = ss.source_set()
 
 hex_common_py = 'hex_common.py'
-attribs_def = meson.current_source_dir() / 'attribs_def.h.inc'
 gen_tcg_h = meson.current_source_dir() / 'gen_tcg.h'
 gen_tcg_hvx_h = meson.current_source_dir() / 'gen_tcg_hvx.h'
 idef_parser_dir = meson.current_source_dir() / 'idef-parser'
@@ -42,28 +41,17 @@
 #
 # Step 2
 # We use Python scripts to generate the following files
-#     shortcode_generated.h.inc
 #     tcg_func_table_generated.c.inc
 #     printinsn_generated.h.inc
-#     op_regs_generated.h.inc
 #     op_attribs_generated.h.inc
 #     opcodes_def_generated.h.inc
 #
-shortcode_generated = custom_target(
-    'shortcode_generated.h.inc',
-    output: 'shortcode_generated.h.inc',
-    depends: [semantics_generated],
-    depend_files: [hex_common_py, attribs_def],
-    command: [python, files('gen_shortcode.py'), semantics_generated, attribs_def, '@OUTPUT@'],
-)
-hexagon_ss.add(shortcode_generated)
-
 tcg_func_table_generated = custom_target(
     'tcg_func_table_generated.c.inc',
     output: 'tcg_func_table_generated.c.inc',
     depends: [semantics_generated],
-    depend_files: [hex_common_py, attribs_def],
-    command: [python, files('gen_tcg_func_table.py'), semantics_generated, attribs_def, '@OUTPUT@'],
+    depend_files: [hex_common_py],
+    command: [python, files('gen_tcg_func_table.py'), semantics_generated, '@OUTPUT@'],
 )
 hexagon_ss.add(tcg_func_table_generated)
 
@@ -71,26 +59,17 @@
     'printinsn_generated.h.inc',
     output: 'printinsn_generated.h.inc',
     depends: [semantics_generated],
-    depend_files: [hex_common_py, attribs_def],
-    command: [python, files('gen_printinsn.py'), semantics_generated, attribs_def, '@OUTPUT@'],
+    depend_files: [hex_common_py],
+    command: [python, files('gen_printinsn.py'), semantics_generated, '@OUTPUT@'],
 )
 hexagon_ss.add(printinsn_generated)
 
-op_regs_generated = custom_target(
-    'op_regs_generated.h.inc',
-    output: 'op_regs_generated.h.inc',
-    depends: [semantics_generated],
-    depend_files: [hex_common_py, attribs_def],
-    command: [python, files('gen_op_regs.py'), semantics_generated, attribs_def, '@OUTPUT@'],
-)
-hexagon_ss.add(op_regs_generated)
-
 op_attribs_generated = custom_target(
     'op_attribs_generated.h.inc',
     output: 'op_attribs_generated.h.inc',
     depends: [semantics_generated],
-    depend_files: [hex_common_py, attribs_def],
-    command: [python, files('gen_op_attribs.py'), semantics_generated, attribs_def, '@OUTPUT@'],
+    depend_files: [hex_common_py],
+    command: [python, files('gen_op_attribs.py'), semantics_generated, '@OUTPUT@'],
 )
 hexagon_ss.add(op_attribs_generated)
 
@@ -98,8 +77,8 @@
     'opcodes_def_generated.h.inc',
     output: 'opcodes_def_generated.h.inc',
     depends: [semantics_generated],
-    depend_files: [hex_common_py, attribs_def],
-    command: [python, files('gen_opcodes_def.py'), semantics_generated, attribs_def, '@OUTPUT@'],
+    depend_files: [hex_common_py],
+    command: [python, files('gen_opcodes_def.py'), semantics_generated, '@OUTPUT@'],
 )
 hexagon_ss.add(opcodes_def_generated)
 
@@ -110,7 +89,7 @@
 #
 gen_dectree_import = executable(
     'gen_dectree_import',
-    'gen_dectree_import.c', opcodes_def_generated, op_regs_generated,
+    'gen_dectree_import.c', opcodes_def_generated,
     native: true, build_by_default: false)
 
 iset_py = custom_target(
@@ -298,7 +277,7 @@
         output: 'idef_parser_input.h.inc',
         depends: [semantics_generated],
         depend_files: [hex_common_py],
-        command: [python, files('gen_idef_parser_funcs.py'), semantics_generated, attribs_def, '@OUTPUT@'],
+        command: [python, files('gen_idef_parser_funcs.py'), semantics_generated, '@OUTPUT@'],
     )
 
     preprocessed_idef_parser_input_generated = custom_target(
@@ -367,12 +346,12 @@
     # Setup input and dependencies for the next step, this depends on whether or
     # not idef-parser is enabled
     helper_dep = [semantics_generated, idef_generated_tcg_c, idef_generated_tcg]
-    helper_in = [semantics_generated, attribs_def, gen_tcg_h, gen_tcg_hvx_h, idef_generated_list]
+    helper_in = [semantics_generated, gen_tcg_h, gen_tcg_hvx_h, idef_generated_list]
 else
     # Setup input and dependencies for the next step, this depends on whether or
     # not idef-parser is enabled
     helper_dep = [semantics_generated]
-    helper_in = [semantics_generated, attribs_def, gen_tcg_h, gen_tcg_hvx_h]
+    helper_in = [semantics_generated, gen_tcg_h, gen_tcg_hvx_h]
 endif
 
 #
@@ -386,7 +365,7 @@
     'helper_protos_generated.h.inc',
     output: 'helper_protos_generated.h.inc',
     depends: helper_dep,
-    depend_files: [hex_common_py, attribs_def, gen_tcg_h, gen_tcg_hvx_h],
+    depend_files: [hex_common_py, gen_tcg_h, gen_tcg_hvx_h],
     command: [python, files('gen_helper_protos.py'), helper_in, '@OUTPUT@'],
 )
 hexagon_ss.add(helper_protos_generated)
@@ -395,7 +374,7 @@
     'helper_funcs_generated.c.inc',
     output: 'helper_funcs_generated.c.inc',
     depends: helper_dep,
-    depend_files: [hex_common_py, attribs_def, gen_tcg_h, gen_tcg_hvx_h],
+    depend_files: [hex_common_py, gen_tcg_h, gen_tcg_hvx_h],
     command: [python, files('gen_helper_funcs.py'), helper_in, '@OUTPUT@'],
 )
 hexagon_ss.add(helper_funcs_generated)
@@ -404,7 +383,7 @@
     'tcg_funcs_generated.c.inc',
     output: 'tcg_funcs_generated.c.inc',
     depends: helper_dep,
-    depend_files: [hex_common_py, attribs_def, gen_tcg_h, gen_tcg_hvx_h],
+    depend_files: [hex_common_py, gen_tcg_h, gen_tcg_hvx_h],
     command: [python, files('gen_tcg_funcs.py'), helper_in, '@OUTPUT@'],
 )
 hexagon_ss.add(tcg_funcs_generated)
@@ -413,7 +392,7 @@
     'analyze_funcs_generated.c.inc',
     output: 'analyze_funcs_generated.c.inc',
     depends: helper_dep,
-    depend_files: [hex_common_py, attribs_def, gen_tcg_h, gen_tcg_hvx_h],
+    depend_files: [hex_common_py, gen_tcg_h, gen_tcg_hvx_h],
     command: [python, files('gen_analyze_funcs.py'), helper_in, '@OUTPUT@'],
 )
 hexagon_ss.add(analyze_funcs_generated)
diff --git a/target/hexagon/mmvec/decode_ext_mmvec.c b/target/hexagon/mmvec/decode_ext_mmvec.c
index 202d84c..f850d01 100644
--- a/target/hexagon/mmvec/decode_ext_mmvec.c
+++ b/target/hexagon/mmvec/decode_ext_mmvec.c
@@ -28,19 +28,15 @@
 {
     /* .new value for a MMVector store */
     int i, j;
-    const char *reginfo;
-    const char *destletters;
-    const char *dststr = NULL;
     uint16_t def_opcode;
-    char letter;
 
     for (i = 1; i < pkt->num_insns; i++) {
         uint16_t use_opcode = pkt->insn[i].opcode;
         if (GET_ATTRIB(use_opcode, A_DOTNEWVALUE) &&
             GET_ATTRIB(use_opcode, A_CVI) &&
             GET_ATTRIB(use_opcode, A_STORE)) {
-            int use_regidx = strchr(opcode_reginfo[use_opcode], 's') -
-                opcode_reginfo[use_opcode];
+            int use_regidx = pkt->insn[i].new_read_idx;
+            g_assert(pkt->insn[i].new_read_idx != -1);
             /*
              * What's encoded at the N-field is the offset to who's producing
              * the value.
@@ -68,31 +64,19 @@
 
             /* def_idx is the index of the producer */
             def_opcode = pkt->insn[def_idx].opcode;
-            reginfo = opcode_reginfo[def_opcode];
-            destletters = "dexy";
-            for (j = 0; (letter = destletters[j]) != 0; j++) {
-                dststr = strchr(reginfo, letter);
-                if (dststr != NULL) {
-                    break;
-                }
-            }
-            if ((dststr == NULL)  && GET_ATTRIB(def_opcode, A_CVI_GATHER)) {
+            if ((pkt->insn[def_idx].dest_idx == -1)  &&
+                GET_ATTRIB(def_opcode, A_CVI_GATHER)) {
                 pkt->insn[i].regno[use_regidx] = def_oreg;
                 pkt->insn[i].new_value_producer_slot = pkt->insn[def_idx].slot;
             } else {
-                if (dststr == NULL) {
+                if (pkt->insn[def_idx].dest_idx == -1) {
                     /* still not there, we have a bad packet */
                     g_assert_not_reached();
                 }
-                int def_regnum = pkt->insn[def_idx].regno[dststr - reginfo];
+                int def_regnum =
+                    pkt->insn[def_idx].regno[pkt->insn[def_idx].dest_idx];
                 /* Now patch up the consumer with the register number */
                 pkt->insn[i].regno[use_regidx] = def_regnum ^ def_oreg;
-                /* special case for (Vx,Vy) */
-                dststr = strchr(reginfo, 'y');
-                if (def_oreg && strchr(reginfo, 'x') && dststr) {
-                    def_regnum = pkt->insn[def_idx].regno[dststr - reginfo];
-                    pkt->insn[i].regno[use_regidx] = def_regnum;
-                }
                 /*
                  * We need to remember who produces this value to later
                  * check if it was dynamically cancelled
diff --git a/target/hexagon/opcodes.c b/target/hexagon/opcodes.c
index 1f7f3de..c8bde2f 100644
--- a/target/hexagon/opcodes.c
+++ b/target/hexagon/opcodes.c
@@ -36,41 +36,6 @@
 #undef OPCODE
 };
 
-const char * const opcode_reginfo[] = {
-#define IMMINFO(TAG, SIGN, SIZE, SHAMT, SIGN2, SIZE2, SHAMT2)    /* nothing */
-#define REGINFO(TAG, REGINFO, RREGS, WREGS) REGINFO,
-#include "op_regs_generated.h.inc"
-    NULL
-#undef REGINFO
-#undef IMMINFO
-};
-
-
-const char * const opcode_rregs[] = {
-#define IMMINFO(TAG, SIGN, SIZE, SHAMT, SIGN2, SIZE2, SHAMT2)    /* nothing */
-#define REGINFO(TAG, REGINFO, RREGS, WREGS) RREGS,
-#include "op_regs_generated.h.inc"
-    NULL
-#undef REGINFO
-#undef IMMINFO
-};
-
-
-const char * const opcode_wregs[] = {
-#define IMMINFO(TAG, SIGN, SIZE, SHAMT, SIGN2, SIZE2, SHAMT2)    /* nothing */
-#define REGINFO(TAG, REGINFO, RREGS, WREGS) WREGS,
-#include "op_regs_generated.h.inc"
-    NULL
-#undef REGINFO
-#undef IMMINFO
-};
-
-const char * const opcode_short_semantics[] = {
-#define DEF_SHORTCODE(TAG, SHORTCODE)              [TAG] = #SHORTCODE,
-#include "shortcode_generated.h.inc"
-#undef DEF_SHORTCODE
-    NULL
-};
 
 DECLARE_BITMAP(opcode_attribs[XX_LAST_OPCODE], A_ZZ_LASTATTRIB);
 
diff --git a/target/hexagon/opcodes.h b/target/hexagon/opcodes.h
index fa7e321..0ee11bd 100644
--- a/target/hexagon/opcodes.h
+++ b/target/hexagon/opcodes.h
@@ -40,10 +40,6 @@
 
 extern const char * const opcode_names[];
 
-extern const char * const opcode_reginfo[];
-extern const char * const opcode_rregs[];
-extern const char * const opcode_wregs[];
-
 typedef struct {
     const char * const encoding;
     const EncClass enc_class;
diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
index 47a870f..0904dc2 100644
--- a/target/hexagon/translate.c
+++ b/target/hexagon/translate.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -380,70 +380,8 @@
         return true;
     }
 
-    if (pkt->num_insns == 1) {
-        if (pkt->pkt_has_hvx) {
-            /*
-             * The HVX instructions with generated helpers use
-             * pass-by-reference, so they need the read/write overlap
-             * check below.
-             * The HVX instructions with overrides are OK.
-             */
-            if (!ctx->has_hvx_helper) {
-                return false;
-            }
-        } else {
-            return false;
-        }
-    }
-
-    /* Check for overlap between register reads and writes */
-    for (int i = 0; i < ctx->reg_log_idx; i++) {
-        int rnum = ctx->reg_log[i];
-        if (test_bit(rnum, ctx->regs_read)) {
-            return true;
-        }
-    }
-
-    /* Check for overlap between predicate reads and writes */
-    for (int i = 0; i < ctx->preg_log_idx; i++) {
-        int pnum = ctx->preg_log[i];
-        if (test_bit(pnum, ctx->pregs_read)) {
-            return true;
-        }
-    }
-
-    /* Check for overlap between HVX reads and writes */
-    for (int i = 0; i < ctx->vreg_log_idx; i++) {
-        int vnum = ctx->vreg_log[i];
-        if (test_bit(vnum, ctx->vregs_read)) {
-            return true;
-        }
-    }
-    if (!bitmap_empty(ctx->vregs_updated_tmp, NUM_VREGS)) {
-        int i = find_first_bit(ctx->vregs_updated_tmp, NUM_VREGS);
-        while (i < NUM_VREGS) {
-            if (test_bit(i, ctx->vregs_read)) {
-                return true;
-            }
-            i = find_next_bit(ctx->vregs_updated_tmp, NUM_VREGS, i + 1);
-        }
-    }
-    if (!bitmap_empty(ctx->vregs_select, NUM_VREGS)) {
-        int i = find_first_bit(ctx->vregs_select, NUM_VREGS);
-        while (i < NUM_VREGS) {
-            if (test_bit(i, ctx->vregs_read)) {
-                return true;
-            }
-            i = find_next_bit(ctx->vregs_select, NUM_VREGS, i + 1);
-        }
-    }
-
-    /* Check for overlap between HVX predicate reads and writes */
-    for (int i = 0; i < ctx->qreg_log_idx; i++) {
-        int qnum = ctx->qreg_log[i];
-        if (test_bit(qnum, ctx->qregs_read)) {
-            return true;
-        }
+    if (ctx->read_after_write || ctx->has_hvx_overlap) {
+        return true;
     }
 
     return false;
@@ -467,7 +405,8 @@
 static void analyze_packet(DisasContext *ctx)
 {
     Packet *pkt = ctx->pkt;
-    ctx->has_hvx_helper = false;
+    ctx->read_after_write = false;
+    ctx->has_hvx_overlap = false;
     for (int i = 0; i < pkt->num_insns; i++) {
         Insn *insn = &pkt->insn[i];
         ctx->insn = insn;
@@ -492,21 +431,19 @@
     ctx->next_PC = next_PC;
     ctx->reg_log_idx = 0;
     bitmap_zero(ctx->regs_written, TOTAL_PER_THREAD_REGS);
-    bitmap_zero(ctx->regs_read, TOTAL_PER_THREAD_REGS);
     bitmap_zero(ctx->predicated_regs, TOTAL_PER_THREAD_REGS);
     ctx->preg_log_idx = 0;
     bitmap_zero(ctx->pregs_written, NUM_PREGS);
-    bitmap_zero(ctx->pregs_read, NUM_PREGS);
     ctx->future_vregs_idx = 0;
     ctx->tmp_vregs_idx = 0;
     ctx->vreg_log_idx = 0;
+    bitmap_zero(ctx->vregs_written, NUM_VREGS);
     bitmap_zero(ctx->vregs_updated_tmp, NUM_VREGS);
     bitmap_zero(ctx->vregs_updated, NUM_VREGS);
     bitmap_zero(ctx->vregs_select, NUM_VREGS);
     bitmap_zero(ctx->predicated_future_vregs, NUM_VREGS);
     bitmap_zero(ctx->predicated_tmp_vregs, NUM_VREGS);
-    bitmap_zero(ctx->vregs_read, NUM_VREGS);
-    bitmap_zero(ctx->qregs_read, NUM_QREGS);
+    bitmap_zero(ctx->qregs_written, NUM_QREGS);
     ctx->qreg_log_idx = 0;
     for (i = 0; i < STORES_MAX; i++) {
         ctx->store_width[i] = 0;
diff --git a/target/hexagon/translate.h b/target/hexagon/translate.h
index 4dd59c6..00cc2bc 100644
--- a/target/hexagon/translate.h
+++ b/target/hexagon/translate.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -38,12 +38,10 @@
     int reg_log[REG_WRITES_MAX];
     int reg_log_idx;
     DECLARE_BITMAP(regs_written, TOTAL_PER_THREAD_REGS);
-    DECLARE_BITMAP(regs_read, TOTAL_PER_THREAD_REGS);
     DECLARE_BITMAP(predicated_regs, TOTAL_PER_THREAD_REGS);
     int preg_log[PRED_WRITES_MAX];
     int preg_log_idx;
     DECLARE_BITMAP(pregs_written, NUM_PREGS);
-    DECLARE_BITMAP(pregs_read, NUM_PREGS);
     uint8_t store_width[STORES_MAX];
     bool s1_store_processed;
     int future_vregs_idx;
@@ -52,22 +50,27 @@
     int tmp_vregs_num[VECTOR_TEMPS_MAX];
     int vreg_log[NUM_VREGS];
     int vreg_log_idx;
+    DECLARE_BITMAP(vregs_written, NUM_VREGS);
+    DECLARE_BITMAP(insn_vregs_written, NUM_VREGS);
     DECLARE_BITMAP(vregs_updated_tmp, NUM_VREGS);
     DECLARE_BITMAP(vregs_updated, NUM_VREGS);
     DECLARE_BITMAP(vregs_select, NUM_VREGS);
     DECLARE_BITMAP(predicated_future_vregs, NUM_VREGS);
     DECLARE_BITMAP(predicated_tmp_vregs, NUM_VREGS);
-    DECLARE_BITMAP(vregs_read, NUM_VREGS);
+    DECLARE_BITMAP(insn_vregs_read, NUM_VREGS);
     int qreg_log[NUM_QREGS];
     int qreg_log_idx;
-    DECLARE_BITMAP(qregs_read, NUM_QREGS);
+    DECLARE_BITMAP(qregs_written, NUM_QREGS);
+    DECLARE_BITMAP(insn_qregs_written, NUM_QREGS);
+    DECLARE_BITMAP(insn_qregs_read, NUM_QREGS);
     bool pre_commit;
     bool need_commit;
     TCGCond branch_cond;
     target_ulong branch_dest;
     bool is_tight_loop;
     bool short_circuit;
-    bool has_hvx_helper;
+    bool read_after_write;
+    bool has_hvx_overlap;
     TCGv new_value[TOTAL_PER_THREAD_REGS];
     TCGv new_pred_value[NUM_PREGS];
     TCGv pred_written;
@@ -75,6 +78,8 @@
     TCGv dczero_addr;
 } DisasContext;
 
+bool is_gather_store_insn(DisasContext *ctx);
+
 static inline void ctx_log_pred_write(DisasContext *ctx, int pnum)
 {
     if (!test_bit(pnum, ctx->pregs_written)) {
@@ -86,7 +91,14 @@
 
 static inline void ctx_log_pred_read(DisasContext *ctx, int pnum)
 {
-    set_bit(pnum, ctx->pregs_read);
+    if (test_bit(pnum, ctx->pregs_written)) {
+        ctx->read_after_write = true;
+    }
+}
+
+static inline void ctx_log_pred_read_new(DisasContext *ctx, int pnum)
+{
+    g_assert(test_bit(pnum, ctx->pregs_written));
 }
 
 static inline void ctx_log_reg_write(DisasContext *ctx, int rnum,
@@ -117,7 +129,14 @@
 
 static inline void ctx_log_reg_read(DisasContext *ctx, int rnum)
 {
-    set_bit(rnum, ctx->regs_read);
+    if (test_bit(rnum, ctx->regs_written)) {
+        ctx->read_after_write = true;
+    }
+}
+
+static inline void ctx_log_reg_read_new(DisasContext *ctx, int rnum)
+{
+    g_assert(test_bit(rnum, ctx->regs_written));
 }
 
 static inline void ctx_log_reg_read_pair(DisasContext *ctx, int rnum)
@@ -131,10 +150,25 @@
 intptr_t ctx_tmp_vreg_off(DisasContext *ctx, int regnum,
                           int num, bool alloc_ok);
 
+static inline void ctx_start_hvx_insn(DisasContext *ctx)
+{
+    bitmap_zero(ctx->insn_vregs_written, NUM_VREGS);
+    bitmap_zero(ctx->insn_vregs_read, NUM_VREGS);
+    bitmap_zero(ctx->insn_qregs_written, NUM_QREGS);
+    bitmap_zero(ctx->insn_qregs_read, NUM_QREGS);
+}
+
 static inline void ctx_log_vreg_write(DisasContext *ctx,
                                       int rnum, VRegWriteType type,
-                                      bool is_predicated)
+                                      bool is_predicated, bool has_helper)
 {
+    if (has_helper) {
+        set_bit(rnum, ctx->insn_vregs_written);
+        if (test_bit(rnum, ctx->insn_vregs_read)) {
+            ctx->has_hvx_overlap = true;
+        }
+    }
+    set_bit(rnum, ctx->vregs_written);
     if (type != EXT_TMP) {
         if (!test_bit(rnum, ctx->vregs_updated)) {
             ctx->vreg_log[ctx->vreg_log_idx] = rnum;
@@ -160,33 +194,77 @@
 
 static inline void ctx_log_vreg_write_pair(DisasContext *ctx,
                                            int rnum, VRegWriteType type,
-                                           bool is_predicated)
+                                           bool is_predicated, bool has_helper)
 {
-    ctx_log_vreg_write(ctx, rnum ^ 0, type, is_predicated);
-    ctx_log_vreg_write(ctx, rnum ^ 1, type, is_predicated);
+    ctx_log_vreg_write(ctx, rnum ^ 0, type, is_predicated, has_helper);
+    ctx_log_vreg_write(ctx, rnum ^ 1, type, is_predicated, has_helper);
 }
 
-static inline void ctx_log_vreg_read(DisasContext *ctx, int rnum)
+static inline void ctx_log_vreg_read(DisasContext *ctx, int rnum,
+                                     bool has_helper)
 {
-    set_bit(rnum, ctx->vregs_read);
+    if (has_helper) {
+        set_bit(rnum, ctx->insn_vregs_read);
+        if (test_bit(rnum, ctx->insn_vregs_written)) {
+            ctx->has_hvx_overlap = true;
+        }
+    }
+    if (test_bit(rnum, ctx->vregs_written)) {
+        ctx->read_after_write = true;
+    }
 }
 
-static inline void ctx_log_vreg_read_pair(DisasContext *ctx, int rnum)
+static inline void ctx_log_vreg_read_new(DisasContext *ctx, int rnum,
+                                         bool has_helper)
 {
-    ctx_log_vreg_read(ctx, rnum ^ 0);
-    ctx_log_vreg_read(ctx, rnum ^ 1);
+    g_assert(is_gather_store_insn(ctx) ||
+             test_bit(rnum, ctx->vregs_updated) ||
+             test_bit(rnum, ctx->vregs_select) ||
+             test_bit(rnum, ctx->vregs_updated_tmp));
+    if (has_helper) {
+        set_bit(rnum, ctx->insn_vregs_read);
+        if (test_bit(rnum, ctx->insn_vregs_written)) {
+            ctx->has_hvx_overlap = true;
+        }
+    }
+    if (is_gather_store_insn(ctx)) {
+        ctx->read_after_write = true;
+    }
+}
+
+static inline void ctx_log_vreg_read_pair(DisasContext *ctx, int rnum,
+                                          bool has_helper)
+{
+    ctx_log_vreg_read(ctx, rnum ^ 0, has_helper);
+    ctx_log_vreg_read(ctx, rnum ^ 1, has_helper);
 }
 
 static inline void ctx_log_qreg_write(DisasContext *ctx,
-                                      int rnum)
+                                      int rnum, bool has_helper)
 {
+    if (has_helper) {
+        set_bit(rnum, ctx->insn_qregs_written);
+        if (test_bit(rnum, ctx->insn_qregs_read)) {
+            ctx->has_hvx_overlap = true;
+        }
+    }
+    set_bit(rnum, ctx->qregs_written);
     ctx->qreg_log[ctx->qreg_log_idx] = rnum;
     ctx->qreg_log_idx++;
 }
 
-static inline void ctx_log_qreg_read(DisasContext *ctx, int qnum)
+static inline void ctx_log_qreg_read(DisasContext *ctx,
+                                     int qnum, bool has_helper)
 {
-    set_bit(qnum, ctx->qregs_read);
+    if (has_helper) {
+        set_bit(qnum, ctx->insn_qregs_read);
+        if (test_bit(qnum, ctx->insn_qregs_written)) {
+            ctx->has_hvx_overlap = true;
+        }
+    }
+    if (test_bit(qnum, ctx->qregs_written)) {
+        ctx->read_after_write = true;
+    }
 }
 
 extern TCGv hex_gpr[TOTAL_PER_THREAD_REGS];
@@ -205,7 +283,6 @@
 extern TCGv hex_vstore_size[VSTORES_MAX];
 extern TCGv hex_vstore_pending[VSTORES_MAX];
 
-bool is_gather_store_insn(DisasContext *ctx);
 void process_store(DisasContext *ctx, int slot_num);
 
 FIELD(PROBE_PKT_SCALAR_STORE_S0, MMU_IDX,       0, 2)
diff --git a/target/i386/nvmm/nvmm-all.c b/target/i386/nvmm/nvmm-all.c
index f9cced5..65768ac 100644
--- a/target/i386/nvmm/nvmm-all.c
+++ b/target/i386/nvmm/nvmm-all.c
@@ -982,7 +982,7 @@
         }
     }
 
-    cpu->accel->dirty = true;
+    qcpu->dirty = true;
     cpu->accel = qcpu;
 
     return 0;
diff --git a/target/i386/whpx/whpx-all.c b/target/i386/whpx/whpx-all.c
index b08e644..a6674a8 100644
--- a/target/i386/whpx/whpx-all.c
+++ b/target/i386/whpx/whpx-all.c
@@ -2236,7 +2236,7 @@
     }
 
     vcpu->interruptable = true;
-    cpu->accel->dirty = true;
+    vcpu->dirty = true;
     cpu->accel = vcpu;
     max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
     qemu_add_vm_change_state_handler(whpx_cpu_update_state, env);
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index ebb6c90..e599ab9 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -705,16 +705,20 @@
         return;
     case 0x300f: /* addv Rm,Rn */
         {
-            TCGv t0, t1, t2;
-            t0 = tcg_temp_new();
-            tcg_gen_add_i32(t0, REG(B7_4), REG(B11_8));
+            TCGv Rn = REG(B11_8);
+            TCGv Rm = REG(B7_4);
+            TCGv result, t1, t2;
+
+            result = tcg_temp_new();
             t1 = tcg_temp_new();
-            tcg_gen_xor_i32(t1, t0, REG(B11_8));
             t2 = tcg_temp_new();
-            tcg_gen_xor_i32(t2, REG(B7_4), REG(B11_8));
+            tcg_gen_add_i32(result, Rm, Rn);
+            /* T = ((Rn ^ Rm) & (Result ^ Rn)) >> 31 */
+            tcg_gen_xor_i32(t1, result, Rn);
+            tcg_gen_xor_i32(t2, Rm, Rn);
             tcg_gen_andc_i32(cpu_sr_t, t1, t2);
             tcg_gen_shri_i32(cpu_sr_t, cpu_sr_t, 31);
-            tcg_gen_mov_i32(REG(B7_4), t0);
+            tcg_gen_mov_i32(Rn, result);
         }
         return;
     case 0x2009: /* and Rm,Rn */
@@ -929,16 +933,20 @@
         return;
     case 0x300b: /* subv Rm,Rn */
         {
-            TCGv t0, t1, t2;
-            t0 = tcg_temp_new();
-            tcg_gen_sub_i32(t0, REG(B11_8), REG(B7_4));
+            TCGv Rn = REG(B11_8);
+            TCGv Rm = REG(B7_4);
+            TCGv result, t1, t2;
+
+            result = tcg_temp_new();
             t1 = tcg_temp_new();
-            tcg_gen_xor_i32(t1, t0, REG(B7_4));
             t2 = tcg_temp_new();
-            tcg_gen_xor_i32(t2, REG(B11_8), REG(B7_4));
+            tcg_gen_sub_i32(result, Rn, Rm);
+            /* T = ((Rn ^ Rm) & (Result ^ Rn)) >> 31 */
+            tcg_gen_xor_i32(t1, result, Rn);
+            tcg_gen_xor_i32(t2, Rn, Rm);
             tcg_gen_and_i32(t1, t1, t2);
             tcg_gen_shri_i32(cpu_sr_t, t1, 31);
-            tcg_gen_mov_i32(REG(B11_8), t0);
+            tcg_gen_mov_i32(Rn, result);
         }
         return;
     case 0x2008: /* tst Rm,Rn */
diff --git a/tests/bench/bufferiszero-bench.c b/tests/bench/bufferiszero-bench.c
new file mode 100644
index 0000000..222695c
--- /dev/null
+++ b/tests/bench/bufferiszero-bench.c
@@ -0,0 +1,47 @@
+/*
+ * QEMU buffer_is_zero speed benchmark
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "qemu/units.h"
+
+static void test(const void *opaque)
+{
+    size_t max = 64 * KiB;
+    void *buf = g_malloc0(max);
+    int accel_index = 0;
+
+    do {
+        if (accel_index != 0) {
+            g_test_message("%s", "");  /* gnu_printf Werror for simple "" */
+        }
+        for (size_t len = 1 * KiB; len <= max; len *= 4) {
+            double total = 0.0;
+
+            g_test_timer_start();
+            do {
+                buffer_is_zero_ge256(buf, len);
+                total += len;
+            } while (g_test_timer_elapsed() < 0.5);
+
+            total /= MiB;
+            g_test_message("buffer_is_zero #%d: %2zuKB %8.0f MB/sec",
+                           accel_index, len / (size_t)KiB,
+                           total / g_test_timer_last());
+        }
+        accel_index++;
+    } while (test_buffer_is_zero_next_accel());
+
+    g_free(buf);
+}
+
+int main(int argc, char **argv)
+{
+    g_test_init(&argc, &argv, NULL);
+    g_test_add_data_func("/cutils/bufferiszero/speed", NULL, test);
+    return g_test_run();
+}
diff --git a/tests/bench/meson.build b/tests/bench/meson.build
index 7e76338..4cd7a2f 100644
--- a/tests/bench/meson.build
+++ b/tests/bench/meson.build
@@ -21,6 +21,7 @@
 
 if have_block
   benchs += {
+     'bufferiszero-bench': [],
      'benchmark-crypto-hash': [crypto],
      'benchmark-crypto-hmac': [crypto],
      'benchmark-crypto-cipher': [crypto],
diff --git a/tests/tcg/hexagon/hvx_misc.c b/tests/tcg/hexagon/hvx_misc.c
index b45170a..1fe14b5 100644
--- a/tests/tcg/hexagon/hvx_misc.c
+++ b/tests/tcg/hexagon/hvx_misc.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2021-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2021-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -231,6 +231,7 @@
 static void test_new_value_store(void)
 {
     void *p0 = buffer0;
+    void *p1 = buffer1;
     void *pout = output;
 
     asm("{\n\t"
@@ -242,6 +243,19 @@
     expect[0] = buffer0[0];
 
     check_output_w(__LINE__, 1);
+
+    /* Test the .new read from the high half of a pair */
+    asm("v7 = vmem(%0 + #0)\n\t"
+        "v12 = vmem(%1 + #0)\n\t"
+        "{\n\t"
+        "    v5:4 = vcombine(v12, v7)\n\t"
+        "    vmem(%2 + #0) = v5.new\n\t"
+        "}\n\t"
+        : : "r"(p0), "r"(p1), "r"(pout) : "v4", "v5", "v7", "v12", "memory");
+
+    expect[0] = buffer1[0];
+
+    check_output_w(__LINE__, 1);
 }
 
 static void test_max_temps()
diff --git a/tests/tcg/sh4/Makefile.target b/tests/tcg/sh4/Makefile.target
index 4d09291c..7852fa6 100644
--- a/tests/tcg/sh4/Makefile.target
+++ b/tests/tcg/sh4/Makefile.target
@@ -17,3 +17,9 @@
 
 test-macw: CFLAGS += -O -g
 TESTS += test-macw
+
+test-addv: CFLAGS += -O -g
+TESTS += test-addv
+
+test-subv: CFLAGS += -O -g
+TESTS += test-subv
diff --git a/tests/tcg/sh4/test-addv.c b/tests/tcg/sh4/test-addv.c
new file mode 100644
index 0000000..ca87fe7
--- /dev/null
+++ b/tests/tcg/sh4/test-addv.c
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+static void addv(const int a, const int b, const int res, const int carry)
+{
+    int o = a, c;
+
+    asm volatile("addv %2,%0\n"
+                 "movt %1\n"
+                 : "+r"(o), "=r"(c) : "r"(b) : );
+
+    if (c != carry || o != res) {
+        printf("ADDV %d, %d = %d/%d [T = %d/%d]\n", a, b, o, res, c, carry);
+        abort();
+    }
+}
+
+int main(void)
+{
+    addv(INT_MAX, 1, INT_MIN, 1);
+    addv(INT_MAX - 1, 1, INT_MAX, 0);
+
+    return 0;
+}
diff --git a/tests/tcg/sh4/test-subv.c b/tests/tcg/sh4/test-subv.c
new file mode 100644
index 0000000..a3c2db9
--- /dev/null
+++ b/tests/tcg/sh4/test-subv.c
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+static void subv(const int a, const int b, const int res, const int carry)
+{
+    int o = a, c;
+
+    asm volatile("subv %2,%0\n"
+                 "movt %1\n"
+                 : "+r"(o), "=r"(c) : "r"(b) : );
+
+    if (c != carry || o != res) {
+        printf("SUBV %d, %d = %d/%d [T = %d/%d]\n", a, b, o, res, c, carry);
+        abort();
+    }
+}
+
+int main(void)
+{
+    subv(INT_MIN, 1, INT_MAX, 1);
+    subv(INT_MAX, -1, INT_MIN, 1);
+    subv(INT_MAX, 1, INT_MAX - 1, 0);
+    subv(0, 1, -1, 0);
+    subv(-1, -1, 0, 0);
+
+    return 0;
+}
diff --git a/ui/cocoa.m b/ui/cocoa.m
index 25e0db9..981615a 100644
--- a/ui/cocoa.m
+++ b/ui/cocoa.m
@@ -50,23 +50,10 @@
 #include <Carbon/Carbon.h>
 #include "hw/core/cpu.h"
 
-#ifndef MAC_OS_X_VERSION_10_13
-#define MAC_OS_X_VERSION_10_13 101300
-#endif
-
 #ifndef MAC_OS_VERSION_14_0
 #define MAC_OS_VERSION_14_0 140000
 #endif
 
-/* 10.14 deprecates NSOnState and NSOffState in favor of
- * NSControlStateValueOn/Off, which were introduced in 10.13.
- * Define for older versions
- */
-#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_13
-#define NSControlStateValueOn NSOnState
-#define NSControlStateValueOff NSOffState
-#endif
-
 //#define DEBUG
 
 #ifdef DEBUG
diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index 3e6a5df..74864f7 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -26,265 +26,290 @@
 #include "qemu/bswap.h"
 #include "host/cpuinfo.h"
 
-static bool
-buffer_zero_int(const void *buf, size_t len)
+typedef bool (*biz_accel_fn)(const void *, size_t);
+
+static bool buffer_is_zero_int_lt256(const void *buf, size_t len)
 {
-    if (unlikely(len < 8)) {
-        /* For a very small buffer, simply accumulate all the bytes.  */
-        const unsigned char *p = buf;
-        const unsigned char *e = buf + len;
-        unsigned char t = 0;
+    uint64_t t;
+    const uint64_t *p, *e;
 
-        do {
-            t |= *p++;
-        } while (p < e);
-
-        return t == 0;
-    } else {
-        /* Otherwise, use the unaligned memory access functions to
-           handle the beginning and end of the buffer, with a couple
-           of loops handling the middle aligned section.  */
-        uint64_t t = ldq_he_p(buf);
-        const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8);
-        const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8);
-
-        for (; p + 8 <= e; p += 8) {
-            __builtin_prefetch(p + 8);
-            if (t) {
-                return false;
-            }
-            t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
-        }
-        while (p < e) {
-            t |= *p++;
-        }
-        t |= ldq_he_p(buf + len - 8);
-
-        return t == 0;
+    /*
+     * Use unaligned memory access functions to handle
+     * the beginning and end of the buffer.
+     */
+    if (unlikely(len <= 8)) {
+        return (ldl_he_p(buf) | ldl_he_p(buf + len - 4)) == 0;
     }
+
+    t = ldq_he_p(buf) | ldq_he_p(buf + len - 8);
+    p = QEMU_ALIGN_PTR_DOWN(buf + 8, 8);
+    e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 8);
+
+    /* Read 0 to 31 aligned words from the middle. */
+    while (p < e) {
+        t |= *p++;
+    }
+    return t == 0;
 }
 
-#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
+static bool buffer_is_zero_int_ge256(const void *buf, size_t len)
+{
+    /*
+     * Use unaligned memory access functions to handle
+     * the beginning and end of the buffer.
+     */
+    uint64_t t = ldq_he_p(buf) | ldq_he_p(buf + len - 8);
+    const uint64_t *p = QEMU_ALIGN_PTR_DOWN(buf + 8, 8);
+    const uint64_t *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 8);
+
+    /* Collect a partial block at the tail end. */
+    t |= e[-7] | e[-6] | e[-5] | e[-4] | e[-3] | e[-2] | e[-1];
+
+    /*
+     * Loop over 64 byte blocks.
+     * With the head and tail removed, e - p >= 30,
+     * so the loop must iterate at least 3 times.
+     */
+    do {
+        if (t) {
+            return false;
+        }
+        t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
+        p += 8;
+    } while (p < e - 7);
+
+    return t == 0;
+}
+
+#if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
 #include <immintrin.h>
 
-/* Note that each of these vectorized functions require len >= 64.  */
+/* Helper for preventing the compiler from reassociating
+   chains of binary vector operations.  */
+#define SSE_REASSOC_BARRIER(vec0, vec1) asm("" : "+x"(vec0), "+x"(vec1))
+
+/* Note that these vectorized functions may assume len >= 256.  */
 
 static bool __attribute__((target("sse2")))
 buffer_zero_sse2(const void *buf, size_t len)
 {
-    __m128i t = _mm_loadu_si128(buf);
-    __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
-    __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
-    __m128i zero = _mm_setzero_si128();
+    /* Unaligned loads at head/tail.  */
+    __m128i v = *(__m128i_u *)(buf);
+    __m128i w = *(__m128i_u *)(buf + len - 16);
+    /* Align head/tail to 16-byte boundaries.  */
+    const __m128i *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16);
+    const __m128i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16);
+    __m128i zero = { 0 };
 
-    /* Loop over 16-byte aligned blocks of 64.  */
-    while (likely(p <= e)) {
-        __builtin_prefetch(p);
-        t = _mm_cmpeq_epi8(t, zero);
-        if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) {
+    /* Collect a partial block at tail end.  */
+    v |= e[-1]; w |= e[-2];
+    SSE_REASSOC_BARRIER(v, w);
+    v |= e[-3]; w |= e[-4];
+    SSE_REASSOC_BARRIER(v, w);
+    v |= e[-5]; w |= e[-6];
+    SSE_REASSOC_BARRIER(v, w);
+    v |= e[-7]; v |= w;
+
+    /*
+     * Loop over complete 128-byte blocks.
+     * With the head and tail removed, e - p >= 14, so the loop
+     * must iterate at least once.
+     */
+    do {
+        v = _mm_cmpeq_epi8(v, zero);
+        if (unlikely(_mm_movemask_epi8(v) != 0xFFFF)) {
             return false;
         }
-        t = p[-4] | p[-3] | p[-2] | p[-1];
-        p += 4;
-    }
+        v = p[0]; w = p[1];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= p[2]; w |= p[3];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= p[4]; w |= p[5];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= p[6]; w |= p[7];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= w;
+        p += 8;
+    } while (p < e - 7);
 
-    /* Finish the aligned tail.  */
-    t |= e[-3];
-    t |= e[-2];
-    t |= e[-1];
-
-    /* Finish the unaligned tail.  */
-    t |= _mm_loadu_si128(buf + len - 16);
-
-    return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF;
+    return _mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) == 0xFFFF;
 }
 
 #ifdef CONFIG_AVX2_OPT
-static bool __attribute__((target("sse4")))
-buffer_zero_sse4(const void *buf, size_t len)
-{
-    __m128i t = _mm_loadu_si128(buf);
-    __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
-    __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
-
-    /* Loop over 16-byte aligned blocks of 64.  */
-    while (likely(p <= e)) {
-        __builtin_prefetch(p);
-        if (unlikely(!_mm_testz_si128(t, t))) {
-            return false;
-        }
-        t = p[-4] | p[-3] | p[-2] | p[-1];
-        p += 4;
-    }
-
-    /* Finish the aligned tail.  */
-    t |= e[-3];
-    t |= e[-2];
-    t |= e[-1];
-
-    /* Finish the unaligned tail.  */
-    t |= _mm_loadu_si128(buf + len - 16);
-
-    return _mm_testz_si128(t, t);
-}
-
 static bool __attribute__((target("avx2")))
 buffer_zero_avx2(const void *buf, size_t len)
 {
-    /* Begin with an unaligned head of 32 bytes.  */
-    __m256i t = _mm256_loadu_si256(buf);
-    __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32);
-    __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32);
+    /* Unaligned loads at head/tail.  */
+    __m256i v = *(__m256i_u *)(buf);
+    __m256i w = *(__m256i_u *)(buf + len - 32);
+    /* Align head/tail to 32-byte boundaries.  */
+    const __m256i *p = QEMU_ALIGN_PTR_DOWN(buf + 32, 32);
+    const __m256i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 32);
+    __m256i zero = { 0 };
 
-    /* Loop over 32-byte aligned blocks of 128.  */
-    while (p <= e) {
-        __builtin_prefetch(p);
-        if (unlikely(!_mm256_testz_si256(t, t))) {
+    /* Collect a partial block at tail end.  */
+    v |= e[-1]; w |= e[-2];
+    SSE_REASSOC_BARRIER(v, w);
+    v |= e[-3]; w |= e[-4];
+    SSE_REASSOC_BARRIER(v, w);
+    v |= e[-5]; w |= e[-6];
+    SSE_REASSOC_BARRIER(v, w);
+    v |= e[-7]; v |= w;
+
+    /* Loop over complete 256-byte blocks.  */
+    for (; p < e - 7; p += 8) {
+        /* PTEST is not profitable here.  */
+        v = _mm256_cmpeq_epi8(v, zero);
+        if (unlikely(_mm256_movemask_epi8(v) != 0xFFFFFFFF)) {
             return false;
         }
-        t = p[-4] | p[-3] | p[-2] | p[-1];
-        p += 4;
-    } ;
+        v = p[0]; w = p[1];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= p[2]; w |= p[3];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= p[4]; w |= p[5];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= p[6]; w |= p[7];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= w;
+    }
 
-    /* Finish the last block of 128 unaligned.  */
-    t |= _mm256_loadu_si256(buf + len - 4 * 32);
-    t |= _mm256_loadu_si256(buf + len - 3 * 32);
-    t |= _mm256_loadu_si256(buf + len - 2 * 32);
-    t |= _mm256_loadu_si256(buf + len - 1 * 32);
-
-    return _mm256_testz_si256(t, t);
+    return _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, zero)) == 0xFFFFFFFF;
 }
 #endif /* CONFIG_AVX2_OPT */
 
-#ifdef CONFIG_AVX512F_OPT
-static bool __attribute__((target("avx512f")))
-buffer_zero_avx512(const void *buf, size_t len)
-{
-    /* Begin with an unaligned head of 64 bytes.  */
-    __m512i t = _mm512_loadu_si512(buf);
-    __m512i *p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64);
-    __m512i *e = (__m512i *)(((uintptr_t)buf + len) & -64);
-
-    /* Loop over 64-byte aligned blocks of 256.  */
-    while (p <= e) {
-        __builtin_prefetch(p);
-        if (unlikely(_mm512_test_epi64_mask(t, t))) {
-            return false;
-        }
-        t = p[-4] | p[-3] | p[-2] | p[-1];
-        p += 4;
-    }
-
-    t |= _mm512_loadu_si512(buf + len - 4 * 64);
-    t |= _mm512_loadu_si512(buf + len - 3 * 64);
-    t |= _mm512_loadu_si512(buf + len - 2 * 64);
-    t |= _mm512_loadu_si512(buf + len - 1 * 64);
-
-    return !_mm512_test_epi64_mask(t, t);
-
-}
-#endif /* CONFIG_AVX512F_OPT */
-
-/*
- * Make sure that these variables are appropriately initialized when
- * SSE2 is enabled on the compiler command-line, but the compiler is
- * too old to support CONFIG_AVX2_OPT.
- */
-#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
-# define INIT_USED     0
-# define INIT_LENGTH   0
-# define INIT_ACCEL    buffer_zero_int
-#else
-# ifndef __SSE2__
-#  error "ISA selection confusion"
-# endif
-# define INIT_USED     CPUINFO_SSE2
-# define INIT_LENGTH   64
-# define INIT_ACCEL    buffer_zero_sse2
-#endif
-
-static unsigned used_accel = INIT_USED;
-static unsigned length_to_accel = INIT_LENGTH;
-static bool (*buffer_accel)(const void *, size_t) = INIT_ACCEL;
-
-static unsigned __attribute__((noinline))
-select_accel_cpuinfo(unsigned info)
-{
-    /* Array is sorted in order of algorithm preference. */
-    static const struct {
-        unsigned bit;
-        unsigned len;
-        bool (*fn)(const void *, size_t);
-    } all[] = {
-#ifdef CONFIG_AVX512F_OPT
-        { CPUINFO_AVX512F, 256, buffer_zero_avx512 },
-#endif
+static biz_accel_fn const accel_table[] = {
+    buffer_is_zero_int_ge256,
+    buffer_zero_sse2,
 #ifdef CONFIG_AVX2_OPT
-        { CPUINFO_AVX2,    128, buffer_zero_avx2 },
-        { CPUINFO_SSE4,     64, buffer_zero_sse4 },
+    buffer_zero_avx2,
 #endif
-        { CPUINFO_SSE2,     64, buffer_zero_sse2 },
-        { CPUINFO_ALWAYS,    0, buffer_zero_int },
-    };
+};
 
-    for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) {
-        if (info & all[i].bit) {
-            length_to_accel = all[i].len;
-            buffer_accel = all[i].fn;
-            return all[i].bit;
-        }
+static unsigned best_accel(void)
+{
+    unsigned info = cpuinfo_init();
+
+#ifdef CONFIG_AVX2_OPT
+    if (info & CPUINFO_AVX2) {
+        return 2;
     }
-    return 0;
-}
-
-#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
-static void __attribute__((constructor)) init_accel(void)
-{
-    used_accel = select_accel_cpuinfo(cpuinfo_init());
-}
-#endif /* CONFIG_AVX2_OPT */
-
-bool test_buffer_is_zero_next_accel(void)
-{
-    /*
-     * Accumulate the accelerators that we've already tested, and
-     * remove them from the set to test this round.  We'll get back
-     * a zero from select_accel_cpuinfo when there are no more.
-     */
-    unsigned used = select_accel_cpuinfo(cpuinfo & ~used_accel);
-    used_accel |= used;
-    return used;
-}
-
-static bool select_accel_fn(const void *buf, size_t len)
-{
-    if (likely(len >= length_to_accel)) {
-        return buffer_accel(buf, len);
-    }
-    return buffer_zero_int(buf, len);
-}
-
-#else
-#define select_accel_fn  buffer_zero_int
-bool test_buffer_is_zero_next_accel(void)
-{
-    return false;
-}
 #endif
+    return info & CPUINFO_SSE2 ? 1 : 0;
+}
+
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+#include <arm_neon.h>
 
 /*
- * Checks if a buffer is all zeroes
+ * Helper for preventing the compiler from reassociating
+ * chains of binary vector operations.
  */
-bool buffer_is_zero(const void *buf, size_t len)
+#define REASSOC_BARRIER(vec0, vec1) asm("" : "+w"(vec0), "+w"(vec1))
+
+static bool buffer_is_zero_simd(const void *buf, size_t len)
+{
+    uint32x4_t t0, t1, t2, t3;
+
+    /* Align head/tail to 16-byte boundaries.  */
+    const uint32x4_t *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16);
+    const uint32x4_t *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16);
+
+    /* Unaligned loads at head/tail.  */
+    t0 = vld1q_u32(buf) | vld1q_u32(buf + len - 16);
+
+    /* Collect a partial block at tail end.  */
+    t1 = e[-7] | e[-6];
+    t2 = e[-5] | e[-4];
+    t3 = e[-3] | e[-2];
+    t0 |= e[-1];
+    REASSOC_BARRIER(t0, t1);
+    REASSOC_BARRIER(t2, t3);
+    t0 |= t1;
+    t2 |= t3;
+    REASSOC_BARRIER(t0, t2);
+    t0 |= t2;
+
+    /*
+     * Loop over complete 128-byte blocks.
+     * With the head and tail removed, e - p >= 14, so the loop
+     * must iterate at least once.
+     */
+    do {
+        /*
+         * Reduce via UMAXV.  Whatever the actual result,
+         * it will only be zero if all input bytes are zero.
+         */
+        if (unlikely(vmaxvq_u32(t0) != 0)) {
+            return false;
+        }
+
+        t0 = p[0] | p[1];
+        t1 = p[2] | p[3];
+        t2 = p[4] | p[5];
+        t3 = p[6] | p[7];
+        REASSOC_BARRIER(t0, t1);
+        REASSOC_BARRIER(t2, t3);
+        t0 |= t1;
+        t2 |= t3;
+        REASSOC_BARRIER(t0, t2);
+        t0 |= t2;
+        p += 8;
+    } while (p < e - 7);
+
+    return vmaxvq_u32(t0) == 0;
+}
+
+#define best_accel() 1
+static biz_accel_fn const accel_table[] = {
+    buffer_is_zero_int_ge256,
+    buffer_is_zero_simd,
+};
+#else
+#define best_accel() 0
+static biz_accel_fn const accel_table[1] = {
+    buffer_is_zero_int_ge256
+};
+#endif
+
+static biz_accel_fn buffer_is_zero_accel;
+static unsigned accel_index;
+
+bool buffer_is_zero_ool(const void *buf, size_t len)
 {
     if (unlikely(len == 0)) {
         return true;
     }
+    if (!buffer_is_zero_sample3(buf, len)) {
+        return false;
+    }
+    /* All bytes are covered for any len <= 3.  */
+    if (unlikely(len <= 3)) {
+        return true;
+    }
 
-    /* Fetch the beginning of the buffer while we select the accelerator.  */
-    __builtin_prefetch(buf);
+    if (likely(len >= 256)) {
+        return buffer_is_zero_accel(buf, len);
+    }
+    return buffer_is_zero_int_lt256(buf, len);
+}
 
-    /* Use an optimized zero check if possible.  Note that this also
-       includes a check for an unrolled loop over 64-bit integers.  */
-    return select_accel_fn(buf, len);
+bool buffer_is_zero_ge256(const void *buf, size_t len)
+{
+    return buffer_is_zero_accel(buf, len);
+}
+
+bool test_buffer_is_zero_next_accel(void)
+{
+    if (accel_index != 0) {
+        buffer_is_zero_accel = accel_table[--accel_index];
+        return true;
+    }
+    return false;
+}
+
+static void __attribute__((constructor)) init_accel(void)
+{
+    accel_index = best_accel();
+    buffer_is_zero_accel = accel_table[accel_index];
 }