Merge tag 'for-upstream-urgent' of https://gitlab.com/bonzini/qemu into staging

Fixes for Python venv changes

# -----BEGIN PGP SIGNATURE-----
#
# iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmRn7D4UHHBib256aW5p
# QHJlZGhhdC5jb20ACgkQv/vSX3jHroOHbQgAiQW824iL2Iw+wjYckp0rwLxe53+z
# P4kCdQePrfKW3sPglbeDArPr4gzuo7bdj75dscZmco+nBU40qGqEpRHBqjQol5pE
# kcQsmqx+0Udbsc6kJe47fgSsBLD2KbT1QQCVBgScNuDviogQ0/PCLNWjk9V4OhgL
# 0ZlK8QFnuv0qNthS+oNjkNi6SYGYNOw+4LQ/WcLWnowwhNRGUvYoq9QdOCocfyxD
# t+1xQvF4Pxqnhbkni51JRoXv/Np8U/yDHMgonvw8BLxTMNAes4nV7ifzyW2pltnf
# YEHGUKYPtrPR9dKLr/Au9ktr7n3O5ikOEpPIPSi4BwFqzv6hdE4DDAMXDA==
# =Auyq
# -----END PGP SIGNATURE-----
# gpg: Signature made Fri 19 May 2023 02:38:06 PM PDT
# gpg:                using RSA key F13338574B662389866C7682BFFBD25F78C7AE83
# gpg:                issuer "pbonzini@redhat.com"
# gpg: Good signature from "Paolo Bonzini <bonzini@gnu.org>" [undefined]
# gpg:                 aka "Paolo Bonzini <pbonzini@redhat.com>" [undefined]
# gpg: WARNING: This key is not certified with a trusted signature!
# gpg:          There is no indication that the signature belongs to the owner.
# Primary key fingerprint: 46F5 9FBD 57D6 12E7 BFD4  E2F7 7E15 100C CD36 69B1
#      Subkey fingerprint: F133 3857 4B66 2389 866C  7682 BFFB D25F 78C7 AE83

* tag 'for-upstream-urgent' of https://gitlab.com/bonzini/qemu:
  scripts: make sure scripts are invoked via $(PYTHON)
  gitlab: custom-runners: preserve more artifacts for debugging
  mkvenv: pass first missing package to diagnose()
  configure: fix backwards-compatibility for meson sphinx_build option
  build: rebuild build.ninja using "meson setup --reconfigure"
  mkvenv: replace distlib.database with importlib.metadata/pkg_resources
  remove remaining traces of meson submodule

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
diff --git a/MAINTAINERS b/MAINTAINERS
index d0e604c..1b64664 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -225,6 +225,7 @@
 F: disas/hexagon.c
 F: configs/targets/hexagon-linux-user/default.mak
 F: docker/dockerfiles/debian-hexagon-cross.docker
+F: gdb-xml/hexagon*.xml
 
 Hexagon idef-parser
 M: Alessandro Di Federico <ale@rev.ng>
diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index 34be1b9..5b73a39 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -254,33 +254,6 @@
     return op;
 }
 
-static TCGOp *copy_extu_i32_i64(TCGOp **begin_op, TCGOp *op)
-{
-    if (TCG_TARGET_REG_BITS == 32) {
-        /* mov_i32 */
-        op = copy_op(begin_op, op, INDEX_op_mov_i32);
-        /* mov_i32 w/ $0 */
-        op = copy_op(begin_op, op, INDEX_op_mov_i32);
-    } else {
-        /* extu_i32_i64 */
-        op = copy_op(begin_op, op, INDEX_op_extu_i32_i64);
-    }
-    return op;
-}
-
-static TCGOp *copy_mov_i64(TCGOp **begin_op, TCGOp *op)
-{
-    if (TCG_TARGET_REG_BITS == 32) {
-        /* 2x mov_i32 */
-        op = copy_op(begin_op, op, INDEX_op_mov_i32);
-        op = copy_op(begin_op, op, INDEX_op_mov_i32);
-    } else {
-        /* mov_i64 */
-        op = copy_op(begin_op, op, INDEX_op_mov_i64);
-    }
-    return op;
-}
-
 static TCGOp *copy_const_ptr(TCGOp **begin_op, TCGOp *op, void *ptr)
 {
     if (UINTPTR_MAX == UINT32_MAX) {
@@ -295,18 +268,6 @@
     return op;
 }
 
-static TCGOp *copy_extu_tl_i64(TCGOp **begin_op, TCGOp *op)
-{
-    if (TARGET_LONG_BITS == 32) {
-        /* extu_i32_i64 */
-        op = copy_extu_i32_i64(begin_op, op);
-    } else {
-        /* mov_i64 */
-        op = copy_mov_i64(begin_op, op);
-    }
-    return op;
-}
-
 static TCGOp *copy_ld_i64(TCGOp **begin_op, TCGOp *op)
 {
     if (TCG_TARGET_REG_BITS == 32) {
@@ -451,9 +412,6 @@
         tcg_debug_assert(begin_op && begin_op->opc == INDEX_op_ld_i32);
     }
 
-    /* extu_tl_i64 */
-    op = copy_extu_tl_i64(&begin_op, op);
-
     if (type == PLUGIN_GEN_CB_MEM) {
         /* call */
         op = copy_call(&begin_op, op, HELPER(plugin_vcpu_mem_cb),
diff --git a/configs/targets/hexagon-linux-user.mak b/configs/targets/hexagon-linux-user.mak
index 003ed0a..2765a4c 100644
--- a/configs/targets/hexagon-linux-user.mak
+++ b/configs/targets/hexagon-linux-user.mak
@@ -1 +1,2 @@
 TARGET_ARCH=hexagon
+TARGET_XML_FILES=gdb-xml/hexagon-core.xml gdb-xml/hexagon-hvx.xml
diff --git a/configure b/configure
index 1bdc7fd..80ca1c9 100755
--- a/configure
+++ b/configure
@@ -1269,7 +1269,7 @@
 : ${cross_cc_armeb="$cross_cc_arm"}
 : ${cross_cc_cflags_armeb="-mbig-endian"}
 : ${cross_cc_hexagon="hexagon-unknown-linux-musl-clang"}
-: ${cross_cc_cflags_hexagon="-mv67 -O2 -static"}
+: ${cross_cc_cflags_hexagon="-mv73 -O2 -static"}
 : ${cross_cc_cflags_i386="-m32"}
 : ${cross_cc_cflags_ppc="-m32 -mbig-endian"}
 : ${cross_cc_cflags_ppc64="-m64 -mbig-endian"}
diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
index 7bb4d2f..e934e0a 100644
--- a/docs/about/deprecated.rst
+++ b/docs/about/deprecated.rst
@@ -328,6 +328,14 @@
 has used a properly allocated identifier. Deprecate the ``use-intel-id``
 machine compatibility parameter.
 
+``-device cxl-type3,memdev=xxxx`` (since 8.0)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``cxl-type3`` device initially only used a single memory backend.  With
+the addition of volatile memory support, it is now necessary to distinguish
+between persistent and volatile memory backends.  As such, memdev is deprecated
+in favor of persistent-memdev.
+
 
 Block device options
 ''''''''''''''''''''
diff --git a/docs/system/devices/cxl.rst b/docs/system/devices/cxl.rst
index 4c38223..f12011e 100644
--- a/docs/system/devices/cxl.rst
+++ b/docs/system/devices/cxl.rst
@@ -162,7 +162,7 @@
   |<------------------SYSTEM PHYSICAL ADDRESS MAP (1)----------------->|
   |    __________   __________________________________   __________    |
   |   |          | |                                  | |          |   |
-  |   | CFMW 0   | |  CXL Fixed Memory Window 1       | | CFMW 1   |   |
+  |   | CFMW 0   | |  CXL Fixed Memory Window 1       | | CFMW 2   |   |
   |   | HB0 only | |  Configured to interleave memory | | HB1 only |   |
   |   |          | |  memory accesses across HB0/HB1  | |          |   |
   |   |__________| |_____x____________________________| |__________|   |
@@ -208,8 +208,8 @@
 (1) **3 CXL Fixed Memory Windows (CFMW)** corresponding to different
     ranges of the system physical address map.  Each CFMW has
     particular interleave setup across the CXL Host Bridges (HB)
-    CFMW0 provides uninterleaved access to HB0, CFW2 provides
-    uninterleaved access to HB1. CFW1 provides interleaved memory access
+    CFMW0 provides uninterleaved access to HB0, CFMW2 provides
+    uninterleaved access to HB1. CFMW1 provides interleaved memory access
     across HB0 and HB1.
 
 (2) **Two CXL Host Bridges**. Each of these has 2 CXL Root Ports and
@@ -247,7 +247,7 @@
   |<------------------SYSTEM PHYSICAL ADDRESS MAP (1)----------------->|
   |    __________   __________________________________   __________    |
   |   |          | |                                  | |          |   |
-  |   | CFMW 0   | |  CXL Fixed Memory Window 1       | | CFMW 1   |   |
+  |   | CFMW 0   | |  CXL Fixed Memory Window 1       | | CFMW 2   |   |
   |   | HB0 only | |  Configured to interleave memory | | HB1 only |   |
   |   |          | |  memory accesses across HB0/HB1  | |          |   |
   |   |____x_____| |__________________________________| |__________|   |
@@ -300,22 +300,43 @@
 
 Example command lines
 ---------------------
-A very simple setup with just one directly attached CXL Type 3 device::
+A very simple setup with just one directly attached CXL Type 3 Persistent Memory device::
 
-  qemu-system-aarch64 -M virt,gic-version=3,cxl=on -m 4g,maxmem=8G,slots=8 -cpu max \
+  qemu-system-x86_64 -M q35,cxl=on -m 4G,maxmem=8G,slots=8 -smp 4 \
   ...
   -object memory-backend-file,id=cxl-mem1,share=on,mem-path=/tmp/cxltest.raw,size=256M \
   -object memory-backend-file,id=cxl-lsa1,share=on,mem-path=/tmp/lsa.raw,size=256M \
   -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \
   -device cxl-rp,port=0,bus=cxl.1,id=root_port13,chassis=0,slot=2 \
-  -device cxl-type3,bus=root_port13,memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0 \
+  -device cxl-type3,bus=root_port13,persistent-memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0 \
+  -M cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G
+
+A very simple setup with just one directly attached CXL Type 3 Volatile Memory device::
+
+  qemu-system-aarch64 -M virt,gic-version=3,cxl=on -m 4g,maxmem=8G,slots=8 -cpu max \
+  ...
+  -object memory-backend-ram,id=vmem0,share=on,size=256M \
+  -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \
+  -device cxl-rp,port=0,bus=cxl.1,id=root_port13,chassis=0,slot=2 \
+  -device cxl-type3,bus=root_port13,volatile-memdev=vmem0,id=cxl-vmem0 \
+  -M cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G
+
+The same volatile setup may optionally include an LSA region::
+
+  qemu-system-aarch64 -M virt,gic-version=3,cxl=on -m 4g,maxmem=8G,slots=8 -cpu max \
+  ...
+  -object memory-backend-ram,id=vmem0,share=on,size=256M \
+  -object memory-backend-file,id=cxl-lsa0,share=on,mem-path=/tmp/lsa.raw,size=256M \
+  -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \
+  -device cxl-rp,port=0,bus=cxl.1,id=root_port13,chassis=0,slot=2 \
+  -device cxl-type3,bus=root_port13,volatile-memdev=vmem0,lsa=cxl-lsa0,id=cxl-vmem0 \
   -M cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G
 
 A setup suitable for 4 way interleave. Only one fixed window provided, to enable 2 way
 interleave across 2 CXL host bridges.  Each host bridge has 2 CXL Root Ports, with
 the CXL Type3 device directly attached (no switches).::
 
-  qemu-system-aarch64 -M virt,gic-version=3,cxl=on -m 4g,maxmem=8G,slots=8 -cpu max \
+  qemu-system-x86_64 -M q35,cxl=on -m 4G,maxmem=8G,slots=8 -smp 4 \
   ...
   -object memory-backend-file,id=cxl-mem1,share=on,mem-path=/tmp/cxltest.raw,size=256M \
   -object memory-backend-file,id=cxl-mem2,share=on,mem-path=/tmp/cxltest2.raw,size=256M \
@@ -328,18 +349,18 @@
   -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \
   -device pxb-cxl,bus_nr=222,bus=pcie.0,id=cxl.2 \
   -device cxl-rp,port=0,bus=cxl.1,id=root_port13,chassis=0,slot=2 \
-  -device cxl-type3,bus=root_port13,memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0 \
+  -device cxl-type3,bus=root_port13,persistent-memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0 \
   -device cxl-rp,port=1,bus=cxl.1,id=root_port14,chassis=0,slot=3 \
-  -device cxl-type3,bus=root_port14,memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem1 \
+  -device cxl-type3,bus=root_port14,persistent-memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem1 \
   -device cxl-rp,port=0,bus=cxl.2,id=root_port15,chassis=0,slot=5 \
-  -device cxl-type3,bus=root_port15,memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem2 \
+  -device cxl-type3,bus=root_port15,persistent-memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem2 \
   -device cxl-rp,port=1,bus=cxl.2,id=root_port16,chassis=0,slot=6 \
-  -device cxl-type3,bus=root_port16,memdev=cxl-mem4,lsa=cxl-lsa4,id=cxl-pmem3 \
+  -device cxl-type3,bus=root_port16,persistent-memdev=cxl-mem4,lsa=cxl-lsa4,id=cxl-pmem3 \
   -M cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.targets.1=cxl.2,cxl-fmw.0.size=4G,cxl-fmw.0.interleave-granularity=8k
 
 An example of 4 devices below a switch suitable for 1, 2 or 4 way interleave::
 
-  qemu-system-aarch64 -M virt,gic-version=3,cxl=on -m 4g,maxmem=8G,slots=8 -cpu max \
+  qemu-system-x86_64 -M q35,cxl=on -m 4G,maxmem=8G,slots=8 -smp 4 \
   ...
   -object memory-backend-file,id=cxl-mem0,share=on,mem-path=/tmp/cxltest.raw,size=256M \
   -object memory-backend-file,id=cxl-mem1,share=on,mem-path=/tmp/cxltest1.raw,size=256M \
@@ -354,15 +375,23 @@
   -device cxl-rp,port=1,bus=cxl.1,id=root_port1,chassis=0,slot=1 \
   -device cxl-upstream,bus=root_port0,id=us0 \
   -device cxl-downstream,port=0,bus=us0,id=swport0,chassis=0,slot=4 \
-  -device cxl-type3,bus=swport0,memdev=cxl-mem0,lsa=cxl-lsa0,id=cxl-pmem0,size=256M \
+  -device cxl-type3,bus=swport0,persistent-memdev=cxl-mem0,lsa=cxl-lsa0,id=cxl-pmem0 \
   -device cxl-downstream,port=1,bus=us0,id=swport1,chassis=0,slot=5 \
-  -device cxl-type3,bus=swport1,memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem1,size=256M \
+  -device cxl-type3,bus=swport1,persistent-memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem1 \
   -device cxl-downstream,port=2,bus=us0,id=swport2,chassis=0,slot=6 \
-  -device cxl-type3,bus=swport2,memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem2,size=256M \
+  -device cxl-type3,bus=swport2,persistent-memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem2 \
   -device cxl-downstream,port=3,bus=us0,id=swport3,chassis=0,slot=7 \
-  -device cxl-type3,bus=swport3,memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem3,size=256M \
+  -device cxl-type3,bus=swport3,persistent-memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem3 \
   -M cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G,cxl-fmw.0.interleave-granularity=4k
 
+Deprecations
+------------
+
+The Type 3 device [memdev] attribute has been deprecated in favor of the
+[persistent-memdev] attributes. [memdev] will default to a persistent memory
+device for backward compatibility and is incapable of being used in combination
+with [persistent-memdev].
+
 Kernel Configuration Options
 ----------------------------
 
diff --git a/gdb-xml/hexagon-core.xml b/gdb-xml/hexagon-core.xml
new file mode 100644
index 0000000..e181163
--- /dev/null
+++ b/gdb-xml/hexagon-core.xml
@@ -0,0 +1,84 @@
+<?xml version="1.0"?>
+<!--
+  Copyright(c) 2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+
+  This work is licensed under the terms of the GNU GPL, version 2 or
+  (at your option) any later version. See the COPYING file in the
+  top-level directory.
+
+  Note: this file is intended to be use with LLDB, so it contains fields
+  that may be unknown to GDB. For more information on such fields, please
+  see:
+  https://github.com/llvm/llvm-project/blob/287aa6c4536408413b860e61fca0318a27214cf3/lldb/docs/lldb-gdb-remote.txt#L738-L860
+  https://github.com/llvm/llvm-project/blob/287aa6c4536408413b860e61fca0318a27214cf3/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp#L4275-L4335
+-->
+
+<!DOCTYPE feature SYSTEM "gdb-target.dtd">
+<feature name="org.gnu.gdb.hexagon.core">
+
+  <reg name="r00" altname="r0" bitsize="32" offset="0"   encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="0" generic="r00"/>
+  <reg name="r01" altname="r1" bitsize="32" offset="4"   encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="1" generic="r01"/>
+  <reg name="r02" altname="r2" bitsize="32" offset="8"   encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="2" generic="r02"/>
+  <reg name="r03" altname="r3" bitsize="32" offset="12"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="3" generic="r03"/>
+  <reg name="r04" altname="r4" bitsize="32" offset="16"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="4" generic="r04"/>
+  <reg name="r05" altname="r5" bitsize="32" offset="20"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="5" generic="r05"/>
+  <reg name="r06" altname="r6" bitsize="32" offset="24"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="6" generic="r06"/>
+  <reg name="r07" altname="r7" bitsize="32" offset="28"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="7" generic="r07"/>
+  <reg name="r08" altname="r8" bitsize="32" offset="32"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="8" generic="r08"/>
+  <reg name="r09" altname="r9" bitsize="32" offset="36"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="9" generic="r09"/>
+  <reg name="r10"              bitsize="32" offset="40"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="10"/>
+  <reg name="r11"              bitsize="32" offset="44"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="11"/>
+  <reg name="r12"              bitsize="32" offset="48"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="12"/>
+  <reg name="r13"              bitsize="32" offset="52"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="13"/>
+  <reg name="r14"              bitsize="32" offset="56"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="14"/>
+  <reg name="r15"              bitsize="32" offset="60"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="15"/>
+  <reg name="r16"              bitsize="32" offset="64"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="16"/>
+  <reg name="r17"              bitsize="32" offset="68"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="17"/>
+  <reg name="r18"              bitsize="32" offset="72"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="18"/>
+  <reg name="r19"              bitsize="32" offset="76"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="19"/>
+  <reg name="r20"              bitsize="32" offset="80"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="20"/>
+  <reg name="r21"              bitsize="32" offset="84"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="21"/>
+  <reg name="r22"              bitsize="32" offset="88"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="22"/>
+  <reg name="r23"              bitsize="32" offset="92"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="23"/>
+  <reg name="r24"              bitsize="32" offset="96"  encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="24"/>
+  <reg name="r25"              bitsize="32" offset="100" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="25"/>
+  <reg name="r26"              bitsize="32" offset="104" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="26"/>
+  <reg name="r27"              bitsize="32" offset="108" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="27"/>
+  <reg name="r28"              bitsize="32" offset="112" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="28"/>
+  <reg name="r29" altname="sp" bitsize="32" offset="116" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="29" generic="sp"/>
+  <reg name="r30" altname="fp" bitsize="32" offset="120" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="30" generic="fp"/>
+  <reg name="r31" altname="ra" bitsize="32" offset="124" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="31" generic="ra"/>
+  <reg name="sa0"              bitsize="32" offset="128" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="32"/>
+  <reg name="lc0"              bitsize="32" offset="132" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="33"/>
+  <reg name="sa1"              bitsize="32" offset="136" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="34"/>
+  <reg name="lc1"              bitsize="32" offset="140" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="35"/>
+  <reg name="p3_0"             bitsize="32" offset="144" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="36"/>
+  <reg name="c5"               bitsize="32" offset="148" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="37"/>
+  <reg name="m0"               bitsize="32" offset="152" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="38"/>
+  <reg name="m1"               bitsize="32" offset="156" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="39"/>
+  <reg name="usr"              bitsize="32" offset="160" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="40"/>
+  <reg name="pc"               bitsize="32" offset="164" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="41" generic="pc"/>
+  <reg name="ugp"              bitsize="32" offset="168" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="42"/>
+  <reg name="gp"               bitsize="32" offset="172" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="43"/>
+  <reg name="cs0"              bitsize="32" offset="176" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="44"/>
+  <reg name="cs1"              bitsize="32" offset="180" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="45"/>
+  <reg name="upcyclelo"        bitsize="32" offset="184" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="46"/>
+  <reg name="upcyclehi"        bitsize="32" offset="188" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="47"/>
+  <reg name="framelimit"       bitsize="32" offset="192" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="48"/>
+  <reg name="framekey"         bitsize="32" offset="196" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="49"/>
+  <reg name="pktcountlo"       bitsize="32" offset="200" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="50"/>
+  <reg name="pktcounthi"       bitsize="32" offset="204" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="51"/>
+  <reg name="pkt_cnt"          bitsize="32" offset="208" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="52"/>
+  <reg name="insn_cnt"         bitsize="32" offset="212" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="53"/>
+  <reg name="hvx_cnt"          bitsize="32" offset="216" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="54"/>
+  <reg name="c23"              bitsize="32" offset="220" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="55"/>
+  <reg name="c24"              bitsize="32" offset="224" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="56"/>
+  <reg name="c25"              bitsize="32" offset="228" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="57"/>
+  <reg name="c26"              bitsize="32" offset="232" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="58"/>
+  <reg name="c27"              bitsize="32" offset="236" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="59"/>
+  <reg name="c28"              bitsize="32" offset="240" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="60"/>
+  <reg name="c29"              bitsize="32" offset="244" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="61"/>
+  <reg name="utimerlo"         bitsize="32" offset="248" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="62"/>
+  <reg name="utimerhi"         bitsize="32" offset="252" encoding="uint" format="hex" group="Thread Registers" dwarf_regnum="63"/>
+
+</feature>
diff --git a/gdb-xml/hexagon-hvx.xml b/gdb-xml/hexagon-hvx.xml
new file mode 100644
index 0000000..5f2e220
--- /dev/null
+++ b/gdb-xml/hexagon-hvx.xml
@@ -0,0 +1,96 @@
+<?xml version="1.0"?>
+<!--
+  Copyright(c) 2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+
+  This work is licensed under the terms of the GNU GPL, version 2 or
+  (at your option) any later version. See the COPYING file in the
+  top-level directory.
+
+  Note: this file is intended to be use with LLDB, so it contains fields
+  that may be unknown to GDB. For more information on such fields, please
+  see:
+  https://github.com/llvm/llvm-project/blob/287aa6c4536408413b860e61fca0318a27214cf3/lldb/docs/lldb-gdb-remote.txt#L738-L860
+  https://github.com/llvm/llvm-project/blob/287aa6c4536408413b860e61fca0318a27214cf3/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp#L4275-L4335
+-->
+
+<!DOCTYPE feature SYSTEM "gdb-target.dtd">
+<feature name="org.gnu.gdb.hexagon.hvx">
+
+  <vector id="vud" type="uint64" count="16"/>
+  <vector id="vd" type="int64" count="16"/>
+  <vector id="vuw" type="uint32" count="32"/>
+  <vector id="vw" type="int32" count="32"/>
+  <vector id="vuh" type="uint16" count="64"/>
+  <vector id="vh" type="int16" count="64"/>
+  <vector id="vub" type="uint8" count="128"/>
+  <vector id="vb" type="int8" count="128"/>
+  <union id="hex_vec">
+    <field name="ud" type="vud"/>
+    <field name="d" type="vd"/>
+    <field name="uw" type="vuw"/>
+    <field name="w" type="vw"/>
+    <field name="uh" type="vuh"/>
+    <field name="h" type="vh"/>
+    <field name="ub" type="vub"/>
+    <field name="b" type="vb"/>
+  </union>
+
+  <flags id="ui2" size="1">
+    <field name="0" start="0" end="0"/>
+    <field name="1" start="1" end="1"/>
+  </flags>
+  <flags id="ui4" size="1">
+    <field name="0" start="0" end="0"/>
+    <field name="1" start="1" end="1"/>
+    <field name="2" start="2" end="2"/>
+    <field name="3" start="3" end="3"/>
+  </flags>
+  <vector id="vpd" type="uint8" count="16"/>
+  <vector id="vpw" type="ui4" count="32"/>
+  <vector id="vph" type="ui2" count="64"/>
+  <vector id="vpb" type="bool" count="128"/>
+  <union id="hex_vec_pred">
+    <field name="d" type="vpd"/>
+    <field name="w" type="vpw"/>
+    <field name="h" type="vph"/>
+    <field name="b" type="vpb"/>
+  </union>
+
+  <reg name="v0"  bitsize="1024" offset="256"  encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="88"/>
+  <reg name="v1"  bitsize="1024" offset="384"  encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="89"/>
+  <reg name="v2"  bitsize="1024" offset="512"  encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="90"/>
+  <reg name="v3"  bitsize="1024" offset="640"  encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="91"/>
+  <reg name="v4"  bitsize="1024" offset="768"  encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="92"/>
+  <reg name="v5"  bitsize="1024" offset="896"  encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="93"/>
+  <reg name="v6"  bitsize="1024" offset="1024" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="94"/>
+  <reg name="v7"  bitsize="1024" offset="1152" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="95"/>
+  <reg name="v8"  bitsize="1024" offset="1280" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="96"/>
+  <reg name="v9"  bitsize="1024" offset="1408" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="97"/>
+  <reg name="v10" bitsize="1024" offset="1536" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="98"/>
+  <reg name="v11" bitsize="1024" offset="1664" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="99"/>
+  <reg name="v12" bitsize="1024" offset="1792" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="100"/>
+  <reg name="v13" bitsize="1024" offset="1920" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="101"/>
+  <reg name="v14" bitsize="1024" offset="2048" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="102"/>
+  <reg name="v15" bitsize="1024" offset="2176" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="103"/>
+  <reg name="v16" bitsize="1024" offset="2304" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="104"/>
+  <reg name="v17" bitsize="1024" offset="2432" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="105"/>
+  <reg name="v18" bitsize="1024" offset="2560" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="106"/>
+  <reg name="v19" bitsize="1024" offset="2688" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="107"/>
+  <reg name="v20" bitsize="1024" offset="2816" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="108"/>
+  <reg name="v21" bitsize="1024" offset="2944" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="109"/>
+  <reg name="v22" bitsize="1024" offset="3072" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="110"/>
+  <reg name="v23" bitsize="1024" offset="3200" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="111"/>
+  <reg name="v24" bitsize="1024" offset="3328" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="112"/>
+  <reg name="v25" bitsize="1024" offset="3456" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="113"/>
+  <reg name="v26" bitsize="1024" offset="3584" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="114"/>
+  <reg name="v27" bitsize="1024" offset="3712" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="115"/>
+  <reg name="v28" bitsize="1024" offset="3840" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="116"/>
+  <reg name="v29" bitsize="1024" offset="3968" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="117"/>
+  <reg name="v30" bitsize="1024" offset="4096" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="118"/>
+  <reg name="v31" bitsize="1024" offset="4224" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="119"/>
+  <reg name="q0"  bitsize="128"  offset="4352" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="120"/>
+  <reg name="q1"  bitsize="128"  offset="4368" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="121"/>
+  <reg name="q2"  bitsize="128"  offset="4384" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="122"/>
+  <reg name="q3"  bitsize="128"  offset="4400" encoding="vector" format="hex" group="HVX Vector Registers" dwarf_regnum="123"/>
+
+</feature>
diff --git a/gdbstub/gdbstub.c b/gdbstub/gdbstub.c
index 0760d78..be18568 100644
--- a/gdbstub/gdbstub.c
+++ b/gdbstub/gdbstub.c
@@ -777,6 +777,10 @@
 /*
  * cmd_startswith -> cmd is compared using startswith
  *
+ * allow_stop_reply -> true iff the gdbstub can respond to this command with a
+ *   "stop reply" packet. The list of commands that accept such response is
+ *   defined at the GDB Remote Serial Protocol documentation. see:
+ *   https://sourceware.org/gdb/onlinedocs/gdb/Stop-Reply-Packets.html#Stop-Reply-Packets.
  *
  * schema definitions:
  * Each schema parameter entry consists of 2 chars,
@@ -802,6 +806,7 @@
     const char *cmd;
     bool cmd_startswith;
     const char *schema;
+    bool allow_stop_reply;
 } GdbCmdParseEntry;
 
 static inline int startswith(const char *string, const char *pattern)
@@ -835,6 +840,7 @@
             }
         }
 
+        gdbserver_state.allow_stop_reply = cmd->allow_stop_reply;
         cmd->handler(params, user_ctx);
         return 0;
     }
@@ -1283,11 +1289,14 @@
     gdbserver_state.g_cpu = cpu;
     gdbserver_state.c_cpu = cpu;
 
-    g_string_printf(gdbserver_state.str_buf, "T%02xthread:", GDB_SIGNAL_TRAP);
-    gdb_append_thread_id(cpu, gdbserver_state.str_buf);
-    g_string_append_c(gdbserver_state.str_buf, ';');
+    if (gdbserver_state.allow_stop_reply) {
+        g_string_printf(gdbserver_state.str_buf, "T%02xthread:", GDB_SIGNAL_TRAP);
+        gdb_append_thread_id(cpu, gdbserver_state.str_buf);
+        g_string_append_c(gdbserver_state.str_buf, ';');
+        gdbserver_state.allow_stop_reply = false;
 cleanup:
-    gdb_put_strbuf();
+        gdb_put_strbuf();
+    }
 }
 
 static void handle_v_kill(GArray *params, void *user_ctx)
@@ -1310,12 +1319,14 @@
         .handler = handle_v_cont,
         .cmd = "Cont",
         .cmd_startswith = 1,
+        .allow_stop_reply = true,
         .schema = "s0"
     },
     {
         .handler = handle_v_attach,
         .cmd = "Attach;",
         .cmd_startswith = 1,
+        .allow_stop_reply = true,
         .schema = "l0"
     },
     {
@@ -1698,10 +1709,13 @@
 
 static void handle_target_halt(GArray *params, void *user_ctx)
 {
-    g_string_printf(gdbserver_state.str_buf, "T%02xthread:", GDB_SIGNAL_TRAP);
-    gdb_append_thread_id(gdbserver_state.c_cpu, gdbserver_state.str_buf);
-    g_string_append_c(gdbserver_state.str_buf, ';');
-    gdb_put_strbuf();
+    if (gdbserver_state.allow_stop_reply) {
+        g_string_printf(gdbserver_state.str_buf, "T%02xthread:", GDB_SIGNAL_TRAP);
+        gdb_append_thread_id(gdbserver_state.c_cpu, gdbserver_state.str_buf);
+        g_string_append_c(gdbserver_state.str_buf, ';');
+        gdb_put_strbuf();
+        gdbserver_state.allow_stop_reply = false;
+    }
     /*
      * Remove all the breakpoints when this query is issued,
      * because gdb is doing an initial connect and the state
@@ -1725,7 +1739,8 @@
             static const GdbCmdParseEntry target_halted_cmd_desc = {
                 .handler = handle_target_halt,
                 .cmd = "?",
-                .cmd_startswith = 1
+                .cmd_startswith = 1,
+                .allow_stop_reply = true,
             };
             cmd_parser = &target_halted_cmd_desc;
         }
@@ -1736,6 +1751,7 @@
                 .handler = handle_continue,
                 .cmd = "c",
                 .cmd_startswith = 1,
+                .allow_stop_reply = true,
                 .schema = "L0"
             };
             cmd_parser = &continue_cmd_desc;
@@ -1747,6 +1763,7 @@
                 .handler = handle_cont_with_sig,
                 .cmd = "C",
                 .cmd_startswith = 1,
+                .allow_stop_reply = true,
                 .schema = "l0"
             };
             cmd_parser = &cont_with_sig_cmd_desc;
@@ -1785,6 +1802,7 @@
                 .handler = handle_step,
                 .cmd = "s",
                 .cmd_startswith = 1,
+                .allow_stop_reply = true,
                 .schema = "L0"
             };
             cmd_parser = &step_cmd_desc;
@@ -1976,6 +1994,7 @@
 {
     uint8_t reply;
 
+    gdbserver_state.allow_stop_reply = false;
 #ifndef CONFIG_USER_ONLY
     if (gdbserver_state.last_packet->len) {
         /* Waiting for a response to the last packet.  If we see the start
diff --git a/gdbstub/internals.h b/gdbstub/internals.h
index 94ddff4..33d21d6 100644
--- a/gdbstub/internals.h
+++ b/gdbstub/internals.h
@@ -65,6 +65,11 @@
     GByteArray *mem_buf;
     int sstep_flags;
     int supported_sstep_flags;
+    /*
+     * Whether we are allowed to send a stop reply packet at this moment.
+     * Must be set off after sending the stop reply itself.
+     */
+    bool allow_stop_reply;
 } GDBState;
 
 /* lives in main gdbstub.c */
diff --git a/gdbstub/softmmu.c b/gdbstub/softmmu.c
index 22ecd09..99d994e 100644
--- a/gdbstub/softmmu.c
+++ b/gdbstub/softmmu.c
@@ -43,6 +43,7 @@
     g_free(gdbserver_state.processes);
     gdbserver_state.processes = NULL;
     gdbserver_state.process_num = 0;
+    gdbserver_state.allow_stop_reply = false;
 }
 
 /*
@@ -139,6 +140,10 @@
         return;
     }
 
+    if (!gdbserver_state.allow_stop_reply) {
+        return;
+    }
+
     gdb_append_thread_id(cpu, tid);
 
     switch (state) {
@@ -205,6 +210,7 @@
 
 send_packet:
     gdb_put_packet(buf->str);
+    gdbserver_state.allow_stop_reply = false;
 
     /* disable single step if it was enabled */
     cpu_single_step(cpu, 0);
@@ -422,8 +428,11 @@
 
     trace_gdbstub_op_exiting((uint8_t)code);
 
-    snprintf(buf, sizeof(buf), "W%02x", (uint8_t)code);
-    gdb_put_packet(buf);
+    if (gdbserver_state.allow_stop_reply) {
+        snprintf(buf, sizeof(buf), "W%02x", (uint8_t)code);
+        gdb_put_packet(buf);
+        gdbserver_state.allow_stop_reply = false;
+    }
 
     qemu_chr_fe_deinit(&gdbserver_system_state.chr, true);
 }
diff --git a/gdbstub/user.c b/gdbstub/user.c
index 80488b6..5b375be 100644
--- a/gdbstub/user.c
+++ b/gdbstub/user.c
@@ -108,8 +108,11 @@
 
     trace_gdbstub_op_exiting((uint8_t)code);
 
-    snprintf(buf, sizeof(buf), "W%02x", (uint8_t)code);
-    gdb_put_packet(buf);
+    if (gdbserver_state.allow_stop_reply) {
+        snprintf(buf, sizeof(buf), "W%02x", (uint8_t)code);
+        gdb_put_packet(buf);
+        gdbserver_state.allow_stop_reply = false;
+    }
 }
 
 int gdb_handlesig(CPUState *cpu, int sig)
@@ -127,11 +130,14 @@
 
     if (sig != 0) {
         gdb_set_stop_cpu(cpu);
-        g_string_printf(gdbserver_state.str_buf,
-                        "T%02xthread:", gdb_target_signal_to_gdb(sig));
-        gdb_append_thread_id(cpu, gdbserver_state.str_buf);
-        g_string_append_c(gdbserver_state.str_buf, ';');
-        gdb_put_strbuf();
+        if (gdbserver_state.allow_stop_reply) {
+            g_string_printf(gdbserver_state.str_buf,
+                            "T%02xthread:", gdb_target_signal_to_gdb(sig));
+            gdb_append_thread_id(cpu, gdbserver_state.str_buf);
+            g_string_append_c(gdbserver_state.str_buf, ';');
+            gdb_put_strbuf();
+            gdbserver_state.allow_stop_reply = false;
+        }
     }
     /*
      * gdb_put_packet() might have detected that the peer terminated the
@@ -174,12 +180,14 @@
 {
     char buf[4];
 
-    if (!gdbserver_state.init || gdbserver_user_state.fd < 0) {
+    if (!gdbserver_state.init || gdbserver_user_state.fd < 0 ||
+        !gdbserver_state.allow_stop_reply) {
         return;
     }
 
     snprintf(buf, sizeof(buf), "X%02x", gdb_target_signal_to_gdb(sig));
     gdb_put_packet(buf);
+    gdbserver_state.allow_stop_reply = false;
 }
 
 static void gdb_accept_init(int fd)
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 06b514b..b99ae18 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -2146,7 +2146,7 @@
         exit(1);
     }
 
-    if (vms->mte && hvf_enabled()) {
+    if (vms->mte && (kvm_enabled() || hvf_enabled())) {
         error_report("mach-virt: %s does not support providing "
                      "MTE to the guest CPU",
                      current_accel_name());
@@ -2216,48 +2216,39 @@
         }
 
         if (vms->mte) {
-            if (tcg_enabled()) {
-                /* Create the memory region only once, but link to all cpus. */
-                if (!tag_sysmem) {
-                    /*
-                     * The property exists only if MemTag is supported.
-                     * If it is, we must allocate the ram to back that up.
-                     */
-                    if (!object_property_find(cpuobj, "tag-memory")) {
-                        error_report("MTE requested, but not supported "
-                                     "by the guest CPU");
-                        exit(1);
-                    }
-
-                    tag_sysmem = g_new(MemoryRegion, 1);
-                    memory_region_init(tag_sysmem, OBJECT(machine),
-                                       "tag-memory", UINT64_MAX / 32);
-
-                    if (vms->secure) {
-                        secure_tag_sysmem = g_new(MemoryRegion, 1);
-                        memory_region_init(secure_tag_sysmem, OBJECT(machine),
-                                           "secure-tag-memory",
-                                           UINT64_MAX / 32);
-
-                        /* As with ram, secure-tag takes precedence over tag. */
-                        memory_region_add_subregion_overlap(secure_tag_sysmem,
-                                                            0, tag_sysmem, -1);
-                    }
-                }
-
-                object_property_set_link(cpuobj, "tag-memory",
-                                         OBJECT(tag_sysmem), &error_abort);
-                if (vms->secure) {
-                    object_property_set_link(cpuobj, "secure-tag-memory",
-                                             OBJECT(secure_tag_sysmem),
-                                             &error_abort);
-                }
-            } else if (kvm_enabled()) {
-                if (!kvm_arm_mte_supported()) {
-                    error_report("MTE requested, but not supported by KVM");
+            /* Create the memory region only once, but link to all cpus. */
+            if (!tag_sysmem) {
+                /*
+                 * The property exists only if MemTag is supported.
+                 * If it is, we must allocate the ram to back that up.
+                 */
+                if (!object_property_find(cpuobj, "tag-memory")) {
+                    error_report("MTE requested, but not supported "
+                                 "by the guest CPU");
                     exit(1);
                 }
-                kvm_arm_enable_mte(cpuobj, &error_abort);
+
+                tag_sysmem = g_new(MemoryRegion, 1);
+                memory_region_init(tag_sysmem, OBJECT(machine),
+                                   "tag-memory", UINT64_MAX / 32);
+
+                if (vms->secure) {
+                    secure_tag_sysmem = g_new(MemoryRegion, 1);
+                    memory_region_init(secure_tag_sysmem, OBJECT(machine),
+                                       "secure-tag-memory", UINT64_MAX / 32);
+
+                    /* As with ram, secure-tag takes precedence over tag.  */
+                    memory_region_add_subregion_overlap(secure_tag_sysmem, 0,
+                                                        tag_sysmem, -1);
+                }
+            }
+
+            object_property_set_link(cpuobj, "tag-memory", OBJECT(tag_sysmem),
+                                     &error_abort);
+            if (vms->secure) {
+                object_property_set_link(cpuobj, "secure-tag-memory",
+                                         OBJECT(secure_tag_sysmem),
+                                         &error_abort);
             }
         }
 
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 47a3484..07f763e 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -48,6 +48,7 @@
     { "e1000e", "migrate-timadj", "off" },
     { "virtio-mem", "x-early-migration", "false" },
     { "migration", "x-preempt-pre-7-2", "true" },
+    { TYPE_PCI_DEVICE, "x-pcie-err-unc-mask", "off" },
 };
 const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2);
 
diff --git a/hw/cxl/cxl-cdat.c b/hw/cxl/cxl-cdat.c
index 137abd0..d246d68 100644
--- a/hw/cxl/cxl-cdat.c
+++ b/hw/cxl/cxl-cdat.c
@@ -108,31 +108,21 @@
 static void ct3_load_cdat(CDATObject *cdat, Error **errp)
 {
     g_autofree CDATEntry *cdat_st = NULL;
+    g_autofree char *buf = NULL;
     uint8_t sum = 0;
     int num_ent;
-    int i = 0, ent = 1, file_size = 0;
+    int i = 0, ent = 1;
+    gsize file_size = 0;
     CDATSubHeader *hdr;
-    FILE *fp = NULL;
+    GError *error = NULL;
 
     /* Read CDAT file and create its cache */
-    fp = fopen(cdat->filename, "r");
-    if (!fp) {
-        error_setg(errp, "CDAT: Unable to open file");
+    if (!g_file_get_contents(cdat->filename, (gchar **)&buf,
+                             &file_size, &error)) {
+        error_setg(errp, "CDAT: File read failed: %s", error->message);
+        g_error_free(error);
         return;
     }
-
-    fseek(fp, 0, SEEK_END);
-    file_size = ftell(fp);
-    fseek(fp, 0, SEEK_SET);
-    cdat->buf = g_malloc0(file_size);
-
-    if (fread(cdat->buf, file_size, 1, fp) == 0) {
-        error_setg(errp, "CDAT: File read failed");
-        return;
-    }
-
-    fclose(fp);
-
     if (file_size < sizeof(CDATTableHeader)) {
         error_setg(errp, "CDAT: File too short");
         return;
@@ -140,9 +130,17 @@
     i = sizeof(CDATTableHeader);
     num_ent = 1;
     while (i < file_size) {
-        hdr = (CDATSubHeader *)(cdat->buf + i);
+        hdr = (CDATSubHeader *)(buf + i);
+        if (i + sizeof(CDATSubHeader) > file_size) {
+            error_setg(errp, "CDAT: Truncated table");
+            return;
+        }
         cdat_len_check(hdr, errp);
         i += hdr->length;
+        if (i > file_size) {
+            error_setg(errp, "CDAT: Truncated table");
+            return;
+        }
         num_ent++;
     }
     if (i != file_size) {
@@ -150,33 +148,26 @@
         return;
     }
 
-    cdat_st = g_malloc0(sizeof(*cdat_st) * num_ent);
-    if (!cdat_st) {
-        error_setg(errp, "CDAT: Failed to allocate entry array");
-        return;
-    }
+    cdat_st = g_new0(CDATEntry, num_ent);
 
     /* Set CDAT header, Entry = 0 */
-    cdat_st[0].base = cdat->buf;
+    cdat_st[0].base = buf;
     cdat_st[0].length = sizeof(CDATTableHeader);
     i = 0;
 
     while (i < cdat_st[0].length) {
-        sum += cdat->buf[i++];
+        sum += buf[i++];
     }
 
     /* Read CDAT structures */
     while (i < file_size) {
-        hdr = (CDATSubHeader *)(cdat->buf + i);
-        cdat_len_check(hdr, errp);
-
+        hdr = (CDATSubHeader *)(buf + i);
         cdat_st[ent].base = hdr;
         cdat_st[ent].length = hdr->length;
 
-        while (cdat->buf + i <
-               (uint8_t *)cdat_st[ent].base + cdat_st[ent].length) {
+        while (buf + i < (char *)cdat_st[ent].base + cdat_st[ent].length) {
             assert(i < file_size);
-            sum += cdat->buf[i++];
+            sum += buf[i++];
         }
 
         ent++;
@@ -187,6 +178,7 @@
     }
     cdat->entry_len = num_ent;
     cdat->entry = g_steal_pointer(&cdat_st);
+    cdat->buf = g_steal_pointer(&buf);
 }
 
 void cxl_doe_cdat_init(CXLComponentState *cxl_cstate, Error **errp)
@@ -218,7 +210,5 @@
         cdat->free_cdat_table(cdat->built_buf, cdat->built_buf_len,
                               cdat->private);
     }
-    if (cdat->buf) {
-        free(cdat->buf);
-    }
+    g_free(cdat->buf);
 }
diff --git a/hw/cxl/cxl-component-utils.c b/hw/cxl/cxl-component-utils.c
index b665d4f..378f108 100644
--- a/hw/cxl/cxl-component-utils.c
+++ b/hw/cxl/cxl-component-utils.c
@@ -38,23 +38,25 @@
     ComponentRegisters *cregs = &cxl_cstate->crb;
     uint32_t *cache_mem = cregs->cache_mem_registers;
     bool should_commit = false;
+    bool should_uncommit = false;
 
     switch (offset) {
     case A_CXL_HDM_DECODER0_CTRL:
         should_commit = FIELD_EX32(value, CXL_HDM_DECODER0_CTRL, COMMIT);
+        should_uncommit = !should_commit;
         break;
     default:
         break;
     }
 
-    memory_region_transaction_begin();
-    stl_le_p((uint8_t *)cache_mem + offset, value);
     if (should_commit) {
-        ARRAY_FIELD_DP32(cache_mem, CXL_HDM_DECODER0_CTRL, COMMIT, 0);
-        ARRAY_FIELD_DP32(cache_mem, CXL_HDM_DECODER0_CTRL, ERR, 0);
-        ARRAY_FIELD_DP32(cache_mem, CXL_HDM_DECODER0_CTRL, COMMITTED, 1);
+        value = FIELD_DP32(value, CXL_HDM_DECODER0_CTRL, ERR, 0);
+        value = FIELD_DP32(value, CXL_HDM_DECODER0_CTRL, COMMITTED, 1);
+    } else if (should_uncommit) {
+        value = FIELD_DP32(value, CXL_HDM_DECODER0_CTRL, ERR, 0);
+        value = FIELD_DP32(value, CXL_HDM_DECODER0_CTRL, COMMITTED, 0);
     }
-    memory_region_transaction_commit();
+    stl_le_p((uint8_t *)cache_mem + offset, value);
 }
 
 static void cxl_cache_mem_write_reg(void *opaque, hwaddr offset, uint64_t value,
diff --git a/hw/cxl/cxl-device-utils.c b/hw/cxl/cxl-device-utils.c
index 4c5e88a..86e1cea 100644
--- a/hw/cxl/cxl-device-utils.c
+++ b/hw/cxl/cxl-device-utils.c
@@ -269,3 +269,18 @@
 
     cxl_initialize_mailbox(cxl_dstate);
 }
+
+uint64_t cxl_device_get_timestamp(CXLDeviceState *cxl_dstate)
+{
+    uint64_t time, delta;
+    uint64_t final_time = 0;
+
+    if (cxl_dstate->timestamp.set) {
+        /* Find the delta from the last time the host set the time. */
+        time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+        delta = time - cxl_dstate->timestamp.last_set;
+        final_time = cxl_dstate->timestamp.host_set + delta;
+    }
+
+    return final_time;
+}
diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index 206e04a..702e16c 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -23,7 +23,7 @@
  *     FOO    = 0x7f,
  *          #define BAR 0
  *  2. Implement the handler
- *    static ret_code cmd_foo_bar(struct cxl_cmd *cmd,
+ *    static CXLRetCode cmd_foo_bar(struct cxl_cmd *cmd,
  *                                  CXLDeviceState *cxl_dstate, uint16_t *len)
  *  3. Add the command to the cxl_cmd_set[][]
  *    [FOO][BAR] = { "FOO_BAR", cmd_foo_bar, x, y },
@@ -90,10 +90,10 @@
     CXL_MBOX_UNSUPPORTED_MAILBOX = 0x15,
     CXL_MBOX_INVALID_PAYLOAD_LENGTH = 0x16,
     CXL_MBOX_MAX = 0x17
-} ret_code;
+} CXLRetCode;
 
 struct cxl_cmd;
-typedef ret_code (*opcode_handler)(struct cxl_cmd *cmd,
+typedef CXLRetCode (*opcode_handler)(struct cxl_cmd *cmd,
                                    CXLDeviceState *cxl_dstate, uint16_t *len);
 struct cxl_cmd {
     const char *name;
@@ -105,16 +105,16 @@
 
 #define DEFINE_MAILBOX_HANDLER_ZEROED(name, size)                         \
     uint16_t __zero##name = size;                                         \
-    static ret_code cmd_##name(struct cxl_cmd *cmd,                       \
-                               CXLDeviceState *cxl_dstate, uint16_t *len) \
+    static CXLRetCode cmd_##name(struct cxl_cmd *cmd,                       \
+                                 CXLDeviceState *cxl_dstate, uint16_t *len) \
     {                                                                     \
         *len = __zero##name;                                              \
         memset(cmd->payload, 0, *len);                                    \
         return CXL_MBOX_SUCCESS;                                          \
     }
 #define DEFINE_MAILBOX_HANDLER_NOP(name)                                  \
-    static ret_code cmd_##name(struct cxl_cmd *cmd,                       \
-                               CXLDeviceState *cxl_dstate, uint16_t *len) \
+    static CXLRetCode cmd_##name(struct cxl_cmd *cmd,                       \
+                                 CXLDeviceState *cxl_dstate, uint16_t *len) \
     {                                                                     \
         return CXL_MBOX_SUCCESS;                                          \
     }
@@ -125,9 +125,9 @@
 DEFINE_MAILBOX_HANDLER_NOP(events_set_interrupt_policy);
 
 /* 8.2.9.2.1 */
-static ret_code cmd_firmware_update_get_info(struct cxl_cmd *cmd,
-                                             CXLDeviceState *cxl_dstate,
-                                             uint16_t *len)
+static CXLRetCode cmd_firmware_update_get_info(struct cxl_cmd *cmd,
+                                               CXLDeviceState *cxl_dstate,
+                                               uint16_t *len)
 {
     struct {
         uint8_t slots_supported;
@@ -141,7 +141,8 @@
     } QEMU_PACKED *fw_info;
     QEMU_BUILD_BUG_ON(sizeof(*fw_info) != 0x50);
 
-    if (cxl_dstate->pmem_size < CXL_CAPACITY_MULTIPLIER) {
+    if ((cxl_dstate->vmem_size < CXL_CAPACITY_MULTIPLIER) ||
+        (cxl_dstate->pmem_size < CXL_CAPACITY_MULTIPLIER)) {
         return CXL_MBOX_INTERNAL_ERROR;
     }
 
@@ -158,21 +159,12 @@
 }
 
 /* 8.2.9.3.1 */
-static ret_code cmd_timestamp_get(struct cxl_cmd *cmd,
-                                  CXLDeviceState *cxl_dstate,
-                                  uint16_t *len)
+static CXLRetCode cmd_timestamp_get(struct cxl_cmd *cmd,
+                                    CXLDeviceState *cxl_dstate,
+                                    uint16_t *len)
 {
-    uint64_t time, delta;
-    uint64_t final_time = 0;
+    uint64_t final_time = cxl_device_get_timestamp(cxl_dstate);
 
-    if (cxl_dstate->timestamp.set) {
-        /* First find the delta from the last time the host set the time. */
-        time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
-        delta = time - cxl_dstate->timestamp.last_set;
-        final_time = cxl_dstate->timestamp.host_set + delta;
-    }
-
-    /* Then adjust the actual time */
     stq_le_p(cmd->payload, final_time);
     *len = 8;
 
@@ -180,7 +172,7 @@
 }
 
 /* 8.2.9.3.2 */
-static ret_code cmd_timestamp_set(struct cxl_cmd *cmd,
+static CXLRetCode cmd_timestamp_set(struct cxl_cmd *cmd,
                                   CXLDeviceState *cxl_dstate,
                                   uint16_t *len)
 {
@@ -200,9 +192,9 @@
 };
 
 /* 8.2.9.4.1 */
-static ret_code cmd_logs_get_supported(struct cxl_cmd *cmd,
-                                       CXLDeviceState *cxl_dstate,
-                                       uint16_t *len)
+static CXLRetCode cmd_logs_get_supported(struct cxl_cmd *cmd,
+                                         CXLDeviceState *cxl_dstate,
+                                         uint16_t *len)
 {
     struct {
         uint16_t entries;
@@ -223,9 +215,9 @@
 }
 
 /* 8.2.9.4.2 */
-static ret_code cmd_logs_get_log(struct cxl_cmd *cmd,
-                                 CXLDeviceState *cxl_dstate,
-                                 uint16_t *len)
+static CXLRetCode cmd_logs_get_log(struct cxl_cmd *cmd,
+                                   CXLDeviceState *cxl_dstate,
+                                   uint16_t *len)
 {
     struct {
         QemuUUID uuid;
@@ -264,9 +256,9 @@
 }
 
 /* 8.2.9.5.1.1 */
-static ret_code cmd_identify_memory_device(struct cxl_cmd *cmd,
-                                           CXLDeviceState *cxl_dstate,
-                                           uint16_t *len)
+static CXLRetCode cmd_identify_memory_device(struct cxl_cmd *cmd,
+                                             CXLDeviceState *cxl_dstate,
+                                             uint16_t *len)
 {
     struct {
         char fw_revision[0x10];
@@ -288,29 +280,29 @@
 
     CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
     CXLType3Class *cvc = CXL_TYPE3_GET_CLASS(ct3d);
-    uint64_t size = cxl_dstate->pmem_size;
 
-    if (!QEMU_IS_ALIGNED(size, CXL_CAPACITY_MULTIPLIER)) {
+    if ((!QEMU_IS_ALIGNED(cxl_dstate->vmem_size, CXL_CAPACITY_MULTIPLIER)) ||
+        (!QEMU_IS_ALIGNED(cxl_dstate->pmem_size, CXL_CAPACITY_MULTIPLIER))) {
         return CXL_MBOX_INTERNAL_ERROR;
     }
 
     id = (void *)cmd->payload;
     memset(id, 0, sizeof(*id));
 
-    /* PMEM only */
     snprintf(id->fw_revision, 0x10, "BWFW VERSION %02d", 0);
 
-    id->total_capacity = size / CXL_CAPACITY_MULTIPLIER;
-    id->persistent_capacity = size / CXL_CAPACITY_MULTIPLIER;
-    id->lsa_size = cvc->get_lsa_size(ct3d);
+    stq_le_p(&id->total_capacity, cxl_dstate->mem_size / CXL_CAPACITY_MULTIPLIER);
+    stq_le_p(&id->persistent_capacity, cxl_dstate->pmem_size / CXL_CAPACITY_MULTIPLIER);
+    stq_le_p(&id->volatile_capacity, cxl_dstate->vmem_size / CXL_CAPACITY_MULTIPLIER);
+    stl_le_p(&id->lsa_size, cvc->get_lsa_size(ct3d));
 
     *len = sizeof(*id);
     return CXL_MBOX_SUCCESS;
 }
 
-static ret_code cmd_ccls_get_partition_info(struct cxl_cmd *cmd,
-                                           CXLDeviceState *cxl_dstate,
-                                           uint16_t *len)
+static CXLRetCode cmd_ccls_get_partition_info(struct cxl_cmd *cmd,
+                                              CXLDeviceState *cxl_dstate,
+                                              uint16_t *len)
 {
     struct {
         uint64_t active_vmem;
@@ -319,25 +311,28 @@
         uint64_t next_pmem;
     } QEMU_PACKED *part_info = (void *)cmd->payload;
     QEMU_BUILD_BUG_ON(sizeof(*part_info) != 0x20);
-    uint64_t size = cxl_dstate->pmem_size;
 
-    if (!QEMU_IS_ALIGNED(size, CXL_CAPACITY_MULTIPLIER)) {
+    if ((!QEMU_IS_ALIGNED(cxl_dstate->vmem_size, CXL_CAPACITY_MULTIPLIER)) ||
+        (!QEMU_IS_ALIGNED(cxl_dstate->pmem_size, CXL_CAPACITY_MULTIPLIER))) {
         return CXL_MBOX_INTERNAL_ERROR;
     }
 
-    /* PMEM only */
-    part_info->active_vmem = 0;
-    part_info->next_vmem = 0;
-    part_info->active_pmem = size / CXL_CAPACITY_MULTIPLIER;
-    part_info->next_pmem = 0;
+    stq_le_p(&part_info->active_vmem, cxl_dstate->vmem_size / CXL_CAPACITY_MULTIPLIER);
+    /*
+     * When both next_vmem and next_pmem are 0, there is no pending change to
+     * partitioning.
+     */
+    stq_le_p(&part_info->next_vmem, 0);
+    stq_le_p(&part_info->active_pmem, cxl_dstate->pmem_size / CXL_CAPACITY_MULTIPLIER);
+    stq_le_p(&part_info->next_pmem, 0);
 
     *len = sizeof(*part_info);
     return CXL_MBOX_SUCCESS;
 }
 
-static ret_code cmd_ccls_get_lsa(struct cxl_cmd *cmd,
-                                 CXLDeviceState *cxl_dstate,
-                                 uint16_t *len)
+static CXLRetCode cmd_ccls_get_lsa(struct cxl_cmd *cmd,
+                                   CXLDeviceState *cxl_dstate,
+                                   uint16_t *len)
 {
     struct {
         uint32_t offset;
@@ -360,9 +355,9 @@
     return CXL_MBOX_SUCCESS;
 }
 
-static ret_code cmd_ccls_set_lsa(struct cxl_cmd *cmd,
-                                 CXLDeviceState *cxl_dstate,
-                                 uint16_t *len)
+static CXLRetCode cmd_ccls_set_lsa(struct cxl_cmd *cmd,
+                                   CXLDeviceState *cxl_dstate,
+                                   uint16_t *len)
 {
     struct set_lsa_pl {
         uint32_t offset;
diff --git a/hw/i386/acpi-common.c b/hw/i386/acpi-common.c
index 52e5c14..8a0932f 100644
--- a/hw/i386/acpi-common.c
+++ b/hw/i386/acpi-common.c
@@ -102,7 +102,7 @@
     MachineClass *mc = MACHINE_GET_CLASS(x86ms);
     const CPUArchIdList *apic_ids = mc->possible_cpu_arch_ids(MACHINE(x86ms));
     AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(adev);
-    AcpiTable table = { .sig = "APIC", .rev = 1, .oem_id = oem_id,
+    AcpiTable table = { .sig = "APIC", .rev = 3, .oem_id = oem_id,
                         .oem_table_id = oem_table_id };
 
     acpi_table_begin(&table, table_data);
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index d761c8c..b3d826a 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -116,7 +116,9 @@
     { "qemu64-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, },\
     { "athlon-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, },
 
-GlobalProperty pc_compat_8_0[] = {};
+GlobalProperty pc_compat_8_0[] = {
+    { "virtio-mem", "unplugged-inaccessible", "auto" },
+};
 const size_t pc_compat_8_0_len = G_N_ELEMENTS(pc_compat_8_0);
 
 GlobalProperty pc_compat_7_2[] = {
@@ -950,7 +952,6 @@
 void pc_memory_init(PCMachineState *pcms,
                     MemoryRegion *system_memory,
                     MemoryRegion *rom_memory,
-                    MemoryRegion **ram_memory,
                     uint64_t pci_hole64_size)
 {
     int linux_boot, i;
@@ -1008,7 +1009,6 @@
      * Split single memory region and use aliases to address portions of it,
      * done for backwards compatibility with older qemus.
      */
-    *ram_memory = machine->ram;
     ram_below_4g = g_malloc(sizeof(*ram_below_4g));
     memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", machine->ram,
                              0, x86ms->below_4g_mem_size);
@@ -1265,7 +1265,7 @@
 
 void pc_basic_device_init(struct PCMachineState *pcms,
                           ISABus *isa_bus, qemu_irq *gsi,
-                          ISADevice **rtc_state,
+                          ISADevice *rtc_state,
                           bool create_fdctrl,
                           uint32_t hpet_irqs)
 {
@@ -1318,7 +1318,17 @@
         pit_alt_irq = qdev_get_gpio_in(hpet, HPET_LEGACY_PIT_INT);
         rtc_irq = qdev_get_gpio_in(hpet, HPET_LEGACY_RTC_INT);
     }
-    *rtc_state = ISA_DEVICE(mc146818_rtc_init(isa_bus, 2000, rtc_irq));
+
+    if (rtc_irq) {
+        qdev_connect_gpio_out(DEVICE(rtc_state), 0, rtc_irq);
+    } else {
+        uint32_t irq = object_property_get_uint(OBJECT(rtc_state),
+                                                "irq",
+                                                &error_fatal);
+        isa_connect_gpio_out(rtc_state, 0, irq);
+    }
+    object_property_add_alias(OBJECT(pcms), "rtc-time", OBJECT(rtc_state),
+                              "date");
 
 #ifdef CONFIG_XEN_EMU
     if (xen_mode == XEN_EMULATE) {
@@ -1331,7 +1341,7 @@
     }
 #endif
 
-    qemu_register_boot_set(pc_boot_set, *rtc_state);
+    qemu_register_boot_set(pc_boot_set, rtc_state);
 
     if (!xen_enabled() &&
         (x86ms->pit == ON_OFF_AUTO_AUTO || x86ms->pit == ON_OFF_AUTO_ON)) {
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 66a849d..10070ea 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -32,6 +32,7 @@
 #include "hw/i386/pc.h"
 #include "hw/i386/apic.h"
 #include "hw/pci-host/i440fx.h"
+#include "hw/rtc/mc146818rtc.h"
 #include "hw/southbridge/piix.h"
 #include "hw/display/ramfb.h"
 #include "hw/firmware/smbios.h"
@@ -144,6 +145,7 @@
     if (xen_enabled()) {
         xen_hvm_init_pc(pcms, &ram_memory);
     } else {
+        ram_memory = machine->ram;
         if (!pcms->max_ram_below_4g) {
             pcms->max_ram_below_4g = 0xe0000000; /* default: 3.5G */
         }
@@ -198,7 +200,7 @@
     if (pcmc->smbios_defaults) {
         MachineClass *mc = MACHINE_GET_CLASS(machine);
         /* These values are guest ABI, do not change */
-        smbios_set_defaults("QEMU", "Standard PC (i440FX + PIIX, 1996)",
+        smbios_set_defaults("QEMU", mc->desc,
                             mc->name, pcmc->smbios_legacy_mode,
                             pcmc->smbios_uuid_encoded,
                             pcms->smbios_entry_point_type);
@@ -206,8 +208,7 @@
 
     /* allocate ram and load rom/bios */
     if (!xen_enabled()) {
-        pc_memory_init(pcms, system_memory,
-                       rom_memory, &ram_memory, hole64_size);
+        pc_memory_init(pcms, system_memory, rom_memory, hole64_size);
     } else {
         pc_system_flash_cleanup_unused(pcms);
         if (machine->kernel_filename != NULL) {
@@ -240,10 +241,17 @@
         piix3->pic = x86ms->gsi;
         piix3_devfn = piix3->dev.devfn;
         isa_bus = ISA_BUS(qdev_get_child_bus(DEVICE(piix3), "isa.0"));
+        rtc_state = ISA_DEVICE(object_resolve_path_component(OBJECT(pci_dev),
+                                                             "rtc"));
     } else {
         pci_bus = NULL;
-        isa_bus = isa_bus_new(NULL, get_system_memory(), system_io,
+        isa_bus = isa_bus_new(NULL, system_memory, system_io,
                               &error_abort);
+
+        rtc_state = isa_new(TYPE_MC146818_RTC);
+        qdev_prop_set_int32(DEVICE(rtc_state), "base_year", 2000);
+        isa_realize_and_unref(rtc_state, isa_bus, &error_fatal);
+
         i8257_dma_init(isa_bus, 0);
         pcms->hpet_enabled = false;
     }
@@ -269,7 +277,7 @@
     }
 
     /* init basic PC hardware */
-    pc_basic_device_init(pcms, isa_bus, x86ms->gsi, &rtc_state, true,
+    pc_basic_device_init(pcms, isa_bus, x86ms->gsi, rtc_state, true,
                          0x4);
 
     pc_nic_init(pcmc, isa_bus, pci_bus);
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index f02919d..8030d53 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -126,10 +126,10 @@
     DeviceState *lpc_dev;
     BusState *idebus[MAX_SATA_PORTS];
     ISADevice *rtc_state;
+    MemoryRegion *system_memory = get_system_memory();
     MemoryRegion *system_io = get_system_io();
     MemoryRegion *pci_memory;
     MemoryRegion *rom_memory;
-    MemoryRegion *ram_memory;
     GSIState *gsi_state;
     ISABus *isa_bus;
     int i;
@@ -192,14 +192,14 @@
         rom_memory = pci_memory;
     } else {
         pci_memory = NULL;
-        rom_memory = get_system_memory();
+        rom_memory = system_memory;
     }
 
     pc_guest_info_init(pcms);
 
     if (pcmc->smbios_defaults) {
         /* These values are guest ABI, do not change */
-        smbios_set_defaults("QEMU", "Standard PC (Q35 + ICH9, 2009)",
+        smbios_set_defaults("QEMU", mc->desc,
                             mc->name, pcmc->smbios_legacy_mode,
                             pcmc->smbios_uuid_encoded,
                             pcms->smbios_entry_point_type);
@@ -215,16 +215,15 @@
     }
 
     /* allocate ram and load rom/bios */
-    pc_memory_init(pcms, get_system_memory(), rom_memory, &ram_memory,
-                   pci_hole64_size);
+    pc_memory_init(pcms, system_memory, rom_memory, pci_hole64_size);
 
-    object_property_add_child(qdev_get_machine(), "q35", OBJECT(q35_host));
+    object_property_add_child(OBJECT(machine), "q35", OBJECT(q35_host));
     object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_RAM_MEM,
-                             OBJECT(ram_memory), NULL);
+                             OBJECT(machine->ram), NULL);
     object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_PCI_MEM,
                              OBJECT(pci_memory), NULL);
     object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_SYSTEM_MEM,
-                             OBJECT(get_system_memory()), NULL);
+                             OBJECT(system_memory), NULL);
     object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_IO_MEM,
                              OBJECT(system_io), NULL);
     object_property_set_int(OBJECT(q35_host), PCI_HOST_BELOW_4G_MEM_SIZE,
@@ -242,6 +241,8 @@
                       x86_machine_is_smm_enabled(x86ms));
     pci_realize_and_unref(lpc, host_bus, &error_fatal);
 
+    rtc_state = ISA_DEVICE(object_resolve_path_component(OBJECT(lpc), "rtc"));
+
     object_property_add_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP,
                              TYPE_HOTPLUG_HANDLER,
                              (Object **)&x86ms->acpi_dev,
@@ -291,7 +292,7 @@
     }
 
     /* init basic PC hardware */
-    pc_basic_device_init(pcms, isa_bus, x86ms->gsi, &rtc_state, !mc->no_floppy,
+    pc_basic_device_init(pcms, isa_bus, x86ms->gsi, rtc_state, !mc->no_floppy,
                          0xff0104);
 
     if (pcms->sata_enabled) {
diff --git a/hw/isa/Kconfig b/hw/isa/Kconfig
index 0156a66..c10cbc5 100644
--- a/hw/isa/Kconfig
+++ b/hw/isa/Kconfig
@@ -35,6 +35,7 @@
     bool
     select I8257
     select ISA_BUS
+    select MC146818RTC
 
 config PIIX4
     bool
@@ -79,3 +80,4 @@
     select I8257
     select ISA_BUS
     select ACPI_ICH9
+    select MC146818RTC
diff --git a/hw/isa/lpc_ich9.c b/hw/isa/lpc_ich9.c
index 9714b00..9c47a2f 100644
--- a/hw/isa/lpc_ich9.c
+++ b/hw/isa/lpc_ich9.c
@@ -658,6 +658,8 @@
     static const uint8_t acpi_enable_cmd = ICH9_APM_ACPI_ENABLE;
     static const uint8_t acpi_disable_cmd = ICH9_APM_ACPI_DISABLE;
 
+    object_initialize_child(obj, "rtc", &lpc->rtc, TYPE_MC146818_RTC);
+
     object_property_add_uint8_ptr(obj, ACPI_PM_PROP_SCI_INT,
                                   &lpc->sci_gsi, OBJ_PROP_FLAG_READ);
     object_property_add_uint8_ptr(OBJECT(lpc), ACPI_PM_PROP_ACPI_ENABLE_CMD,
@@ -723,6 +725,12 @@
 
     i8257_dma_init(isa_bus, 0);
 
+    /* RTC */
+    qdev_prop_set_int32(DEVICE(&lpc->rtc), "base_year", 2000);
+    if (!qdev_realize(DEVICE(&lpc->rtc), BUS(isa_bus), errp)) {
+        return;
+    }
+
     pci_bus_irqs(pci_bus, ich9_lpc_set_irq, d, ICH9_LPC_NB_PIRQS);
     pci_bus_map_irqs(pci_bus, ich9_lpc_map_irq);
     pci_bus_set_route_irq_fn(pci_bus, ich9_route_intx_pin_to_irq);
diff --git a/hw/isa/piix3.c b/hw/isa/piix3.c
index a9cb39b..f9103ea 100644
--- a/hw/isa/piix3.c
+++ b/hw/isa/piix3.c
@@ -28,6 +28,7 @@
 #include "hw/dma/i8257.h"
 #include "hw/southbridge/piix.h"
 #include "hw/irq.h"
+#include "hw/qdev-properties.h"
 #include "hw/isa/isa.h"
 #include "hw/xen/xen.h"
 #include "sysemu/runstate.h"
@@ -301,6 +302,12 @@
                                         PIIX_RCR_IOPORT, &d->rcr_mem, 1);
 
     i8257_dma_init(isa_bus, 0);
+
+    /* RTC */
+    qdev_prop_set_int32(DEVICE(&d->rtc), "base_year", 2000);
+    if (!qdev_realize(DEVICE(&d->rtc), BUS(isa_bus), errp)) {
+        return;
+    }
 }
 
 static void build_pci_isa_aml(AcpiDevAmlIf *adev, Aml *scope)
@@ -324,6 +331,13 @@
     qbus_build_aml(bus, scope);
 }
 
+static void pci_piix3_init(Object *obj)
+{
+    PIIX3State *d = PIIX3_PCI_DEVICE(obj);
+
+    object_initialize_child(obj, "rtc", &d->rtc, TYPE_MC146818_RTC);
+}
+
 static void pci_piix3_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
@@ -350,6 +364,7 @@
     .name = TYPE_PIIX3_PCI_DEVICE,
     .parent = TYPE_PCI_DEVICE,
     .instance_size = sizeof(PIIX3State),
+    .instance_init = pci_piix3_init,
     .abstract = true,
     .class_init = pci_piix3_class_init,
     .interfaces = (InterfaceInfo[]) {
diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index abe60b3..2adacbd 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -31,7 +31,8 @@
 };
 
 static int ct3_build_cdat_entries_for_mr(CDATSubHeader **cdat_table,
-                                         int dsmad_handle, MemoryRegion *mr)
+                                         int dsmad_handle, MemoryRegion *mr,
+                                         bool is_pmem, uint64_t dpa_base)
 {
     g_autofree CDATDsmas *dsmas = NULL;
     g_autofree CDATDslbis *dslbis0 = NULL;
@@ -50,9 +51,9 @@
             .length = sizeof(*dsmas),
         },
         .DSMADhandle = dsmad_handle,
-        .flags = CDAT_DSMAS_FLAG_NV,
-        .DPA_base = 0,
-        .DPA_length = int128_get64(mr->size),
+        .flags = is_pmem ? CDAT_DSMAS_FLAG_NV : 0,
+        .DPA_base = dpa_base,
+        .DPA_length = memory_region_size(mr),
     };
 
     /* For now, no memory side cache, plausiblish numbers */
@@ -130,10 +131,13 @@
             .length = sizeof(*dsemts),
         },
         .DSMAS_handle = dsmad_handle,
-        /* Reserved - the non volatile from DSMAS matters */
-        .EFI_memory_type_attr = 2,
+        /*
+         * NV: Reserved - the non volatile from DSMAS matters
+         * V: EFI_MEMORY_SP
+         */
+        .EFI_memory_type_attr = is_pmem ? 2 : 1,
         .DPA_offset = 0,
-        .DPA_length = int128_get64(mr->size),
+        .DPA_length = memory_region_size(mr),
     };
 
     /* Header always at start of structure */
@@ -150,33 +154,68 @@
 static int ct3_build_cdat_table(CDATSubHeader ***cdat_table, void *priv)
 {
     g_autofree CDATSubHeader **table = NULL;
-    MemoryRegion *nonvolatile_mr;
     CXLType3Dev *ct3d = priv;
+    MemoryRegion *volatile_mr = NULL, *nonvolatile_mr = NULL;
     int dsmad_handle = 0;
-    int rc;
+    int cur_ent = 0;
+    int len = 0;
+    int rc, i;
 
-    if (!ct3d->hostmem) {
+    if (!ct3d->hostpmem && !ct3d->hostvmem) {
         return 0;
     }
 
-    nonvolatile_mr = host_memory_backend_get_memory(ct3d->hostmem);
-    if (!nonvolatile_mr) {
-        return -EINVAL;
+    if (ct3d->hostvmem) {
+        volatile_mr = host_memory_backend_get_memory(ct3d->hostvmem);
+        if (!volatile_mr) {
+            return -EINVAL;
+        }
+        len += CT3_CDAT_NUM_ENTRIES;
     }
 
-    table = g_malloc0(CT3_CDAT_NUM_ENTRIES * sizeof(*table));
+    if (ct3d->hostpmem) {
+        nonvolatile_mr = host_memory_backend_get_memory(ct3d->hostpmem);
+        if (!nonvolatile_mr) {
+            return -EINVAL;
+        }
+        len += CT3_CDAT_NUM_ENTRIES;
+    }
+
+    table = g_malloc0(len * sizeof(*table));
     if (!table) {
         return -ENOMEM;
     }
 
-    rc = ct3_build_cdat_entries_for_mr(table, dsmad_handle++, nonvolatile_mr);
-    if (rc < 0) {
-        return rc;
+    /* Now fill them in */
+    if (volatile_mr) {
+        rc = ct3_build_cdat_entries_for_mr(table, dsmad_handle++, volatile_mr,
+                                           false, 0);
+        if (rc < 0) {
+            return rc;
+        }
+        cur_ent = CT3_CDAT_NUM_ENTRIES;
     }
 
+    if (nonvolatile_mr) {
+        rc = ct3_build_cdat_entries_for_mr(&(table[cur_ent]), dsmad_handle++,
+                                           nonvolatile_mr, true,
+                                           (volatile_mr ?
+                                            memory_region_size(volatile_mr) : 0));
+        if (rc < 0) {
+            goto error_cleanup;
+        }
+        cur_ent += CT3_CDAT_NUM_ENTRIES;
+    }
+    assert(len == cur_ent);
+
     *cdat_table = g_steal_pointer(&table);
 
-    return CT3_CDAT_NUM_ENTRIES;
+    return len;
+error_cleanup:
+    for (i = 0; i < cur_ent; i++) {
+        g_free(table[i]);
+    }
+    return rc;
 }
 
 static void ct3_free_cdat_table(CDATSubHeader **cdat_table, int num, void *priv)
@@ -264,16 +303,42 @@
 {
     CXLComponentState *cxl_cstate = &ct3d->cxl_cstate;
     uint8_t *dvsec;
+    uint32_t range1_size_hi, range1_size_lo,
+             range1_base_hi = 0, range1_base_lo = 0,
+             range2_size_hi = 0, range2_size_lo = 0,
+             range2_base_hi = 0, range2_base_lo = 0;
+
+    /*
+     * Volatile memory is mapped as (0x0)
+     * Persistent memory is mapped at (volatile->size)
+     */
+    if (ct3d->hostvmem) {
+        range1_size_hi = ct3d->hostvmem->size >> 32;
+        range1_size_lo = (2 << 5) | (2 << 2) | 0x3 |
+                         (ct3d->hostvmem->size & 0xF0000000);
+        if (ct3d->hostpmem) {
+            range2_size_hi = ct3d->hostpmem->size >> 32;
+            range2_size_lo = (2 << 5) | (2 << 2) | 0x3 |
+                             (ct3d->hostpmem->size & 0xF0000000);
+        }
+    } else {
+        range1_size_hi = ct3d->hostpmem->size >> 32;
+        range1_size_lo = (2 << 5) | (2 << 2) | 0x3 |
+                         (ct3d->hostpmem->size & 0xF0000000);
+    }
 
     dvsec = (uint8_t *)&(CXLDVSECDevice){
         .cap = 0x1e,
         .ctrl = 0x2,
         .status2 = 0x2,
-        .range1_size_hi = ct3d->hostmem->size >> 32,
-        .range1_size_lo = (2 << 5) | (2 << 2) | 0x3 |
-        (ct3d->hostmem->size & 0xF0000000),
-        .range1_base_hi = 0,
-        .range1_base_lo = 0,
+        .range1_size_hi = range1_size_hi,
+        .range1_size_lo = range1_size_lo,
+        .range1_base_hi = range1_base_hi,
+        .range1_base_lo = range1_base_lo,
+        .range2_size_hi = range2_size_hi,
+        .range2_size_lo = range2_size_lo,
+        .range2_base_hi = range2_base_hi,
+        .range2_base_lo = range2_base_lo,
     };
     cxl_component_create_dvsec(cxl_cstate, CXL2_TYPE3_DEVICE,
                                PCIE_CXL_DEVICE_DVSEC_LENGTH,
@@ -314,14 +379,32 @@
 {
     ComponentRegisters *cregs = &ct3d->cxl_cstate.crb;
     uint32_t *cache_mem = cregs->cache_mem_registers;
+    uint32_t ctrl;
 
     assert(which == 0);
 
+    ctrl = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_CTRL);
     /* TODO: Sanity checks that the decoder is possible */
-    ARRAY_FIELD_DP32(cache_mem, CXL_HDM_DECODER0_CTRL, COMMIT, 0);
-    ARRAY_FIELD_DP32(cache_mem, CXL_HDM_DECODER0_CTRL, ERR, 0);
+    ctrl = FIELD_DP32(ctrl, CXL_HDM_DECODER0_CTRL, ERR, 0);
+    ctrl = FIELD_DP32(ctrl, CXL_HDM_DECODER0_CTRL, COMMITTED, 1);
 
-    ARRAY_FIELD_DP32(cache_mem, CXL_HDM_DECODER0_CTRL, COMMITTED, 1);
+    stl_le_p(cache_mem + R_CXL_HDM_DECODER0_CTRL, ctrl);
+}
+
+static void hdm_decoder_uncommit(CXLType3Dev *ct3d, int which)
+{
+    ComponentRegisters *cregs = &ct3d->cxl_cstate.crb;
+    uint32_t *cache_mem = cregs->cache_mem_registers;
+    uint32_t ctrl;
+
+    assert(which == 0);
+
+    ctrl = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_CTRL);
+
+    ctrl = FIELD_DP32(ctrl, CXL_HDM_DECODER0_CTRL, ERR, 0);
+    ctrl = FIELD_DP32(ctrl, CXL_HDM_DECODER0_CTRL, COMMITTED, 0);
+
+    stl_le_p(cache_mem + R_CXL_HDM_DECODER0_CTRL, ctrl);
 }
 
 static int ct3d_qmp_uncor_err_to_cxl(CxlUncorErrorType qmp_err)
@@ -392,6 +475,7 @@
     CXLType3Dev *ct3d = container_of(cxl_cstate, CXLType3Dev, cxl_cstate);
     uint32_t *cache_mem = cregs->cache_mem_registers;
     bool should_commit = false;
+    bool should_uncommit = false;
     int which_hdm = -1;
 
     assert(size == 4);
@@ -400,6 +484,7 @@
     switch (offset) {
     case A_CXL_HDM_DECODER0_CTRL:
         should_commit = FIELD_EX32(value, CXL_HDM_DECODER0_CTRL, COMMIT);
+        should_uncommit = !should_commit;
         which_hdm = 0;
         break;
     case A_CXL_RAS_UNC_ERR_STATUS:
@@ -486,42 +571,77 @@
     stl_le_p((uint8_t *)cache_mem + offset, value);
     if (should_commit) {
         hdm_decoder_commit(ct3d, which_hdm);
+    } else if (should_uncommit) {
+        hdm_decoder_uncommit(ct3d, which_hdm);
     }
 }
 
 static bool cxl_setup_memory(CXLType3Dev *ct3d, Error **errp)
 {
     DeviceState *ds = DEVICE(ct3d);
-    MemoryRegion *mr;
-    char *name;
 
-    if (!ct3d->hostmem) {
-        error_setg(errp, "memdev property must be set");
+    if (!ct3d->hostmem && !ct3d->hostvmem && !ct3d->hostpmem) {
+        error_setg(errp, "at least one memdev property must be set");
+        return false;
+    } else if (ct3d->hostmem && ct3d->hostpmem) {
+        error_setg(errp, "[memdev] cannot be used with new "
+                         "[persistent-memdev] property");
+        return false;
+    } else if (ct3d->hostmem) {
+        /* Use of hostmem property implies pmem */
+        ct3d->hostpmem = ct3d->hostmem;
+        ct3d->hostmem = NULL;
+    }
+
+    if (ct3d->hostpmem && !ct3d->lsa) {
+        error_setg(errp, "lsa property must be set for persistent devices");
         return false;
     }
 
-    mr = host_memory_backend_get_memory(ct3d->hostmem);
-    if (!mr) {
-        error_setg(errp, "memdev property must be set");
-        return false;
+    if (ct3d->hostvmem) {
+        MemoryRegion *vmr;
+        char *v_name;
+
+        vmr = host_memory_backend_get_memory(ct3d->hostvmem);
+        if (!vmr) {
+            error_setg(errp, "volatile memdev must have backing device");
+            return false;
+        }
+        memory_region_set_nonvolatile(vmr, false);
+        memory_region_set_enabled(vmr, true);
+        host_memory_backend_set_mapped(ct3d->hostvmem, true);
+        if (ds->id) {
+            v_name = g_strdup_printf("cxl-type3-dpa-vmem-space:%s", ds->id);
+        } else {
+            v_name = g_strdup("cxl-type3-dpa-vmem-space");
+        }
+        address_space_init(&ct3d->hostvmem_as, vmr, v_name);
+        ct3d->cxl_dstate.vmem_size = memory_region_size(vmr);
+        ct3d->cxl_dstate.mem_size += memory_region_size(vmr);
+        g_free(v_name);
     }
-    memory_region_set_nonvolatile(mr, true);
-    memory_region_set_enabled(mr, true);
-    host_memory_backend_set_mapped(ct3d->hostmem, true);
 
-    if (ds->id) {
-        name = g_strdup_printf("cxl-type3-dpa-space:%s", ds->id);
-    } else {
-        name = g_strdup("cxl-type3-dpa-space");
-    }
-    address_space_init(&ct3d->hostmem_as, mr, name);
-    g_free(name);
+    if (ct3d->hostpmem) {
+        MemoryRegion *pmr;
+        char *p_name;
 
-    ct3d->cxl_dstate.pmem_size = ct3d->hostmem->size;
-
-    if (!ct3d->lsa) {
-        error_setg(errp, "lsa property must be set");
-        return false;
+        pmr = host_memory_backend_get_memory(ct3d->hostpmem);
+        if (!pmr) {
+            error_setg(errp, "persistent memdev must have backing device");
+            return false;
+        }
+        memory_region_set_nonvolatile(pmr, true);
+        memory_region_set_enabled(pmr, true);
+        host_memory_backend_set_mapped(ct3d->hostpmem, true);
+        if (ds->id) {
+            p_name = g_strdup_printf("cxl-type3-dpa-pmem-space:%s", ds->id);
+        } else {
+            p_name = g_strdup("cxl-type3-dpa-pmem-space");
+        }
+        address_space_init(&ct3d->hostpmem_as, pmr, p_name);
+        ct3d->cxl_dstate.pmem_size = memory_region_size(pmr);
+        ct3d->cxl_dstate.mem_size += memory_region_size(pmr);
+        g_free(p_name);
     }
 
     return true;
@@ -593,6 +713,9 @@
     cxl_cstate->cdat.free_cdat_table = ct3_free_cdat_table;
     cxl_cstate->cdat.private = ct3d;
     cxl_doe_cdat_init(cxl_cstate, errp);
+    if (*errp) {
+        goto err_free_special_ops;
+    }
 
     pcie_cap_deverr_init(pci_dev);
     /* Leave a bit of room for expansion */
@@ -605,9 +728,15 @@
 
 err_release_cdat:
     cxl_doe_cdat_release(cxl_cstate);
+err_free_special_ops:
     g_free(regs->special_ops);
 err_address_space_free:
-    address_space_destroy(&ct3d->hostmem_as);
+    if (ct3d->hostpmem) {
+        address_space_destroy(&ct3d->hostpmem_as);
+    }
+    if (ct3d->hostvmem) {
+        address_space_destroy(&ct3d->hostvmem_as);
+    }
     return;
 }
 
@@ -620,7 +749,12 @@
     pcie_aer_exit(pci_dev);
     cxl_doe_cdat_release(cxl_cstate);
     g_free(regs->special_ops);
-    address_space_destroy(&ct3d->hostmem_as);
+    if (ct3d->hostpmem) {
+        address_space_destroy(&ct3d->hostpmem_as);
+    }
+    if (ct3d->hostvmem) {
+        address_space_destroy(&ct3d->hostvmem_as);
+    }
 }
 
 /* TODO: Support multiple HDM decoders and DPA skip */
@@ -655,51 +789,77 @@
     return true;
 }
 
+static int cxl_type3_hpa_to_as_and_dpa(CXLType3Dev *ct3d,
+                                       hwaddr host_addr,
+                                       unsigned int size,
+                                       AddressSpace **as,
+                                       uint64_t *dpa_offset)
+{
+    MemoryRegion *vmr = NULL, *pmr = NULL;
+
+    if (ct3d->hostvmem) {
+        vmr = host_memory_backend_get_memory(ct3d->hostvmem);
+    }
+    if (ct3d->hostpmem) {
+        pmr = host_memory_backend_get_memory(ct3d->hostpmem);
+    }
+
+    if (!vmr && !pmr) {
+        return -ENODEV;
+    }
+
+    if (!cxl_type3_dpa(ct3d, host_addr, dpa_offset)) {
+        return -EINVAL;
+    }
+
+    if (*dpa_offset > ct3d->cxl_dstate.mem_size) {
+        return -EINVAL;
+    }
+
+    if (vmr) {
+        if (*dpa_offset < memory_region_size(vmr)) {
+            *as = &ct3d->hostvmem_as;
+        } else {
+            *as = &ct3d->hostpmem_as;
+            *dpa_offset -= memory_region_size(vmr);
+        }
+    } else {
+        *as = &ct3d->hostpmem_as;
+    }
+
+    return 0;
+}
+
 MemTxResult cxl_type3_read(PCIDevice *d, hwaddr host_addr, uint64_t *data,
                            unsigned size, MemTxAttrs attrs)
 {
-    CXLType3Dev *ct3d = CXL_TYPE3(d);
-    uint64_t dpa_offset;
-    MemoryRegion *mr;
+    uint64_t dpa_offset = 0;
+    AddressSpace *as = NULL;
+    int res;
 
-    /* TODO support volatile region */
-    mr = host_memory_backend_get_memory(ct3d->hostmem);
-    if (!mr) {
+    res = cxl_type3_hpa_to_as_and_dpa(CXL_TYPE3(d), host_addr, size,
+                                      &as, &dpa_offset);
+    if (res) {
         return MEMTX_ERROR;
     }
 
-    if (!cxl_type3_dpa(ct3d, host_addr, &dpa_offset)) {
-        return MEMTX_ERROR;
-    }
-
-    if (dpa_offset > int128_get64(mr->size)) {
-        return MEMTX_ERROR;
-    }
-
-    return address_space_read(&ct3d->hostmem_as, dpa_offset, attrs, data, size);
+    return address_space_read(as, dpa_offset, attrs, data, size);
 }
 
 MemTxResult cxl_type3_write(PCIDevice *d, hwaddr host_addr, uint64_t data,
                             unsigned size, MemTxAttrs attrs)
 {
-    CXLType3Dev *ct3d = CXL_TYPE3(d);
-    uint64_t dpa_offset;
-    MemoryRegion *mr;
+    uint64_t dpa_offset = 0;
+    AddressSpace *as = NULL;
+    int res;
 
-    mr = host_memory_backend_get_memory(ct3d->hostmem);
-    if (!mr) {
-        return MEMTX_OK;
+    res = cxl_type3_hpa_to_as_and_dpa(CXL_TYPE3(d), host_addr, size,
+                                      &as, &dpa_offset);
+    if (res) {
+        return MEMTX_ERROR;
     }
 
-    if (!cxl_type3_dpa(ct3d, host_addr, &dpa_offset)) {
-        return MEMTX_OK;
-    }
-
-    if (dpa_offset > int128_get64(mr->size)) {
-        return MEMTX_OK;
-    }
-    return address_space_write(&ct3d->hostmem_as, dpa_offset, attrs,
-                               &data, size);
+    return address_space_write(as, dpa_offset, attrs, &data, size);
 }
 
 static void ct3d_reset(DeviceState *dev)
@@ -714,7 +874,11 @@
 
 static Property ct3_props[] = {
     DEFINE_PROP_LINK("memdev", CXLType3Dev, hostmem, TYPE_MEMORY_BACKEND,
-                     HostMemoryBackend *),
+                     HostMemoryBackend *), /* for backward compatibility */
+    DEFINE_PROP_LINK("persistent-memdev", CXLType3Dev, hostpmem,
+                     TYPE_MEMORY_BACKEND, HostMemoryBackend *),
+    DEFINE_PROP_LINK("volatile-memdev", CXLType3Dev, hostvmem,
+                     TYPE_MEMORY_BACKEND, HostMemoryBackend *),
     DEFINE_PROP_LINK("lsa", CXLType3Dev, lsa, TYPE_MEMORY_BACKEND,
                      HostMemoryBackend *),
     DEFINE_PROP_UINT64("sn", CXLType3Dev, sn, UI64_NULL),
@@ -726,6 +890,10 @@
 {
     MemoryRegion *mr;
 
+    if (!ct3d->lsa) {
+        return 0;
+    }
+
     mr = host_memory_backend_get_memory(ct3d->lsa);
     return memory_region_size(mr);
 }
@@ -743,6 +911,10 @@
     MemoryRegion *mr;
     void *lsa;
 
+    if (!ct3d->lsa) {
+        return 0;
+    }
+
     mr = host_memory_backend_get_memory(ct3d->lsa);
     validate_lsa_access(mr, size, offset);
 
@@ -758,6 +930,10 @@
     MemoryRegion *mr;
     void *lsa;
 
+    if (!ct3d->lsa) {
+        return;
+    }
+
     mr = host_memory_backend_get_memory(ct3d->lsa);
     validate_lsa_access(mr, size, offset);
 
@@ -929,7 +1105,7 @@
     pc->config_read = ct3d_config_read;
 
     set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
-    dc->desc = "CXL PMEM Device (Type 3)";
+    dc->desc = "CXL Memory Device (Type 3)";
     dc->reset = ct3d_reset;
     device_class_set_props(dc, ct3_props);
 
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 447f669..8ce239a 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -805,7 +805,6 @@
     }
 
     if (!get_vhost_net(nc->peer)) {
-        virtio_add_feature(&features, VIRTIO_F_RING_RESET);
         return features;
     }
 
diff --git a/hw/pci-bridge/Kconfig b/hw/pci-bridge/Kconfig
index 02614f4..6707736 100644
--- a/hw/pci-bridge/Kconfig
+++ b/hw/pci-bridge/Kconfig
@@ -3,6 +3,11 @@
     default y if PCI_DEVICES
     depends on PCI_EXPRESS && MSI_NONBROKEN
 
+config PCIE_PCI_BRIDGE
+    bool
+    default y if PCIE_PORT
+    depends on PCIE_PORT
+
 config PXB
     bool
     default y if Q35 || ARM_VIRT
diff --git a/hw/pci-bridge/cxl_upstream.c b/hw/pci-bridge/cxl_upstream.c
index 9df436c..ef47e5d 100644
--- a/hw/pci-bridge/cxl_upstream.c
+++ b/hw/pci-bridge/cxl_upstream.c
@@ -346,6 +346,9 @@
     cxl_cstate->cdat.free_cdat_table = free_default_cdat_table;
     cxl_cstate->cdat.private = d;
     cxl_doe_cdat_init(cxl_cstate, errp);
+    if (*errp) {
+        goto err_cap;
+    }
 
     return;
 
diff --git a/hw/pci-bridge/meson.build b/hw/pci-bridge/meson.build
index fe92d43..0edc6c7 100644
--- a/hw/pci-bridge/meson.build
+++ b/hw/pci-bridge/meson.build
@@ -2,7 +2,8 @@
 pci_ss.add(files('pci_bridge_dev.c'))
 pci_ss.add(when: 'CONFIG_I82801B11', if_true: files('i82801b11.c'))
 pci_ss.add(when: 'CONFIG_IOH3420', if_true: files('ioh3420.c'))
-pci_ss.add(when: 'CONFIG_PCIE_PORT', if_true: files('pcie_root_port.c', 'gen_pcie_root_port.c', 'pcie_pci_bridge.c'))
+pci_ss.add(when: 'CONFIG_PCIE_PORT', if_true: files('pcie_root_port.c', 'gen_pcie_root_port.c'))
+pci_ss.add(when: 'CONFIG_PCIE_PCI_BRIDGE', if_true: files('pcie_pci_bridge.c'))
 pci_ss.add(when: 'CONFIG_PXB', if_true: files('pci_expander_bridge.c'),
                                if_false: files('pci_expander_bridge_stubs.c'))
 pci_ss.add(when: 'CONFIG_XIO3130', if_true: files('xio3130_upstream.c', 'xio3130_downstream.c'))
diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c
index 262f82c..61e7b97 100644
--- a/hw/pci-host/i440fx.c
+++ b/hw/pci-host/i440fx.c
@@ -27,6 +27,7 @@
 #include "qemu/range.h"
 #include "hw/i386/pc.h"
 #include "hw/pci/pci.h"
+#include "hw/pci/pci_bus.h"
 #include "hw/pci/pci_host.h"
 #include "hw/pci-host/i440fx.h"
 #include "hw/qdev-properties.h"
@@ -217,10 +218,10 @@
     PCIHostState *s = PCI_HOST_BRIDGE(dev);
     SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
 
-    sysbus_add_io(sbd, 0xcf8, &s->conf_mem);
+    memory_region_add_subregion(s->bus->address_space_io, 0xcf8, &s->conf_mem);
     sysbus_init_ioports(sbd, 0xcf8, 4);
 
-    sysbus_add_io(sbd, 0xcfc, &s->data_mem);
+    memory_region_add_subregion(s->bus->address_space_io, 0xcfc, &s->data_mem);
     sysbus_init_ioports(sbd, 0xcfc, 4);
 
     /* register i440fx 0xcf8 port as coalesced pio */
@@ -291,12 +292,12 @@
     object_property_add_const_link(qdev_get_machine(), "smram",
                                    OBJECT(&f->smram));
 
-    init_pam(dev, f->ram_memory, f->system_memory, f->pci_address_space,
-             &f->pam_regions[0], PAM_BIOS_BASE, PAM_BIOS_SIZE);
+    init_pam(&f->pam_regions[0], OBJECT(d), f->ram_memory, f->system_memory,
+             f->pci_address_space, PAM_BIOS_BASE, PAM_BIOS_SIZE);
     for (i = 0; i < ARRAY_SIZE(f->pam_regions) - 1; ++i) {
-        init_pam(dev, f->ram_memory, f->system_memory, f->pci_address_space,
-                 &f->pam_regions[i+1], PAM_EXPAN_BASE + i * PAM_EXPAN_SIZE,
-                 PAM_EXPAN_SIZE);
+        init_pam(&f->pam_regions[i + 1], OBJECT(d), f->ram_memory,
+                 f->system_memory, f->pci_address_space,
+                 PAM_EXPAN_BASE + i * PAM_EXPAN_SIZE, PAM_EXPAN_SIZE);
     }
 
     ram_size = ram_size / 8 / 1024 / 1024;
diff --git a/hw/pci-host/pam.c b/hw/pci-host/pam.c
index 454dd12..68e9884 100644
--- a/hw/pci-host/pam.c
+++ b/hw/pci-host/pam.c
@@ -30,24 +30,24 @@
 #include "qemu/osdep.h"
 #include "hw/pci-host/pam.h"
 
-void init_pam(DeviceState *dev, MemoryRegion *ram_memory,
+void init_pam(PAMMemoryRegion *mem, Object *owner, MemoryRegion *ram_memory,
               MemoryRegion *system_memory, MemoryRegion *pci_address_space,
-              PAMMemoryRegion *mem, uint32_t start, uint32_t size)
+              uint32_t start, uint32_t size)
 {
     int i;
 
     /* RAM */
-    memory_region_init_alias(&mem->alias[3], OBJECT(dev), "pam-ram", ram_memory,
+    memory_region_init_alias(&mem->alias[3], owner, "pam-ram", ram_memory,
                              start, size);
     /* ROM (XXX: not quite correct) */
-    memory_region_init_alias(&mem->alias[1], OBJECT(dev), "pam-rom", ram_memory,
+    memory_region_init_alias(&mem->alias[1], owner, "pam-rom", ram_memory,
                              start, size);
     memory_region_set_readonly(&mem->alias[1], true);
 
     /* XXX: should distinguish read/write cases */
-    memory_region_init_alias(&mem->alias[0], OBJECT(dev), "pam-pci", pci_address_space,
+    memory_region_init_alias(&mem->alias[0], owner, "pam-pci", pci_address_space,
                              start, size);
-    memory_region_init_alias(&mem->alias[2], OBJECT(dev), "pam-pci", ram_memory,
+    memory_region_init_alias(&mem->alias[2], owner, "pam-pci", ram_memory,
                              start, size);
 
     memory_region_transaction_begin();
diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
index 2639086..fd18920 100644
--- a/hw/pci-host/q35.c
+++ b/hw/pci-host/q35.c
@@ -50,10 +50,12 @@
     Q35PCIHost *s = Q35_HOST_DEVICE(dev);
     SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
 
-    sysbus_add_io(sbd, MCH_HOST_BRIDGE_CONFIG_ADDR, &pci->conf_mem);
+    memory_region_add_subregion(s->mch.address_space_io,
+                                MCH_HOST_BRIDGE_CONFIG_ADDR, &pci->conf_mem);
     sysbus_init_ioports(sbd, MCH_HOST_BRIDGE_CONFIG_ADDR, 4);
 
-    sysbus_add_io(sbd, MCH_HOST_BRIDGE_CONFIG_DATA, &pci->data_mem);
+    memory_region_add_subregion(s->mch.address_space_io,
+                                MCH_HOST_BRIDGE_CONFIG_DATA, &pci->data_mem);
     sysbus_init_ioports(sbd, MCH_HOST_BRIDGE_CONFIG_DATA, 4);
 
     /* register q35 0xcf8 port as coalesced pio */
@@ -643,12 +645,12 @@
     object_property_add_const_link(qdev_get_machine(), "smram",
                                    OBJECT(&mch->smram));
 
-    init_pam(DEVICE(mch), mch->ram_memory, mch->system_memory,
-             mch->pci_address_space, &mch->pam_regions[0],
+    init_pam(&mch->pam_regions[0], OBJECT(mch), mch->ram_memory,
+             mch->system_memory, mch->pci_address_space,
              PAM_BIOS_BASE, PAM_BIOS_SIZE);
     for (i = 0; i < ARRAY_SIZE(mch->pam_regions) - 1; ++i) {
-        init_pam(DEVICE(mch), mch->ram_memory, mch->system_memory,
-                 mch->pci_address_space, &mch->pam_regions[i+1],
+        init_pam(&mch->pam_regions[i + 1], OBJECT(mch), mch->ram_memory,
+                 mch->system_memory, mch->pci_address_space,
                  PAM_EXPAN_BASE + i * PAM_EXPAN_SIZE, PAM_EXPAN_SIZE);
     }
 }
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 8a87ccc..1cc7c89 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -79,6 +79,8 @@
     DEFINE_PROP_STRING("failover_pair_id", PCIDevice,
                        failover_pair_id),
     DEFINE_PROP_UINT32("acpi-index",  PCIDevice, acpi_index, 0),
+    DEFINE_PROP_BIT("x-pcie-err-unc-mask", PCIDevice, cap_present,
+                    QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
     DEFINE_PROP_END_OF_LIST()
 };
 
@@ -2307,15 +2309,14 @@
                                Error **errp)
 {
     int64_t size;
-    char *path;
+    g_autofree char *path = NULL;
     void *ptr;
     char name[32];
     const VMStateDescription *vmsd;
 
-    if (!pdev->romfile)
+    if (!pdev->romfile || !strlen(pdev->romfile)) {
         return;
-    if (strlen(pdev->romfile) == 0)
-        return;
+    }
 
     if (!pdev->rom_bar) {
         /*
@@ -2350,23 +2351,20 @@
     size = get_image_size(path);
     if (size < 0) {
         error_setg(errp, "failed to find romfile \"%s\"", pdev->romfile);
-        g_free(path);
         return;
     } else if (size == 0) {
         error_setg(errp, "romfile \"%s\" is empty", pdev->romfile);
-        g_free(path);
         return;
     } else if (size > 2 * GiB) {
         error_setg(errp, "romfile \"%s\" too large (size cannot exceed 2 GiB)",
                    pdev->romfile);
-        g_free(path);
         return;
     }
     if (pdev->romsize != -1) {
         if (size > pdev->romsize) {
-            error_setg(errp, "romfile \"%s\" (%u bytes) is too large for ROM size %u",
+            error_setg(errp, "romfile \"%s\" (%u bytes) "
+                       "is too large for ROM size %u",
                        pdev->romfile, (uint32_t)size, pdev->romsize);
-            g_free(path);
             return;
         }
     } else {
@@ -2374,21 +2372,18 @@
     }
 
     vmsd = qdev_get_vmsd(DEVICE(pdev));
+    snprintf(name, sizeof(name), "%s.rom",
+             vmsd ? vmsd->name : object_get_typename(OBJECT(pdev)));
 
-    if (vmsd) {
-        snprintf(name, sizeof(name), "%s.rom", vmsd->name);
-    } else {
-        snprintf(name, sizeof(name), "%s.rom", object_get_typename(OBJECT(pdev)));
-    }
     pdev->has_rom = true;
-    memory_region_init_rom(&pdev->rom, OBJECT(pdev), name, pdev->romsize, &error_fatal);
+    memory_region_init_rom(&pdev->rom, OBJECT(pdev), name, pdev->romsize,
+                           &error_fatal);
+
     ptr = memory_region_get_ram_ptr(&pdev->rom);
     if (load_image_size(path, ptr, size) < 0) {
         error_setg(errp, "failed to load romfile \"%s\"", pdev->romfile);
-        g_free(path);
         return;
     }
-    g_free(path);
 
     if (is_default_rom) {
         /* Only the default rom images will be patched (if needed). */
diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c
index 103667c..374d593 100644
--- a/hw/pci/pcie_aer.c
+++ b/hw/pci/pcie_aer.c
@@ -112,10 +112,13 @@
 
     pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS,
                  PCI_ERR_UNC_SUPPORTED);
-    pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK,
-                 PCI_ERR_UNC_MASK_DEFAULT);
-    pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK,
-                 PCI_ERR_UNC_SUPPORTED);
+
+    if (dev->cap_present & QEMU_PCIE_ERR_UNC_MASK) {
+        pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK,
+                     PCI_ERR_UNC_MASK_DEFAULT);
+        pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK,
+                     PCI_ERR_UNC_SUPPORTED);
+    }
 
     pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER,
                  PCI_ERR_UNC_SEVERITY_DEFAULT);
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index ddc9c7b..1baea16 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2166,7 +2166,7 @@
                 break;
             }
         }
-    } while ((index < htabslots) && !qemu_file_rate_limit(f));
+    } while ((index < htabslots) && !migration_rate_exceeded(f));
 
     if (index >= htabslots) {
         assert(index == htabslots);
@@ -2237,7 +2237,7 @@
             assert(index == htabslots);
             index = 0;
         }
-    } while ((examined < htabslots) && (!qemu_file_rate_limit(f) || final));
+    } while ((examined < htabslots) && (!migration_rate_exceeded(f) || final));
 
     if (index >= htabslots) {
         assert(index == htabslots);
diff --git a/hw/s390x/s390-stattrib.c b/hw/s390x/s390-stattrib.c
index aed919a..220e845 100644
--- a/hw/s390x/s390-stattrib.c
+++ b/hw/s390x/s390-stattrib.c
@@ -209,7 +209,7 @@
         return -ENOMEM;
     }
 
-    while (final ? 1 : qemu_file_rate_limit(f) == 0) {
+    while (final ? 1 : migration_rate_exceeded(f) == 0) {
         reallen = sac->get_stattr(sas, &start_gfn, buflen, buf);
         if (reallen < 0) {
             g_free(buf);
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index 8361e70..bd7c12b 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -68,7 +68,7 @@
  */
 static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
 {
-    return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
+    return svq->num_free;
 }
 
 /**
@@ -263,6 +263,7 @@
         return -EINVAL;
     }
 
+    svq->num_free -= ndescs;
     svq->desc_state[qemu_head].elem = elem;
     svq->desc_state[qemu_head].ndescs = ndescs;
     vhost_svq_kick(svq);
@@ -449,6 +450,7 @@
     last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id);
     svq->desc_next[last_used_chain] = svq->free_head;
     svq->free_head = used_elem.id;
+    svq->num_free += num;
 
     *len = used_elem.len;
     return g_steal_pointer(&svq->desc_state[used_elem.id].elem);
@@ -659,6 +661,7 @@
     svq->iova_tree = iova_tree;
 
     svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
+    svq->num_free = svq->vring.num;
     driver_size = vhost_svq_driver_area_size(svq);
     device_size = vhost_svq_device_area_size(svq);
     svq->vring.desc = qemu_memalign(qemu_real_host_page_size(), driver_size);
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index 926a489..6efe051 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -107,6 +107,9 @@
 
     /* Next head to consume from the device */
     uint16_t last_used_idx;
+
+    /* Size of SVQ vring free descriptors */
+    uint16_t num_free;
 } VhostShadowVirtqueue;
 
 bool vhost_svq_valid_features(uint64_t features, Error **errp);
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index e5285df..e3ec872 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -42,17 +42,7 @@
 #define VHOST_USER_F_PROTOCOL_FEATURES 30
 #define VHOST_USER_BACKEND_MAX_FDS     8
 
-/*
- * Set maximum number of RAM slots supported to
- * the maximum number supported by the target
- * hardware plaform.
- */
-#if defined(TARGET_X86) || defined(TARGET_X86_64) || \
-    defined(TARGET_ARM) || defined(TARGET_AARCH64)
-#include "hw/acpi/acpi.h"
-#define VHOST_USER_MAX_RAM_SLOTS ACPI_MAX_RAM_SLOTS
-
-#elif defined(TARGET_PPC) || defined(TARGET_PPC64)
+#if defined(TARGET_PPC) || defined(TARGET_PPC64)
 #include "hw/ppc/spapr.h"
 #define VHOST_USER_MAX_RAM_SLOTS SPAPR_MAX_RAM_SLOTS
 
@@ -2677,7 +2667,20 @@
                                           VIRTIO_CONFIG_S_DRIVER |
                                           VIRTIO_CONFIG_S_DRIVER_OK);
     } else {
-        return vhost_user_set_status(dev, 0);
+        return 0;
+    }
+}
+
+static void vhost_user_reset_status(struct vhost_dev *dev)
+{
+    /* Set device status only for last queue pair */
+    if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
+        return;
+    }
+
+    if (virtio_has_feature(dev->protocol_features,
+                           VHOST_USER_PROTOCOL_F_STATUS)) {
+        vhost_user_set_status(dev, 0);
     }
 }
 
@@ -2716,4 +2719,5 @@
         .vhost_get_inflight_fd = vhost_user_get_inflight_fd,
         .vhost_set_inflight_fd = vhost_user_set_inflight_fd,
         .vhost_dev_start = vhost_user_dev_start,
+        .vhost_reset_status = vhost_user_reset_status,
 };
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index bc6bad2..b3094e8 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -26,6 +26,7 @@
 #include "cpu.h"
 #include "trace.h"
 #include "qapi/error.h"
+#include "hw/virtio/virtio-access.h"
 
 /*
  * Return one past the end of the end of section. Be careful with uint64_t
@@ -60,13 +61,21 @@
                      iova_min, section->offset_within_address_space);
         return true;
     }
+    /*
+     * While using vIOMMU, sometimes the section will be larger than iova_max,
+     * but the memory that actually maps is smaller, so move the check to
+     * function vhost_vdpa_iommu_map_notify(). That function will use the actual
+     * size that maps to the kernel
+     */
 
-    llend = vhost_vdpa_section_end(section);
-    if (int128_gt(llend, int128_make64(iova_max))) {
-        error_report("RAM section out of device range (max=0x%" PRIx64
-                     ", end addr=0x%" PRIx64 ")",
-                     iova_max, int128_get64(llend));
-        return true;
+    if (!memory_region_is_iommu(section->mr)) {
+        llend = vhost_vdpa_section_end(section);
+        if (int128_gt(llend, int128_make64(iova_max))) {
+            error_report("RAM section out of device range (max=0x%" PRIx64
+                         ", end addr=0x%" PRIx64 ")",
+                         iova_max, int128_get64(llend));
+            return true;
+        }
     }
 
     return false;
@@ -185,6 +194,115 @@
     v->iotlb_batch_begin_sent = false;
 }
 
+static void vhost_vdpa_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
+{
+    struct vdpa_iommu *iommu = container_of(n, struct vdpa_iommu, n);
+
+    hwaddr iova = iotlb->iova + iommu->iommu_offset;
+    struct vhost_vdpa *v = iommu->dev;
+    void *vaddr;
+    int ret;
+    Int128 llend;
+
+    if (iotlb->target_as != &address_space_memory) {
+        error_report("Wrong target AS \"%s\", only system memory is allowed",
+                     iotlb->target_as->name ? iotlb->target_as->name : "none");
+        return;
+    }
+    RCU_READ_LOCK_GUARD();
+    /* check if RAM section out of device range */
+    llend = int128_add(int128_makes64(iotlb->addr_mask), int128_makes64(iova));
+    if (int128_gt(llend, int128_make64(v->iova_range.last))) {
+        error_report("RAM section out of device range (max=0x%" PRIx64
+                     ", end addr=0x%" PRIx64 ")",
+                     v->iova_range.last, int128_get64(llend));
+        return;
+    }
+
+    if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
+        bool read_only;
+
+        if (!memory_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, NULL)) {
+            return;
+        }
+        ret = vhost_vdpa_dma_map(v, VHOST_VDPA_GUEST_PA_ASID, iova,
+                                 iotlb->addr_mask + 1, vaddr, read_only);
+        if (ret) {
+            error_report("vhost_vdpa_dma_map(%p, 0x%" HWADDR_PRIx ", "
+                         "0x%" HWADDR_PRIx ", %p) = %d (%m)",
+                         v, iova, iotlb->addr_mask + 1, vaddr, ret);
+        }
+    } else {
+        ret = vhost_vdpa_dma_unmap(v, VHOST_VDPA_GUEST_PA_ASID, iova,
+                                   iotlb->addr_mask + 1);
+        if (ret) {
+            error_report("vhost_vdpa_dma_unmap(%p, 0x%" HWADDR_PRIx ", "
+                         "0x%" HWADDR_PRIx ") = %d (%m)",
+                         v, iova, iotlb->addr_mask + 1, ret);
+        }
+    }
+}
+
+static void vhost_vdpa_iommu_region_add(MemoryListener *listener,
+                                        MemoryRegionSection *section)
+{
+    struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
+
+    struct vdpa_iommu *iommu;
+    Int128 end;
+    int iommu_idx;
+    IOMMUMemoryRegion *iommu_mr;
+    int ret;
+
+    iommu_mr = IOMMU_MEMORY_REGION(section->mr);
+
+    iommu = g_malloc0(sizeof(*iommu));
+    end = int128_add(int128_make64(section->offset_within_region),
+                     section->size);
+    end = int128_sub(end, int128_one());
+    iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
+                                                   MEMTXATTRS_UNSPECIFIED);
+    iommu->iommu_mr = iommu_mr;
+    iommu_notifier_init(&iommu->n, vhost_vdpa_iommu_map_notify,
+                        IOMMU_NOTIFIER_IOTLB_EVENTS,
+                        section->offset_within_region,
+                        int128_get64(end),
+                        iommu_idx);
+    iommu->iommu_offset = section->offset_within_address_space -
+                          section->offset_within_region;
+    iommu->dev = v;
+
+    ret = memory_region_register_iommu_notifier(section->mr, &iommu->n, NULL);
+    if (ret) {
+        g_free(iommu);
+        return;
+    }
+
+    QLIST_INSERT_HEAD(&v->iommu_list, iommu, iommu_next);
+    memory_region_iommu_replay(iommu->iommu_mr, &iommu->n);
+
+    return;
+}
+
+static void vhost_vdpa_iommu_region_del(MemoryListener *listener,
+                                        MemoryRegionSection *section)
+{
+    struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
+
+    struct vdpa_iommu *iommu;
+
+    QLIST_FOREACH(iommu, &v->iommu_list, iommu_next)
+    {
+        if (MEMORY_REGION(iommu->iommu_mr) == section->mr &&
+            iommu->n.start == section->offset_within_region) {
+            memory_region_unregister_iommu_notifier(section->mr, &iommu->n);
+            QLIST_REMOVE(iommu, iommu_next);
+            g_free(iommu);
+            break;
+        }
+    }
+}
+
 static void vhost_vdpa_listener_region_add(MemoryListener *listener,
                                            MemoryRegionSection *section)
 {
@@ -199,6 +317,10 @@
                                             v->iova_range.last)) {
         return;
     }
+    if (memory_region_is_iommu(section->mr)) {
+        vhost_vdpa_iommu_region_add(listener, section);
+        return;
+    }
 
     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
@@ -278,6 +400,9 @@
                                             v->iova_range.last)) {
         return;
     }
+    if (memory_region_is_iommu(section->mr)) {
+        vhost_vdpa_iommu_region_del(listener, section);
+    }
 
     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
@@ -288,7 +413,8 @@
     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
     llend = vhost_vdpa_section_end(section);
 
-    trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend));
+    trace_vhost_vdpa_listener_region_del(v, iova,
+        int128_get64(int128_sub(llend, int128_one())));
 
     if (int128_ge(int128_make64(iova), llend)) {
         return;
@@ -315,10 +441,28 @@
         vhost_iova_tree_remove(v->iova_tree, *result);
     }
     vhost_vdpa_iotlb_batch_begin_once(v);
+    /*
+     * The unmap ioctl doesn't accept a full 64-bit. need to check it
+     */
+    if (int128_eq(llsize, int128_2_64())) {
+        llsize = int128_rshift(llsize, 1);
+        ret = vhost_vdpa_dma_unmap(v, VHOST_VDPA_GUEST_PA_ASID, iova,
+                                   int128_get64(llsize));
+
+        if (ret) {
+            error_report("vhost_vdpa_dma_unmap(%p, 0x%" HWADDR_PRIx ", "
+                         "0x%" HWADDR_PRIx ") = %d (%m)",
+                         v, iova, int128_get64(llsize), ret);
+        }
+        iova += int128_get64(llsize);
+    }
     ret = vhost_vdpa_dma_unmap(v, VHOST_VDPA_GUEST_PA_ASID, iova,
                                int128_get64(llsize));
+
     if (ret) {
-        error_report("vhost_vdpa dma unmap error!");
+        error_report("vhost_vdpa_dma_unmap(%p, 0x%" HWADDR_PRIx ", "
+                     "0x%" HWADDR_PRIx ") = %d (%m)",
+                     v, iova, int128_get64(llsize), ret);
     }
 
     memory_region_unref(section->mr);
@@ -1163,7 +1307,13 @@
     }
 
     if (started) {
-        memory_listener_register(&v->listener, &address_space_memory);
+        if (vhost_dev_has_iommu(dev) && (v->shadow_vqs_enabled)) {
+            error_report("SVQ can not work while IOMMU enable, please disable"
+                         "IOMMU and try again");
+            return -1;
+        }
+        memory_listener_register(&v->listener, dev->vdev->dma_as);
+
         return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
     }
 
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 746d130..23da579 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -107,7 +107,7 @@
     }
 }
 
-static bool vhost_dev_has_iommu(struct vhost_dev *dev)
+bool vhost_dev_has_iommu(struct vhost_dev *dev)
 {
     VirtIODevice *vdev = dev->vdev;
 
diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c
index 2fe8045..c729a1f 100644
--- a/hw/virtio/virtio-crypto.c
+++ b/hw/virtio/virtio-crypto.c
@@ -476,15 +476,17 @@
         size_t max_len;
         CryptoDevBackendSymOpInfo *op_info = req->op_info.u.sym_op_info;
 
-        max_len = op_info->iv_len +
-                  op_info->aad_len +
-                  op_info->src_len +
-                  op_info->dst_len +
-                  op_info->digest_result_len;
+        if (op_info) {
+            max_len = op_info->iv_len +
+                      op_info->aad_len +
+                      op_info->src_len +
+                      op_info->dst_len +
+                      op_info->digest_result_len;
 
-        /* Zeroize and free request data structure */
-        memset(op_info, 0, sizeof(*op_info) + max_len);
-        g_free(op_info);
+            /* Zeroize and free request data structure */
+            memset(op_info, 0, sizeof(*op_info) + max_len);
+            g_free(op_info);
+        }
     } else if (req->flags == QCRYPTODEV_BACKEND_ALG_ASYM) {
         CryptoDevBackendAsymOpInfo *op_info = req->op_info.u.asym_op_info;
         if (op_info) {
diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
index 957fe77..538b695 100644
--- a/hw/virtio/virtio-mem.c
+++ b/hw/virtio/virtio-mem.c
@@ -1341,7 +1341,7 @@
                      TYPE_MEMORY_BACKEND, HostMemoryBackend *),
 #if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS)
     DEFINE_PROP_ON_OFF_AUTO(VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP, VirtIOMEM,
-                            unplugged_inaccessible, ON_OFF_AUTO_AUTO),
+                            unplugged_inaccessible, ON_OFF_AUTO_ON),
 #endif
     DEFINE_PROP_BOOL(VIRTIO_MEM_EARLY_MIGRATION_PROP, VirtIOMEM,
                      early_migration, true),
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 02fb84a..edbc0da 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -716,6 +716,38 @@
     }
 }
 
+static void virtio_pci_ats_ctrl_trigger(PCIDevice *pci_dev, bool enable)
+{
+    VirtIOPCIProxy *proxy = VIRTIO_PCI(pci_dev);
+    VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+    VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+
+    vdev->device_iotlb_enabled = enable;
+
+    if (k->toggle_device_iotlb) {
+        k->toggle_device_iotlb(vdev);
+    }
+}
+
+static void pcie_ats_config_write(PCIDevice *dev, uint32_t address,
+                                  uint32_t val, int len)
+{
+    uint32_t off;
+    uint16_t ats_cap = dev->exp.ats_cap;
+
+    if (!ats_cap || address < ats_cap) {
+        return;
+    }
+    off = address - ats_cap;
+    if (off >= PCI_EXT_CAP_ATS_SIZEOF) {
+        return;
+    }
+
+    if (range_covers_byte(off, len, PCI_ATS_CTRL + 1)) {
+        virtio_pci_ats_ctrl_trigger(dev, !!(val & PCI_ATS_CTRL_ENABLE));
+    }
+}
+
 static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
                                 uint32_t val, int len)
 {
@@ -729,6 +761,10 @@
         pcie_cap_flr_write_config(pci_dev, address, val, len);
     }
 
+    if (proxy->flags & VIRTIO_PCI_FLAG_ATS) {
+        pcie_ats_config_write(pci_dev, address, val, len);
+    }
+
     if (range_covers_byte(address, len, PCI_COMMAND)) {
         if (!(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
             virtio_set_disabled(vdev, true);
diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
index d589f78..02befda 100644
--- a/include/hw/cxl/cxl_device.h
+++ b/include/hw/cxl/cxl_device.h
@@ -119,8 +119,10 @@
         uint64_t host_set;
     } timestamp;
 
-    /* memory region for persistent memory, HDM */
+    /* memory region size, HDM */
+    uint64_t mem_size;
     uint64_t pmem_size;
+    uint64_t vmem_size;
 } CXLDeviceState;
 
 /* Initialize the register block for a device */
@@ -245,12 +247,15 @@
     PCIDevice parent_obj;
 
     /* Properties */
-    HostMemoryBackend *hostmem;
+    HostMemoryBackend *hostmem; /* deprecated */
+    HostMemoryBackend *hostvmem;
+    HostMemoryBackend *hostpmem;
     HostMemoryBackend *lsa;
     uint64_t sn;
 
     /* State */
-    AddressSpace hostmem_as;
+    AddressSpace hostvmem_as;
+    AddressSpace hostpmem_as;
     CXLComponentState cxl_cstate;
     CXLDeviceState cxl_dstate;
 
@@ -282,4 +287,6 @@
 MemTxResult cxl_type3_write(PCIDevice *d, hwaddr host_addr, uint64_t data,
                             unsigned size, MemTxAttrs attrs);
 
+uint64_t cxl_device_get_timestamp(CXLDeviceState *cxlds);
+
 #endif
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 84935fc..79e7558 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -162,13 +162,12 @@
 void pc_memory_init(PCMachineState *pcms,
                     MemoryRegion *system_memory,
                     MemoryRegion *rom_memory,
-                    MemoryRegion **ram_memory,
                     uint64_t pci_hole64_size);
 uint64_t pc_pci_hole64_start(void);
 DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus);
 void pc_basic_device_init(struct PCMachineState *pcms,
                           ISABus *isa_bus, qemu_irq *gsi,
-                          ISADevice **rtc_state,
+                          ISADevice *rtc_state,
                           bool create_fdctrl,
                           uint32_t hpet_irqs);
 void pc_cmos_init(PCMachineState *pcms,
diff --git a/include/hw/pci-host/pam.h b/include/hw/pci-host/pam.h
index c1fd06b..005916f 100644
--- a/include/hw/pci-host/pam.h
+++ b/include/hw/pci-host/pam.h
@@ -87,8 +87,9 @@
     unsigned current;
 } PAMMemoryRegion;
 
-void init_pam(DeviceState *dev, MemoryRegion *ram, MemoryRegion *system,
-              MemoryRegion *pci, PAMMemoryRegion *mem, uint32_t start, uint32_t size);
+void init_pam(PAMMemoryRegion *mem, Object *owner, MemoryRegion *ram,
+              MemoryRegion *system, MemoryRegion *pci,
+              uint32_t start, uint32_t size);
 void pam_update(PAMMemoryRegion *mem, int idx, uint8_t val);
 
 #endif /* QEMU_PAM_H */
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 935b4b9..e6d0574 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -207,6 +207,8 @@
     QEMU_PCIE_EXTCAP_INIT = (1 << QEMU_PCIE_EXTCAP_INIT_BITNR),
 #define QEMU_PCIE_CXL_BITNR 10
     QEMU_PCIE_CAP_CXL = (1 << QEMU_PCIE_CXL_BITNR),
+#define QEMU_PCIE_ERR_UNC_MASK_BITNR 11
+    QEMU_PCIE_ERR_UNC_MASK = (1 << QEMU_PCIE_ERR_UNC_MASK_BITNR),
 };
 
 typedef struct PCIINTxRoute {
diff --git a/include/hw/southbridge/ich9.h b/include/hw/southbridge/ich9.h
index 7004eec..fd01649 100644
--- a/include/hw/southbridge/ich9.h
+++ b/include/hw/southbridge/ich9.h
@@ -6,6 +6,7 @@
 #include "hw/intc/ioapic.h"
 #include "hw/pci/pci.h"
 #include "hw/pci/pci_device.h"
+#include "hw/rtc/mc146818rtc.h"
 #include "exec/memory.h"
 #include "qemu/notify.h"
 #include "qom/object.h"
@@ -30,6 +31,7 @@
     */
     uint8_t irr[PCI_SLOT_MAX][PCI_NUM_PINS];
 
+    MC146818RtcState rtc;
     APMState apm;
     ICH9LPCPMRegs pm;
     uint32_t sci_level; /* track sci level */
diff --git a/include/hw/southbridge/piix.h b/include/hw/southbridge/piix.h
index 0bf48e9..a840340 100644
--- a/include/hw/southbridge/piix.h
+++ b/include/hw/southbridge/piix.h
@@ -13,6 +13,7 @@
 #define HW_SOUTHBRIDGE_PIIX_H
 
 #include "hw/pci/pci_device.h"
+#include "hw/rtc/mc146818rtc.h"
 
 /* PIRQRC[A:D]: PIRQx Route Control Registers */
 #define PIIX_PIRQCA 0x60
@@ -51,6 +52,8 @@
     /* This member isn't used. Just for save/load compatibility */
     int32_t pci_irq_levels_vmstate[PIIX_NUM_PIRQS];
 
+    MC146818RtcState rtc;
+
     /* Reset Control Register contents */
     uint8_t rcr;
 
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index c278a2a..e64bfc7 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -52,6 +52,8 @@
     struct vhost_dev *dev;
     Error *migration_blocker;
     VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
+    QLIST_HEAD(, vdpa_iommu) iommu_list;
+    IOMMUNotifier n;
 } VhostVDPA;
 
 int vhost_vdpa_get_iova_range(int fd, struct vhost_vdpa_iova_range *iova_range);
@@ -61,4 +63,13 @@
 int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, uint32_t asid, hwaddr iova,
                          hwaddr size);
 
+typedef struct vdpa_iommu {
+    struct vhost_vdpa *dev;
+    IOMMUMemoryRegion *iommu_mr;
+    hwaddr iommu_offset;
+    IOMMUNotifier n;
+    QLIST_ENTRY(vdpa_iommu) iommu_next;
+} VDPAIOMMUState;
+
+
 #endif
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index a52f273..f7f10c8 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -336,4 +336,5 @@
                            struct vhost_inflight *inflight);
 int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
                            struct vhost_inflight *inflight);
+bool vhost_dev_has_iommu(struct vhost_dev *dev);
 #endif
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index f6b38f7..af86ed7 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -155,6 +155,7 @@
     QLIST_HEAD(, VirtQueue) *vector_queues;
     QTAILQ_ENTRY(VirtIODevice) next;
     EventNotifier config_notifier;
+    bool device_iotlb_enabled;
 };
 
 struct VirtioDeviceClass {
@@ -212,6 +213,7 @@
     const VMStateDescription *vmsd;
     bool (*primary_unplug_pending)(void *opaque);
     struct vhost_dev *(*get_vhost)(VirtIODevice *vdev);
+    void (*toggle_device_iotlb)(VirtIODevice *vdev);
 };
 
 void virtio_instance_init_common(Object *proxy_obj, void *data,
diff --git a/include/migration/colo.h b/include/migration/colo.h
index 7ef3154..eaac07f 100644
--- a/include/migration/colo.h
+++ b/include/migration/colo.h
@@ -28,7 +28,6 @@
 int migration_incoming_enable_colo(void);
 void migration_incoming_disable_colo(void);
 bool migration_incoming_colo_enabled(void);
-void *colo_process_incoming_thread(void *opaque);
 bool migration_incoming_in_colo_state(void);
 
 COLOMode get_colo_mode(void);
@@ -44,5 +43,13 @@
  */
 void colo_checkpoint_delay_set(void);
 
+/*
+ * Starts COLO incoming process. Called from process_incoming_migration_co()
+ * after loading the state.
+ *
+ * Called with BQL locked, may temporary release BQL.
+ */
+int coroutine_fn colo_incoming_co(void);
+
 void colo_shutdown(void);
 #endif
diff --git a/include/migration/qemu-file-types.h b/include/migration/qemu-file-types.h
index 1436f9c..9ba163f 100644
--- a/include/migration/qemu-file-types.h
+++ b/include/migration/qemu-file-types.h
@@ -165,6 +165,16 @@
 
 void qemu_put_counted_string(QEMUFile *f, const char *name);
 
-int qemu_file_rate_limit(QEMUFile *f);
+/**
+ * migration_rate_exceeded: Check if we have exceeded rate for this interval
+ *
+ * Checks if we have already transferred more data that we are allowed
+ * in the current interval.
+ *
+ * @f: QEMUFile used for main migration channel
+ *
+ * Returns if we should stop sending data for this interval.
+ */
+bool migration_rate_exceeded(QEMUFile *f);
 
 #endif
diff --git a/linux-user/hexagon/cpu_loop.c b/linux-user/hexagon/cpu_loop.c
index b84e25b..7f1499e 100644
--- a/linux-user/hexagon/cpu_loop.c
+++ b/linux-user/hexagon/cpu_loop.c
@@ -63,6 +63,9 @@
         case EXCP_ATOMIC:
             cpu_exec_step_atomic(cs);
             break;
+        case EXCP_DEBUG:
+            force_sig_fault(TARGET_SIGTRAP, TARGET_TRAP_BRKPT, 0);
+            break;
         default:
             EXCP_DUMP(env, "\nqemu: unhandled CPU exception %#x - aborting\n",
                      trapnr);
diff --git a/linux-user/hexagon/target_elf.h b/linux-user/hexagon/target_elf.h
index b4e9f40..36056fc 100644
--- a/linux-user/hexagon/target_elf.h
+++ b/linux-user/hexagon/target_elf.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -20,7 +20,10 @@
 
 static inline const char *cpu_get_model(uint32_t eflags)
 {
-    /* For now, treat anything newer than v5 as a v67 */
+    static char buf[32];
+    int err;
+
+    /* For now, treat anything newer than v5 as a v73 */
     /* FIXME - Disable instructions that are newer than the specified arch */
     if (eflags == 0x04 ||    /* v5  */
         eflags == 0x05 ||    /* v55 */
@@ -30,11 +33,18 @@
         eflags == 0x65 ||    /* v65 */
         eflags == 0x66 ||    /* v66 */
         eflags == 0x67 ||    /* v67 */
-        eflags == 0x8067     /* v67t */
+        eflags == 0x8067 ||  /* v67t */
+        eflags == 0x68 ||    /* v68 */
+        eflags == 0x69 ||    /* v69 */
+        eflags == 0x71 ||    /* v71 */
+        eflags == 0x8071 ||  /* v71t */
+        eflags == 0x73       /* v73 */
        ) {
-        return "v67";
+        return "v73";
     }
-    return "unknown";
+
+    err = snprintf(buf, sizeof(buf), "unknown (0x%x)", eflags);
+    return err >= 0 && err < sizeof(buf) ? buf : "unknown";
 }
 
 #endif
diff --git a/meson.build b/meson.build
index 41c87c4..0a5cdef 100644
--- a/meson.build
+++ b/meson.build
@@ -2105,6 +2105,7 @@
 config_host_data.set('CONFIG_GTK', gtk.found())
 config_host_data.set('CONFIG_VTE', vte.found())
 config_host_data.set('CONFIG_GTK_CLIPBOARD', have_gtk_clipboard)
+config_host_data.set('CONFIG_HEXAGON_IDEF_PARSER', get_option('hexagon_idef_parser'))
 config_host_data.set('CONFIG_LIBATTR', have_old_libattr)
 config_host_data.set('CONFIG_LIBCAP_NG', libcap_ng.found())
 config_host_data.set('CONFIG_EBPF', libbpf.found())
diff --git a/meson_options.txt b/meson_options.txt
index 972c458..9023738 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -304,6 +304,8 @@
        description: 'block migration in the main migration stream')
 option('replication', type: 'feature', value: 'auto',
        description: 'replication support')
+option('colo_proxy', type: 'feature', value: 'auto',
+       description: 'colo-proxy support')
 option('bochs', type: 'feature', value: 'auto',
        description: 'bochs image format support')
 option('cloop', type: 'feature', value: 'auto',
diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
index 20f36e6..032fc5f 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -706,7 +706,7 @@
     QSIMPLEQ_FOREACH(dbms, &s->dbms_list, entry) {
         while (!dbms->bulk_completed) {
             bulk_phase_send_chunk(f, s, dbms);
-            if (limit && qemu_file_rate_limit(f)) {
+            if (limit && migration_rate_exceeded(f)) {
                 return;
             }
         }
diff --git a/migration/block.c b/migration/block.c
index 12617b4..b9580a6 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -23,6 +23,7 @@
 #include "block/dirty-bitmap.h"
 #include "migration/misc.h"
 #include "migration.h"
+#include "migration-stats.h"
 #include "migration/register.h"
 #include "qemu-file.h"
 #include "migration/vmstate.h"
@@ -625,7 +626,7 @@
 
     blk_mig_lock();
     while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
-        if (qemu_file_rate_limit(f)) {
+        if (migration_rate_exceeded(f)) {
             break;
         }
         if (blk->ret < 0) {
@@ -762,7 +763,7 @@
     /* control the rate of transfer */
     blk_mig_lock();
     while (block_mig_state.read_done * BLK_MIG_BLOCK_SIZE <
-           qemu_file_get_rate_limit(f) &&
+           migration_rate_get() &&
            block_mig_state.submitted < MAX_PARALLEL_IO &&
            (block_mig_state.submitted + block_mig_state.read_done) <
            MAX_IO_BUFFERS) {
diff --git a/migration/colo.c b/migration/colo.c
index 6c7c313..72f4f7b 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -145,8 +145,8 @@
     qemu_sem_post(&mis->colo_incoming_sem);
 
     /* For Secondary VM, jump to incoming co */
-    if (mis->migration_incoming_co) {
-        qemu_coroutine_enter(mis->migration_incoming_co);
+    if (mis->colo_incoming_co) {
+        qemu_coroutine_enter(mis->colo_incoming_co);
     }
 }
 
@@ -817,7 +817,7 @@
     }
 }
 
-void *colo_process_incoming_thread(void *opaque)
+static void *colo_process_incoming_thread(void *opaque)
 {
     MigrationIncomingState *mis = opaque;
     QEMUFile *fb = NULL;
@@ -918,3 +918,40 @@
     rcu_unregister_thread();
     return NULL;
 }
+
+int coroutine_fn colo_incoming_co(void)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    Error *local_err = NULL;
+    QemuThread th;
+
+    assert(qemu_mutex_iothread_locked());
+
+    if (!migration_incoming_colo_enabled()) {
+        return 0;
+    }
+
+    /* Make sure all file formats throw away their mutable metadata */
+    bdrv_activate_all(&local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        return -EINVAL;
+    }
+
+    qemu_thread_create(&th, "COLO incoming", colo_process_incoming_thread,
+                       mis, QEMU_THREAD_JOINABLE);
+
+    mis->colo_incoming_co = qemu_coroutine_self();
+    qemu_coroutine_yield();
+    mis->colo_incoming_co = NULL;
+
+    qemu_mutex_unlock_iothread();
+    /* Wait checkpoint incoming thread exit before free resource */
+    qemu_thread_join(&th);
+    qemu_mutex_lock_iothread();
+
+    /* We hold the global iothread lock, so it is safe here */
+    colo_release_ram_cache();
+
+    return 0;
+}
diff --git a/migration/meson.build b/migration/meson.build
index dc8b1da..a8e01e7 100644
--- a/migration/meson.build
+++ b/migration/meson.build
@@ -1,5 +1,6 @@
 # Files needed by unit tests
 migration_files = files(
+  'migration-stats.c',
   'page_cache.c',
   'xbzrle.c',
   'vmstate-types.c',
@@ -18,11 +19,9 @@
   'fd.c',
   'global_state.c',
   'migration-hmp-cmds.c',
-  'migration-stats.c',
   'migration.c',
   'multifd.c',
   'multifd-zlib.c',
-  'multifd-zlib.c',
   'ram-compress.c',
   'options.c',
   'postcopy-ram.c',
diff --git a/migration/migration-stats.c b/migration/migration-stats.c
index 2f2cea9..f98c826 100644
--- a/migration/migration-stats.c
+++ b/migration/migration-stats.c
@@ -12,6 +12,57 @@
 
 #include "qemu/osdep.h"
 #include "qemu/stats64.h"
+#include "qemu-file.h"
+#include "trace.h"
 #include "migration-stats.h"
 
 MigrationAtomicStats mig_stats;
+
+bool migration_rate_exceeded(QEMUFile *f)
+{
+    if (qemu_file_get_error(f)) {
+        return true;
+    }
+
+    uint64_t rate_limit_start = stat64_get(&mig_stats.rate_limit_start);
+    uint64_t rate_limit_current = migration_transferred_bytes(f);
+    uint64_t rate_limit_used = rate_limit_current - rate_limit_start;
+    uint64_t rate_limit_max = stat64_get(&mig_stats.rate_limit_max);
+
+    if (rate_limit_max == RATE_LIMIT_DISABLED) {
+        return false;
+    }
+    if (rate_limit_max > 0 && rate_limit_used > rate_limit_max) {
+        return true;
+    }
+    return false;
+}
+
+uint64_t migration_rate_get(void)
+{
+    return stat64_get(&mig_stats.rate_limit_max);
+}
+
+#define XFER_LIMIT_RATIO (1000 / BUFFER_DELAY)
+
+void migration_rate_set(uint64_t limit)
+{
+    /*
+     * 'limit' is per second.  But we check it each BUFER_DELAY miliseconds.
+     */
+    stat64_set(&mig_stats.rate_limit_max, limit / XFER_LIMIT_RATIO);
+}
+
+void migration_rate_reset(QEMUFile *f)
+{
+    stat64_set(&mig_stats.rate_limit_start, migration_transferred_bytes(f));
+}
+
+uint64_t migration_transferred_bytes(QEMUFile *f)
+{
+    uint64_t multifd = stat64_get(&mig_stats.multifd_bytes);
+    uint64_t qemu_file = qemu_file_transferred(f);
+
+    trace_migration_transferred_bytes(qemu_file, multifd);
+    return qemu_file + multifd;
+}
diff --git a/migration/migration-stats.h b/migration/migration-stats.h
index cf8a4f0..ac2260e 100644
--- a/migration/migration-stats.h
+++ b/migration/migration-stats.h
@@ -16,6 +16,18 @@
 #include "qemu/stats64.h"
 
 /*
+ * Amount of time to allocate to each "chunk" of bandwidth-throttled
+ * data.
+ */
+#define BUFFER_DELAY     100
+
+/*
+ * If rate_limit_max is 0, there is special code to remove the rate
+ * limit.
+ */
+#define RATE_LIMIT_DISABLED 0
+
+/*
  * These are the ram migration statistic counters.  It is loosely
  * based on MigrationStats.  We change to Stat64 any counter that
  * needs to be updated using atomic ops (can be accessed by more than
@@ -70,6 +82,14 @@
      */
     Stat64 precopy_bytes;
     /*
+     * Amount of transferred data at the start of current cycle.
+     */
+    Stat64 rate_limit_start;
+    /*
+     * Maximum amount of data we can send in a cycle.
+     */
+    Stat64 rate_limit_max;
+    /*
      * Total number of bytes transferred.
      */
     Stat64 transferred;
@@ -81,4 +101,39 @@
 
 extern MigrationAtomicStats mig_stats;
 
+/**
+ * migration_rate_get: Get the maximum amount that can be transferred.
+ *
+ * Returns the maximum number of bytes that can be transferred in a cycle.
+ */
+uint64_t migration_rate_get(void);
+
+/**
+ * migration_rate_reset: Reset the rate limit counter.
+ *
+ * This is called when we know we start a new transfer cycle.
+ *
+ * @f: QEMUFile used for main migration channel
+ */
+void migration_rate_reset(QEMUFile *f);
+
+/**
+ * migration_rate_set: Set the maximum amount that can be transferred.
+ *
+ * Sets the maximum amount of bytes that can be transferred in one cycle.
+ *
+ * @new_rate: new maximum amount
+ */
+void migration_rate_set(uint64_t new_rate);
+
+/**
+ * migration_transferred_bytes: Return number of bytes transferred
+ *
+ * @f: QEMUFile used for main migration channel
+ *
+ * Returns how many bytes have we transferred since the beginning of
+ * the migration.  It accounts for bytes sent through any migration
+ * channel, multifd, qemu_file, rdma, ....
+ */
+uint64_t migration_transferred_bytes(QEMUFile *f);
 #endif
diff --git a/migration/migration.c b/migration/migration.c
index 00d8ba8..5de7f73 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -511,7 +511,6 @@
     MigrationIncomingState *mis = migration_incoming_get_current();
     PostcopyState ps;
     int ret;
-    Error *local_err = NULL;
 
     assert(mis->from_src_file);
 
@@ -520,12 +519,14 @@
         goto fail;
     }
 
-    mis->migration_incoming_co = qemu_coroutine_self();
     mis->largest_page_size = qemu_ram_pagesize_largest();
     postcopy_state_set(POSTCOPY_INCOMING_NONE);
     migrate_set_state(&mis->state, MIGRATION_STATUS_NONE,
                       MIGRATION_STATUS_ACTIVE);
+
+    mis->loadvm_co = qemu_coroutine_self();
     ret = qemu_loadvm_state(mis->from_src_file);
+    mis->loadvm_co = NULL;
 
     ps = postcopy_state_get();
     trace_process_incoming_migration_co_end(ret, ps);
@@ -553,35 +554,14 @@
         goto fail;
     }
 
-    /* we get COLO info, and know if we are in COLO mode */
-    if (migration_incoming_colo_enabled()) {
-        QemuThread colo_incoming_thread;
-
-        /* Make sure all file formats throw away their mutable metadata */
-        bdrv_activate_all(&local_err);
-        if (local_err) {
-            error_report_err(local_err);
-            goto fail;
-        }
-
-        qemu_thread_create(&colo_incoming_thread, "COLO incoming",
-             colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE);
-        qemu_coroutine_yield();
-
-        qemu_mutex_unlock_iothread();
-        /* Wait checkpoint incoming thread exit before free resource */
-        qemu_thread_join(&colo_incoming_thread);
-        qemu_mutex_lock_iothread();
-        /* We hold the global iothread lock, so it is safe here */
-        colo_release_ram_cache();
+    if (colo_incoming_co() < 0) {
+        goto fail;
     }
 
     mis->bh = qemu_bh_new(process_incoming_migration_bh, mis);
     qemu_bh_schedule(mis->bh);
-    mis->migration_incoming_co = NULL;
     return;
 fail:
-    local_err = NULL;
     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
                       MIGRATION_STATUS_FAILED);
     qemu_fclose(mis->from_src_file);
@@ -2140,7 +2120,7 @@
      * will notice we're in POSTCOPY_ACTIVE and not actually
      * wrap their state up here
      */
-    qemu_file_set_rate_limit(ms->to_dst_file, bandwidth);
+    migration_rate_set(bandwidth);
     if (migrate_postcopy_ram()) {
         /* Ping just for debugging, helps line traces up */
         qemu_savevm_send_ping(ms->to_dst_file, 2);
@@ -2324,7 +2304,7 @@
                  * them if migration fails or is cancelled.
                  */
                 s->block_inactive = !migrate_colo();
-                qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
+                migration_rate_set(RATE_LIMIT_DISABLED);
                 ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
                                                          s->block_inactive);
             }
@@ -2645,16 +2625,9 @@
     }
 }
 
-/* How many bytes have we transferred since the beginning of the migration */
-static uint64_t migration_total_bytes(MigrationState *s)
-{
-    return qemu_file_transferred(s->to_dst_file) +
-        stat64_get(&mig_stats.multifd_bytes);
-}
-
 static void migration_calculate_complete(MigrationState *s)
 {
-    uint64_t bytes = migration_total_bytes(s);
+    uint64_t bytes = migration_transferred_bytes(s->to_dst_file);
     int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
     int64_t transfer_time;
 
@@ -2680,7 +2653,7 @@
      * wrong speed calculation.
      */
     s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
-    s->iteration_initial_bytes = migration_total_bytes(s);
+    s->iteration_initial_bytes = migration_transferred_bytes(s->to_dst_file);
     s->iteration_initial_pages = ram_get_total_transferred_pages();
 }
 
@@ -2695,7 +2668,7 @@
         return;
     }
 
-    current_bytes = migration_total_bytes(s);
+    current_bytes = migration_transferred_bytes(s->to_dst_file);
     transferred = current_bytes - s->iteration_initial_bytes;
     time_spent = current_time - s->iteration_start_time;
     bandwidth = (double)transferred / time_spent;
@@ -2719,7 +2692,7 @@
             stat64_get(&mig_stats.dirty_bytes_last_sync) / bandwidth;
     }
 
-    qemu_file_reset_rate_limit(s->to_dst_file);
+    migration_rate_reset(s->to_dst_file);
 
     update_iteration_initial_status(s);
 
@@ -2872,7 +2845,7 @@
 
     bool urgent = false;
     migration_update_counters(s, now);
-    if (qemu_file_rate_limit(s->to_dst_file)) {
+    if (migration_rate_exceeded(s->to_dst_file)) {
 
         if (qemu_file_get_error(s->to_dst_file)) {
             return false;
@@ -2994,7 +2967,7 @@
     trace_migration_thread_setup_complete();
 
     while (migration_is_active(s)) {
-        if (urgent || !qemu_file_rate_limit(s->to_dst_file)) {
+        if (urgent || !migration_rate_exceeded(s->to_dst_file)) {
             MigIterateState iter_state = migration_iteration_run(s);
             if (iter_state == MIG_ITERATE_SKIP) {
                 continue;
@@ -3068,7 +3041,7 @@
     rcu_register_thread();
     object_ref(OBJECT(s));
 
-    qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
+    migration_rate_set(RATE_LIMIT_DISABLED);
 
     setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
     /*
@@ -3240,7 +3213,7 @@
         notifier_list_notify(&migration_state_notifiers, s);
     }
 
-    qemu_file_set_rate_limit(s->to_dst_file, rate_limit);
+    migration_rate_set(rate_limit);
     qemu_file_set_blocking(s->to_dst_file, true);
 
     /*
diff --git a/migration/migration.h b/migration/migration.h
index 7721c76..48a4612 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -162,8 +162,15 @@
 
     int state;
 
+    /*
+     * The incoming migration coroutine, non-NULL during qemu_loadvm_state().
+     * Used to wake the migration incoming coroutine from rdma code. How much is
+     * it safe - it's a question.
+     */
+    Coroutine *loadvm_co;
+
     /* The coroutine we should enter (back) after failover */
-    Coroutine *migration_incoming_co;
+    Coroutine *colo_incoming_co;
     QemuSemaphore colo_incoming_sem;
 
     /*
diff --git a/migration/multifd.c b/migration/multifd.c
index 5c4298e..0bf5958 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -175,6 +175,7 @@
 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 {
     MultiFDInit_t msg = {};
+    size_t size = sizeof(msg);
     int ret;
 
     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
@@ -182,10 +183,12 @@
     msg.id = p->id;
     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 
-    ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
+    ret = qio_channel_write_all(p->c, (char *)&msg, size, errp);
     if (ret != 0) {
         return -1;
     }
+    stat64_add(&mig_stats.multifd_bytes, size);
+    stat64_add(&mig_stats.transferred, size);
     return 0;
 }
 
@@ -395,7 +398,6 @@
     static int next_channel;
     MultiFDSendParams *p = NULL; /* make happy gcc */
     MultiFDPages_t *pages = multifd_send_state->pages;
-    uint64_t transferred;
 
     if (qatomic_read(&multifd_send_state->exiting)) {
         return -1;
@@ -430,11 +432,7 @@
     p->packet_num = multifd_send_state->packet_num++;
     multifd_send_state->pages = p->pages;
     p->pages = pages;
-    transferred = ((uint64_t) pages->num) * p->page_size + p->packet_len;
-    qemu_file_acct_rate_limit(f, transferred);
     qemu_mutex_unlock(&p->mutex);
-    stat64_add(&mig_stats.transferred, transferred);
-    stat64_add(&mig_stats.multifd_bytes, transferred);
     qemu_sem_post(&p->sem);
 
     return 1;
@@ -716,6 +714,8 @@
                 if (ret != 0) {
                     break;
                 }
+                stat64_add(&mig_stats.multifd_bytes, p->packet_len);
+                stat64_add(&mig_stats.transferred, p->packet_len);
             } else {
                 /* Send header using the same writev call */
                 p->iov[0].iov_len = p->packet_len;
@@ -728,6 +728,8 @@
                 break;
             }
 
+            stat64_add(&mig_stats.multifd_bytes, p->next_packet_size);
+            stat64_add(&mig_stats.transferred, p->next_packet_size);
             qemu_mutex_lock(&p->mutex);
             p->pending_job--;
             qemu_mutex_unlock(&p->mutex);
diff --git a/migration/options.c b/migration/options.c
index c2a278e..b62ab30 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -23,6 +23,7 @@
 #include "migration/colo.h"
 #include "migration/misc.h"
 #include "migration.h"
+#include "migration-stats.h"
 #include "qemu-file.h"
 #include "ram.h"
 #include "options.h"
@@ -1242,8 +1243,7 @@
     if (params->has_max_bandwidth) {
         s->parameters.max_bandwidth = params->max_bandwidth;
         if (s->to_dst_file && !migration_in_postcopy()) {
-            qemu_file_set_rate_limit(s->to_dst_file,
-                                s->parameters.max_bandwidth);
+            migration_rate_set(s->parameters.max_bandwidth);
         }
     }
 
@@ -1272,8 +1272,7 @@
     if (params->has_max_postcopy_bandwidth) {
         s->parameters.max_postcopy_bandwidth = params->max_postcopy_bandwidth;
         if (s->to_dst_file && migration_in_postcopy()) {
-            qemu_file_set_rate_limit(s->to_dst_file,
-                    s->parameters.max_postcopy_bandwidth);
+            migration_rate_set(s->parameters.max_postcopy_bandwidth);
         }
     }
     if (params->has_max_cpu_throttle) {
diff --git a/migration/options.h b/migration/options.h
index 5cca332..45991af 100644
--- a/migration/options.h
+++ b/migration/options.h
@@ -17,13 +17,6 @@
 #include "hw/qdev-properties.h"
 #include "hw/qdev-properties-system.h"
 
-/* constants */
-
-/* Amount of time to allocate to each "chunk" of bandwidth-throttled
- * data. */
-#define BUFFER_DELAY     100
-#define XFER_LIMIT_RATIO (1000 / BUFFER_DELAY)
-
 /* migration properties */
 
 extern Property migration_properties[];
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 5970547..acc2826 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -27,6 +27,7 @@
 #include "qemu/error-report.h"
 #include "qemu/iov.h"
 #include "migration.h"
+#include "migration-stats.h"
 #include "qemu-file.h"
 #include "trace.h"
 #include "options.h"
@@ -40,17 +41,6 @@
     QIOChannel *ioc;
     bool is_writable;
 
-    /*
-     * Maximum amount of data in bytes to transfer during one
-     * rate limiting time window
-     */
-    uint64_t rate_limit_max;
-    /*
-     * Total amount of data in bytes queued for transfer
-     * during this rate limiting time window
-     */
-    uint64_t rate_limit_used;
-
     /* The sum of bytes transferred on the wire */
     uint64_t total_transferred;
 
@@ -301,7 +291,8 @@
                                    &local_error) < 0) {
             qemu_file_set_error_obj(f, -EIO, local_error);
         } else {
-            f->total_transferred += iov_size(f->iov, f->iovcnt);
+            uint64_t size = iov_size(f->iov, f->iovcnt);
+            f->total_transferred += size;
         }
 
         qemu_iovec_release_ram(f);
@@ -352,9 +343,6 @@
     if (f->hooks && f->hooks->save_page) {
         int ret = f->hooks->save_page(f, block_offset,
                                       offset, size, bytes_sent);
-        if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
-            qemu_file_acct_rate_limit(f, size);
-        }
 
         if (ret != RAM_SAVE_CONTROL_DELAYED &&
             ret != RAM_SAVE_CONTROL_NOT_SUPP) {
@@ -518,7 +506,6 @@
         return;
     }
 
-    f->rate_limit_used += size;
     add_to_iovec(f, buf, size, may_free);
 }
 
@@ -536,7 +523,6 @@
             l = size;
         }
         memcpy(f->buf + f->buf_index, buf, l);
-        f->rate_limit_used += l;
         add_buf_to_iovec(f, l);
         if (qemu_file_get_error(f)) {
             break;
@@ -553,7 +539,6 @@
     }
 
     f->buf[f->buf_index] = v;
-    f->rate_limit_used++;
     add_buf_to_iovec(f, 1);
 }
 
@@ -727,40 +712,6 @@
     return f->total_transferred;
 }
 
-int qemu_file_rate_limit(QEMUFile *f)
-{
-    if (qemu_file_get_error(f)) {
-        return 1;
-    }
-    if (f->rate_limit_max > 0 && f->rate_limit_used > f->rate_limit_max) {
-        return 1;
-    }
-    return 0;
-}
-
-uint64_t qemu_file_get_rate_limit(QEMUFile *f)
-{
-    return f->rate_limit_max;
-}
-
-void qemu_file_set_rate_limit(QEMUFile *f, uint64_t limit)
-{
-    /*
-     * 'limit' is per second.  But we check it each 100 miliseconds.
-     */
-    f->rate_limit_max = limit / XFER_LIMIT_RATIO;
-}
-
-void qemu_file_reset_rate_limit(QEMUFile *f)
-{
-    f->rate_limit_used = 0;
-}
-
-void qemu_file_acct_rate_limit(QEMUFile *f, uint64_t len)
-{
-    f->rate_limit_used += len;
-}
-
 void qemu_put_be16(QEMUFile *f, unsigned int v)
 {
     qemu_put_byte(f, v >> 8);
diff --git a/migration/qemu-file.h b/migration/qemu-file.h
index bcc3908..e649718 100644
--- a/migration/qemu-file.h
+++ b/migration/qemu-file.h
@@ -130,17 +130,6 @@
  * accounting information tracks the total migration traffic.
  */
 void qemu_file_credit_transfer(QEMUFile *f, size_t size);
-void qemu_file_reset_rate_limit(QEMUFile *f);
-/*
- * qemu_file_acct_rate_limit:
- *
- * Report on a number of bytes the have been transferred
- * out of band from the main file object I/O methods, and
- * need to be applied to the rate limiting calcuations
- */
-void qemu_file_acct_rate_limit(QEMUFile *f, uint64_t len);
-void qemu_file_set_rate_limit(QEMUFile *f, uint64_t new_rate);
-uint64_t qemu_file_get_rate_limit(QEMUFile *f);
 int qemu_file_get_error_obj(QEMUFile *f, Error **errp);
 int qemu_file_get_error_obj_any(QEMUFile *f1, QEMUFile *f2, Error **errp);
 void qemu_file_set_error_obj(QEMUFile *f, int ret, Error *err);
diff --git a/migration/ram.c b/migration/ram.c
index 5900cab..9fb076f 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -3116,7 +3116,7 @@
 
         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
         i = 0;
-        while ((ret = qemu_file_rate_limit(f)) == 0 ||
+        while ((ret = migration_rate_exceeded(f)) == 0 ||
                postcopy_has_request(rs)) {
             int pages;
 
diff --git a/migration/rdma.c b/migration/rdma.c
index 2cd8f1c..2e4dcff 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -3342,9 +3342,8 @@
             }
         }
         rdma_ack_cm_event(cm_event);
-
-        if (mis->migration_incoming_co) {
-            qemu_coroutine_enter(mis->migration_incoming_co);
+        if (mis->loadvm_co) {
+            qemu_coroutine_enter(mis->loadvm_co);
         }
         return;
     }
diff --git a/migration/savevm.c b/migration/savevm.c
index e337883..03795ce 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1338,7 +1338,7 @@
             !(se->ops->has_postcopy && se->ops->has_postcopy(se->opaque))) {
             continue;
         }
-        if (qemu_file_rate_limit(f)) {
+        if (migration_rate_exceeded(f)) {
             return 0;
         }
         trace_savevm_section_start(se->idstr, se->section_id);
diff --git a/migration/trace-events b/migration/trace-events
index f39818c..cdaef7a 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -186,6 +186,9 @@
 process_incoming_migration_co_postcopy_end_main(void) ""
 postcopy_preempt_enabled(bool value) "%d"
 
+# migration-stats
+migration_transferred_bytes(uint64_t qemu_file, uint64_t multifd) "qemu_file %" PRIu64 " multifd %" PRIu64
+
 # channel.c
 migration_set_incoming_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s"
 migration_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostname, void *err)  "ioc=%p ioctype=%s hostname=%s err=%p"
diff --git a/net/meson.build b/net/meson.build
index 87afca3..6f4ecde 100644
--- a/net/meson.build
+++ b/net/meson.build
@@ -1,13 +1,10 @@
 softmmu_ss.add(files(
   'announce.c',
   'checksum.c',
-  'colo-compare.c',
-  'colo.c',
   'dump.c',
   'eth.c',
   'filter-buffer.c',
   'filter-mirror.c',
-  'filter-rewriter.c',
   'filter.c',
   'hub.c',
   'net-hmp-cmds.c',
@@ -19,6 +16,16 @@
   'util.c',
 ))
 
+if get_option('replication').allowed() or \
+    get_option('colo_proxy').allowed()
+  softmmu_ss.add(files('colo-compare.c'))
+  softmmu_ss.add(files('colo.c'))
+endif
+
+if get_option('colo_proxy').allowed()
+  softmmu_ss.add(files('filter-rewriter.c'))
+endif
+
 softmmu_ss.add(when: 'CONFIG_TCG', if_true: files('filter-replay.c'))
 
 if have_l2tpv3
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 53124e1..5714fd9 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -91,6 +91,7 @@
   printf "%s\n" '  capstone        Whether and how to find the capstone library'
   printf "%s\n" '  cloop           cloop image format support'
   printf "%s\n" '  cocoa           Cocoa user interface (macOS only)'
+  printf "%s\n" '  colo-proxy      colo-proxy support'
   printf "%s\n" '  coreaudio       CoreAudio sound support'
   printf "%s\n" '  crypto-afalg    Linux AF_ALG crypto backend driver'
   printf "%s\n" '  curl            CURL block device driver'
@@ -252,6 +253,8 @@
     --disable-cloop) printf "%s" -Dcloop=disabled ;;
     --enable-cocoa) printf "%s" -Dcocoa=enabled ;;
     --disable-cocoa) printf "%s" -Dcocoa=disabled ;;
+    --enable-colo-proxy) printf "%s" -Dcolo_proxy=enabled ;;
+    --disable-colo-proxy) printf "%s" -Dcolo_proxy=disabled ;;
     --enable-coreaudio) printf "%s" -Dcoreaudio=enabled ;;
     --disable-coreaudio) printf "%s" -Dcoreaudio=disabled ;;
     --with-coroutine=*) quote_sh "-Dcoroutine_backend=$2" ;;
diff --git a/stubs/colo-compare.c b/stubs/colo-compare.c
new file mode 100644
index 0000000..ec72666
--- /dev/null
+++ b/stubs/colo-compare.c
@@ -0,0 +1,7 @@
+#include "qemu/osdep.h"
+#include "qemu/notify.h"
+#include "net/colo-compare.h"
+
+void colo_compare_cleanup(void)
+{
+}
diff --git a/stubs/colo.c b/stubs/colo.c
index cf9816d..f33379d 100644
--- a/stubs/colo.c
+++ b/stubs/colo.c
@@ -10,11 +10,9 @@
 {
 }
 
-void *colo_process_incoming_thread(void *opaque)
+int coroutine_fn colo_incoming_co(void)
 {
-    error_report("Impossible happend: trying to start COLO thread when COLO "
-                 "module is not built in");
-    abort();
+    return 0;
 }
 
 void colo_checkpoint_delay_set(void)
diff --git a/stubs/meson.build b/stubs/meson.build
index 8412cad..a56645e 100644
--- a/stubs/meson.build
+++ b/stubs/meson.build
@@ -46,6 +46,7 @@
 stub_ss.add(files('trace-control.c'))
 stub_ss.add(files('uuid.c'))
 stub_ss.add(files('colo.c'))
+stub_ss.add(files('colo-compare.c'))
 stub_ss.add(files('vmstate.c'))
 stub_ss.add(files('vm-stop.c'))
 stub_ss.add(files('win32-kbd-hook.c'))
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index f6a88e5..5182ed0 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -1480,7 +1480,6 @@
                                      qdev_prop_allow_set_link_before_realize,
                                      OBJ_PROP_LINK_STRONG);
         }
-        cpu->has_mte = true;
     }
 #endif
 }
@@ -1617,7 +1616,7 @@
         }
         if (cpu->tag_memory) {
             error_setg(errp,
-                       "Cannot enable %s when guest CPUs has tag memory enabled",
+                       "Cannot enable %s when guest CPUs has MTE enabled",
                        current_accel_name());
             return;
         }
@@ -1997,10 +1996,10 @@
     }
 
 #ifndef CONFIG_USER_ONLY
-    if (!cpu->has_mte && cpu_isar_feature(aa64_mte, cpu)) {
+    if (cpu->tag_memory == NULL && cpu_isar_feature(aa64_mte, cpu)) {
         /*
-         * Disable the MTE feature bits if we do not have the feature
-         * setup by the machine.
+         * Disable the MTE feature bits if we do not have tag-memory
+         * provided by the machine.
          */
         cpu->isar.id_aa64pfr1 =
             FIELD_DP64(cpu->isar.id_aa64pfr1, ID_AA64PFR1, MTE, 0);
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index c3463e3..d469a26 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -935,9 +935,6 @@
      */
     uint32_t psci_conduit;
 
-    /* CPU has Memory Tag Extension */
-    bool has_mte;
-
     /* For v8M, initial value of the Secure VTOR */
     uint32_t init_svtor;
     /* For v8M, initial value of the Non-secure VTOR */
@@ -1056,7 +1053,6 @@
     bool prop_pauth;
     bool prop_pauth_impdef;
     bool prop_lpa2;
-    OnOffAuto prop_mte;
 
     /* DCZ blocksize, in log_2(words), ie low 4 bits of DCZID_EL0 */
     uint32_t dcz_blocksize;
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index 9553488..84da493 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -31,7 +31,6 @@
 #include "hw/boards.h"
 #include "hw/irq.h"
 #include "qemu/log.h"
-#include "migration/blocker.h"
 
 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
     KVM_CAP_LAST_INFO
@@ -1065,37 +1064,3 @@
 void kvm_arch_accel_class_init(ObjectClass *oc)
 {
 }
-
-void kvm_arm_enable_mte(Object *cpuobj, Error **errp)
-{
-    static bool tried_to_enable;
-    static bool succeeded_to_enable;
-    Error *mte_migration_blocker = NULL;
-    int ret;
-
-    if (!tried_to_enable) {
-        /*
-         * MTE on KVM is enabled on a per-VM basis (and retrying doesn't make
-         * sense), and we only want a single migration blocker as well.
-         */
-        tried_to_enable = true;
-
-        ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_MTE, 0);
-        if (ret) {
-            error_setg_errno(errp, -ret, "Failed to enable KVM_CAP_ARM_MTE");
-            return;
-        }
-
-        /* TODO: add proper migration support with MTE enabled */
-        error_setg(&mte_migration_blocker,
-                   "Live migration disabled due to MTE enabled");
-        if (migrate_add_blocker(mte_migration_blocker, errp)) {
-            error_free(mte_migration_blocker);
-            return;
-        }
-        succeeded_to_enable = true;
-    }
-    if (succeeded_to_enable) {
-        object_property_set_bool(cpuobj, "has_mte", true, NULL);
-    }
-}
diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index 1893f38..810db33 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -756,11 +756,6 @@
     return kvm_check_extension(kvm_state, KVM_CAP_STEAL_TIME);
 }
 
-bool kvm_arm_mte_supported(void)
-{
-    return kvm_check_extension(kvm_state, KVM_CAP_ARM_MTE);
-}
-
 QEMU_BUILD_BUG_ON(KVM_ARM64_SVE_VQ_MIN != 1);
 
 uint32_t kvm_arm_sve_get_vls(CPUState *cs)
diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h
index 2083547..330fbe5 100644
--- a/target/arm/kvm_arm.h
+++ b/target/arm/kvm_arm.h
@@ -314,13 +314,6 @@
 bool kvm_arm_sve_supported(void);
 
 /**
- * kvm_arm_mte_supported:
- *
- * Returns: true if KVM can enable MTE, and false otherwise.
- */
-bool kvm_arm_mte_supported(void);
-
-/**
  * kvm_arm_get_max_vm_ipa_size:
  * @ms: Machine state handle
  * @fixed_ipa: True when the IPA limit is fixed at 40. This is the case
@@ -384,8 +377,6 @@
 
 int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level);
 
-void kvm_arm_enable_mte(Object *cpuobj, Error **errp);
-
 #else
 
 /*
@@ -412,11 +403,6 @@
     return false;
 }
 
-static inline bool kvm_arm_mte_supported(void)
-{
-    return false;
-}
-
 /*
  * These functions should never actually be called without KVM support.
  */
@@ -465,11 +451,6 @@
     g_assert_not_reached();
 }
 
-static inline void kvm_arm_enable_mte(Object *cpuobj, Error **errp)
-{
-    g_assert_not_reached();
-}
-
 #endif
 
 static inline const char *gic_class_name(void)
diff --git a/target/hexagon/README b/target/hexagon/README
index ebafc78..4381117 100644
--- a/target/hexagon/README
+++ b/target/hexagon/README
@@ -4,10 +4,10 @@
 image processing, machine learning, and other workloads.
 
 The following versions of the Hexagon core are supported
-    Scalar core: v67
-    https://developer.qualcomm.com/downloads/qualcomm-hexagon-v67-programmer-s-reference-manual
-    HVX extension: v66
-    https://developer.qualcomm.com/downloads/qualcomm-hexagon-v66-hvx-programmer-s-reference-manual
+    Scalar core: v73
+    https://developer.qualcomm.com/downloads/qualcomm-hexagon-v73-programmers-reference-manual-rev-aa
+    HVX extension: v73
+    https://developer.qualcomm.com/downloads/qualcomm-hexagon-v73-hvx-programmers-reference-manual-rev-aa
 
 We presented an overview of the project at the 2019 KVM Forum.
     https://kvmforum2019.sched.com/event/Tmwc/qemu-hexagon-automatic-translation-of-the-isa-manual-pseudcode-to-tiny-code-instructions-of-a-vliw-architecture-niccolo-izzo-revng-taylor-simpson-qualcomm-innovation-center
@@ -87,7 +87,7 @@
         TCGv RsV = hex_gpr[insn->regno[1]];
         TCGv RtV = hex_gpr[insn->regno[2]];
         gen_helper_A2_add(RdV, cpu_env, RsV, RtV);
-        gen_log_reg_write(RdN, RdV);
+        gen_log_reg_write(ctx, RdN, RdV);
     }
 
 helper_funcs_generated.c.inc
@@ -186,7 +186,7 @@
 these functions record the writes to registers by calling ctx_log_*.  During
 gen_start_packet, we invoke the analyze_<tag> function for each instruction in
 the packet, and we mark the implicit writes.  After the analysis is performed,
-we initialize hex_new_value for each of the predicated assignments.
+we initialize the result register for each of the predicated assignments.
 
 In addition to instruction semantics, we use a generator to create the decode
 tree.  This generation is also a two step process.  The first step is to run
@@ -304,4 +304,4 @@
         At the start of execution of a packet for a given PC
             br helper_debug_start_packet if env->gpr[41] == 0xdeadbeef
         At the end of execution of a packet for a given PC
-            br helper_debug_commit_end if env->this_PC == 0xdeadbeef
+            br helper_debug_commit_end if this_PC == 0xdeadbeef
diff --git a/target/hexagon/arch.c b/target/hexagon/arch.c
index da79b41..d053d68 100644
--- a/target/hexagon/arch.c
+++ b/target/hexagon/arch.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -224,6 +224,7 @@
 
 void arch_fpop_end(CPUHexagonState *env)
 {
+    const bool pkt_need_commit = true;
     int flags = get_float_exception_flags(&env->fp_status);
     if (flags != 0) {
         SOFTFLOAT_TEST_FLAG(float_flag_inexact, FPINPF, FPINPE);
diff --git a/target/hexagon/attribs_def.h.inc b/target/hexagon/attribs_def.h.inc
index 9874d16..21d457f 100644
--- a/target/hexagon/attribs_def.h.inc
+++ b/target/hexagon/attribs_def.h.inc
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -52,6 +52,12 @@
 DEF_ATTRIB(REGWRSIZE_8B, "Memory width is 8 bytes", "", "")
 DEF_ATTRIB(MEMLIKE, "Memory-like instruction", "", "")
 DEF_ATTRIB(MEMLIKE_PACKET_RULES, "follows Memory-like packet rules", "", "")
+DEF_ATTRIB(RELEASE, "Releases a lock", "", "")
+DEF_ATTRIB(ACQUIRE, "Acquires a lock", "", "")
+
+DEF_ATTRIB(RLS_INNER, "Store release inner visibility", "", "")
+DEF_ATTRIB(RLS_ALL_THREAD, "Store release among all threads", "", "")
+DEF_ATTRIB(RLS_SAME_THREAD, "Store release with the same thread", "", "")
 
 /* V6 Vector attributes */
 DEF_ATTRIB(CVI, "Executes on the HVX extension", "", "")
@@ -63,23 +69,27 @@
 DEF_ATTRIB(CVI_VX, "Multiply instruction executes on HVX", "", "")
 DEF_ATTRIB(CVI_VX_DV, "Double vector multiply insn executes on HVX", "", "")
 DEF_ATTRIB(CVI_VS, "Shift instruction executes on HVX", "", "")
+DEF_ATTRIB(CVI_VS_3SRC, "This shift needs to borrow a source register", "", "")
 DEF_ATTRIB(CVI_VS_VX, "Permute/shift and multiply insn executes on HVX", "", "")
 DEF_ATTRIB(CVI_VA, "ALU instruction executes on HVX", "", "")
 DEF_ATTRIB(CVI_VA_DV, "Double vector alu instruction executes on HVX", "", "")
 DEF_ATTRIB(CVI_4SLOT, "Consumes all the vector execution resources", "", "")
 DEF_ATTRIB(CVI_TMP, "Transient Memory Load not written to register", "", "")
+DEF_ATTRIB(CVI_REMAP, "Register Renaming not written to register file", "", "")
 DEF_ATTRIB(CVI_GATHER, "CVI Gather operation", "", "")
 DEF_ATTRIB(CVI_SCATTER, "CVI Scatter operation", "", "")
 DEF_ATTRIB(CVI_SCATTER_RELEASE, "CVI Store Release for scatter", "", "")
 DEF_ATTRIB(CVI_TMP_DST, "CVI instruction that doesn't write a register", "", "")
 DEF_ATTRIB(CVI_SLOT23, "Can execute in slot 2 or slot 3 (HVX)", "", "")
 
+DEF_ATTRIB(VTCM_ALLBANK_ACCESS, "Allocates in all VTCM schedulers.", "", "")
 
 /* Change-of-flow attributes */
 DEF_ATTRIB(JUMP, "Jump-type instruction", "", "")
 DEF_ATTRIB(INDIRECT, "Absolute register jump", "", "")
 DEF_ATTRIB(CALL, "Function call instruction", "", "")
 DEF_ATTRIB(COF, "Change-of-flow instruction", "", "")
+DEF_ATTRIB(HINTED_COF, "This instruction is a hinted change-of-flow", "", "")
 DEF_ATTRIB(CONDEXEC, "May be cancelled by a predicate", "", "")
 DEF_ATTRIB(DOTNEWVALUE, "Uses a register value generated in this pkt", "", "")
 DEF_ATTRIB(NEWCMPJUMP, "Compound compare and jump", "", "")
@@ -102,6 +112,10 @@
 DEF_ATTRIB(IMPLICIT_WRITES_P2, "Writes Predicate 1", "", "UREG.P2")
 DEF_ATTRIB(IMPLICIT_WRITES_P3, "May write Predicate 3", "", "UREG.P3")
 DEF_ATTRIB(IMPLICIT_READS_PC, "Reads the PC register", "", "")
+DEF_ATTRIB(IMPLICIT_READS_P0, "Reads the P0 register", "", "")
+DEF_ATTRIB(IMPLICIT_READS_P1, "Reads the P1 register", "", "")
+DEF_ATTRIB(IMPLICIT_READS_P2, "Reads the P2 register", "", "")
+DEF_ATTRIB(IMPLICIT_READS_P3, "Reads the P3 register", "", "")
 DEF_ATTRIB(IMPLICIT_WRITES_USR, "May write USR", "", "")
 DEF_ATTRIB(WRITES_PRED_REG, "Writes a predicate register", "", "")
 DEF_ATTRIB(COMMUTES, "The operation is communitive", "", "")
@@ -140,6 +154,8 @@
 DEF_ATTRIB(ICINVA, "icinva", "", "")
 DEF_ATTRIB(DCCLEANINVA, "dccleaninva", "", "")
 
+DEF_ATTRIB(NO_INTRINSIC, "Don't generate an intrisic", "", "")
+
 /* Documentation Notes */
 DEF_ATTRIB(NOTE_CONDITIONAL, "can be conditionally executed", "", "")
 DEF_ATTRIB(NOTE_NEWVAL_SLOT0, "New-value oprnd must execute on slot 0", "", "")
@@ -148,7 +164,11 @@
 DEF_ATTRIB(NOTE_AXOK, "May only be grouped with ALU32 or non-FP XTYPE.", "", "")
 DEF_ATTRIB(NOTE_LATEPRED, "The predicate can not be used as a .new", "", "")
 DEF_ATTRIB(NOTE_NVSLOT0, "Can execute only in slot 0 (ST)", "", "")
+DEF_ATTRIB(NOTE_NOVP, "Cannot be paired with a HVX permute instruction", "", "")
+DEF_ATTRIB(NOTE_VA_UNARY, "Combined with HVX ALU op (must be unary)", "", "")
 
+/* V6 MMVector Notes for Documentation */
+DEF_ATTRIB(NOTE_SHIFT_RESOURCE, "Uses the HVX shift resource.", "", "")
 /* Restrictions to make note of */
 DEF_ATTRIB(RESTRICT_NOSLOT1_STORE, "Packet must not have slot 1 store", "", "")
 DEF_ATTRIB(RESTRICT_LATEPRED, "Predicate can not be used as a .new.", "", "")
diff --git a/target/hexagon/cpu.c b/target/hexagon/cpu.c
index ab40cfc..f155936 100644
--- a/target/hexagon/cpu.c
+++ b/target/hexagon/cpu.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -24,9 +24,32 @@
 #include "hw/qdev-properties.h"
 #include "fpu/softfloat-helpers.h"
 #include "tcg/tcg.h"
+#include "exec/gdbstub.h"
 
-static void hexagon_v67_cpu_init(Object *obj)
+static void hexagon_v67_cpu_init(Object *obj) { }
+static void hexagon_v68_cpu_init(Object *obj) { }
+static void hexagon_v69_cpu_init(Object *obj) { }
+static void hexagon_v71_cpu_init(Object *obj) { }
+static void hexagon_v73_cpu_init(Object *obj) { }
+
+static void hexagon_cpu_list_entry(gpointer data, gpointer user_data)
 {
+    ObjectClass *oc = data;
+    char *name = g_strdup(object_class_get_name(oc));
+    if (g_str_has_suffix(name, HEXAGON_CPU_TYPE_SUFFIX)) {
+        name[strlen(name) - strlen(HEXAGON_CPU_TYPE_SUFFIX)] = '\0';
+    }
+    qemu_printf("  %s\n", name);
+    g_free(name);
+}
+
+void hexagon_cpu_list(void)
+{
+    GSList *list;
+    list = object_class_get_list_sorted(TYPE_HEXAGON_CPU, false);
+    qemu_printf("Available CPUs:\n");
+    g_slist_foreach(list, hexagon_cpu_list_entry, NULL);
+    g_slist_free(list);
 }
 
 static ObjectClass *hexagon_cpu_class_by_name(const char *cpu_model)
@@ -52,6 +75,8 @@
 static Property hexagon_lldb_stack_adjust_property =
     DEFINE_PROP_UNSIGNED("lldb-stack-adjust", HexagonCPU, lldb_stack_adjust,
                          0, qdev_prop_uint32, target_ulong);
+static Property hexagon_short_circuit_property =
+    DEFINE_PROP_BOOL("short-circuit", HexagonCPU, short_circuit, true);
 
 const char * const hexagon_regnames[TOTAL_PER_THREAD_REGS] = {
    "r0", "r1",  "r2",  "r3",  "r4",   "r5",  "r6",  "r7",
@@ -315,6 +340,11 @@
         return;
     }
 
+    gdb_register_coprocessor(cs, hexagon_hvx_gdb_read_register,
+                             hexagon_hvx_gdb_write_register,
+                             NUM_VREGS + NUM_QREGS,
+                             "hexagon-hvx.xml", 0);
+
     qemu_init_vcpu(cs);
     cpu_reset(cs);
 
@@ -328,6 +358,7 @@
     cpu_set_cpustate_pointers(cpu);
     qdev_property_add_static(DEVICE(obj), &hexagon_lldb_compat_property);
     qdev_property_add_static(DEVICE(obj), &hexagon_lldb_stack_adjust_property);
+    qdev_property_add_static(DEVICE(obj), &hexagon_short_circuit_property);
 }
 
 #include "hw/core/tcg-cpu-ops.h"
@@ -358,8 +389,9 @@
     cc->get_pc = hexagon_cpu_get_pc;
     cc->gdb_read_register = hexagon_gdb_read_register;
     cc->gdb_write_register = hexagon_gdb_write_register;
-    cc->gdb_num_core_regs = TOTAL_PER_THREAD_REGS + NUM_VREGS + NUM_QREGS;
+    cc->gdb_num_core_regs = TOTAL_PER_THREAD_REGS;
     cc->gdb_stop_before_watchpoint = true;
+    cc->gdb_core_xml_file = "hexagon-core.xml";
     cc->disas_set_info = hexagon_cpu_disas_set_info;
     cc->tcg_ops = &hexagon_tcg_ops;
 }
@@ -382,6 +414,10 @@
         .class_init = hexagon_cpu_class_init,
     },
     DEFINE_CPU(TYPE_HEXAGON_CPU_V67,              hexagon_v67_cpu_init),
+    DEFINE_CPU(TYPE_HEXAGON_CPU_V68,              hexagon_v68_cpu_init),
+    DEFINE_CPU(TYPE_HEXAGON_CPU_V69,              hexagon_v69_cpu_init),
+    DEFINE_CPU(TYPE_HEXAGON_CPU_V71,              hexagon_v71_cpu_init),
+    DEFINE_CPU(TYPE_HEXAGON_CPU_V73,              hexagon_v73_cpu_init),
 };
 
 DEFINE_TYPES(hexagon_cpu_type_infos)
diff --git a/target/hexagon/cpu.h b/target/hexagon/cpu.h
index 81b663e..bfcb105 100644
--- a/target/hexagon/cpu.h
+++ b/target/hexagon/cpu.h
@@ -43,6 +43,13 @@
 #define CPU_RESOLVING_TYPE TYPE_HEXAGON_CPU
 
 #define TYPE_HEXAGON_CPU_V67 HEXAGON_CPU_TYPE_NAME("v67")
+#define TYPE_HEXAGON_CPU_V68 HEXAGON_CPU_TYPE_NAME("v68")
+#define TYPE_HEXAGON_CPU_V69 HEXAGON_CPU_TYPE_NAME("v69")
+#define TYPE_HEXAGON_CPU_V71 HEXAGON_CPU_TYPE_NAME("v71")
+#define TYPE_HEXAGON_CPU_V73 HEXAGON_CPU_TYPE_NAME("v73")
+
+void hexagon_cpu_list(void);
+#define cpu_list hexagon_cpu_list
 
 #define MMU_USER_IDX 0
 
@@ -78,28 +85,21 @@
 typedef struct CPUArchState {
     target_ulong gpr[TOTAL_PER_THREAD_REGS];
     target_ulong pred[NUM_PREGS];
-    target_ulong branch_taken;
 
     /* For comparing with LLDB on target - see adjust_stack_ptrs function */
     target_ulong last_pc_dumped;
     target_ulong stack_start;
 
     uint8_t slot_cancelled;
-    target_ulong new_value[TOTAL_PER_THREAD_REGS];
+    target_ulong new_value_usr;
 
     /*
      * Only used when HEX_DEBUG is on, but unconditionally included
      * to reduce recompile time when turning HEX_DEBUG on/off.
      */
-    target_ulong this_PC;
     target_ulong reg_written[TOTAL_PER_THREAD_REGS];
 
-    target_ulong new_pred_value[NUM_PREGS];
-    target_ulong pred_written;
-
     MemLog mem_log_stores[STORES_MAX];
-    target_ulong pkt_has_store_s1;
-    target_ulong dczero_addr;
 
     float_status fp_status;
 
@@ -146,6 +146,7 @@
 
     bool lldb_compat;
     target_ulong lldb_stack_adjust;
+    bool short_circuit;
 };
 
 #include "cpu_bits.h"
diff --git a/target/hexagon/decode.c b/target/hexagon/decode.c
index 041c8de..946c55c 100644
--- a/target/hexagon/decode.c
+++ b/target/hexagon/decode.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -797,7 +797,26 @@
     return bits == 0x2;
 }
 
-static void
+static bool has_valid_slot_assignment(Packet *pkt)
+{
+    int used_slots = 0;
+    for (int i = 0; i < pkt->num_insns; i++) {
+        int slot_mask;
+        Insn *insn = &pkt->insn[i];
+        if (decode_opcode_ends_loop(insn->opcode)) {
+            /* We overload slot 0 for endloop. */
+            continue;
+        }
+        slot_mask = 1 << insn->slot;
+        if (used_slots & slot_mask) {
+            return false;
+        }
+        used_slots |= slot_mask;
+    }
+    return true;
+}
+
+static bool
 decode_set_slot_number(Packet *pkt)
 {
     int slot;
@@ -886,6 +905,8 @@
         /* Then push it to slot0 */
         pkt->insn[slot1_iidx].slot = 0;
     }
+
+    return has_valid_slot_assignment(pkt);
 }
 
 /*
@@ -961,8 +982,11 @@
     decode_apply_extenders(pkt);
     if (!disas_only) {
         decode_remove_extenders(pkt);
+        if (!decode_set_slot_number(pkt)) {
+            /* Invalid packet */
+            return 0;
+        }
     }
-    decode_set_slot_number(pkt);
     decode_fill_newvalue_regno(pkt);
 
     if (pkt->pkt_has_hvx) {
diff --git a/target/hexagon/gdbstub.c b/target/hexagon/gdbstub.c
index 46083da..54d37e0 100644
--- a/target/hexagon/gdbstub.c
+++ b/target/hexagon/gdbstub.c
@@ -25,6 +25,14 @@
     HexagonCPU *cpu = HEXAGON_CPU(cs);
     CPUHexagonState *env = &cpu->env;
 
+    if (n == HEX_REG_P3_0_ALIASED) {
+        uint32_t p3_0 = 0;
+        for (int i = 0; i < NUM_PREGS; i++) {
+            p3_0 = deposit32(p3_0, i * 8, 8, env->pred[i]);
+        }
+        return gdb_get_regl(mem_buf, p3_0);
+    }
+
     if (n < TOTAL_PER_THREAD_REGS) {
         return gdb_get_regl(mem_buf, env->gpr[n]);
     }
@@ -37,6 +45,14 @@
     HexagonCPU *cpu = HEXAGON_CPU(cs);
     CPUHexagonState *env = &cpu->env;
 
+    if (n == HEX_REG_P3_0_ALIASED) {
+        uint32_t p3_0 = ldtul_p(mem_buf);
+        for (int i = 0; i < NUM_PREGS; i++) {
+            env->pred[i] = extract32(p3_0, i * 8, 8);
+        }
+        return sizeof(target_ulong);
+    }
+
     if (n < TOTAL_PER_THREAD_REGS) {
         env->gpr[n] = ldtul_p(mem_buf);
         return sizeof(target_ulong);
@@ -44,3 +60,71 @@
 
     g_assert_not_reached();
 }
+
+static int gdb_get_vreg(CPUHexagonState *env, GByteArray *mem_buf, int n)
+{
+    int total = 0;
+    int i;
+    for (i = 0; i < ARRAY_SIZE(env->VRegs[n].uw); i++) {
+        total += gdb_get_regl(mem_buf, env->VRegs[n].uw[i]);
+    }
+    return total;
+}
+
+static int gdb_get_qreg(CPUHexagonState *env, GByteArray *mem_buf, int n)
+{
+    int total = 0;
+    int i;
+    for (i = 0; i < ARRAY_SIZE(env->QRegs[n].uw); i++) {
+        total += gdb_get_regl(mem_buf, env->QRegs[n].uw[i]);
+    }
+    return total;
+}
+
+int hexagon_hvx_gdb_read_register(CPUHexagonState *env, GByteArray *mem_buf, int n)
+{
+    if (n < NUM_VREGS) {
+        return gdb_get_vreg(env, mem_buf, n);
+    }
+    n -= NUM_VREGS;
+
+    if (n < NUM_QREGS) {
+        return gdb_get_qreg(env, mem_buf, n);
+    }
+
+    g_assert_not_reached();
+}
+
+static int gdb_put_vreg(CPUHexagonState *env, uint8_t *mem_buf, int n)
+{
+    int i;
+    for (i = 0; i < ARRAY_SIZE(env->VRegs[n].uw); i++) {
+        env->VRegs[n].uw[i] = ldtul_p(mem_buf);
+        mem_buf += 4;
+    }
+    return MAX_VEC_SIZE_BYTES;
+}
+
+static int gdb_put_qreg(CPUHexagonState *env, uint8_t *mem_buf, int n)
+{
+    int i;
+    for (i = 0; i < ARRAY_SIZE(env->QRegs[n].uw); i++) {
+        env->QRegs[n].uw[i] = ldtul_p(mem_buf);
+        mem_buf += 4;
+    }
+    return MAX_VEC_SIZE_BYTES / 8;
+}
+
+int hexagon_hvx_gdb_write_register(CPUHexagonState *env, uint8_t *mem_buf, int n)
+{
+   if (n < NUM_VREGS) {
+        return gdb_put_vreg(env, mem_buf, n);
+    }
+    n -= NUM_VREGS;
+
+    if (n < NUM_QREGS) {
+        return gdb_put_qreg(env, mem_buf, n);
+    }
+
+    g_assert_not_reached();
+}
diff --git a/target/hexagon/gen_analyze_funcs.py b/target/hexagon/gen_analyze_funcs.py
index c74443d..00868cc 100755
--- a/target/hexagon/gen_analyze_funcs.py
+++ b/target/hexagon/gen_analyze_funcs.py
@@ -35,47 +35,55 @@
     predicated = "true" if is_predicated(tag) else "false"
     if regtype == "R":
         if regid in {"ss", "tt"}:
-            f.write(f"//    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    ctx_log_reg_read_pair(ctx, {regN});\n")
         elif regid in {"dd", "ee", "xx", "yy"}:
             f.write(f"    const int {regN} = insn->regno[{regno}];\n")
             f.write(f"    ctx_log_reg_write_pair(ctx, {regN}, {predicated});\n")
         elif regid in {"s", "t", "u", "v"}:
-            f.write(f"//    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    ctx_log_reg_read(ctx, {regN});\n")
         elif regid in {"d", "e", "x", "y"}:
             f.write(f"    const int {regN} = insn->regno[{regno}];\n")
             f.write(f"    ctx_log_reg_write(ctx, {regN}, {predicated});\n")
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "P":
         if regid in {"s", "t", "u", "v"}:
-            f.write(f"//    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    ctx_log_pred_read(ctx, {regN});\n")
         elif regid in {"d", "e", "x"}:
             f.write(f"    const int {regN} = insn->regno[{regno}];\n")
             f.write(f"    ctx_log_pred_write(ctx, {regN});\n")
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "C":
         if regid == "ss":
             f.write(
-                f"//    const int {regN} = insn->regno[{regno}] " "+ HEX_REG_SA0;\n"
+                f"    const int {regN} = insn->regno[{regno}] "
+                "+ HEX_REG_SA0;\n"
             )
+            f.write(f"    ctx_log_reg_read_pair(ctx, {regN});\n")
         elif regid == "dd":
             f.write(f"    const int {regN} = insn->regno[{regno}] " "+ HEX_REG_SA0;\n")
             f.write(f"    ctx_log_reg_write_pair(ctx, {regN}, {predicated});\n")
         elif regid == "s":
             f.write(
-                f"//    const int {regN} = insn->regno[{regno}] " "+ HEX_REG_SA0;\n"
+                f"    const int {regN} = insn->regno[{regno}] "
+                "+ HEX_REG_SA0;\n"
             )
+            f.write(f"    ctx_log_reg_read(ctx, {regN});\n")
         elif regid == "d":
             f.write(f"    const int {regN} = insn->regno[{regno}] " "+ HEX_REG_SA0;\n")
             f.write(f"    ctx_log_reg_write(ctx, {regN}, {predicated});\n")
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "M":
         if regid == "u":
-            f.write(f"//    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    ctx_log_reg_read(ctx, {regN});\n")
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "V":
         newv = "EXT_DFL"
         if hex_common.is_new_result(tag):
@@ -88,22 +96,25 @@
                 f"    ctx_log_vreg_write_pair(ctx, {regN}, {newv}, " f"{predicated});\n"
             )
         elif regid in {"uu", "vv"}:
-            f.write(f"//    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    ctx_log_vreg_read_pair(ctx, {regN});\n")
         elif regid in {"s", "u", "v", "w"}:
-            f.write(f"//    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    ctx_log_vreg_read(ctx, {regN});\n")
         elif regid in {"d", "x", "y"}:
             f.write(f"    const int {regN} = insn->regno[{regno}];\n")
             f.write(f"    ctx_log_vreg_write(ctx, {regN}, {newv}, " f"{predicated});\n")
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "Q":
         if regid in {"d", "e", "x"}:
             f.write(f"    const int {regN} = insn->regno[{regno}];\n")
             f.write(f"    ctx_log_qreg_write(ctx, {regN});\n")
         elif regid in {"s", "t", "u", "v"}:
-            f.write(f"//    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    ctx_log_qreg_read(ctx, {regN});\n")
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "G":
         if regid in {"dd"}:
             f.write(f"//    const int {regN} = insn->regno[{regno}];\n")
@@ -114,7 +125,7 @@
         elif regid in {"s"}:
             f.write(f"//    const int {regN} = insn->regno[{regno}];\n")
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "S":
         if regid in {"dd"}:
             f.write(f"//    const int {regN} = insn->regno[{regno}];\n")
@@ -125,30 +136,33 @@
         elif regid in {"s"}:
             f.write(f"//    const int {regN} = insn->regno[{regno}];\n")
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     else:
-        print("Bad register parse: ", regtype, regid)
+        hex_common.bad_register(regtype, regid)
 
 
 def analyze_opn_new(f, tag, regtype, regid, regno):
     regN = f"{regtype}{regid}N"
     if regtype == "N":
         if regid in {"s", "t"}:
-            f.write(f"//    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    ctx_log_reg_read(ctx, {regN});\n")
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "P":
         if regid in {"t", "u", "v"}:
-            f.write(f"//    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    ctx_log_pred_read(ctx, {regN});\n")
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "O":
         if regid == "s":
-            f.write(f"//    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    const int {regN} = insn->regno[{regno}];\n")
+            f.write(f"    ctx_log_vreg_read(ctx, {regN});\n")
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     else:
-        print("Bad register parse: ", regtype, regid)
+        hex_common.bad_register(regtype, regid)
 
 
 def analyze_opn(f, tag, regtype, regid, toss, numregs, i):
@@ -160,9 +174,9 @@
         elif hex_common.is_new_val(regtype, regid, tag):
             analyze_opn_new(f, tag, regtype, regid, i)
         else:
-            print("Bad register parse: ", regtype, regid, toss, numregs)
+            hex_common.bad_register(regtype, regid, toss, numregs)
     else:
-        print("Bad register parse: ", regtype, regid, toss, numregs)
+        hex_common.bad_register(regtype, regid, toss, numregs)
 
 
 ##
@@ -174,8 +188,10 @@
 ##         Insn *insn G_GNUC_UNUSED = ctx->insn;
 ##         const int RdN = insn->regno[0];
 ##         ctx_log_reg_write(ctx, RdN, false);
-##     //    const int RsN = insn->regno[1];
-##     //    const int RtN = insn->regno[2];
+##         const int RsN = insn->regno[1];
+##         ctx_log_reg_read(ctx, RsN);
+##         const int RtN = insn->regno[2];
+##         ctx_log_reg_read(ctx, RtN);
 ##     }
 ##
 def gen_analyze_func(f, tag, regs, imms):
@@ -193,8 +209,11 @@
     has_generated_helper = not hex_common.skip_qemu_helper(
         tag
     ) and not hex_common.is_idef_parser_enabled(tag)
-    if has_generated_helper and "A_SCALAR_LOAD" in hex_common.attribdict[tag]:
-        f.write("    ctx->need_pkt_has_store_s1 = true;\n")
+
+    ## Mark HVX instructions with generated helpers
+    if (has_generated_helper and
+        "A_CVI" in hex_common.attribdict[tag]):
+        f.write("    ctx->has_hvx_helper = true;\n")
 
     f.write("}\n\n")
 
diff --git a/target/hexagon/gen_helper_funcs.py b/target/hexagon/gen_helper_funcs.py
index c73d792..e80550f 100755
--- a/target/hexagon/gen_helper_funcs.py
+++ b/target/hexagon/gen_helper_funcs.py
@@ -87,9 +87,9 @@
         elif hex_common.is_new_val(regtype, regid, tag):
             gen_helper_arg_new(f, regtype, regid, i)
         else:
-            print("Bad register parse: ", regtype, regid, toss, numregs)
+            hex_common.bad_register(regtype, regid, toss, numregs)
     else:
-        print("Bad register parse: ", regtype, regid, toss, numregs)
+        hex_common.bad_register(regtype, regid, toss, numregs)
 
 
 def gen_helper_arg_imm(f, immlett):
@@ -135,7 +135,7 @@
         else:
             gen_helper_dest_decl(f, regtype, regid, i)
     else:
-        print("Bad register parse: ", regtype, regid, toss, numregs)
+        hex_common.bad_register(regtype, regid, toss, numregs)
 
 
 def gen_helper_src_var_ext(f, regtype, regid):
@@ -185,7 +185,7 @@
         else:
             gen_helper_return(f, regtype, regid, i)
     else:
-        print("Bad register parse: ", regtype, regid, toss, numregs)
+        hex_common.bad_register(regtype, regid, toss, numregs)
 
 
 ##
@@ -239,7 +239,7 @@
                     else:
                         gen_helper_return_type(f, regtype, regid, i)
                 else:
-                    print("Bad register parse: ", regtype, regid, toss, numregs)
+                    hex_common.bad_register(regtype, regid, toss, numregs)
             i += 1
 
         if numscalarresults == 0:
@@ -262,7 +262,7 @@
                         # This is the return value of the function
                         continue
                 else:
-                    print("Bad register parse: ", regtype, regid, toss, numregs)
+                    hex_common.bad_register(regtype, regid, toss, numregs)
                 i += 1
 
         ## For conditional instructions, we pass in the destination register
@@ -287,6 +287,8 @@
 
         if hex_common.need_pkt_has_multi_cof(tag):
             f.write(", uint32_t pkt_has_multi_cof")
+        if (hex_common.need_pkt_need_commit(tag)):
+            f.write(", uint32_t pkt_need_commit")
 
         if hex_common.need_PC(tag):
             if i > 0:
@@ -301,7 +303,7 @@
         if hex_common.need_slot(tag):
             if i > 0:
                 f.write(", ")
-            f.write("uint32_t slot")
+            f.write("uint32_t slotval")
             i += 1
         if hex_common.need_part1(tag):
             if i > 0:
@@ -327,7 +329,12 @@
                     if hex_common.is_hvx_reg(regtype):
                         gen_helper_src_var_ext(f, regtype, regid)
                 else:
-                    print("Bad register parse: ", regtype, regid, toss, numregs)
+                    hex_common.bad_register(regtype, regid, toss, numregs)
+
+        if hex_common.need_slot(tag):
+            if "A_LOAD" in hex_common.attribdict[tag]:
+                f.write("    bool pkt_has_store_s1 = slotval & 0x1;\n")
+            f.write("    uint32_t slot = slotval >> 1;\n")
 
         if "A_FPOP" in hex_common.attribdict[tag]:
             f.write("    arch_fpop_start(env);\n")
diff --git a/target/hexagon/gen_helper_protos.py b/target/hexagon/gen_helper_protos.py
index 187cd6e..3dedd76 100755
--- a/target/hexagon/gen_helper_protos.py
+++ b/target/hexagon/gen_helper_protos.py
@@ -52,7 +52,7 @@
     elif hex_common.is_single(regid):
         f.write(f", {def_helper_types[regtype]}")
     else:
-        print("Bad register parse: ", regtype, regid, toss, numregs)
+        hex_common.bad_register(regtype, regid, toss, numregs)
 
 
 ##
@@ -86,6 +86,8 @@
             def_helper_size = len(regs) + len(imms) + numscalarreadwrite + 1
             if hex_common.need_pkt_has_multi_cof(tag):
                 def_helper_size += 1
+            if hex_common.need_pkt_need_commit(tag):
+                def_helper_size += 1
             if hex_common.need_part1(tag):
                 def_helper_size += 1
             if hex_common.need_slot(tag):
@@ -103,6 +105,8 @@
             def_helper_size = len(regs) + len(imms) + numscalarreadwrite
             if hex_common.need_pkt_has_multi_cof(tag):
                 def_helper_size += 1
+            if hex_common.need_pkt_need_commit(tag):
+                def_helper_size += 1
             if hex_common.need_part1(tag):
                 def_helper_size += 1
             if hex_common.need_slot(tag):
@@ -156,10 +160,12 @@
         for immlett, bits, immshift in imms:
             f.write(", s32")
 
-        ## Add the arguments for the instruction pkt_has_multi_cof, slot and
-        ## part1 (if needed)
+        ## Add the arguments for the instruction pkt_has_multi_cof,
+        ## pkt_needs_commit, PC, next_PC, slot, and part1 (if needed)
         if hex_common.need_pkt_has_multi_cof(tag):
             f.write(", i32")
+        if hex_common.need_pkt_need_commit(tag):
+            f.write(', i32')
         if hex_common.need_PC(tag):
             f.write(", i32")
         if hex_common.helper_needs_next_PC(tag):
diff --git a/target/hexagon/gen_idef_parser_funcs.py b/target/hexagon/gen_idef_parser_funcs.py
index afe68bd..29160fc 100644
--- a/target/hexagon/gen_idef_parser_funcs.py
+++ b/target/hexagon/gen_idef_parser_funcs.py
@@ -103,12 +103,29 @@
                 continue
             if tag.startswith("V6_"):
                 continue
-            if tag.startswith("F"):
+            if ( tag.startswith("F") and
+                 tag not in {
+                     "F2_sfimm_p",
+                     "F2_sfimm_n",
+                     "F2_dfimm_p",
+                     "F2_dfimm_n",
+                     "F2_dfmpyll",
+                     "F2_dfmpylh"
+                 }):
                 continue
             if tag.endswith("_locked"):
                 continue
             if "A_COF" in hex_common.attribdict[tag]:
                 continue
+            if ( tag.startswith('R6_release_') ):
+                continue
+            ## Skip instructions that are incompatible with short-circuit
+            ## packet register writes
+            if ( tag == 'S2_insert' or
+                 tag == 'S2_insert_rp' or
+                 tag == 'S2_asr_r_svw_trun' or
+                 tag == 'A2_swiz' ):
+                continue
 
             regs = tagregs[tag]
             imms = tagimms[tag]
@@ -130,7 +147,7 @@
                 elif is_single_new:
                     arguments.append(f"{prefix}{regtype}{regid}N")
                 else:
-                    print("Bad register parse: ", regtype, regid, toss, numregs)
+                    hex_common.bad_register(regtype, regid, toss, numregs)
 
             for immlett, bits, immshift in imms:
                 arguments.append(hex_common.imm_name(immlett))
diff --git a/target/hexagon/gen_tcg.h b/target/hexagon/gen_tcg.h
index 329e7a1..d78d99d 100644
--- a/target/hexagon/gen_tcg.h
+++ b/target/hexagon/gen_tcg.h
@@ -501,6 +501,38 @@
     do { RsV = RsV; } while (0)
 
 /*
+ * allocframe(#uiV)
+ *     RxV == r29
+ */
+#define fGEN_TCG_S2_allocframe(SHORTCODE) \
+    gen_allocframe(ctx, RxV, uiV)
+
+/* sub-instruction version (no RxV, so handle it manually) */
+#define fGEN_TCG_SS2_allocframe(SHORTCODE) \
+    do { \
+        TCGv r29 = tcg_temp_new(); \
+        tcg_gen_mov_tl(r29, hex_gpr[HEX_REG_SP]); \
+        gen_allocframe(ctx, r29, uiV); \
+        gen_log_reg_write(ctx, HEX_REG_SP, r29); \
+    } while (0)
+
+/*
+ * Rdd32 = deallocframe(Rs32):raw
+ *     RddV == r31:30
+ *     RsV  == r30
+ */
+#define fGEN_TCG_L2_deallocframe(SHORTCODE) \
+    gen_deallocframe(ctx, RddV, RsV)
+
+/* sub-instruction version (no RddV/RsV, so handle it manually) */
+#define fGEN_TCG_SL2_deallocframe(SHORTCODE) \
+    do { \
+        TCGv_i64 r31_30 = tcg_temp_new_i64(); \
+        gen_deallocframe(ctx, r31_30, hex_gpr[HEX_REG_FP]); \
+        gen_log_reg_write_pair(ctx, HEX_REG_FP, r31_30); \
+    } while (0)
+
+/*
  * dealloc_return
  * Assembler mapped to
  * r31:30 = dealloc_return(r30):raw
@@ -515,7 +547,7 @@
     do { \
         TCGv_i64 RddV = get_result_gpr_pair(ctx, HEX_REG_FP); \
         gen_return(ctx, RddV, hex_gpr[HEX_REG_FP]); \
-        gen_log_reg_write_pair(HEX_REG_FP, RddV); \
+        gen_log_reg_write_pair(ctx, HEX_REG_FP, RddV); \
     } while (0)
 
 /*
@@ -549,9 +581,9 @@
 #define fGEN_TCG_SL2_return_f(SHORTCODE) \
     gen_cond_return_subinsn(ctx, TCG_COND_NE, hex_pred[0])
 #define fGEN_TCG_SL2_return_tnew(SHORTCODE) \
-    gen_cond_return_subinsn(ctx, TCG_COND_EQ, hex_new_pred_value[0])
+    gen_cond_return_subinsn(ctx, TCG_COND_EQ, ctx->new_pred_value[0])
 #define fGEN_TCG_SL2_return_fnew(SHORTCODE) \
-    gen_cond_return_subinsn(ctx, TCG_COND_NE, hex_new_pred_value[0])
+    gen_cond_return_subinsn(ctx, TCG_COND_NE, ctx->new_pred_value[0])
 
 /*
  * Mathematical operations with more than one definition require
@@ -560,7 +592,16 @@
 #define fGEN_TCG_A5_ACS(SHORTCODE) \
     do { \
         gen_helper_vacsh_pred(PeV, cpu_env, RxxV, RssV, RttV); \
-        gen_helper_vacsh_val(RxxV, cpu_env, RxxV, RssV, RttV); \
+        gen_helper_vacsh_val(RxxV, cpu_env, RxxV, RssV, RttV, \
+                             tcg_constant_tl(ctx->need_commit)); \
+    } while (0)
+
+#define fGEN_TCG_S2_cabacdecbin(SHORTCODE) \
+    do { \
+        TCGv p0 = tcg_temp_new(); \
+        gen_helper_cabacdecbin_pred(p0, RssV, RttV); \
+        gen_helper_cabacdecbin_val(RddV, RssV, RttV); \
+        gen_log_pred_write(ctx, 0, p0); \
     } while (0)
 
 /*
@@ -653,6 +694,8 @@
     gen_call(ctx, riV)
 #define fGEN_TCG_J2_callr(SHORTCODE) \
     gen_callr(ctx, RsV)
+#define fGEN_TCG_J2_callrh(SHORTCODE) \
+    gen_callr(ctx, RsV)
 
 #define fGEN_TCG_J2_callt(SHORTCODE) \
     gen_cond_call(ctx, PuV, TCG_COND_EQ, riV)
@@ -663,6 +706,27 @@
 #define fGEN_TCG_J2_callrf(SHORTCODE) \
     gen_cond_callr(ctx, TCG_COND_NE, PuV, RsV)
 
+#define fGEN_TCG_J2_loop0r(SHORTCODE) \
+    gen_loop0r(ctx, RsV, riV)
+#define fGEN_TCG_J2_loop1r(SHORTCODE) \
+    gen_loop1r(ctx, RsV, riV)
+#define fGEN_TCG_J2_loop0i(SHORTCODE) \
+    gen_loop0i(ctx, UiV, riV)
+#define fGEN_TCG_J2_loop1i(SHORTCODE) \
+    gen_loop1i(ctx, UiV, riV)
+#define fGEN_TCG_J2_ploop1sr(SHORTCODE) \
+    gen_ploopNsr(ctx, 1, RsV, riV)
+#define fGEN_TCG_J2_ploop1si(SHORTCODE) \
+    gen_ploopNsi(ctx, 1, UiV, riV)
+#define fGEN_TCG_J2_ploop2sr(SHORTCODE) \
+    gen_ploopNsr(ctx, 2, RsV, riV)
+#define fGEN_TCG_J2_ploop2si(SHORTCODE) \
+    gen_ploopNsi(ctx, 2, UiV, riV)
+#define fGEN_TCG_J2_ploop3sr(SHORTCODE) \
+    gen_ploopNsr(ctx, 3, RsV, riV)
+#define fGEN_TCG_J2_ploop3si(SHORTCODE) \
+    gen_ploopNsi(ctx, 3, UiV, riV)
+
 #define fGEN_TCG_J2_endloop0(SHORTCODE) \
     gen_endloop0(ctx)
 #define fGEN_TCG_J2_endloop1(SHORTCODE) \
@@ -847,10 +911,20 @@
 #define fGEN_TCG_J4_tstbit0_fp1_jump_t(SHORTCODE) \
     gen_cmpnd_tstbit0_jmp(ctx, 1, RsV, TCG_COND_NE, riV)
 
+/* p0 = cmp.eq(r0, #7) */
+#define fGEN_TCG_SA1_cmpeqi(SHORTCODE) \
+    do { \
+        TCGv p0 = tcg_temp_new(); \
+        gen_comparei(TCG_COND_EQ, p0, RsV, uiV); \
+        gen_log_pred_write(ctx, 0, p0); \
+    } while (0)
+
 #define fGEN_TCG_J2_jump(SHORTCODE) \
     gen_jump(ctx, riV)
 #define fGEN_TCG_J2_jumpr(SHORTCODE) \
     gen_jumpr(ctx, RsV)
+#define fGEN_TCG_J2_jumprh(SHORTCODE) \
+    gen_jumpr(ctx, RsV)
 #define fGEN_TCG_J4_jumpseti(SHORTCODE) \
     do { \
         tcg_gen_movi_tl(RdV, UiV); \
@@ -1044,6 +1118,22 @@
         gen_jump(ctx, riV); \
     } while (0)
 
+/* if (p0.new) r0 = #0 */
+#define fGEN_TCG_SA1_clrtnew(SHORTCODE) \
+    do { \
+        tcg_gen_movcond_tl(TCG_COND_EQ, RdV, \
+                           ctx->new_pred_value[0], tcg_constant_tl(0), \
+                           RdV, tcg_constant_tl(0)); \
+    } while (0)
+
+/* if (!p0.new) r0 = #0 */
+#define fGEN_TCG_SA1_clrfnew(SHORTCODE) \
+    do { \
+        tcg_gen_movcond_tl(TCG_COND_NE, RdV, \
+                           ctx->new_pred_value[0], tcg_constant_tl(0), \
+                           RdV, tcg_constant_tl(0)); \
+    } while (0)
+
 #define fGEN_TCG_J2_pause(SHORTCODE) \
     do { \
         uiV = uiV; \
@@ -1067,9 +1157,9 @@
     gen_cond_jumpr31(ctx, TCG_COND_NE, hex_pred[0])
 
 #define fGEN_TCG_SL2_jumpr31_tnew(SHORTCODE) \
-    gen_cond_jumpr31(ctx, TCG_COND_EQ, hex_new_pred_value[0])
+    gen_cond_jumpr31(ctx, TCG_COND_EQ, ctx->new_pred_value[0])
 #define fGEN_TCG_SL2_jumpr31_fnew(SHORTCODE) \
-    gen_cond_jumpr31(ctx, TCG_COND_NE, hex_new_pred_value[0])
+    gen_cond_jumpr31(ctx, TCG_COND_NE, ctx->new_pred_value[0])
 
 /* Count trailing zeros/ones */
 #define fGEN_TCG_S2_ct0(SHORTCODE) \
@@ -1095,6 +1185,24 @@
         tcg_gen_extrl_i64_i32(RdV, tmp); \
     } while (0)
 
+#define fGEN_TCG_S2_insert(SHORTCODE) \
+    do { \
+        int width = uiV; \
+        int offset = UiV; \
+        if (width != 0) { \
+            if (offset + width > 32) { \
+                width = 32 - offset; \
+            } \
+            tcg_gen_deposit_tl(RxV, RxV, RsV, offset, width); \
+        } \
+    } while (0)
+#define fGEN_TCG_S2_insert_rp(SHORTCODE) \
+    gen_insert_rp(ctx, RxV, RsV, RttV)
+#define fGEN_TCG_S2_asr_r_svw_trun(SHORTCODE) \
+    gen_asr_r_svw_trun(ctx, RdV, RssV, RtV)
+#define fGEN_TCG_A2_swiz(SHORTCODE) \
+    tcg_gen_bswap_tl(RdV, RsV)
+
 /* Floating point */
 #define fGEN_TCG_F2_conv_sf2df(SHORTCODE) \
     gen_helper_conv_sf2df(RddV, cpu_env, RsV)
@@ -1236,6 +1344,24 @@
         uiV = uiV; \
     } while (0)
 
+#define fGEN_TCG_L2_loadw_aq(SHORTCODE)                 SHORTCODE
+#define fGEN_TCG_L4_loadd_aq(SHORTCODE)                 SHORTCODE
+
+/* Nothing to do for these in qemu, need to suppress compiler warnings */
+#define fGEN_TCG_R6_release_at_vi(SHORTCODE) \
+    do { \
+        RsV = RsV; \
+    } while (0)
+#define fGEN_TCG_R6_release_st_vi(SHORTCODE) \
+    do { \
+        RsV = RsV; \
+    } while (0)
+
+#define fGEN_TCG_S2_storew_rl_at_vi(SHORTCODE)          SHORTCODE
+#define fGEN_TCG_S4_stored_rl_at_vi(SHORTCODE)          SHORTCODE
+#define fGEN_TCG_S2_storew_rl_st_vi(SHORTCODE)          SHORTCODE
+#define fGEN_TCG_S4_stored_rl_st_vi(SHORTCODE)          SHORTCODE
+
 #define fGEN_TCG_J2_trap0(SHORTCODE) \
     do { \
         uiV = uiV; \
diff --git a/target/hexagon/gen_tcg_funcs.py b/target/hexagon/gen_tcg_funcs.py
index fcb3384..c73467b 100755
--- a/target/hexagon/gen_tcg_funcs.py
+++ b/target/hexagon/gen_tcg_funcs.py
@@ -37,7 +37,7 @@
     elif regtype == "C":
         f.write(f"    const int {regN} = insn->regno[{regno}] + HEX_REG_SA0;\n")
     else:
-        print("Bad register parse: ", regtype, regid)
+        hex_common.bad_register(regtype, regid)
     f.write(f"    TCGv_i64 {regtype}{regid}V = " f"get_result_gpr_pair(ctx, {regN});\n")
 
 
@@ -53,7 +53,7 @@
         f.write(f"    const int {regN} = insn->regno[{regno}];\n")
         f.write(f"    TCGv {regtype}{regid}V = tcg_temp_new();\n")
     else:
-        print("Bad register parse: ", regtype, regid)
+        hex_common.bad_register(regtype, regid)
 
 
 def genptr_decl(f, tag, regtype, regid, regno):
@@ -71,7 +71,7 @@
         elif regid in {"d", "e", "x", "y"}:
             genptr_decl_writable(f, tag, regtype, regid, regno)
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "P":
         if regid in {"s", "t", "u", "v"}:
             f.write(
@@ -80,7 +80,7 @@
         elif regid in {"d", "e", "x"}:
             genptr_decl_writable(f, tag, regtype, regid, regno)
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "C":
         if regid == "ss":
             f.write(f"    TCGv_i64 {regtype}{regid}V = " f"tcg_temp_new_i64();\n")
@@ -96,7 +96,7 @@
         elif regid == "d":
             genptr_decl_writable(f, tag, regtype, regid, regno)
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "M":
         if regid == "u":
             f.write(f"    const int {regtype}{regid}N = " f"insn->regno[{regno}];\n")
@@ -105,7 +105,7 @@
                 "HEX_REG_M0];\n"
             )
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "V":
         if regid in {"dd"}:
             f.write(f"    const int {regtype}{regid}N = " f"insn->regno[{regno}];\n")
@@ -159,7 +159,7 @@
                     f"{regtype}{regid}V_off);\n"
                 )
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "Q":
         if regid in {"d", "e", "x"}:
             f.write(f"    const int {regtype}{regid}N = " f"insn->regno[{regno}];\n")
@@ -180,9 +180,9 @@
             if not hex_common.skip_qemu_helper(tag):
                 f.write(f"    TCGv_ptr {regtype}{regid}V = " "tcg_temp_new_ptr();\n")
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     else:
-        print("Bad register parse: ", regtype, regid)
+        hex_common.bad_register(regtype, regid)
 
 
 def genptr_decl_new(f, tag, regtype, regid, regno):
@@ -190,18 +190,18 @@
         if regid in {"s", "t"}:
             f.write(
                 f"    TCGv {regtype}{regid}N = "
-                f"hex_new_value[insn->regno[{regno}]];\n"
+                f"get_result_gpr(ctx, insn->regno[{regno}]);\n"
             )
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "P":
         if regid in {"t", "u", "v"}:
             f.write(
                 f"    TCGv {regtype}{regid}N = "
-                f"hex_new_pred_value[insn->regno[{regno}]];\n"
+                f"ctx->new_pred_value[insn->regno[{regno}]];\n"
             )
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "O":
         if regid == "s":
             f.write(
@@ -218,9 +218,9 @@
                     f"tcg_constant_tl({regtype}{regid}N_num);\n"
                 )
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     else:
-        print("Bad register parse: ", regtype, regid)
+        hex_common.bad_register(regtype, regid)
 
 
 def genptr_decl_opn(f, tag, regtype, regid, toss, numregs, i):
@@ -232,9 +232,9 @@
         elif hex_common.is_new_val(regtype, regid, tag):
             genptr_decl_new(f, tag, regtype, regid, i)
         else:
-            print("Bad register parse: ", regtype, regid, toss, numregs)
+            hex_common.bad_register(regtype, regid, toss, numregs)
     else:
-        print("Bad register parse: ", regtype, regid, toss, numregs)
+        hex_common.bad_register(regtype, regid, toss, numregs)
 
 
 def genptr_decl_imm(f, immlett):
@@ -266,7 +266,7 @@
                     f"hex_gpr[{regtype}{regid}N]);\n"
                 )
         elif regid not in {"s", "t", "u", "v"}:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "P":
         if regid == "x":
             f.write(
@@ -274,7 +274,7 @@
                 f"hex_pred[{regtype}{regid}N]);\n"
             )
         elif regid not in {"s", "t", "u", "v"}:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "C":
         if regid == "ss":
             f.write(
@@ -287,10 +287,10 @@
                 f"{regtype}{regid}V);\n"
             )
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "M":
         if regid != "u":
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "V":
         if regid in {"uu", "vv", "xx"}:
             f.write(f"    tcg_gen_gvec_mov(MO_64, {regtype}{regid}V_off,\n")
@@ -311,7 +311,7 @@
             f.write(f"        vreg_src_off(ctx, {regtype}{regid}N),\n")
             f.write("        sizeof(MMVector), sizeof(MMVector));\n")
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "Q":
         if regid in {"s", "t", "u", "v"}:
             if not hex_common.skip_qemu_helper(tag):
@@ -326,23 +326,23 @@
             )
             f.write("        sizeof(MMQReg), sizeof(MMQReg));\n")
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     else:
-        print("Bad register parse: ", regtype, regid)
+        hex_common.bad_register(regtype, regid)
 
 
 def genptr_src_read_new(f, regtype, regid):
     if regtype == "N":
         if regid not in {"s", "t"}:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "P":
         if regid not in {"t", "u", "v"}:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "O":
         if regid != "s":
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     else:
-        print("Bad register parse: ", regtype, regid)
+        hex_common.bad_register(regtype, regid)
 
 
 def genptr_src_read_opn(f, regtype, regid, tag):
@@ -354,9 +354,9 @@
         elif hex_common.is_new_val(regtype, regid, tag):
             genptr_src_read_new(f, regtype, regid)
         else:
-            print("Bad register parse: ", regtype, regid, toss, numregs)
+            hex_common.bad_register(regtype, regid, toss, numregs)
     else:
-        print("Bad register parse: ", regtype, regid, toss, numregs)
+        hex_common.bad_register(regtype, regid, toss, numregs)
 
 
 def gen_helper_call_opn(f, tag, regtype, regid, toss, numregs, i):
@@ -370,9 +370,9 @@
         elif hex_common.is_new_val(regtype, regid, tag):
             f.write(f"{regtype}{regid}N")
         else:
-            print("Bad register parse: ", regtype, regid, toss, numregs)
+            hex_common.bad_register(regtype, regid, toss, numregs)
     else:
-        print("Bad register parse: ", regtype, regid, toss, numregs)
+        hex_common.bad_register(regtype, regid, toss, numregs)
 
 
 def gen_helper_decl_imm(f, immlett):
@@ -387,7 +387,8 @@
 
 
 def genptr_dst_write_pair(f, tag, regtype, regid):
-    f.write(f"    gen_log_reg_write_pair({regtype}{regid}N, " f"{regtype}{regid}V);\n")
+    f.write(f"    gen_log_reg_write_pair(ctx, {regtype}{regid}N, "
+            f"{regtype}{regid}V);\n")
 
 
 def genptr_dst_write(f, tag, regtype, regid):
@@ -396,10 +397,11 @@
             genptr_dst_write_pair(f, tag, regtype, regid)
         elif regid in {"d", "e", "x", "y"}:
             f.write(
-                f"    gen_log_reg_write({regtype}{regid}N, " f"{regtype}{regid}V);\n"
+                f"    gen_log_reg_write(ctx, {regtype}{regid}N, "
+                f"{regtype}{regid}V);\n"
             )
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "P":
         if regid in {"d", "e", "x"}:
             f.write(
@@ -407,7 +409,7 @@
                 f"{regtype}{regid}V);\n"
             )
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "C":
         if regid == "dd":
             f.write(
@@ -420,9 +422,9 @@
                 f"{regtype}{regid}V);\n"
             )
         else:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     else:
-        print("Bad register parse: ", regtype, regid)
+        hex_common.bad_register(regtype, regid)
 
 
 def genptr_dst_write_ext(f, tag, regtype, regid, newv="EXT_DFL"):
@@ -438,12 +440,12 @@
                 f"{regtype}{regid}N, {newv});\n"
             )
         elif regid not in {"dd", "d", "x"}:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     elif regtype == "Q":
         if regid not in {"d", "e", "x"}:
-            print("Bad register parse: ", regtype, regid)
+            hex_common.bad_register(regtype, regid)
     else:
-        print("Bad register parse: ", regtype, regid)
+        hex_common.bad_register(regtype, regid)
 
 
 def genptr_dst_write_opn(f, regtype, regid, tag):
@@ -466,7 +468,7 @@
         else:
             genptr_dst_write(f, tag, regtype, regid)
     else:
-        print("Bad register parse: ", regtype, regid, toss, numregs)
+        hex_common.bad_register(regtype, regid, toss, numregs)
 
 
 ##
@@ -481,7 +483,7 @@
 ##        TCGv RsV = hex_gpr[insn->regno[1]];
 ##        TCGv RtV = hex_gpr[insn->regno[2]];
 ##        <GEN>
-##        gen_log_reg_write(RdN, RdV);
+##        gen_log_reg_write(ctx, RdN, RdV);
 ##    }
 ##
 ##       where <GEN> depends on hex_common.skip_qemu_helper(tag)
@@ -530,7 +532,7 @@
             elif hex_common.is_new_val(regtype, regid, tag):
                 declared.append(f"{regtype}{regid}N")
             else:
-                print("Bad register parse: ", regtype, regid, toss, numregs)
+                hex_common.bad_register(regtype, regid, toss, numregs)
 
         ## Handle immediates
         for immlett, bits, immshift in imms:
@@ -548,10 +550,13 @@
         if hex_common.need_pkt_has_multi_cof(tag):
             f.write("    TCGv pkt_has_multi_cof = ")
             f.write("tcg_constant_tl(ctx->pkt->pkt_has_multi_cof);\n")
+        if hex_common.need_pkt_need_commit(tag):
+            f.write("    TCGv pkt_need_commit = ")
+            f.write("tcg_constant_tl(ctx->need_commit);\n")
         if hex_common.need_part1(tag):
             f.write("    TCGv part1 = tcg_constant_tl(insn->part1);\n")
         if hex_common.need_slot(tag):
-            f.write("    TCGv slot = tcg_constant_tl(insn->slot);\n")
+            f.write("    TCGv slotval = gen_slotval(ctx);\n")
         if hex_common.need_PC(tag):
             f.write("    TCGv PC = tcg_constant_tl(ctx->pkt->pc);\n")
         if hex_common.helper_needs_next_PC(tag):
@@ -594,12 +599,14 @@
 
         if hex_common.need_pkt_has_multi_cof(tag):
             f.write(", pkt_has_multi_cof")
+        if hex_common.need_pkt_need_commit(tag):
+            f.write(", pkt_need_commit")
         if hex_common.need_PC(tag):
             f.write(", PC")
         if hex_common.helper_needs_next_PC(tag):
             f.write(", next_PC")
         if hex_common.need_slot(tag):
-            f.write(", slot")
+            f.write(", slotval")
         if hex_common.need_part1(tag):
             f.write(", part1")
         f.write(");\n")
diff --git a/target/hexagon/gen_tcg_hvx.h b/target/hexagon/gen_tcg_hvx.h
index d4aefe8..44bae53 100644
--- a/target/hexagon/gen_tcg_hvx.h
+++ b/target/hexagon/gen_tcg_hvx.h
@@ -128,6 +128,41 @@
     tcg_gen_gvec_mov(MO_64, VdV_off, VuV_off, \
                      sizeof(MMVector), sizeof(MMVector))
 
+#define fGEN_TCG_V6_vassign_tmp(SHORTCODE) \
+    tcg_gen_gvec_mov(MO_64, VdV_off, VuV_off, \
+                     sizeof(MMVector), sizeof(MMVector))
+
+#define fGEN_TCG_V6_vcombine_tmp(SHORTCODE) \
+    do { \
+        tcg_gen_gvec_mov(MO_64, VddV_off, VvV_off, \
+                         sizeof(MMVector), sizeof(MMVector)); \
+        tcg_gen_gvec_mov(MO_64, VddV_off + sizeof(MMVector), VuV_off, \
+                         sizeof(MMVector), sizeof(MMVector)); \
+    } while (0)
+
+/*
+ * Vector combine
+ *
+ * Be careful that the source and dest don't overlap
+ */
+#define fGEN_TCG_V6_vcombine(SHORTCODE) \
+    do { \
+        if (VddV_off != VuV_off) { \
+            tcg_gen_gvec_mov(MO_64, VddV_off, VvV_off, \
+                             sizeof(MMVector), sizeof(MMVector)); \
+            tcg_gen_gvec_mov(MO_64, VddV_off + sizeof(MMVector), VuV_off, \
+                             sizeof(MMVector), sizeof(MMVector)); \
+        } else { \
+            intptr_t tmpoff = offsetof(CPUHexagonState, vtmp); \
+            tcg_gen_gvec_mov(MO_64, tmpoff, VuV_off, \
+                             sizeof(MMVector), sizeof(MMVector)); \
+            tcg_gen_gvec_mov(MO_64, VddV_off, VvV_off, \
+                             sizeof(MMVector), sizeof(MMVector)); \
+            tcg_gen_gvec_mov(MO_64, VddV_off + sizeof(MMVector), tmpoff, \
+                             sizeof(MMVector), sizeof(MMVector)); \
+        } \
+    } while (0)
+
 /* Vector conditional move */
 #define fGEN_TCG_VEC_CMOV(PRED) \
     do { \
diff --git a/target/hexagon/genptr.c b/target/hexagon/genptr.c
index 244063b..cb2aa28 100644
--- a/target/hexagon/genptr.c
+++ b/target/hexagon/genptr.c
@@ -45,7 +45,7 @@
 
 #define IMMUTABLE (~0)
 
-static const target_ulong reg_immut_masks[TOTAL_PER_THREAD_REGS] = {
+const target_ulong reg_immut_masks[TOTAL_PER_THREAD_REGS] = {
     [HEX_REG_USR] = 0xc13000c0,
     [HEX_REG_PC] = IMMUTABLE,
     [HEX_REG_GP] = 0x3f,
@@ -68,58 +68,72 @@
     }
 }
 
-static TCGv get_result_gpr(DisasContext *ctx, int rnum)
+TCGv get_result_gpr(DisasContext *ctx, int rnum)
 {
-    return hex_new_value[rnum];
+    if (ctx->need_commit) {
+        if (rnum == HEX_REG_USR) {
+            return hex_new_value_usr;
+        } else {
+            if (ctx->new_value[rnum] == NULL) {
+                ctx->new_value[rnum] = tcg_temp_new();
+                tcg_gen_movi_tl(ctx->new_value[rnum], 0);
+            }
+            return ctx->new_value[rnum];
+        }
+    } else {
+        return hex_gpr[rnum];
+    }
 }
 
 static TCGv_i64 get_result_gpr_pair(DisasContext *ctx, int rnum)
 {
     TCGv_i64 result = tcg_temp_new_i64();
-    tcg_gen_concat_i32_i64(result, hex_new_value[rnum],
-                                   hex_new_value[rnum + 1]);
+    tcg_gen_concat_i32_i64(result, get_result_gpr(ctx, rnum),
+                                   get_result_gpr(ctx, rnum + 1));
     return result;
 }
 
-void gen_log_reg_write(int rnum, TCGv val)
+void gen_log_reg_write(DisasContext *ctx, int rnum, TCGv val)
 {
     const target_ulong reg_mask = reg_immut_masks[rnum];
 
     gen_masked_reg_write(val, hex_gpr[rnum], reg_mask);
-    tcg_gen_mov_tl(hex_new_value[rnum], val);
+    tcg_gen_mov_tl(get_result_gpr(ctx, rnum), val);
     if (HEX_DEBUG) {
         /* Do this so HELPER(debug_commit_end) will know */
         tcg_gen_movi_tl(hex_reg_written[rnum], 1);
     }
 }
 
-static void gen_log_reg_write_pair(int rnum, TCGv_i64 val)
+static void gen_log_reg_write_pair(DisasContext *ctx, int rnum, TCGv_i64 val)
 {
-    const target_ulong reg_mask_low = reg_immut_masks[rnum];
-    const target_ulong reg_mask_high = reg_immut_masks[rnum + 1];
     TCGv val32 = tcg_temp_new();
 
     /* Low word */
     tcg_gen_extrl_i64_i32(val32, val);
-    gen_masked_reg_write(val32, hex_gpr[rnum], reg_mask_low);
-    tcg_gen_mov_tl(hex_new_value[rnum], val32);
-    if (HEX_DEBUG) {
-        /* Do this so HELPER(debug_commit_end) will know */
-        tcg_gen_movi_tl(hex_reg_written[rnum], 1);
-    }
+    gen_log_reg_write(ctx, rnum, val32);
 
     /* High word */
     tcg_gen_extrh_i64_i32(val32, val);
-    gen_masked_reg_write(val32, hex_gpr[rnum + 1], reg_mask_high);
-    tcg_gen_mov_tl(hex_new_value[rnum + 1], val32);
-    if (HEX_DEBUG) {
-        /* Do this so HELPER(debug_commit_end) will know */
-        tcg_gen_movi_tl(hex_reg_written[rnum + 1], 1);
+    gen_log_reg_write(ctx, rnum + 1, val32);
+}
+
+TCGv get_result_pred(DisasContext *ctx, int pnum)
+{
+    if (ctx->need_commit) {
+        if (ctx->new_pred_value[pnum] == NULL) {
+            ctx->new_pred_value[pnum] = tcg_temp_new();
+            tcg_gen_movi_tl(ctx->new_pred_value[pnum], 0);
+        }
+        return ctx->new_pred_value[pnum];
+    } else {
+        return hex_pred[pnum];
     }
 }
 
 void gen_log_pred_write(DisasContext *ctx, int pnum, TCGv val)
 {
+    TCGv pred = get_result_pred(ctx, pnum);
     TCGv base_val = tcg_temp_new();
 
     tcg_gen_andi_tl(base_val, val, 0xff);
@@ -132,12 +146,13 @@
      * straight assignment.  Otherwise, do an and.
      */
     if (!test_bit(pnum, ctx->pregs_written)) {
-        tcg_gen_mov_tl(hex_new_pred_value[pnum], base_val);
+        tcg_gen_mov_tl(pred, base_val);
     } else {
-        tcg_gen_and_tl(hex_new_pred_value[pnum],
-                       hex_new_pred_value[pnum], base_val);
+        tcg_gen_and_tl(pred, pred, base_val);
     }
-    tcg_gen_ori_tl(hex_pred_written, hex_pred_written, 1 << pnum);
+    if (HEX_DEBUG) {
+        tcg_gen_ori_tl(ctx->pred_written, ctx->pred_written, 1 << pnum);
+    }
     set_bit(pnum, ctx->pregs_written);
 }
 
@@ -231,7 +246,7 @@
     if (reg_num == HEX_REG_P3_0_ALIASED) {
         gen_write_p3_0(ctx, val);
     } else {
-        gen_log_reg_write(reg_num, val);
+        gen_log_reg_write(ctx, reg_num, val);
         if (reg_num == HEX_REG_QEMU_PKT_CNT) {
             ctx->num_packets = 0;
         }
@@ -255,7 +270,7 @@
         tcg_gen_extrh_i64_i32(val32, val);
         tcg_gen_mov_tl(result, val32);
     } else {
-        gen_log_reg_write_pair(reg_num, val);
+        gen_log_reg_write_pair(ctx, reg_num, val);
         if (reg_num == HEX_REG_QEMU_PKT_CNT) {
             ctx->num_packets = 0;
             ctx->num_insns = 0;
@@ -383,6 +398,14 @@
     tcg_gen_movi_tl(hex_llsc_addr, ~0);
 }
 
+#ifndef CONFIG_HEXAGON_IDEF_PARSER
+static TCGv gen_slotval(DisasContext *ctx)
+{
+    int slotval = (ctx->pkt->pkt_has_store_s1 & 1) | (ctx->insn->slot << 1);
+    return tcg_constant_tl(slotval);
+}
+#endif
+
 void gen_store32(TCGv vaddr, TCGv src, int width, uint32_t slot)
 {
     tcg_gen_mov_tl(hex_store_addr[slot], vaddr);
@@ -457,9 +480,9 @@
     if (ctx->pkt->pkt_has_multi_cof) {
         /* If there are multiple branches in a packet, ignore the second one */
         tcg_gen_movcond_tl(TCG_COND_NE, hex_gpr[HEX_REG_PC],
-                           hex_branch_taken, tcg_constant_tl(0),
+                           ctx->branch_taken, tcg_constant_tl(0),
                            hex_gpr[HEX_REG_PC], addr);
-        tcg_gen_movi_tl(hex_branch_taken, 1);
+        tcg_gen_movi_tl(ctx->branch_taken, 1);
     } else {
         tcg_gen_mov_tl(hex_gpr[HEX_REG_PC], addr);
     }
@@ -480,7 +503,7 @@
         ctx->branch_cond = TCG_COND_ALWAYS;
         if (pred != NULL) {
             ctx->branch_cond = cond;
-            tcg_gen_mov_tl(hex_branch_taken, pred);
+            tcg_gen_mov_tl(ctx->branch_taken, pred);
         }
         ctx->branch_dest = dest;
     }
@@ -518,6 +541,55 @@
     tcg_gen_movcond_tl(cond, res, arg1, arg2, one, zero);
 }
 
+#ifndef CONFIG_HEXAGON_IDEF_PARSER
+static inline void gen_loop0r(DisasContext *ctx, TCGv RsV, int riV)
+{
+    fIMMEXT(riV);
+    fPCALIGN(riV);
+    gen_log_reg_write(ctx, HEX_REG_LC0, RsV);
+    gen_log_reg_write(ctx, HEX_REG_SA0, tcg_constant_tl(ctx->pkt->pc + riV));
+    gen_set_usr_fieldi(ctx, USR_LPCFG, 0);
+}
+
+static void gen_loop0i(DisasContext *ctx, int count, int riV)
+{
+    gen_loop0r(ctx, tcg_constant_tl(count), riV);
+}
+
+static inline void gen_loop1r(DisasContext *ctx, TCGv RsV, int riV)
+{
+    fIMMEXT(riV);
+    fPCALIGN(riV);
+    gen_log_reg_write(ctx, HEX_REG_LC1, RsV);
+    gen_log_reg_write(ctx, HEX_REG_SA1, tcg_constant_tl(ctx->pkt->pc + riV));
+}
+
+static void gen_loop1i(DisasContext *ctx, int count, int riV)
+{
+    gen_loop1r(ctx, tcg_constant_tl(count), riV);
+}
+
+static void gen_ploopNsr(DisasContext *ctx, int N, TCGv RsV, int riV)
+{
+    fIMMEXT(riV);
+    fPCALIGN(riV);
+    gen_log_reg_write(ctx, HEX_REG_LC0, RsV);
+    gen_log_reg_write(ctx, HEX_REG_SA0, tcg_constant_tl(ctx->pkt->pc + riV));
+    gen_set_usr_fieldi(ctx, USR_LPCFG, N);
+    gen_log_pred_write(ctx, 3, tcg_constant_tl(0));
+}
+
+static void gen_ploopNsi(DisasContext *ctx, int N, int count, int riV)
+{
+    gen_ploopNsr(ctx, N, tcg_constant_tl(count), riV);
+}
+
+static inline void gen_comparei(TCGCond cond, TCGv res, TCGv arg1, int arg2)
+{
+    gen_compare(cond, res, arg1, tcg_constant_tl(arg2));
+}
+#endif
+
 static void gen_cond_jumpr(DisasContext *ctx, TCGv dst_pc,
                            TCGCond cond, TCGv pred)
 {
@@ -547,7 +619,7 @@
         gen_log_pred_write(ctx, pnum, pred);
     } else {
         TCGv pred = tcg_temp_new();
-        tcg_gen_mov_tl(pred, hex_new_pred_value[pnum]);
+        tcg_gen_mov_tl(pred, ctx->new_pred_value[pnum]);
         gen_cond_jump(ctx, cond2, pred, pc_off);
     }
 }
@@ -604,7 +676,7 @@
         gen_log_pred_write(ctx, pnum, pred);
     } else {
         TCGv pred = tcg_temp_new();
-        tcg_gen_mov_tl(pred, hex_new_pred_value[pnum]);
+        tcg_gen_mov_tl(pred, ctx->new_pred_value[pnum]);
         gen_cond_jump(ctx, cond, pred, pc_off);
     }
 }
@@ -665,6 +737,18 @@
     gen_set_label(skip);
 }
 
+#ifndef CONFIG_HEXAGON_IDEF_PARSER
+/* frame = ((LR << 32) | FP) ^ (FRAMEKEY << 32)) */
+static TCGv_i64 gen_frame_scramble(void)
+{
+    TCGv_i64 frame = tcg_temp_new_i64();
+    TCGv tmp = tcg_temp_new();
+    tcg_gen_xor_tl(tmp, hex_gpr[HEX_REG_LR], hex_gpr[HEX_REG_FRAMEKEY]);
+    tcg_gen_concat_i32_i64(frame, hex_gpr[HEX_REG_FP], tmp);
+    return frame;
+}
+#endif
+
 /* frame ^= (int64_t)FRAMEKEY << 32 */
 static void gen_frame_unscramble(TCGv_i64 frame)
 {
@@ -681,6 +765,41 @@
     tcg_gen_qemu_ld_i64(frame, EA, ctx->mem_idx, MO_TEUQ);
 }
 
+#ifndef CONFIG_HEXAGON_IDEF_PARSER
+/* Stack overflow check */
+static void gen_framecheck(TCGv EA, int framesize)
+{
+    /* Not modelled in linux-user mode */
+    /* Placeholder for system mode */
+#ifndef CONFIG_USER_ONLY
+    g_assert_not_reached();
+#endif
+}
+
+static void gen_allocframe(DisasContext *ctx, TCGv r29, int framesize)
+{
+    TCGv r30 = tcg_temp_new();
+    TCGv_i64 frame;
+    tcg_gen_addi_tl(r30, r29, -8);
+    frame = gen_frame_scramble();
+    gen_store8(cpu_env, r30, frame, ctx->insn->slot);
+    gen_log_reg_write(ctx, HEX_REG_FP, r30);
+    gen_framecheck(r30, framesize);
+    tcg_gen_subi_tl(r29, r30, framesize);
+}
+
+static void gen_deallocframe(DisasContext *ctx, TCGv_i64 r31_30, TCGv r30)
+{
+    TCGv r29 = tcg_temp_new();
+    TCGv_i64 frame = tcg_temp_new_i64();
+    gen_load_frame(ctx, frame, r30);
+    gen_frame_unscramble(frame);
+    tcg_gen_mov_i64(r31_30, frame);
+    tcg_gen_addi_tl(r29, r30, 8);
+    gen_log_reg_write(ctx, HEX_REG_SP, r29);
+}
+#endif
+
 static void gen_return(DisasContext *ctx, TCGv_i64 dst, TCGv src)
 {
     /*
@@ -719,7 +838,7 @@
 {
     TCGv_i64 RddV = get_result_gpr_pair(ctx, HEX_REG_FP);
     gen_cond_return(ctx, RddV, hex_gpr[HEX_REG_FP], pred, cond);
-    gen_log_reg_write_pair(HEX_REG_FP, RddV);
+    gen_log_reg_write_pair(ctx, HEX_REG_FP, RddV);
 }
 
 static void gen_endloop0(DisasContext *ctx)
@@ -730,15 +849,13 @@
 
     /*
      *    if (lpcfg == 1) {
-     *        hex_new_pred_value[3] = 0xff;
-     *        hex_pred_written |= 1 << 3;
+     *        p3 = 0xff;
      *    }
      */
     TCGLabel *label1 = gen_new_label();
     tcg_gen_brcondi_tl(TCG_COND_NE, lpcfg, 1, label1);
     {
-        tcg_gen_movi_tl(hex_new_pred_value[3], 0xff);
-        tcg_gen_ori_tl(hex_pred_written, hex_pred_written, 1 << 3);
+        gen_log_pred_write(ctx, 3, tcg_constant_tl(0xff));
     }
     gen_set_label(label1);
 
@@ -807,14 +924,12 @@
 
     /*
      *    if (lpcfg == 1) {
-     *        hex_new_pred_value[3] = 0xff;
-     *        hex_pred_written |= 1 << 3;
+     *        p3 = 0xff;
      *    }
      */
     tcg_gen_brcondi_tl(TCG_COND_NE, lpcfg, 1, label1);
     {
-        tcg_gen_movi_tl(hex_new_pred_value[3], 0xff);
-        tcg_gen_ori_tl(hex_pred_written, hex_pred_written, 1 << 3);
+        gen_log_pred_write(ctx, 3, tcg_constant_tl(0xff));
     }
     gen_set_label(label1);
 
@@ -877,6 +992,7 @@
 /* Shift left with saturation */
 static void gen_shl_sat(DisasContext *ctx, TCGv dst, TCGv src, TCGv shift_amt)
 {
+    TCGv tmp = tcg_temp_new();    /* In case dst == src */
     TCGv usr = get_result_gpr(ctx, HEX_REG_USR);
     TCGv sh32 = tcg_temp_new();
     TCGv dst_sar = tcg_temp_new();
@@ -901,17 +1017,17 @@
      */
 
     tcg_gen_andi_tl(sh32, shift_amt, 31);
-    tcg_gen_movcond_tl(TCG_COND_EQ, dst, sh32, shift_amt,
+    tcg_gen_movcond_tl(TCG_COND_EQ, tmp, sh32, shift_amt,
                        src, tcg_constant_tl(0));
-    tcg_gen_shl_tl(dst, dst, sh32);
-    tcg_gen_sar_tl(dst_sar, dst, sh32);
+    tcg_gen_shl_tl(tmp, tmp, sh32);
+    tcg_gen_sar_tl(dst_sar, tmp, sh32);
     tcg_gen_movcond_tl(TCG_COND_LT, satval, src, tcg_constant_tl(0), min, max);
 
     tcg_gen_setcond_tl(TCG_COND_NE, ovf, dst_sar, src);
     tcg_gen_shli_tl(ovf, ovf, reg_field_info[USR_OVF].offset);
     tcg_gen_or_tl(usr, usr, ovf);
 
-    tcg_gen_movcond_tl(TCG_COND_EQ, dst, dst_sar, src, dst, satval);
+    tcg_gen_movcond_tl(TCG_COND_EQ, dst, dst_sar, src, tmp, satval);
 }
 
 static void gen_sar(TCGv dst, TCGv src, TCGv shift_amt)
@@ -969,6 +1085,105 @@
     gen_set_label(done);
 }
 
+static void gen_insert_rp(DisasContext *ctx, TCGv RxV, TCGv RsV, TCGv_i64 RttV)
+{
+    /*
+     * int width = fZXTN(6, 32, (fGETWORD(1, RttV)));
+     * int offset = fSXTN(7, 32, (fGETWORD(0, RttV)));
+     * size8u_t mask = ((fCONSTLL(1) << width) - 1);
+     * if (offset < 0) {
+     *     RxV = 0;
+     * } else {
+     *     RxV &= ~(mask << offset);
+     *     RxV |= ((RsV & mask) << offset);
+     * }
+     */
+
+    TCGv width = tcg_temp_new();
+    TCGv offset = tcg_temp_new();
+    TCGv_i64 mask = tcg_temp_new_i64();
+    TCGv_i64 result = tcg_temp_new_i64();
+    TCGv_i64 tmp = tcg_temp_new_i64();
+    TCGv_i64 offset64 = tcg_temp_new_i64();
+    TCGLabel *label = gen_new_label();
+    TCGLabel *done = gen_new_label();
+
+    tcg_gen_extrh_i64_i32(width, RttV);
+    tcg_gen_extract_tl(width, width, 0, 6);
+    tcg_gen_extrl_i64_i32(offset, RttV);
+    tcg_gen_sextract_tl(offset, offset, 0, 7);
+    /* Possible values for offset are -64 .. 63 */
+    tcg_gen_brcondi_tl(TCG_COND_GE, offset, 0, label);
+    /* For negative offsets, zero out the result */
+    tcg_gen_movi_tl(RxV, 0);
+    tcg_gen_br(done);
+    gen_set_label(label);
+    /* At this point, possible values of offset are 0 .. 63 */
+    tcg_gen_ext_i32_i64(mask, width);
+    tcg_gen_shl_i64(mask, tcg_constant_i64(1), mask);
+    tcg_gen_subi_i64(mask, mask, 1);
+    tcg_gen_extu_i32_i64(result, RxV);
+    tcg_gen_ext_i32_i64(tmp, offset);
+    tcg_gen_shl_i64(tmp, mask, tmp);
+    tcg_gen_andc_i64(result, result, tmp);
+    tcg_gen_extu_i32_i64(tmp, RsV);
+    tcg_gen_and_i64(tmp, tmp, mask);
+    tcg_gen_extu_i32_i64(offset64, offset);
+    tcg_gen_shl_i64(tmp, tmp, offset64);
+    tcg_gen_or_i64(result, result, tmp);
+    tcg_gen_extrl_i64_i32(RxV, result);
+    gen_set_label(done);
+}
+
+static void gen_asr_r_svw_trun(DisasContext *ctx, TCGv RdV,
+                               TCGv_i64 RssV, TCGv RtV)
+{
+    /*
+     * for (int i = 0; i < 2; i++) {
+     *     fSETHALF(i, RdV, fGETHALF(0, ((fSXTN(7, 32, RtV) > 0) ?
+     *         (fCAST4_8s(fGETWORD(i, RssV)) >> fSXTN(7, 32, RtV)) :
+     *         (fCAST4_8s(fGETWORD(i, RssV)) << -fSXTN(7, 32, RtV)))));
+     * }
+     */
+    TCGv shift_amt32 = tcg_temp_new();
+    TCGv_i64 shift_amt64 = tcg_temp_new_i64();
+    TCGv_i64 tmp64 = tcg_temp_new_i64();
+    TCGv tmp32 = tcg_temp_new();
+    TCGLabel *label = gen_new_label();
+    TCGLabel *zero = gen_new_label();
+    TCGLabel *done =  gen_new_label();
+
+    tcg_gen_sextract_tl(shift_amt32, RtV, 0, 7);
+    /* Possible values of shift_amt32 are -64 .. 63 */
+    tcg_gen_brcondi_tl(TCG_COND_LE, shift_amt32, 0, label);
+    /* After branch, possible values of shift_amt32 are 1 .. 63 */
+    tcg_gen_ext_i32_i64(shift_amt64, shift_amt32);
+    for (int i = 0; i < 2; i++) {
+        tcg_gen_sextract_i64(tmp64, RssV, i * 32, 32);
+        tcg_gen_sar_i64(tmp64, tmp64, shift_amt64);
+        tcg_gen_extrl_i64_i32(tmp32, tmp64);
+        tcg_gen_deposit_tl(RdV, RdV, tmp32, i * 16, 16);
+    }
+    tcg_gen_br(done);
+    gen_set_label(label);
+    tcg_gen_neg_tl(shift_amt32, shift_amt32);
+    /*At this point, possible values of shift_amt32 are 0 .. 64 */
+    tcg_gen_brcondi_tl(TCG_COND_GT, shift_amt32, 63, zero);
+    /*At this point, possible values of shift_amt32 are 0 .. 63 */
+    tcg_gen_ext_i32_i64(shift_amt64, shift_amt32);
+    for (int i = 0; i < 2; i++) {
+        tcg_gen_sextract_i64(tmp64, RssV, i * 32, 32);
+        tcg_gen_shl_i64(tmp64, tmp64, shift_amt64);
+        tcg_gen_extrl_i64_i32(tmp32, tmp64);
+        tcg_gen_deposit_tl(RdV, RdV, tmp32, i * 16, 16);
+    }
+    tcg_gen_br(done);
+    gen_set_label(zero);
+    /* When the shift_amt is 64, zero out the result */
+    tcg_gen_movi_tl(RdV, 0);
+    gen_set_label(done);
+}
+
 static intptr_t vreg_src_off(DisasContext *ctx, int num)
 {
     intptr_t offset = offsetof(CPUHexagonState, VRegs[num]);
@@ -1008,7 +1223,11 @@
 
 static intptr_t get_result_qreg(DisasContext *ctx, int qnum)
 {
-    return  offsetof(CPUHexagonState, future_QRegs[qnum]);
+    if (ctx->need_commit) {
+        return  offsetof(CPUHexagonState, future_QRegs[qnum]);
+    } else {
+        return  offsetof(CPUHexagonState, QRegs[qnum]);
+    }
 }
 
 static void gen_vreg_load(DisasContext *ctx, intptr_t dstoff, TCGv src,
@@ -1134,22 +1353,28 @@
 
 void gen_sat_i32_ovfl(TCGv ovfl, TCGv dest, TCGv source, int width)
 {
-    gen_sat_i32(dest, source, width);
-    tcg_gen_setcond_tl(TCG_COND_NE, ovfl, source, dest);
+    TCGv tmp = tcg_temp_new();    /* In case dest == source */
+    gen_sat_i32(tmp, source, width);
+    tcg_gen_setcond_tl(TCG_COND_NE, ovfl, source, tmp);
+    tcg_gen_mov_tl(dest, tmp);
 }
 
 void gen_satu_i32(TCGv dest, TCGv source, int width)
 {
+    TCGv tmp = tcg_temp_new();    /* In case dest == source */
     TCGv max_val = tcg_constant_tl((1 << width) - 1);
     TCGv zero = tcg_constant_tl(0);
-    tcg_gen_movcond_tl(TCG_COND_GTU, dest, source, max_val, max_val, source);
-    tcg_gen_movcond_tl(TCG_COND_LT, dest, source, zero, zero, dest);
+    tcg_gen_movcond_tl(TCG_COND_GTU, tmp, source, max_val, max_val, source);
+    tcg_gen_movcond_tl(TCG_COND_LT, tmp, source, zero, zero, tmp);
+    tcg_gen_mov_tl(dest, tmp);
 }
 
 void gen_satu_i32_ovfl(TCGv ovfl, TCGv dest, TCGv source, int width)
 {
-    gen_satu_i32(dest, source, width);
-    tcg_gen_setcond_tl(TCG_COND_NE, ovfl, source, dest);
+    TCGv tmp = tcg_temp_new();    /* In case dest == source */
+    gen_satu_i32(tmp, source, width);
+    tcg_gen_setcond_tl(TCG_COND_NE, ovfl, source, tmp);
+    tcg_gen_mov_tl(dest, tmp);
 }
 
 void gen_sat_i64(TCGv_i64 dest, TCGv_i64 source, int width)
@@ -1162,27 +1387,33 @@
 
 void gen_sat_i64_ovfl(TCGv ovfl, TCGv_i64 dest, TCGv_i64 source, int width)
 {
+    TCGv_i64 tmp = tcg_temp_new_i64(); /* In case dest == source */
     TCGv_i64 ovfl_64;
-    gen_sat_i64(dest, source, width);
+    gen_sat_i64(tmp, source, width);
     ovfl_64 = tcg_temp_new_i64();
-    tcg_gen_setcond_i64(TCG_COND_NE, ovfl_64, dest, source);
+    tcg_gen_setcond_i64(TCG_COND_NE, ovfl_64, tmp, source);
+    tcg_gen_mov_i64(dest, tmp);
     tcg_gen_trunc_i64_tl(ovfl, ovfl_64);
 }
 
 void gen_satu_i64(TCGv_i64 dest, TCGv_i64 source, int width)
 {
+    TCGv_i64 tmp = tcg_temp_new_i64();    /* In case dest == source */
     TCGv_i64 max_val = tcg_constant_i64((1LL << width) - 1LL);
     TCGv_i64 zero = tcg_constant_i64(0);
-    tcg_gen_movcond_i64(TCG_COND_GTU, dest, source, max_val, max_val, source);
-    tcg_gen_movcond_i64(TCG_COND_LT, dest, source, zero, zero, dest);
+    tcg_gen_movcond_i64(TCG_COND_GTU, tmp, source, max_val, max_val, source);
+    tcg_gen_movcond_i64(TCG_COND_LT, tmp, source, zero, zero, tmp);
+    tcg_gen_mov_i64(dest, tmp);
 }
 
 void gen_satu_i64_ovfl(TCGv ovfl, TCGv_i64 dest, TCGv_i64 source, int width)
 {
+    TCGv_i64 tmp = tcg_temp_new_i64();    /* In case dest == source */
     TCGv_i64 ovfl_64;
-    gen_satu_i64(dest, source, width);
+    gen_satu_i64(tmp, source, width);
     ovfl_64 = tcg_temp_new_i64();
-    tcg_gen_setcond_i64(TCG_COND_NE, ovfl_64, dest, source);
+    tcg_gen_setcond_i64(TCG_COND_NE, ovfl_64, tmp, source);
+    tcg_gen_mov_i64(dest, tmp);
     tcg_gen_trunc_i64_tl(ovfl, ovfl_64);
 }
 
diff --git a/target/hexagon/genptr.h b/target/hexagon/genptr.h
index 76e497a..a4b43c2 100644
--- a/target/hexagon/genptr.h
+++ b/target/hexagon/genptr.h
@@ -35,7 +35,9 @@
 void gen_store8i(TCGv_env cpu_env, TCGv vaddr, int64_t src, uint32_t slot);
 TCGv gen_read_reg(TCGv result, int num);
 TCGv gen_read_preg(TCGv pred, uint8_t num);
-void gen_log_reg_write(int rnum, TCGv val);
+TCGv get_result_gpr(DisasContext *ctx, int rnum);
+TCGv get_result_pred(DisasContext *ctx, int pnum);
+void gen_log_reg_write(DisasContext *ctx, int rnum, TCGv val);
 void gen_log_pred_write(DisasContext *ctx, int pnum, TCGv val);
 void gen_set_usr_field(DisasContext *ctx, int field, TCGv val);
 void gen_set_usr_fieldi(DisasContext *ctx, int field, int x);
@@ -58,4 +60,6 @@
 void gen_set_half_i64(int N, TCGv_i64 result, TCGv src);
 void probe_noshuf_load(TCGv va, int s, int mi);
 
+extern const target_ulong reg_immut_masks[TOTAL_PER_THREAD_REGS];
+
 #endif
diff --git a/target/hexagon/helper.h b/target/hexagon/helper.h
index ed7f984..fa0ebaf 100644
--- a/target/hexagon/helper.h
+++ b/target/hexagon/helper.h
@@ -21,7 +21,7 @@
 DEF_HELPER_FLAGS_2(raise_exception, TCG_CALL_NO_RETURN, noreturn, env, i32)
 DEF_HELPER_1(debug_start_packet, void, env)
 DEF_HELPER_FLAGS_3(debug_check_store_width, TCG_CALL_NO_WG, void, env, int, int)
-DEF_HELPER_FLAGS_3(debug_commit_end, TCG_CALL_NO_WG, void, env, int, int)
+DEF_HELPER_FLAGS_5(debug_commit_end, TCG_CALL_NO_WG, void, env, i32, int, int, int)
 DEF_HELPER_2(commit_store, void, env, int)
 DEF_HELPER_3(gather_store, void, env, i32, int)
 DEF_HELPER_1(commit_hvx_stores, void, env)
@@ -29,8 +29,10 @@
 DEF_HELPER_FLAGS_1(fbrev, TCG_CALL_NO_RWG_SE, i32, i32)
 DEF_HELPER_3(sfrecipa, i64, env, f32, f32)
 DEF_HELPER_2(sfinvsqrta, i64, env, f32)
-DEF_HELPER_4(vacsh_val, s64, env, s64, s64, s64)
+DEF_HELPER_5(vacsh_val, s64, env, s64, s64, s64, i32)
 DEF_HELPER_FLAGS_4(vacsh_pred, TCG_CALL_NO_RWG_SE, s32, env, s64, s64, s64)
+DEF_HELPER_FLAGS_2(cabacdecbin_val, TCG_CALL_NO_RWG_SE, s64, s64, s64)
+DEF_HELPER_FLAGS_2(cabacdecbin_pred, TCG_CALL_NO_RWG_SE, s32, s64, s64)
 
 /* Floating point */
 DEF_HELPER_2(conv_sf2df, f64, env, f32)
diff --git a/target/hexagon/hex_common.py b/target/hexagon/hex_common.py
index 40f28ca..f3aac55 100755
--- a/target/hexagon/hex_common.py
+++ b/target/hexagon/hex_common.py
@@ -30,6 +30,9 @@
 overrides = {}  # tags with helper overrides
 idef_parser_enabled = {}  # tags enabled for idef-parser
 
+def bad_register(*args):
+    args_str = ", ".join(map(str, args))
+    raise Exception(f"Bad register parse: {args_str}")
 
 # We should do this as a hash for performance,
 # but to keep order let's keep it as a list.
@@ -97,6 +100,12 @@
     add_qemu_macro_attrib("fSET_LPCFG", "A_IMPLICIT_WRITES_USR")
     add_qemu_macro_attrib("fLOAD", "A_SCALAR_LOAD")
     add_qemu_macro_attrib("fSTORE", "A_SCALAR_STORE")
+    add_qemu_macro_attrib('fLSBNEW0', 'A_IMPLICIT_READS_P0')
+    add_qemu_macro_attrib('fLSBNEW0NOT', 'A_IMPLICIT_READS_P0')
+    add_qemu_macro_attrib('fREAD_P0', 'A_IMPLICIT_READS_P0')
+    add_qemu_macro_attrib('fLSBNEW1', 'A_IMPLICIT_READS_P1')
+    add_qemu_macro_attrib('fLSBNEW1NOT', 'A_IMPLICIT_READS_P1')
+    add_qemu_macro_attrib('fREAD_P3', 'A_IMPLICIT_READS_P3')
 
     # Recurse down macros, find attributes from sub-macros
     macroValues = list(macros.values())
@@ -241,9 +250,10 @@
 
 def need_slot(tag):
     if (
-        ("A_CONDEXEC" in attribdict[tag] and "A_JUMP" not in attribdict[tag])
-        or "A_STORE" in attribdict[tag]
-        or "A_LOAD" in attribdict[tag]
+        "A_CVI_SCATTER" not in attribdict[tag]
+        and "A_CVI_GATHER" not in attribdict[tag]
+        and ("A_STORE" in attribdict[tag]
+             or "A_LOAD" in attribdict[tag])
     ):
         return 1
     else:
@@ -270,6 +280,9 @@
     return "A_COF" in attribdict[tag]
 
 
+def need_pkt_need_commit(tag):
+    return 'A_IMPLICIT_WRITES_USR' in attribdict[tag]
+
 def need_condexec_reg(tag, regs):
     if "A_CONDEXEC" in attribdict[tag]:
         for regtype, regid, toss, numregs in regs:
diff --git a/target/hexagon/iclass.c b/target/hexagon/iclass.c
index 6091286..c3f8523 100644
--- a/target/hexagon/iclass.c
+++ b/target/hexagon/iclass.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -51,8 +51,10 @@
         return SLOTS_0;
     } else if ((opcode == J2_trap0) ||
                (opcode == Y2_isync) ||
-               (opcode == J2_pause) || (opcode == J4_hintjumpr)) {
+               (opcode == J2_pause)) {
         return SLOTS_2;
+    } else if (opcode == J4_hintjumpr) {
+        return SLOTS_23;
     } else if (GET_ATTRIB(opcode, A_CRSLOT23)) {
         return SLOTS_23;
     } else if (GET_ATTRIB(opcode, A_RESTRICT_PREFERSLOT0)) {
diff --git a/target/hexagon/idef-parser/idef-parser.lex b/target/hexagon/idef-parser/idef-parser.lex
index 5eb8ac5..cd5958e 100644
--- a/target/hexagon/idef-parser/idef-parser.lex
+++ b/target/hexagon/idef-parser/idef-parser.lex
@@ -401,12 +401,39 @@
                            }
                            return SIGN;
                          }
-"0x"{HEX_DIGIT}+         |
-{DIGIT}+                 { yylval->rvalue.type = IMMEDIATE;
-                           yylval->rvalue.bit_width = 32;
-                           yylval->rvalue.signedness = SIGNED;
+"0x"{HEX_DIGIT}+         { uint64_t value = strtoull(yytext, NULL, 0);
+                           yylval->rvalue.type = IMMEDIATE;
                            yylval->rvalue.imm.type = VALUE;
-                           yylval->rvalue.imm.value = strtoull(yytext, NULL, 0);
+                           yylval->rvalue.imm.value = value;
+                           if (value <= INT_MAX) {
+                               yylval->rvalue.bit_width = sizeof(int) * 8;
+                               yylval->rvalue.signedness = SIGNED;
+                           } else if (value <= UINT_MAX) {
+                               yylval->rvalue.bit_width = sizeof(unsigned int) * 8;
+                               yylval->rvalue.signedness = UNSIGNED;
+                           } else if (value <= LONG_MAX) {
+                               yylval->rvalue.bit_width = sizeof(long) * 8;
+                               yylval->rvalue.signedness = SIGNED;
+                           } else if (value <= ULONG_MAX) {
+                               yylval->rvalue.bit_width = sizeof(unsigned long) * 8;
+                               yylval->rvalue.signedness = UNSIGNED;
+                           } else {
+                               g_assert_not_reached();
+                           }
+                           return IMM; }
+{DIGIT}+                 { int64_t value = strtoll(yytext, NULL, 0);
+                           yylval->rvalue.type = IMMEDIATE;
+                           yylval->rvalue.imm.type = VALUE;
+                           yylval->rvalue.imm.value = value;
+                           if (value >= INT_MIN && value <= INT_MAX) {
+                               yylval->rvalue.bit_width = sizeof(int) * 8;
+                               yylval->rvalue.signedness = SIGNED;
+                           } else if (value >= LONG_MIN && value <= LONG_MAX) {
+                               yylval->rvalue.bit_width = sizeof(long) * 8;
+                               yylval->rvalue.signedness = SIGNED;
+                           } else {
+                              g_assert_not_reached();
+                           }
                            return IMM; }
 "0x"{HEX_DIGIT}+"ULL"    |
 {DIGIT}+"ULL"            { yylval->rvalue.type = IMMEDIATE;
diff --git a/target/hexagon/idef-parser/idef-parser.y b/target/hexagon/idef-parser/idef-parser.y
index 5444fd4..5c98395 100644
--- a/target/hexagon/idef-parser/idef-parser.y
+++ b/target/hexagon/idef-parser/idef-parser.y
@@ -594,8 +594,6 @@
        | CAST rvalue
          {
              @1.last_column = @2.last_column;
-             /* Assign target signedness */
-             $2.signedness = $1.signedness;
              $$ = gen_cast_op(c, &@1, &$2, $1.bit_width, $1.signedness);
          }
        | rvalue EQ rvalue
@@ -685,7 +683,7 @@
              yyassert(c, &@1, $5.type == IMMEDIATE &&
                       $5.imm.type == VALUE,
                       "SXT expects immediate values\n");
-             $$ = gen_extend_op(c, &@1, &$3, $5.imm.value, &$7, SIGNED);
+             $$ = gen_extend_op(c, &@1, &$3, 64, &$7, SIGNED);
          }
        | ZXT '(' rvalue ',' IMM ',' rvalue ')'
          {
@@ -693,7 +691,7 @@
              yyassert(c, &@1, $5.type == IMMEDIATE &&
                       $5.imm.type == VALUE,
                       "ZXT expects immediate values\n");
-             $$ = gen_extend_op(c, &@1, &$3, $5.imm.value, &$7, UNSIGNED);
+             $$ = gen_extend_op(c, &@1, &$3, 64, &$7, UNSIGNED);
          }
        | '(' rvalue ')'
          {
diff --git a/target/hexagon/idef-parser/parser-helpers.c b/target/hexagon/idef-parser/parser-helpers.c
index 8734218..7b5ebaf 100644
--- a/target/hexagon/idef-parser/parser-helpers.c
+++ b/target/hexagon/idef-parser/parser-helpers.c
@@ -167,8 +167,9 @@
     EMIT(c, "hex_gpr[%u]", reg->id);
 }
 
-void imm_print(Context *c, YYLTYPE *locp, HexImm *imm)
+void imm_print(Context *c, YYLTYPE *locp, HexValue *rvalue)
 {
+    HexImm *imm = &rvalue->imm;
     switch (imm->type) {
     case I:
         EMIT(c, "i");
@@ -177,7 +178,21 @@
         EMIT(c, "%ciV", imm->id);
         break;
     case VALUE:
-        EMIT(c, "((int64_t) %" PRIu64 "ULL)", (int64_t) imm->value);
+        if (rvalue->bit_width == 32) {
+            if (rvalue->signedness == UNSIGNED) {
+                EMIT(c, "((uint32_t) 0x%" PRIx32 ")", (uint32_t) imm->value);
+            }  else {
+                EMIT(c, "((int32_t) 0x%" PRIx32 ")", (int32_t) imm->value);
+            }
+        } else if (rvalue->bit_width == 64) {
+            if (rvalue->signedness == UNSIGNED) {
+                EMIT(c, "((uint64_t) 0x%" PRIx64 "ULL)", (uint64_t) imm->value);
+            } else {
+                EMIT(c, "((int64_t) 0x%" PRIx64 "LL)", (int64_t) imm->value);
+            }
+        } else {
+            g_assert_not_reached();
+        }
         break;
     case QEMU_TMP:
         EMIT(c, "qemu_tmp_%" PRIu64, imm->index);
@@ -213,7 +228,7 @@
       tmp_print(c, locp, &rvalue->tmp);
       break;
   case IMMEDIATE:
-      imm_print(c, locp, &rvalue->imm);
+      imm_print(c, locp, rvalue);
       break;
   case VARID:
       var_print(c, locp, &rvalue->var);
@@ -386,13 +401,10 @@
 
     if (rvalue->type == IMMEDIATE) {
         HexValue res = gen_imm_qemu_tmp(c, locp, 64, rvalue->signedness);
-        bool is_unsigned = (rvalue->signedness == UNSIGNED);
-        const char *sign_suffix = is_unsigned ? "u" : "";
         gen_c_int_type(c, locp, 64, rvalue->signedness);
-        OUT(c, locp, " ", &res, " = ");
-        OUT(c, locp, "(", sign_suffix, "int64_t) ");
-        OUT(c, locp, "(", sign_suffix, "int32_t) ");
-        OUT(c, locp, rvalue, ";\n");
+        OUT(c, locp, " ", &res, " = (");
+        gen_c_int_type(c, locp, 64, rvalue->signedness);
+        OUT(c, locp, ")", rvalue, ";\n");
         return res;
     } else {
         HexValue res = gen_tmp(c, locp, 64, rvalue->signedness);
@@ -959,33 +971,18 @@
                      unsigned target_width,
                      HexSignedness signedness)
 {
+    HexValue res;
     assert_signedness(c, locp, src->signedness);
     if (src->bit_width == target_width) {
-        return *src;
-    } else if (src->type == IMMEDIATE) {
-        HexValue res = *src;
-        res.bit_width = target_width;
-        res.signedness = signedness;
-        return res;
+        res = *src;
+    } else if (src->bit_width < target_width) {
+        res = gen_rvalue_extend(c, locp, src);
     } else {
-        HexValue res = gen_tmp(c, locp, target_width, signedness);
-        /* Truncate */
-        if (src->bit_width > target_width) {
-            OUT(c, locp, "tcg_gen_trunc_i64_tl(", &res, ", ", src, ");\n");
-        } else {
-            assert_signedness(c, locp, src->signedness);
-            if (src->signedness == UNSIGNED) {
-                /* Extend unsigned */
-                OUT(c, locp, "tcg_gen_extu_i32_i64(",
-                    &res, ", ", src, ");\n");
-            } else {
-                /* Extend signed */
-                OUT(c, locp, "tcg_gen_ext_i32_i64(",
-                    &res, ", ", src, ");\n");
-            }
-        }
-        return res;
+        /* src->bit_width > target_width */
+        res = gen_rvalue_truncate(c, locp, src);
     }
+    res.signedness = signedness;
+    return res;
 }
 
 
@@ -1123,7 +1120,7 @@
                        HexValue *value,
                        HexSignedness signedness)
 {
-    unsigned bit_width = (dst_width = 64) ? 64 : 32;
+    unsigned bit_width = (dst_width == 64) ? 64 : 32;
     HexValue value_m = *value;
     HexValue src_width_m = *src_width;
 
@@ -1318,7 +1315,7 @@
     value_m = rvalue_materialize(c, locp, &value_m);
     OUT(c,
         locp,
-        "gen_log_reg_write(", &reg->reg.id, ", ",
+        "gen_log_reg_write(ctx, ", &reg->reg.id, ", ",
         &value_m, ");\n");
 }
 
@@ -1854,7 +1851,7 @@
         *pred = gen_tmp(c, locp, 32, UNSIGNED);
         if (is_dotnew) {
             OUT(c, locp, "tcg_gen_mov_i32(", pred,
-                ", hex_new_pred_value[");
+                ", ctx->new_pred_value[");
             OUT(c, locp, pred_str, "]);\n");
         } else {
             OUT(c, locp, "gen_read_preg(", pred, ", ", pred_str, ");\n");
diff --git a/target/hexagon/idef-parser/parser-helpers.h b/target/hexagon/idef-parser/parser-helpers.h
index 1239d23..7c58087 100644
--- a/target/hexagon/idef-parser/parser-helpers.h
+++ b/target/hexagon/idef-parser/parser-helpers.h
@@ -80,7 +80,7 @@
 
 void reg_print(Context *c, YYLTYPE *locp, HexReg *reg);
 
-void imm_print(Context *c, YYLTYPE *locp, HexImm *imm);
+void imm_print(Context *c, YYLTYPE *locp, HexValue *rvalue);
 
 void var_print(Context *c, YYLTYPE *locp, HexVar *var);
 
diff --git a/target/hexagon/imported/branch.idef b/target/hexagon/imported/branch.idef
index 88f5f48..93e2e37 100644
--- a/target/hexagon/imported/branch.idef
+++ b/target/hexagon/imported/branch.idef
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -34,6 +34,9 @@
 Q6INSN(J2_jumpr,"jumpr Rs32",ATTRIBS(A_JINDIR), "indirect unconditional jump",
 {fJUMPR(RsN,RsV,COF_TYPE_JUMPR);})
 
+Q6INSN(J2_jumprh,"jumprh Rs32",ATTRIBS(A_JINDIR, A_HINTED_COF), "indirect unconditional jump",
+{fJUMPR(RsN,RsV,COF_TYPE_JUMPR);})
+
 #define OLDCOND_JUMP(TAG,OPER,OPER2,ATTRIB,DESCR,SEMANTICS) \
 Q6INSN(TAG##t,"if (Pu4) "OPER":nt "OPER2,ATTRIB,DESCR,{fBRANCH_SPECULATE_STALL(fLSBOLD(PuV),,SPECULATE_NOT_TAKEN,12,0); if (fLSBOLD(PuV)) { SEMANTICS; }}) \
 Q6INSN(TAG##f,"if (!Pu4) "OPER":nt "OPER2,ATTRIB,DESCR,{fBRANCH_SPECULATE_STALL(fLSBOLDNOT(PuV),,SPECULATE_NOT_TAKEN,12,0); if (fLSBOLDNOT(PuV)) { SEMANTICS; }}) \
@@ -196,6 +199,8 @@
 Q6INSN(J2_callrf,"if (!Pu4) callr Rs32",ATTRIBS(CINDIR_STD),"indirect conditional call if false",
 {fBRANCH_SPECULATE_STALL(fLSBOLDNOT(PuV),,SPECULATE_NOT_TAKEN,12,0);if (fLSBOLDNOT(PuV)) { fCALLR(RsV); }})
 
+Q6INSN(J2_callrh,"callrh Rs32",ATTRIBS(CINDIR_STD, A_HINTED_COF), "hinted indirect unconditional call",
+{ fCALLR(RsV); })
 
 
 
diff --git a/target/hexagon/imported/encode_pp.def b/target/hexagon/imported/encode_pp.def
index d71c04c..0cd30a5 100644
--- a/target/hexagon/imported/encode_pp.def
+++ b/target/hexagon/imported/encode_pp.def
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -382,14 +382,23 @@
 DEF_ENC32(L4_return_tnew_pnt, ICLASS_LD" 011 0 000 sssss PP0010vv ---ddddd")
 DEF_ENC32(L4_return_fnew_pnt, ICLASS_LD" 011 0 000 sssss PP1010vv ---ddddd")
 
-DEF_ENC32(L2_loadw_locked,ICLASS_LD" 001 0 000 sssss PP00---- -00ddddd")
+DEF_ENC32(L2_loadw_locked,ICLASS_LD" 001 0 000 sssss PP000--- 000ddddd")
 
 
 
+DEF_ENC32(L2_loadw_aq,        ICLASS_LD" 001 0 000 sssss PP001--- 000ddddd")
+DEF_ENC32(L4_loadd_aq,        ICLASS_LD" 001 0 000 sssss PP011--- 000ddddd")
 
+DEF_ENC32(R6_release_at_vi,    ICLASS_ST" 000 01 11sssss PP0ttttt --0011dd")
+DEF_ENC32(R6_release_st_vi,   ICLASS_ST" 000 01 11sssss PP0ttttt --1011dd")
 
+DEF_ENC32(S2_storew_rl_at_vi,  ICLASS_ST" 000 01 01sssss PP-ttttt --0010dd")
+DEF_ENC32(S2_storew_rl_st_vi, ICLASS_ST" 000 01 01sssss PP-ttttt --1010dd")
 
-DEF_ENC32(L4_loadd_locked,ICLASS_LD" 001 0 000 sssss PP01---- -00ddddd")
+DEF_ENC32(S4_stored_rl_at_vi,  ICLASS_ST" 000 01 11sssss PP0ttttt --0010dd")
+DEF_ENC32(S4_stored_rl_st_vi, ICLASS_ST" 000 01 11sssss PP0ttttt --1010dd")
+
+DEF_ENC32(L4_loadd_locked,ICLASS_LD" 001 0 000 sssss PP010--- 000ddddd")
 DEF_EXT_SPACE(EXTRACTW,   ICLASS_LD" 001 0 000 iiiii PP0iiiii -01iiiii")
 DEF_ENC32(Y2_dcfetchbo,   ICLASS_LD" 010 0 000 sssss PP0--iii iiiiiiii")
 
@@ -479,8 +488,8 @@
 /*                               x bus/cache     */
 /*                                    x store/cache     */
 DEF_ENC32(S2_allocframe,   ICLASS_ST" 000 01 00xxxxx PP000iii iiiiiiii")
-DEF_ENC32(S2_storew_locked,ICLASS_ST" 000 01 01sssss PP-ttttt ------dd")
-DEF_ENC32(S4_stored_locked,ICLASS_ST" 000 01 11sssss PP0ttttt ------dd")
+DEF_ENC32(S2_storew_locked,ICLASS_ST" 000 01 01sssss PP-ttttt ----00dd")
+DEF_ENC32(S4_stored_locked,ICLASS_ST" 000 01 11sssss PP0ttttt ----00dd")
 DEF_ENC32(Y2_dczeroa,      ICLASS_ST" 000 01 10sssss PP0----- --------")
 
 
@@ -515,6 +524,7 @@
 
 DEF_FIELDROW_DESC32(ICLASS_J" 0000 -------- PP------ --------","[#0] PC=(Rs), R31=return")
 DEF_ENC32(J2_callr,     ICLASS_J" 0000  101sssss  PP------  --------")
+DEF_ENC32(J2_callrh,    ICLASS_J" 0000  110sssss  PP------  --------")
 
 DEF_FIELDROW_DESC32(ICLASS_J" 0001 -------- PP------ --------","[#1] if (Pu) PC=(Rs), R31=return")
 DEF_ENC32(J2_callrt,    ICLASS_J" 0001  000sssss  PP----uu  --------")
@@ -522,6 +532,7 @@
 
 DEF_FIELDROW_DESC32(ICLASS_J" 0010 -------- PP------ --------","[#2] PC=(Rs); ")
 DEF_ENC32(J2_jumpr,      ICLASS_J" 0010  100sssss  PP------  --------")
+DEF_ENC32(J2_jumprh,     ICLASS_J" 0010  110sssss  PP------  --------")
 DEF_ENC32(J4_hintjumpr,  ICLASS_J" 0010  101sssss  PP------  --------")
 
 DEF_FIELDROW_DESC32(ICLASS_J" 0011 -------- PP------ --------","[#3] if (Pu) PC=(Rs) ")
diff --git a/target/hexagon/imported/ldst.idef b/target/hexagon/imported/ldst.idef
index 237634b..5319817 100644
--- a/target/hexagon/imported/ldst.idef
+++ b/target/hexagon/imported/ldst.idef
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -128,6 +128,24 @@
 
 #define A_RETURN A_RESTRICT_COF_MAX1,A_RESTRICT_SLOT0ONLY,A_RESTRICT_NOSLOT1_STORE,A_RET_TYPE,A_DEALLOCRET
 
+/**** Load Acquire Store Release Instructions****/
+
+
+
+Q6INSN(L2_loadw_aq,"Rd32=memw_aq(Rs32)",ATTRIBS(A_REGWRSIZE_4B,A_ACQUIRE,A_RESTRICT_SLOT0ONLY,A_MEMSIZE_4B,A_LOAD),"Load Acquire Word",
+{ fEA_REG(RsV); fLOAD(1,4,u,EA,RdV); })
+Q6INSN(L4_loadd_aq,"Rdd32=memd_aq(Rs32)",ATTRIBS(A_REGWRSIZE_8B,A_ACQUIRE,A_RESTRICT_SLOT0ONLY,A_MEMSIZE_8B,A_LOAD),"Load Acquire Double integer",
+{ fEA_REG(RsV); fLOAD(1,8,u,EA,RddV); })
+
+Q6INSN(R6_release_at_vi,"release(Rs32):at",ATTRIBS(A_MEMSIZE_0B,A_RELEASE,A_STORE,A_VTCM_ALLBANK_ACCESS,A_RLS_INNER,A_RLS_ALL_THREAD,A_RESTRICT_NOPACKET,A_RESTRICT_SLOT0ONLY), "Release lock", {fEA_REG(RsV); fSTORE(1,0,EA,RsV); })
+Q6INSN(R6_release_st_vi,"release(Rs32):st",ATTRIBS(A_MEMSIZE_0B,A_RELEASE,A_STORE,A_VTCM_ALLBANK_ACCESS,A_RLS_INNER,A_RLS_SAME_THREAD,A_RESTRICT_NOPACKET,A_RESTRICT_SLOT0ONLY), "Release lock", {fEA_REG(RsV); fSTORE(1,0,EA,RsV); })
+
+Q6INSN(S2_storew_rl_at_vi,"memw_rl(Rs32):at=Rt32",ATTRIBS(A_REGWRSIZE_4B,A_RELEASE,A_VTCM_ALLBANK_ACCESS,A_RLS_INNER,A_RLS_ALL_THREAD,A_RESTRICT_NOPACKET,A_MEMSIZE_4B,A_STORE,A_RESTRICT_SLOT0ONLY),"Store Release Word", { fEA_REG(RsV); fSTORE(1,4,EA,RtV); })
+Q6INSN(S4_stored_rl_at_vi,"memd_rl(Rs32):at=Rtt32",ATTRIBS(A_REGWRSIZE_8B,A_RELEASE,A_VTCM_ALLBANK_ACCESS,A_RLS_INNER,A_RLS_ALL_THREAD,A_RESTRICT_NOPACKET,A_MEMSIZE_8B,A_STORE,A_RESTRICT_SLOT0ONLY),"Store Release Double integer", { fEA_REG(RsV); fSTORE(1,8,EA,RttV); })
+
+Q6INSN(S2_storew_rl_st_vi,"memw_rl(Rs32):st=Rt32",ATTRIBS(A_REGWRSIZE_4B,A_RELEASE,A_VTCM_ALLBANK_ACCESS,A_RLS_INNER,A_RLS_SAME_THREAD,A_RESTRICT_NOPACKET,A_MEMSIZE_4B,A_STORE,A_RESTRICT_SLOT0ONLY),"Store Release Word", { fEA_REG(RsV); fSTORE(1,4,EA,RtV); })
+Q6INSN(S4_stored_rl_st_vi,"memd_rl(Rs32):st=Rtt32",ATTRIBS(A_REGWRSIZE_8B,A_RELEASE,A_VTCM_ALLBANK_ACCESS,A_RLS_INNER,A_RLS_SAME_THREAD,A_RESTRICT_NOPACKET,A_MEMSIZE_8B,A_STORE,A_RESTRICT_SLOT0ONLY),"Store Release Double integer", { fEA_REG(RsV); fSTORE(1,8,EA,RttV); })
+
 Q6INSN(L2_deallocframe,"Rdd32=deallocframe(Rs32):raw", ATTRIBS(A_REGWRSIZE_8B,A_MEMSIZE_8B,A_LOAD,A_DEALLOCFRAME), "Deallocate stack frame",
 { fHIDE(size8u_t tmp;) fEA_REG(RsV);
   fLOAD(1,8,u,EA,tmp);
diff --git a/target/hexagon/imported/mmvec/encode_ext.def b/target/hexagon/imported/mmvec/encode_ext.def
index 6fbbe2c..402438f 100644
--- a/target/hexagon/imported/mmvec/encode_ext.def
+++ b/target/hexagon/imported/mmvec/encode_ext.def
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -257,6 +257,11 @@
 DEF_ENC(V6_vasruwuhsat,         ICLASS_CJ" 1 000 vvv vvttt PP 1 uuuuu 100 ddddd") //
 DEF_ENC(V6_vasruhubsat,            ICLASS_CJ" 1 000 vvv vvttt PP 1 uuuuu 101 ddddd") //
 
+DEF_ENC(V6_vasrvuhubrndsat,"00011101000vvvvvPP0uuuuu011ddddd")
+DEF_ENC(V6_vasrvuhubsat,"00011101000vvvvvPP0uuuuu010ddddd")
+DEF_ENC(V6_vasrvwuhrndsat,"00011101000vvvvvPP0uuuuu001ddddd")
+DEF_ENC(V6_vasrvwuhsat,"00011101000vvvvvPP0uuuuu000ddddd")
+
 /***************************************************************
 *
 *  Group #1, Uses Q6 Rt32
@@ -716,6 +721,7 @@
 
 DEF_ENC(V6_vavguw,        ICLASS_CJ" 1 111 000 vvvvv PP 1 uuuuu 010 ddddd") //
 DEF_ENC(V6_vavguwrnd,    ICLASS_CJ" 1 111 000 vvvvv PP 1 uuuuu 011 ddddd") //
+DEF_ENC(V6_vassign_tmp,"00011110--0---01PP0uuuuu110ddddd")
 DEF_ENC(V6_vavgb,        ICLASS_CJ" 1 111 000 vvvvv PP 1 uuuuu 100 ddddd") //
 DEF_ENC(V6_vavgbrnd,    ICLASS_CJ" 1 111 000 vvvvv PP 1 uuuuu 101 ddddd") //
 DEF_ENC(V6_vnavgb,        ICLASS_CJ" 1 111 000 vvvvv PP 1 uuuuu 110 ddddd") //
@@ -730,6 +736,8 @@
 DEF_ENC(V6_vsatuwuh,    ICLASS_CJ" 1 111 001 vvvvv PP 0 uuuuu 110 ddddd") //
 DEF_ENC(V6_vdealb4w,     ICLASS_CJ" 1 111 001 vvvvv PP 0 uuuuu 111 ddddd") //
 
+DEF_ENC(V6_v6mpyvubs10_vxx, 	ICLASS_CJ" 1 111 001 vvvvv PP 1 uuuuu 0ii xxxxx")
+DEF_ENC(V6_v6mpyhubs10_vxx, 	ICLASS_CJ" 1 111 001 vvvvv PP 1 uuuuu 1ii xxxxx")
 
 DEF_ENC(V6_vmpyowh_rnd,     ICLASS_CJ" 1 111 010 vvvvv PP 0 uuuuu 000 ddddd") //
 DEF_ENC(V6_vshuffeb,      ICLASS_CJ" 1 111 010 vvvvv PP 0 uuuuu 001 ddddd") //
@@ -739,6 +747,11 @@
 DEF_ENC(V6_vshufoeh,      ICLASS_CJ" 1 111 010 vvvvv PP 0 uuuuu 101 ddddd") //
 DEF_ENC(V6_vshufoeb,      ICLASS_CJ" 1 111 010 vvvvv PP 0 uuuuu 110 ddddd") //
 DEF_ENC(V6_vcombine,     ICLASS_CJ" 1 111 010 vvvvv PP 0 uuuuu 111 ddddd") //
+DEF_ENC(V6_vcombine_tmp,"00011110101vvvvvPP0uuuuu111ddddd")
+
+DEF_ENC(V6_v6mpyvubs10,  ICLASS_CJ" 1 111 010 vvvvv PP 1 uuuuu 0ii ddddd")
+DEF_ENC(V6_v6mpyhubs10,  ICLASS_CJ" 1 111 010 vvvvv PP 1 uuuuu 1ii ddddd")
+
 
 DEF_ENC(V6_vmpyieoh,     ICLASS_CJ" 1 111 011 vvvvv PP 0 uuuuu 000 ddddd") //
 DEF_ENC(V6_vadduwsat,     ICLASS_CJ" 1 111 011 vvvvv PP 0 uuuuu 001 ddddd") //
@@ -789,6 +802,7 @@
 DEF_ENC(V6_vrounduwuh,     ICLASS_CJ" 1 111 111 vvvvv PP 0 uuuuu 100 ddddd") //
 DEF_ENC(V6_vmpyewuh,    ICLASS_CJ" 1 111 111 vvvvv PP 0 uuuuu 101 ddddd")
 DEF_ENC(V6_vmpyowh,        ICLASS_CJ" 1 111 111 vvvvv PP 0 uuuuu 111 ddddd")
+DEF_ENC(V6_vmpyuhvs,"00011111110vvvvvPP1uuuuu111ddddd")
 
 
 #endif /* NO MMVEC */
diff --git a/target/hexagon/imported/mmvec/ext.idef b/target/hexagon/imported/mmvec/ext.idef
index 8ca5a60..ead32c2 100644
--- a/target/hexagon/imported/mmvec/ext.idef
+++ b/target/hexagon/imported/mmvec/ext.idef
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -62,6 +62,9 @@
 DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
 
 
+#define ITERATOR_INSN_SHIFT3_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \
+EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS,A_CVI_VS_3SRC,A_NOTE_SHIFT_RESOURCE,A_NOTE_NOVP,A_NOTE_VA_UNARY),  \
+DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
 
 #define ITERATOR_INSN_SHIFT_SLOT_VV_LATE(WIDTH,TAG,SYNTAX,DESCR,CODE) \
 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS),  \
@@ -116,6 +119,10 @@
 EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV),  \
 DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
 
+#define ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_VX_FWD(WIDTH,TAG,SYNTAX,DESCR,CODE) \
+EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV),  \
+DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
+
 #define ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
 ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX2,DESCR,CODE)
 
@@ -976,6 +983,22 @@
 NARROWING_SHIFT(16,vasrhbsat,fSETBYTE,b,h,:sat,fVSATB,fVNOROUND,0x7)
 NARROWING_SHIFT(16,vasrhbrndsat,fSETBYTE,b,h,:rnd:sat,fVSATB,fVROUND,0x7)
 
+#define NARROWING_VECTOR_SHIFT(ITERSIZE,TAG,DSTM,DSTTYPE,SRCTYPE,SRCTYPE2,SYNOPTS,SATFUNC,RNDFUNC,SHAMTMASK) \
+ITERATOR_INSN_SHIFT3_SLOT(ITERSIZE,TAG, \
+"Vd32." #DSTTYPE "=vasr(Vuu32." #SRCTYPE ",Vv32." #SRCTYPE2 ")" #SYNOPTS, \
+"Vector shift by vector right and shuffle", \
+    fHIDE(int )shamt = VvV.SRCTYPE2[2*i+0] & SHAMTMASK; \
+    DSTM(0,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuuV.v[0].SRCTYPE[i],shamt) >> shamt)); \
+    shamt = VvV.SRCTYPE2[2*i+1] & SHAMTMASK; \
+    DSTM(1,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuuV.v[1].SRCTYPE[i],shamt) >> shamt)))
+
+/* WORD TO HALF*/
+NARROWING_VECTOR_SHIFT(32,vasrvwuhsat,fSETHALF,uh,w,uh,:sat,fVSATUH,fVNOROUND,0xF)
+NARROWING_VECTOR_SHIFT(32,vasrvwuhrndsat,fSETHALF,uh,w,uh,:rnd:sat,fVSATUH,fVROUND,0xF)
+/* HALF TO BYTE*/
+NARROWING_VECTOR_SHIFT(16,vasrvuhubsat,fSETBYTE,ub,uh,ub,:sat,fVSATUB,fVNOROUND,0x7)
+NARROWING_VECTOR_SHIFT(16,vasrvuhubrndsat,fSETBYTE,ub,uh,ub,:rnd:sat,fVSATUB,fVROUND,0x7)
+
 NARROWING_SHIFT_NOV1(16,vasruhubsat,fSETBYTE,ub,uh,:sat,fVSATUB,fVNOROUND,0x7)
 NARROWING_SHIFT_NOV1(16,vasruhubrndsat,fSETBYTE,ub,uh,:rnd:sat,fVSATUB,fVROUND,0x7)
 
@@ -1360,6 +1383,9 @@
 
 
 
+ITERATOR_INSN_MPY_SLOT(16,vmpyuhvs, "Vd32.uh=vmpy(Vu32.uh,Vv32.uh):>>16",
+"Vector by Vector Unsigned Halfword Multiply with 16 bit rightshift",
+    VdV.uh[i] = fGETUHALF(1,fMPY16UU(VuV.uh[i],VvV.uh[i])))
 
 
 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhus, "Vdd32=vmpyhus(Vu32,Vv32)","Vdd32.w=vmpy(Vu32.h,Vv32.uh)",
@@ -2038,6 +2064,24 @@
 
 ///////////////////////////////////////////////////////////////////////////
 
+EXTINSN(V6_vcombine_tmp, "Vdd32.tmp=vcombine(Vu32,Vv32)",    ATTRIBS(A_EXTENSION,A_CVI,A_CVI_REMAP,A_CVI_TMP,A_NO_INTRINSIC),
+"Vector assign tmp, Any two to Vector Pair ",
+{
+   fHIDE(int i;)
+    fVFOREACH(8, i) {
+           VddV.v[0].ub[i] = VvV.ub[i];
+           VddV.v[1].ub[i] = VuV.ub[i];
+    }
+})
+
+EXTINSN(V6_vassign_tmp, "Vd32.tmp=Vu32",    ATTRIBS(A_EXTENSION,A_CVI,A_CVI_REMAP,A_CVI_TMP,A_NO_INTRINSIC),
+"Vector assign tmp, Any two to Vector Pair ",
+{
+   fHIDE(int i;)
+    fVFOREACH(32, i) {
+           VdV.w[i]=VuV.w[i];
+    }
+})
 
 /*********************************************************
 * GENERAL PERMUTE NETWORKS
@@ -2507,6 +2551,281 @@
 })
 
 
+ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_VX_FWD(32, v6mpyvubs10_vxx, "Vxx32.w+=v6mpy(Vuu32.ub,Vvv32.b,#u2):v", "",
+    fHIDE(size2s_t c00;)
+    fGET10BIT(c00, VvvV.v[0].uw[i], 0)
+    fHIDE(size2s_t c01;)
+    fGET10BIT(c01, VvvV.v[0].uw[i], 1)
+    fHIDE(size2s_t c02;)
+    fGET10BIT(c02, VvvV.v[0].uw[i], 2)
+
+	fHIDE(size2s_t c10;)
+    fGET10BIT(c10, VvvV.v[1].uw[i], 0)
+    fHIDE(size2s_t c11;)
+    fGET10BIT(c11, VvvV.v[1].uw[i], 1)
+    fHIDE(size2s_t c12;)
+    fGET10BIT(c12, VvvV.v[1].uw[i], 2)
+
+    if (uiV == 0) {
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12);
+
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02);
+
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c10);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c12);
+
+    } else if (uiV == 1) {
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c00);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c01);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c02);
+
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12);
+
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02);
+
+    } else if (uiV == 2) {
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12);
+
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02);
+
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c10);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c11);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c12);
+
+    } else if (uiV == 3) {
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c00);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c02);
+
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12);
+
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02);
+    }
+)
+ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_VX_FWD(32, v6mpyhubs10_vxx, "Vxx32.w+=v6mpy(Vuu32.ub,Vvv32.b,#u2):h", "",
+    fHIDE(size2s_t c00;)
+    fGET10BIT(c00, VvvV.v[0].uw[i], 0)
+    fHIDE(size2s_t c01;)
+    fGET10BIT(c01, VvvV.v[0].uw[i], 1)
+    fHIDE(size2s_t c02;)
+    fGET10BIT(c02, VvvV.v[0].uw[i], 2)
+    fHIDE(size2s_t c10;)
+    fGET10BIT(c10, VvvV.v[1].uw[i], 0)
+    fHIDE(size2s_t c11;)
+    fGET10BIT(c11, VvvV.v[1].uw[i], 1)
+    fHIDE(size2s_t c12;)
+    fGET10BIT(c12, VvvV.v[1].uw[i], 2)
+
+    if (uiV == 0) {
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12);
+
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02);
+
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c10);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c12);
+
+    } else if (uiV == 1) {
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c00);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c01);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c02);
+
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12);
+
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02);
+
+    }  else if (uiV == 2) {
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12);
+
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02);
+
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c10);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c11);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c12);
+
+    } else if (uiV == 3) {
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c00);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01);
+        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c02);
+
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12);
+
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01);
+        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02);
+    }
+)
+
+
+ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32, v6mpyvubs10, "Vdd32.w=v6mpy(Vuu32.ub,Vvv32.b,#u2):v", "",
+    fHIDE(short c00;)
+    fGET10BIT(c00, VvvV.v[0].uw[i], 0)
+    fHIDE(short c01;)
+    fGET10BIT(c01, VvvV.v[0].uw[i], 1)
+    fHIDE(short c02;)
+    fGET10BIT(c02, VvvV.v[0].uw[i], 2)
+    fHIDE(short c10;)
+    fGET10BIT(c10, VvvV.v[1].uw[i], 0)
+    fHIDE(short c11;)
+    fGET10BIT(c11, VvvV.v[1].uw[i], 1)
+    fHIDE(short c12;)
+    fGET10BIT(c12, VvvV.v[1].uw[i], 2)
+
+
+
+    if (uiV == 0) {
+        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12);
+
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02);
+
+        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c10);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c12);
+
+    }  else if (uiV == 1) {
+        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c00);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c01);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c02);
+
+        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12);
+
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02);
+
+    }  else if (uiV == 2) {
+        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12);
+
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02);
+
+        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c10);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c11);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c12);
+
+    } else if (uiV == 3) {
+        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c00);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c02);
+
+        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12);
+
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02);
+    }
+)
+
+ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32, v6mpyhubs10, "Vdd32.w=v6mpy(Vuu32.ub,Vvv32.b,#u2):h", "",
+    fHIDE(short c00;)
+    fGET10BIT(c00, VvvV.v[0].uw[i], 0)
+    fHIDE(short c01;)
+    fGET10BIT(c01, VvvV.v[0].uw[i], 1)
+    fHIDE(short c02;)
+    fGET10BIT(c02, VvvV.v[0].uw[i], 2)
+    fHIDE(short c10;)
+    fGET10BIT(c10, VvvV.v[1].uw[i], 0)
+    fHIDE(short c11;)
+    fGET10BIT(c11, VvvV.v[1].uw[i], 1)
+    fHIDE(short c12;)
+    fGET10BIT(c12, VvvV.v[1].uw[i], 2)
+
+    if (uiV == 0) {
+        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12);
+
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02);
+
+        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c10);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c12);
+
+    }  else if (uiV == 1) {
+        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c00);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c01);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c02);
+
+        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12);
+
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02);
+
+    }  else if (uiV == 2) {
+        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12);
+
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02);
+
+        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c10);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c11);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c12);
+
+    } else if (uiV == 3) {
+        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c00);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01);
+        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c02);
+
+        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12);
+
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01);
+        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02);
+    }
+)
+
 
 EXTINSN(V6_vscattermhwq,  "if (Qs4) vscatter(Rt32,Mu2,Vvv32.w).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA_DV,A_CVI_VM,A_MEMLIKE), "Scatter halfwords conditional",
 {
diff --git a/target/hexagon/internal.h b/target/hexagon/internal.h
index b1bfadc..d732b6b 100644
--- a/target/hexagon/internal.h
+++ b/target/hexagon/internal.h
@@ -33,6 +33,8 @@
 
 int hexagon_gdb_read_register(CPUState *cpu, GByteArray *buf, int reg);
 int hexagon_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
+int hexagon_hvx_gdb_read_register(CPUHexagonState *env, GByteArray *mem_buf, int n);
+int hexagon_hvx_gdb_write_register(CPUHexagonState *env, uint8_t *mem_buf, int n);
 
 void hexagon_debug_vreg(CPUHexagonState *env, int regnum);
 void hexagon_debug_qreg(CPUHexagonState *env, int regnum);
diff --git a/target/hexagon/macros.h b/target/hexagon/macros.h
index 760630d..5451b06 100644
--- a/target/hexagon/macros.h
+++ b/target/hexagon/macros.h
@@ -44,8 +44,17 @@
                    reg_field_info[FIELD].offset)
 
 #define SET_USR_FIELD(FIELD, VAL) \
-    fINSERT_BITS(env->new_value[HEX_REG_USR], reg_field_info[FIELD].width, \
-                 reg_field_info[FIELD].offset, (VAL))
+    do { \
+        if (pkt_need_commit) { \
+            fINSERT_BITS(env->new_value_usr, \
+                        reg_field_info[FIELD].width, \
+                        reg_field_info[FIELD].offset, (VAL)); \
+        } else { \
+            fINSERT_BITS(env->gpr[HEX_REG_USR], \
+                        reg_field_info[FIELD].width, \
+                        reg_field_info[FIELD].offset, (VAL)); \
+        } \
+    } while (0)
 #endif
 
 #ifdef QEMU_GENERATE
@@ -164,14 +173,14 @@
 #define MEM_STORE8(VA, DATA, SLOT) \
     MEM_STORE8_FUNC(DATA)(cpu_env, VA, DATA, SLOT)
 #else
-#define MEM_LOAD1s(VA) ((int8_t)mem_load1(env, slot, VA))
-#define MEM_LOAD1u(VA) ((uint8_t)mem_load1(env, slot, VA))
-#define MEM_LOAD2s(VA) ((int16_t)mem_load2(env, slot, VA))
-#define MEM_LOAD2u(VA) ((uint16_t)mem_load2(env, slot, VA))
-#define MEM_LOAD4s(VA) ((int32_t)mem_load4(env, slot, VA))
-#define MEM_LOAD4u(VA) ((uint32_t)mem_load4(env, slot, VA))
-#define MEM_LOAD8s(VA) ((int64_t)mem_load8(env, slot, VA))
-#define MEM_LOAD8u(VA) ((uint64_t)mem_load8(env, slot, VA))
+#define MEM_LOAD1s(VA) ((int8_t)mem_load1(env, pkt_has_store_s1, slot, VA))
+#define MEM_LOAD1u(VA) ((uint8_t)mem_load1(env, pkt_has_store_s1, slot, VA))
+#define MEM_LOAD2s(VA) ((int16_t)mem_load2(env, pkt_has_store_s1, slot, VA))
+#define MEM_LOAD2u(VA) ((uint16_t)mem_load2(env, pkt_has_store_s1, slot, VA))
+#define MEM_LOAD4s(VA) ((int32_t)mem_load4(env, pkt_has_store_s1, slot, VA))
+#define MEM_LOAD4u(VA) ((uint32_t)mem_load4(env, pkt_has_store_s1, slot, VA))
+#define MEM_LOAD8s(VA) ((int64_t)mem_load8(env, pkt_has_store_s1, slot, VA))
+#define MEM_LOAD8u(VA) ((uint64_t)mem_load8(env, pkt_has_store_s1, slot, VA))
 
 #define MEM_STORE1(VA, DATA, SLOT) log_store32(env, VA, DATA, 1, SLOT)
 #define MEM_STORE2(VA, DATA, SLOT) log_store32(env, VA, DATA, 2, SLOT)
@@ -227,12 +236,8 @@
 
 #ifdef QEMU_GENERATE
 #define fLSBNEW(PVAL)   tcg_gen_andi_tl(LSB, (PVAL), 1)
-#define fLSBNEW0        tcg_gen_andi_tl(LSB, hex_new_pred_value[0], 1)
-#define fLSBNEW1        tcg_gen_andi_tl(LSB, hex_new_pred_value[1], 1)
 #else
 #define fLSBNEW(PVAL)   ((PVAL) & 1)
-#define fLSBNEW0        (env->new_pred_value[0] & 1)
-#define fLSBNEW1        (env->new_pred_value[1] & 1)
 #endif
 
 #ifdef QEMU_GENERATE
@@ -347,10 +352,6 @@
 
 #define fREAD_LR() (env->gpr[HEX_REG_LR])
 
-#define fWRITE_LR(A) log_reg_write(env, HEX_REG_LR, A)
-#define fWRITE_FP(A) log_reg_write(env, HEX_REG_FP, A)
-#define fWRITE_SP(A) log_reg_write(env, HEX_REG_SP, A)
-
 #define fREAD_SP() (env->gpr[HEX_REG_SP])
 #define fREAD_LC0 (env->gpr[HEX_REG_LC0])
 #define fREAD_LC1 (env->gpr[HEX_REG_LC1])
@@ -375,24 +376,10 @@
 #define fBRANCH(LOC, TYPE)          fWRITE_NPC(LOC)
 #define fJUMPR(REGNO, TARGET, TYPE) fBRANCH(TARGET, COF_TYPE_JUMPR)
 #define fHINTJR(TARGET) { /* Not modelled in qemu */}
-#define fWRITE_LOOP_REGS0(START, COUNT) \
-    do { \
-        log_reg_write(env, HEX_REG_LC0, COUNT);  \
-        log_reg_write(env, HEX_REG_SA0, START); \
-    } while (0)
-#define fWRITE_LOOP_REGS1(START, COUNT) \
-    do { \
-        log_reg_write(env, HEX_REG_LC1, COUNT);  \
-        log_reg_write(env, HEX_REG_SA1, START);\
-    } while (0)
 
 #define fSET_OVERFLOW() SET_USR_FIELD(USR_OVF, 1)
 #define fSET_LPCFG(VAL) SET_USR_FIELD(USR_LPCFG, (VAL))
 #define fGET_LPCFG (GET_USR_FIELD(USR_LPCFG))
-#define fWRITE_P0(VAL) log_pred_write(env, 0, VAL)
-#define fWRITE_P1(VAL) log_pred_write(env, 1, VAL)
-#define fWRITE_P2(VAL) log_pred_write(env, 2, VAL)
-#define fWRITE_P3(VAL) log_pred_write(env, 3, VAL)
 #define fPART1(WORK) if (part1) { WORK; return; }
 #define fCAST4u(A) ((uint32_t)(A))
 #define fCAST4s(A) ((int32_t)(A))
@@ -661,7 +648,11 @@
                    reg_field_info[FIELD].offset)
 
 #ifdef QEMU_GENERATE
-#define fDCZEROA(REG) tcg_gen_mov_tl(hex_dczero_addr, (REG))
+#define fDCZEROA(REG) \
+    do { \
+        ctx->dczero_addr = tcg_temp_new(); \
+        tcg_gen_mov_tl(ctx->dczero_addr, (REG)); \
+    } while (0)
 #endif
 
 #define fBRANCH_SPECULATE_STALL(DOTNEWVAL, JUMP_COND, SPEC_DIR, HINTBITNUM, \
diff --git a/target/hexagon/mmvec/macros.h b/target/hexagon/mmvec/macros.h
index 1201d77..a655634 100644
--- a/target/hexagon/mmvec/macros.h
+++ b/target/hexagon/mmvec/macros.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -346,4 +346,11 @@
 #define fUARCH_NOTE_PUMP_2X()
 
 #define IV1DEAD()
+
+#define fGET10BIT(COE, VAL, POS) \
+    do { \
+        COE = (sextract32(VAL, 24 + 2 * POS, 2) << 8) | \
+               extract32(VAL, POS * 8, 8); \
+    } while (0);
+
 #endif
diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
index 3cc71b6..12967ac 100644
--- a/target/hexagon/op_helper.c
+++ b/target/hexagon/op_helper.c
@@ -52,38 +52,6 @@
     do_raise_exception_err(env, excp, 0);
 }
 
-void log_reg_write(CPUHexagonState *env, int rnum,
-                   target_ulong val)
-{
-    HEX_DEBUG_LOG("log_reg_write[%d] = " TARGET_FMT_ld " (0x" TARGET_FMT_lx ")",
-                  rnum, val, val);
-    if (val == env->gpr[rnum]) {
-        HEX_DEBUG_LOG(" NO CHANGE");
-    }
-    HEX_DEBUG_LOG("\n");
-
-    env->new_value[rnum] = val;
-    if (HEX_DEBUG) {
-        /* Do this so HELPER(debug_commit_end) will know */
-        env->reg_written[rnum] = 1;
-    }
-}
-
-static void log_pred_write(CPUHexagonState *env, int pnum, target_ulong val)
-{
-    HEX_DEBUG_LOG("log_pred_write[%d] = " TARGET_FMT_ld
-                  " (0x" TARGET_FMT_lx ")\n",
-                  pnum, val, val);
-
-    /* Multiple writes to the same preg are and'ed together */
-    if (env->pred_written & (1 << pnum)) {
-        env->new_pred_value[pnum] &= val & 0xff;
-    } else {
-        env->new_pred_value[pnum] = val & 0xff;
-        env->pred_written |= 1 << pnum;
-    }
-}
-
 void log_store32(CPUHexagonState *env, target_ulong addr,
                  target_ulong val, int width, int slot)
 {
@@ -235,14 +203,14 @@
 }
 
 /* This function is a handy place to set a breakpoint */
-void HELPER(debug_commit_end)(CPUHexagonState *env, int has_st0, int has_st1)
+void HELPER(debug_commit_end)(CPUHexagonState *env, uint32_t this_PC,
+                              int pred_written, int has_st0, int has_st1)
 {
     bool reg_printed = false;
     bool pred_printed = false;
     int i;
 
-    HEX_DEBUG_LOG("Packet committed: pc = 0x" TARGET_FMT_lx "\n",
-                  env->this_PC);
+    HEX_DEBUG_LOG("Packet committed: pc = 0x" TARGET_FMT_lx "\n", this_PC);
     HEX_DEBUG_LOG("slot_cancelled = %d\n", env->slot_cancelled);
 
     for (i = 0; i < TOTAL_PER_THREAD_REGS; i++) {
@@ -252,18 +220,18 @@
                 reg_printed = true;
             }
             HEX_DEBUG_LOG("\tr%d = " TARGET_FMT_ld " (0x" TARGET_FMT_lx ")\n",
-                          i, env->new_value[i], env->new_value[i]);
+                          i, env->gpr[i], env->gpr[i]);
         }
     }
 
     for (i = 0; i < NUM_PREGS; i++) {
-        if (env->pred_written & (1 << i)) {
+        if (pred_written & (1 << i)) {
             if (!pred_printed) {
                 HEX_DEBUG_LOG("Predicates written\n");
                 pred_printed = true;
             }
             HEX_DEBUG_LOG("\tp%d = 0x" TARGET_FMT_lx "\n",
-                          i, env->new_pred_value[i]);
+                          i, env->pred[i]);
         }
     }
 
@@ -384,7 +352,8 @@
 }
 
 int64_t HELPER(vacsh_val)(CPUHexagonState *env,
-                           int64_t RxxV, int64_t RssV, int64_t RttV)
+                           int64_t RxxV, int64_t RssV, int64_t RttV,
+                           uint32_t pkt_need_commit)
 {
     for (int i = 0; i < 4; i++) {
         int xv = sextract64(RxxV, i * 16, 16);
@@ -416,6 +385,87 @@
     return PeV;
 }
 
+int64_t HELPER(cabacdecbin_val)(int64_t RssV, int64_t RttV)
+{
+    int64_t RddV = 0;
+    size4u_t state;
+    size4u_t valMPS;
+    size4u_t bitpos;
+    size4u_t range;
+    size4u_t offset;
+    size4u_t rLPS;
+    size4u_t rMPS;
+
+    state =  fEXTRACTU_RANGE(fGETWORD(1, RttV), 5, 0);
+    valMPS = fEXTRACTU_RANGE(fGETWORD(1, RttV), 8, 8);
+    bitpos = fEXTRACTU_RANGE(fGETWORD(0, RttV), 4, 0);
+    range =  fGETWORD(0, RssV);
+    offset = fGETWORD(1, RssV);
+
+    /* calculate rLPS */
+    range <<= bitpos;
+    offset <<= bitpos;
+    rLPS = rLPS_table_64x4[state][(range >> 29) & 3];
+    rLPS  = rLPS << 23;   /* left aligned */
+
+    /* calculate rMPS */
+    rMPS = (range & 0xff800000) - rLPS;
+
+    /* most probable region */
+    if (offset < rMPS) {
+        RddV = AC_next_state_MPS_64[state];
+        fINSERT_RANGE(RddV, 8, 8, valMPS);
+        fINSERT_RANGE(RddV, 31, 23, (rMPS >> 23));
+        fSETWORD(1, RddV, offset);
+    }
+    /* least probable region */
+    else {
+        RddV = AC_next_state_LPS_64[state];
+        fINSERT_RANGE(RddV, 8, 8, ((!state) ? (1 - valMPS) : (valMPS)));
+        fINSERT_RANGE(RddV, 31, 23, (rLPS >> 23));
+        fSETWORD(1, RddV, (offset - rMPS));
+    }
+    return RddV;
+}
+
+int32_t HELPER(cabacdecbin_pred)(int64_t RssV, int64_t RttV)
+{
+    int32_t p0 = 0;
+    size4u_t state;
+    size4u_t valMPS;
+    size4u_t bitpos;
+    size4u_t range;
+    size4u_t offset;
+    size4u_t rLPS;
+    size4u_t rMPS;
+
+    state =  fEXTRACTU_RANGE(fGETWORD(1, RttV), 5, 0);
+    valMPS = fEXTRACTU_RANGE(fGETWORD(1, RttV), 8, 8);
+    bitpos = fEXTRACTU_RANGE(fGETWORD(0, RttV), 4, 0);
+    range =  fGETWORD(0, RssV);
+    offset = fGETWORD(1, RssV);
+
+    /* calculate rLPS */
+    range <<= bitpos;
+    offset <<= bitpos;
+    rLPS = rLPS_table_64x4[state][(range >> 29) & 3];
+    rLPS  = rLPS << 23;   /* left aligned */
+
+    /* calculate rMPS */
+    rMPS = (range & 0xff800000) - rLPS;
+
+    /* most probable region */
+    if (offset < rMPS) {
+        p0 = valMPS;
+
+    }
+    /* least probable region */
+    else {
+        p0 = valMPS ^ 1;
+    }
+    return p0;
+}
+
 static void probe_store(CPUHexagonState *env, int slot, int mmu_idx,
                         bool is_predicated)
 {
@@ -516,41 +566,45 @@
  * If the load is in slot 0 and there is a store in slot1 (that
  * wasn't cancelled), we have to do the store first.
  */
-static void check_noshuf(CPUHexagonState *env, uint32_t slot,
-                         target_ulong vaddr, int size)
+static void check_noshuf(CPUHexagonState *env, bool pkt_has_store_s1,
+                         uint32_t slot, target_ulong vaddr, int size)
 {
-    if (slot == 0 && env->pkt_has_store_s1 &&
+    if (slot == 0 && pkt_has_store_s1 &&
         ((env->slot_cancelled & (1 << 1)) == 0)) {
         HELPER(probe_noshuf_load)(env, vaddr, size, MMU_USER_IDX);
         HELPER(commit_store)(env, 1);
     }
 }
 
-uint8_t mem_load1(CPUHexagonState *env, uint32_t slot, target_ulong vaddr)
+uint8_t mem_load1(CPUHexagonState *env, bool pkt_has_store_s1,
+                  uint32_t slot, target_ulong vaddr)
 {
     uintptr_t ra = GETPC();
-    check_noshuf(env, slot, vaddr, 1);
+    check_noshuf(env, pkt_has_store_s1, slot, vaddr, 1);
     return cpu_ldub_data_ra(env, vaddr, ra);
 }
 
-uint16_t mem_load2(CPUHexagonState *env, uint32_t slot, target_ulong vaddr)
+uint16_t mem_load2(CPUHexagonState *env, bool pkt_has_store_s1,
+                   uint32_t slot, target_ulong vaddr)
 {
     uintptr_t ra = GETPC();
-    check_noshuf(env, slot, vaddr, 2);
+    check_noshuf(env, pkt_has_store_s1, slot, vaddr, 2);
     return cpu_lduw_data_ra(env, vaddr, ra);
 }
 
-uint32_t mem_load4(CPUHexagonState *env, uint32_t slot, target_ulong vaddr)
+uint32_t mem_load4(CPUHexagonState *env, bool pkt_has_store_s1,
+                   uint32_t slot, target_ulong vaddr)
 {
     uintptr_t ra = GETPC();
-    check_noshuf(env, slot, vaddr, 4);
+    check_noshuf(env, pkt_has_store_s1, slot, vaddr, 4);
     return cpu_ldl_data_ra(env, vaddr, ra);
 }
 
-uint64_t mem_load8(CPUHexagonState *env, uint32_t slot, target_ulong vaddr)
+uint64_t mem_load8(CPUHexagonState *env, bool pkt_has_store_s1,
+                   uint32_t slot, target_ulong vaddr)
 {
     uintptr_t ra = GETPC();
-    check_noshuf(env, slot, vaddr, 8);
+    check_noshuf(env, pkt_has_store_s1, slot, vaddr, 8);
     return cpu_ldq_data_ra(env, vaddr, ra);
 }
 
diff --git a/target/hexagon/op_helper.h b/target/hexagon/op_helper.h
index db22b54..8f3764d 100644
--- a/target/hexagon/op_helper.h
+++ b/target/hexagon/op_helper.h
@@ -19,15 +19,15 @@
 #define HEXAGON_OP_HELPER_H
 
 /* Misc functions */
-void write_new_pc(CPUHexagonState *env, bool pkt_has_multi_cof, target_ulong addr);
+uint8_t mem_load1(CPUHexagonState *env, bool pkt_has_store_s1,
+                  uint32_t slot, target_ulong vaddr);
+uint16_t mem_load2(CPUHexagonState *env, bool pkt_has_store_s1,
+                   uint32_t slot, target_ulong vaddr);
+uint32_t mem_load4(CPUHexagonState *env, bool pkt_has_store_s1,
+                   uint32_t slot, target_ulong vaddr);
+uint64_t mem_load8(CPUHexagonState *env, bool pkt_has_store_s1,
+                   uint32_t slot, target_ulong vaddr);
 
-uint8_t mem_load1(CPUHexagonState *env, uint32_t slot, target_ulong vaddr);
-uint16_t mem_load2(CPUHexagonState *env, uint32_t slot, target_ulong vaddr);
-uint32_t mem_load4(CPUHexagonState *env, uint32_t slot, target_ulong vaddr);
-uint64_t mem_load8(CPUHexagonState *env, uint32_t slot, target_ulong vaddr);
-
-void log_reg_write(CPUHexagonState *env, int rnum,
-                   target_ulong val);
 void log_store64(CPUHexagonState *env, target_ulong addr,
                  int64_t val, int width, int slot);
 void log_store32(CPUHexagonState *env, target_ulong addr,
diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
index cddd7c5..b18f1a9 100644
--- a/target/hexagon/translate.c
+++ b/target/hexagon/translate.c
@@ -27,6 +27,7 @@
 #include "insn.h"
 #include "decode.h"
 #include "translate.h"
+#include "genptr.h"
 #include "printinsn.h"
 
 #include "analyze_funcs_generated.c.inc"
@@ -40,19 +41,13 @@
 
 TCGv hex_gpr[TOTAL_PER_THREAD_REGS];
 TCGv hex_pred[NUM_PREGS];
-TCGv hex_this_PC;
 TCGv hex_slot_cancelled;
-TCGv hex_branch_taken;
-TCGv hex_new_value[TOTAL_PER_THREAD_REGS];
+TCGv hex_new_value_usr;
 TCGv hex_reg_written[TOTAL_PER_THREAD_REGS];
-TCGv hex_new_pred_value[NUM_PREGS];
-TCGv hex_pred_written;
 TCGv hex_store_addr[STORES_MAX];
 TCGv hex_store_width[STORES_MAX];
 TCGv hex_store_val32[STORES_MAX];
 TCGv_i64 hex_store_val64[STORES_MAX];
-TCGv hex_pkt_has_store_s1;
-TCGv hex_dczero_addr;
 TCGv hex_llsc_addr;
 TCGv hex_llsc_val;
 TCGv_i64 hex_llsc_val_i64;
@@ -69,6 +64,10 @@
 {
     intptr_t offset;
 
+    if (!ctx->need_commit) {
+        return offsetof(CPUHexagonState, VRegs[regnum]);
+    }
+
     /* See if it is already allocated */
     for (int i = 0; i < ctx->future_vregs_idx; i++) {
         if (ctx->future_vregs_num[i] == regnum) {
@@ -154,7 +153,7 @@
     if (ctx->branch_cond != TCG_COND_NEVER) {
         if (ctx->branch_cond != TCG_COND_ALWAYS) {
             TCGLabel *skip = gen_new_label();
-            tcg_gen_brcondi_tl(ctx->branch_cond, hex_branch_taken, 0, skip);
+            tcg_gen_brcondi_tl(ctx->branch_cond, ctx->branch_taken, 0, skip);
             gen_goto_tb(ctx, 0, ctx->branch_dest, true);
             gen_set_label(skip);
             gen_goto_tb(ctx, 1, ctx->next_PC, false);
@@ -262,11 +261,6 @@
     return false;
 }
 
-static bool need_pred_written(Packet *pkt)
-{
-    return check_for_attrib(pkt, A_WRITES_PRED_REG);
-}
-
 static bool need_next_PC(DisasContext *ctx)
 {
     Packet *pkt = ctx->pkt;
@@ -341,10 +335,131 @@
     mark_implicit_pred_write(ctx, A_IMPLICIT_WRITES_P3, 3);
 }
 
+static bool pkt_raises_exception(Packet *pkt)
+{
+    if (check_for_attrib(pkt, A_LOAD) ||
+        check_for_attrib(pkt, A_STORE)) {
+        return true;
+    }
+    return false;
+}
+
+static bool need_commit(DisasContext *ctx)
+{
+    Packet *pkt = ctx->pkt;
+
+    /*
+     * If the short-circuit property is set to false, we'll always do the commit
+     */
+    if (!ctx->short_circuit) {
+        return true;
+    }
+
+    if (pkt_raises_exception(pkt)) {
+        return true;
+    }
+
+    /* Registers with immutability flags require new_value */
+    for (int i = 0; i < ctx->reg_log_idx; i++) {
+        int rnum = ctx->reg_log[i];
+        if (reg_immut_masks[rnum]) {
+            return true;
+        }
+    }
+
+    /* Floating point instructions are hard-coded to use new_value */
+    if (check_for_attrib(pkt, A_FPOP)) {
+        return true;
+    }
+
+    if (pkt->num_insns == 1) {
+        if (pkt->pkt_has_hvx) {
+            /*
+             * The HVX instructions with generated helpers use
+             * pass-by-reference, so they need the read/write overlap
+             * check below.
+             * The HVX instructions with overrides are OK.
+             */
+            if (!ctx->has_hvx_helper) {
+                return false;
+            }
+        } else {
+            return false;
+        }
+    }
+
+    /* Check for overlap between register reads and writes */
+    for (int i = 0; i < ctx->reg_log_idx; i++) {
+        int rnum = ctx->reg_log[i];
+        if (test_bit(rnum, ctx->regs_read)) {
+            return true;
+        }
+    }
+
+    /* Check for overlap between predicate reads and writes */
+    for (int i = 0; i < ctx->preg_log_idx; i++) {
+        int pnum = ctx->preg_log[i];
+        if (test_bit(pnum, ctx->pregs_read)) {
+            return true;
+        }
+    }
+
+    /* Check for overlap between HVX reads and writes */
+    for (int i = 0; i < ctx->vreg_log_idx; i++) {
+        int vnum = ctx->vreg_log[i];
+        if (test_bit(vnum, ctx->vregs_read)) {
+            return true;
+        }
+    }
+    if (!bitmap_empty(ctx->vregs_updated_tmp, NUM_VREGS)) {
+        int i = find_first_bit(ctx->vregs_updated_tmp, NUM_VREGS);
+        while (i < NUM_VREGS) {
+            if (test_bit(i, ctx->vregs_read)) {
+                return true;
+            }
+            i = find_next_bit(ctx->vregs_updated_tmp, NUM_VREGS, i + 1);
+        }
+    }
+    if (!bitmap_empty(ctx->vregs_select, NUM_VREGS)) {
+        int i = find_first_bit(ctx->vregs_select, NUM_VREGS);
+        while (i < NUM_VREGS) {
+            if (test_bit(i, ctx->vregs_read)) {
+                return true;
+            }
+            i = find_next_bit(ctx->vregs_select, NUM_VREGS, i + 1);
+        }
+    }
+
+    /* Check for overlap between HVX predicate reads and writes */
+    for (int i = 0; i < ctx->qreg_log_idx; i++) {
+        int qnum = ctx->qreg_log[i];
+        if (test_bit(qnum, ctx->qregs_read)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static void mark_implicit_pred_read(DisasContext *ctx, int attrib, int pnum)
+{
+    if (GET_ATTRIB(ctx->insn->opcode, attrib)) {
+        ctx_log_pred_read(ctx, pnum);
+    }
+}
+
+static void mark_implicit_pred_reads(DisasContext *ctx)
+{
+    mark_implicit_pred_read(ctx, A_IMPLICIT_READS_P0, 0);
+    mark_implicit_pred_read(ctx, A_IMPLICIT_READS_P1, 1);
+    mark_implicit_pred_read(ctx, A_IMPLICIT_READS_P3, 2);
+    mark_implicit_pred_read(ctx, A_IMPLICIT_READS_P3, 3);
+}
+
 static void analyze_packet(DisasContext *ctx)
 {
     Packet *pkt = ctx->pkt;
-    ctx->need_pkt_has_store_s1 = false;
+    ctx->has_hvx_helper = false;
     for (int i = 0; i < pkt->num_insns; i++) {
         Insn *insn = &pkt->insn[i];
         ctx->insn = insn;
@@ -353,7 +468,10 @@
         }
         mark_implicit_reg_writes(ctx);
         mark_implicit_pred_writes(ctx);
+        mark_implicit_pred_reads(ctx);
     }
+
+    ctx->need_commit = need_commit(ctx);
 }
 
 static void gen_start_packet(DisasContext *ctx)
@@ -366,9 +484,11 @@
     ctx->next_PC = next_PC;
     ctx->reg_log_idx = 0;
     bitmap_zero(ctx->regs_written, TOTAL_PER_THREAD_REGS);
+    bitmap_zero(ctx->regs_read, TOTAL_PER_THREAD_REGS);
     bitmap_zero(ctx->predicated_regs, TOTAL_PER_THREAD_REGS);
     ctx->preg_log_idx = 0;
     bitmap_zero(ctx->pregs_written, NUM_PREGS);
+    bitmap_zero(ctx->pregs_read, NUM_PREGS);
     ctx->future_vregs_idx = 0;
     ctx->tmp_vregs_idx = 0;
     ctx->vreg_log_idx = 0;
@@ -377,19 +497,23 @@
     bitmap_zero(ctx->vregs_select, NUM_VREGS);
     bitmap_zero(ctx->predicated_future_vregs, NUM_VREGS);
     bitmap_zero(ctx->predicated_tmp_vregs, NUM_VREGS);
+    bitmap_zero(ctx->vregs_read, NUM_VREGS);
+    bitmap_zero(ctx->qregs_read, NUM_QREGS);
     ctx->qreg_log_idx = 0;
     for (i = 0; i < STORES_MAX; i++) {
         ctx->store_width[i] = 0;
     }
     ctx->s1_store_processed = false;
     ctx->pre_commit = true;
+    for (i = 0; i < TOTAL_PER_THREAD_REGS; i++) {
+        ctx->new_value[i] = NULL;
+    }
+    for (i = 0; i < NUM_PREGS; i++) {
+        ctx->new_pred_value[i] = NULL;
+    }
 
     analyze_packet(ctx);
 
-    if (ctx->need_pkt_has_store_s1) {
-        tcg_gen_movi_tl(hex_pkt_has_store_s1, pkt->pkt_has_store_s1);
-    }
-
     /*
      * pregs_written is used both in the analyze phase as well as the code
      * gen phase, so clear it again.
@@ -399,35 +523,50 @@
     if (HEX_DEBUG) {
         /* Handy place to set a breakpoint before the packet executes */
         gen_helper_debug_start_packet(cpu_env);
-        tcg_gen_movi_tl(hex_this_PC, ctx->base.pc_next);
     }
 
     /* Initialize the runtime state for packet semantics */
     if (need_slot_cancelled(pkt)) {
         tcg_gen_movi_tl(hex_slot_cancelled, 0);
     }
+    ctx->branch_taken = NULL;
     if (pkt->pkt_has_cof) {
+        ctx->branch_taken = tcg_temp_new();
         if (pkt->pkt_has_multi_cof) {
-            tcg_gen_movi_tl(hex_branch_taken, 0);
+            tcg_gen_movi_tl(ctx->branch_taken, 0);
         }
         if (need_next_PC(ctx)) {
             tcg_gen_movi_tl(hex_gpr[HEX_REG_PC], next_PC);
         }
     }
-    if (need_pred_written(pkt)) {
-        tcg_gen_movi_tl(hex_pred_written, 0);
+    if (HEX_DEBUG) {
+        ctx->pred_written = tcg_temp_new();
+        tcg_gen_movi_tl(ctx->pred_written, 0);
     }
 
-    /* Preload the predicated registers into hex_new_value[i] */
-    if (!bitmap_empty(ctx->predicated_regs, TOTAL_PER_THREAD_REGS)) {
+    /* Preload the predicated registers into get_result_gpr(ctx, i) */
+    if (ctx->need_commit &&
+        !bitmap_empty(ctx->predicated_regs, TOTAL_PER_THREAD_REGS)) {
         int i = find_first_bit(ctx->predicated_regs, TOTAL_PER_THREAD_REGS);
         while (i < TOTAL_PER_THREAD_REGS) {
-            tcg_gen_mov_tl(hex_new_value[i], hex_gpr[i]);
+            tcg_gen_mov_tl(get_result_gpr(ctx, i), hex_gpr[i]);
             i = find_next_bit(ctx->predicated_regs, TOTAL_PER_THREAD_REGS,
                               i + 1);
         }
     }
 
+    /*
+     * Preload the predicated pred registers into hex_new_pred_value[pred_num]
+     * Only endloop instructions conditionally write to pred registers
+     */
+    if (ctx->need_commit && pkt->pkt_has_endloop) {
+        for (int i = 0; i < ctx->preg_log_idx; i++) {
+            int pred_num = ctx->preg_log[i];
+            ctx->new_pred_value[pred_num] = tcg_temp_new();
+            tcg_gen_mov_tl(ctx->new_pred_value[pred_num], hex_pred[pred_num]);
+        }
+    }
+
     /* Preload the predicated HVX registers into future_VRegs and tmp_VRegs */
     if (!bitmap_empty(ctx->predicated_future_vregs, NUM_VREGS)) {
         int i = find_first_bit(ctx->predicated_future_vregs, NUM_VREGS);
@@ -481,6 +620,9 @@
     uint8_t width = 0;
 
     if (GET_ATTRIB(opcode, A_SCALAR_STORE)) {
+        if (GET_ATTRIB(opcode, A_MEMSIZE_0B)) {
+            return;
+        }
         if (GET_ATTRIB(opcode, A_MEMSIZE_1B)) {
             width |= 1;
         }
@@ -515,10 +657,15 @@
 {
     int i;
 
+    /* Early exit if not needed */
+    if (!ctx->need_commit) {
+        return;
+    }
+
     for (i = 0; i < ctx->reg_log_idx; i++) {
         int reg_num = ctx->reg_log[i];
 
-        tcg_gen_mov_tl(hex_gpr[reg_num], hex_new_value[reg_num]);
+        tcg_gen_mov_tl(hex_gpr[reg_num], get_result_gpr(ctx, reg_num));
 
         /*
          * ctx->is_tight_loop is set when SA0 points to the beginning of the TB.
@@ -532,41 +679,14 @@
 
 static void gen_pred_writes(DisasContext *ctx)
 {
-    int i;
-
-    /* Early exit if the log is empty */
-    if (!ctx->preg_log_idx) {
+    /* Early exit if not needed or the log is empty */
+    if (!ctx->need_commit || !ctx->preg_log_idx) {
         return;
     }
 
-    /*
-     * Only endloop instructions will conditionally
-     * write a predicate.  If there are no endloop
-     * instructions, we can use the non-conditional
-     * write of the predicates.
-     */
-    if (ctx->pkt->pkt_has_endloop) {
-        TCGv zero = tcg_constant_tl(0);
-        TCGv pred_written = tcg_temp_new();
-        for (i = 0; i < ctx->preg_log_idx; i++) {
-            int pred_num = ctx->preg_log[i];
-
-            tcg_gen_andi_tl(pred_written, hex_pred_written, 1 << pred_num);
-            tcg_gen_movcond_tl(TCG_COND_NE, hex_pred[pred_num],
-                               pred_written, zero,
-                               hex_new_pred_value[pred_num],
-                               hex_pred[pred_num]);
-        }
-    } else {
-        for (i = 0; i < ctx->preg_log_idx; i++) {
-            int pred_num = ctx->preg_log[i];
-            tcg_gen_mov_tl(hex_pred[pred_num], hex_new_pred_value[pred_num]);
-            if (HEX_DEBUG) {
-                /* Do this so HELPER(debug_commit_end) will know */
-                tcg_gen_ori_tl(hex_pred_written, hex_pred_written,
-                               1 << pred_num);
-            }
-        }
+    for (int i = 0; i < ctx->preg_log_idx; i++) {
+        int pred_num = ctx->preg_log[i];
+        tcg_gen_mov_tl(hex_pred[pred_num], ctx->new_pred_value[pred_num]);
     }
 }
 
@@ -692,7 +812,7 @@
         TCGv addr = tcg_temp_new();
         TCGv_i64 zero = tcg_constant_i64(0);
 
-        tcg_gen_andi_tl(addr, hex_dczero_addr, ~0x1f);
+        tcg_gen_andi_tl(addr, ctx->dczero_addr, ~0x1f);
         tcg_gen_qemu_st_i64(zero, addr, ctx->mem_idx, MO_UQ);
         tcg_gen_addi_tl(addr, addr, 8);
         tcg_gen_qemu_st_i64(zero, addr, ctx->mem_idx, MO_UQ);
@@ -719,6 +839,12 @@
 {
     int i;
 
+    /* Early exit if not needed */
+    if (!ctx->need_commit) {
+        g_assert(!pkt_has_hvx_store(ctx->pkt));
+        return;
+    }
+
     /*
      *    for (i = 0; i < ctx->vreg_log_idx; i++) {
      *        int rnum = ctx->vreg_log[i];
@@ -873,7 +999,8 @@
             tcg_constant_tl(pkt->pkt_has_store_s1 && !pkt->pkt_has_dczeroa);
 
         /* Handy place to set a breakpoint at the end of execution */
-        gen_helper_debug_commit_end(cpu_env, has_st0, has_st1);
+        gen_helper_debug_commit_end(cpu_env, tcg_constant_tl(ctx->pkt->pc),
+                                    ctx->pred_written, has_st0, has_st1);
     }
 
     if (pkt->vhist_insn != NULL) {
@@ -920,6 +1047,7 @@
                                           CPUState *cs)
 {
     DisasContext *ctx = container_of(dcbase, DisasContext, base);
+    HexagonCPU *hex_cpu = env_archcpu(cs->env_ptr);
     uint32_t hex_flags = dcbase->tb->flags;
 
     ctx->mem_idx = MMU_USER_IDX;
@@ -928,6 +1056,7 @@
     ctx->num_hvx_insns = 0;
     ctx->branch_cond = TCG_COND_NEVER;
     ctx->is_tight_loop = FIELD_EX32(hex_flags, TB_FLAGS, IS_TIGHT_LOOP);
+    ctx->short_circuit = hex_cpu->short_circuit;
 }
 
 static void hexagon_tr_tb_start(DisasContextBase *db, CPUState *cpu)
@@ -1028,9 +1157,7 @@
 }
 
 #define NAME_LEN               64
-static char new_value_names[TOTAL_PER_THREAD_REGS][NAME_LEN];
 static char reg_written_names[TOTAL_PER_THREAD_REGS][NAME_LEN];
-static char new_pred_value_names[NUM_PREGS][NAME_LEN];
 static char store_addr_names[STORES_MAX][NAME_LEN];
 static char store_width_names[STORES_MAX][NAME_LEN];
 static char store_val32_names[STORES_MAX][NAME_LEN];
@@ -1050,11 +1177,6 @@
             offsetof(CPUHexagonState, gpr[i]),
             hexagon_regnames[i]);
 
-        snprintf(new_value_names[i], NAME_LEN, "new_%s", hexagon_regnames[i]);
-        hex_new_value[i] = tcg_global_mem_new(cpu_env,
-            offsetof(CPUHexagonState, new_value[i]),
-            new_value_names[i]);
-
         if (HEX_DEBUG) {
             snprintf(reg_written_names[i], NAME_LEN, "reg_written_%s",
                      hexagon_regnames[i]);
@@ -1063,29 +1185,16 @@
                 reg_written_names[i]);
         }
     }
+    hex_new_value_usr = tcg_global_mem_new(cpu_env,
+        offsetof(CPUHexagonState, new_value_usr), "new_value_usr");
+
     for (i = 0; i < NUM_PREGS; i++) {
         hex_pred[i] = tcg_global_mem_new(cpu_env,
             offsetof(CPUHexagonState, pred[i]),
             hexagon_prednames[i]);
-
-        snprintf(new_pred_value_names[i], NAME_LEN, "new_pred_%s",
-                 hexagon_prednames[i]);
-        hex_new_pred_value[i] = tcg_global_mem_new(cpu_env,
-            offsetof(CPUHexagonState, new_pred_value[i]),
-            new_pred_value_names[i]);
     }
-    hex_pred_written = tcg_global_mem_new(cpu_env,
-        offsetof(CPUHexagonState, pred_written), "pred_written");
-    hex_this_PC = tcg_global_mem_new(cpu_env,
-        offsetof(CPUHexagonState, this_PC), "this_PC");
     hex_slot_cancelled = tcg_global_mem_new(cpu_env,
         offsetof(CPUHexagonState, slot_cancelled), "slot_cancelled");
-    hex_branch_taken = tcg_global_mem_new(cpu_env,
-        offsetof(CPUHexagonState, branch_taken), "branch_taken");
-    hex_pkt_has_store_s1 = tcg_global_mem_new(cpu_env,
-        offsetof(CPUHexagonState, pkt_has_store_s1), "pkt_has_store_s1");
-    hex_dczero_addr = tcg_global_mem_new(cpu_env,
-        offsetof(CPUHexagonState, dczero_addr), "dczero_addr");
     hex_llsc_addr = tcg_global_mem_new(cpu_env,
         offsetof(CPUHexagonState, llsc_addr), "llsc_addr");
     hex_llsc_val = tcg_global_mem_new(cpu_env,
diff --git a/target/hexagon/translate.h b/target/hexagon/translate.h
index 4b9f21c..4dd59c6 100644
--- a/target/hexagon/translate.h
+++ b/target/hexagon/translate.h
@@ -38,10 +38,12 @@
     int reg_log[REG_WRITES_MAX];
     int reg_log_idx;
     DECLARE_BITMAP(regs_written, TOTAL_PER_THREAD_REGS);
+    DECLARE_BITMAP(regs_read, TOTAL_PER_THREAD_REGS);
     DECLARE_BITMAP(predicated_regs, TOTAL_PER_THREAD_REGS);
     int preg_log[PRED_WRITES_MAX];
     int preg_log_idx;
     DECLARE_BITMAP(pregs_written, NUM_PREGS);
+    DECLARE_BITMAP(pregs_read, NUM_PREGS);
     uint8_t store_width[STORES_MAX];
     bool s1_store_processed;
     int future_vregs_idx;
@@ -55,13 +57,22 @@
     DECLARE_BITMAP(vregs_select, NUM_VREGS);
     DECLARE_BITMAP(predicated_future_vregs, NUM_VREGS);
     DECLARE_BITMAP(predicated_tmp_vregs, NUM_VREGS);
+    DECLARE_BITMAP(vregs_read, NUM_VREGS);
     int qreg_log[NUM_QREGS];
     int qreg_log_idx;
+    DECLARE_BITMAP(qregs_read, NUM_QREGS);
     bool pre_commit;
+    bool need_commit;
     TCGCond branch_cond;
     target_ulong branch_dest;
     bool is_tight_loop;
-    bool need_pkt_has_store_s1;
+    bool short_circuit;
+    bool has_hvx_helper;
+    TCGv new_value[TOTAL_PER_THREAD_REGS];
+    TCGv new_pred_value[NUM_PREGS];
+    TCGv pred_written;
+    TCGv branch_taken;
+    TCGv dczero_addr;
 } DisasContext;
 
 static inline void ctx_log_pred_write(DisasContext *ctx, int pnum)
@@ -73,6 +84,11 @@
     }
 }
 
+static inline void ctx_log_pred_read(DisasContext *ctx, int pnum)
+{
+    set_bit(pnum, ctx->pregs_read);
+}
+
 static inline void ctx_log_reg_write(DisasContext *ctx, int rnum,
                                      bool is_predicated)
 {
@@ -99,6 +115,17 @@
     ctx_log_reg_write(ctx, rnum + 1, is_predicated);
 }
 
+static inline void ctx_log_reg_read(DisasContext *ctx, int rnum)
+{
+    set_bit(rnum, ctx->regs_read);
+}
+
+static inline void ctx_log_reg_read_pair(DisasContext *ctx, int rnum)
+{
+    ctx_log_reg_read(ctx, rnum);
+    ctx_log_reg_read(ctx, rnum + 1);
+}
+
 intptr_t ctx_future_vreg_off(DisasContext *ctx, int regnum,
                              int num, bool alloc_ok);
 intptr_t ctx_tmp_vreg_off(DisasContext *ctx, int regnum,
@@ -139,6 +166,17 @@
     ctx_log_vreg_write(ctx, rnum ^ 1, type, is_predicated);
 }
 
+static inline void ctx_log_vreg_read(DisasContext *ctx, int rnum)
+{
+    set_bit(rnum, ctx->vregs_read);
+}
+
+static inline void ctx_log_vreg_read_pair(DisasContext *ctx, int rnum)
+{
+    ctx_log_vreg_read(ctx, rnum ^ 0);
+    ctx_log_vreg_read(ctx, rnum ^ 1);
+}
+
 static inline void ctx_log_qreg_write(DisasContext *ctx,
                                       int rnum)
 {
@@ -146,20 +184,20 @@
     ctx->qreg_log_idx++;
 }
 
+static inline void ctx_log_qreg_read(DisasContext *ctx, int qnum)
+{
+    set_bit(qnum, ctx->qregs_read);
+}
+
 extern TCGv hex_gpr[TOTAL_PER_THREAD_REGS];
 extern TCGv hex_pred[NUM_PREGS];
-extern TCGv hex_this_PC;
 extern TCGv hex_slot_cancelled;
-extern TCGv hex_branch_taken;
-extern TCGv hex_new_value[TOTAL_PER_THREAD_REGS];
+extern TCGv hex_new_value_usr;
 extern TCGv hex_reg_written[TOTAL_PER_THREAD_REGS];
-extern TCGv hex_new_pred_value[NUM_PREGS];
-extern TCGv hex_pred_written;
 extern TCGv hex_store_addr[STORES_MAX];
 extern TCGv hex_store_width[STORES_MAX];
 extern TCGv hex_store_val32[STORES_MAX];
 extern TCGv_i64 hex_store_val64[STORES_MAX];
-extern TCGv hex_dczero_addr;
 extern TCGv hex_llsc_addr;
 extern TCGv hex_llsc_val;
 extern TCGv_i64 hex_llsc_val_i64;
diff --git a/tests/data/acpi/microvm/APIC b/tests/data/acpi/microvm/APIC
index 68dbd44..672764e 100644
--- a/tests/data/acpi/microvm/APIC
+++ b/tests/data/acpi/microvm/APIC
Binary files differ
diff --git a/tests/data/acpi/microvm/APIC.ioapic2 b/tests/data/acpi/microvm/APIC.ioapic2
index 3063c52..6f24fdb 100644
--- a/tests/data/acpi/microvm/APIC.ioapic2
+++ b/tests/data/acpi/microvm/APIC.ioapic2
Binary files differ
diff --git a/tests/data/acpi/microvm/APIC.pcie b/tests/data/acpi/microvm/APIC.pcie
index 4e8f6ed..2239ca7 100644
--- a/tests/data/acpi/microvm/APIC.pcie
+++ b/tests/data/acpi/microvm/APIC.pcie
Binary files differ
diff --git a/tests/data/acpi/pc/APIC b/tests/data/acpi/pc/APIC
index 208331d..868a343 100644
--- a/tests/data/acpi/pc/APIC
+++ b/tests/data/acpi/pc/APIC
Binary files differ
diff --git a/tests/data/acpi/pc/APIC.acpihmat b/tests/data/acpi/pc/APIC.acpihmat
index 812c460..125d1ff 100644
--- a/tests/data/acpi/pc/APIC.acpihmat
+++ b/tests/data/acpi/pc/APIC.acpihmat
Binary files differ
diff --git a/tests/data/acpi/pc/APIC.cphp b/tests/data/acpi/pc/APIC.cphp
index 65cc4f4..a2c2a24 100644
--- a/tests/data/acpi/pc/APIC.cphp
+++ b/tests/data/acpi/pc/APIC.cphp
Binary files differ
diff --git a/tests/data/acpi/pc/APIC.dimmpxm b/tests/data/acpi/pc/APIC.dimmpxm
index d904d4a..9b5922b 100644
--- a/tests/data/acpi/pc/APIC.dimmpxm
+++ b/tests/data/acpi/pc/APIC.dimmpxm
Binary files differ
diff --git a/tests/data/acpi/q35/APIC b/tests/data/acpi/q35/APIC
index 208331d..868a343 100644
--- a/tests/data/acpi/q35/APIC
+++ b/tests/data/acpi/q35/APIC
Binary files differ
diff --git a/tests/data/acpi/q35/APIC.acpihmat b/tests/data/acpi/q35/APIC.acpihmat
index 812c460..125d1ff 100644
--- a/tests/data/acpi/q35/APIC.acpihmat
+++ b/tests/data/acpi/q35/APIC.acpihmat
Binary files differ
diff --git a/tests/data/acpi/q35/APIC.acpihmat-noinitiator b/tests/data/acpi/q35/APIC.acpihmat-noinitiator
index d904d4a..9b5922b 100644
--- a/tests/data/acpi/q35/APIC.acpihmat-noinitiator
+++ b/tests/data/acpi/q35/APIC.acpihmat-noinitiator
Binary files differ
diff --git a/tests/data/acpi/q35/APIC.core-count2 b/tests/data/acpi/q35/APIC.core-count2
index a255082..f5da2eb 100644
--- a/tests/data/acpi/q35/APIC.core-count2
+++ b/tests/data/acpi/q35/APIC.core-count2
Binary files differ
diff --git a/tests/data/acpi/q35/APIC.cphp b/tests/data/acpi/q35/APIC.cphp
index 65cc4f4..a2c2a24 100644
--- a/tests/data/acpi/q35/APIC.cphp
+++ b/tests/data/acpi/q35/APIC.cphp
Binary files differ
diff --git a/tests/data/acpi/q35/APIC.dimmpxm b/tests/data/acpi/q35/APIC.dimmpxm
index d904d4a..9b5922b 100644
--- a/tests/data/acpi/q35/APIC.dimmpxm
+++ b/tests/data/acpi/q35/APIC.dimmpxm
Binary files differ
diff --git a/tests/data/acpi/q35/APIC.xapic b/tests/data/acpi/q35/APIC.xapic
index c1969c3..83bd283 100644
--- a/tests/data/acpi/q35/APIC.xapic
+++ b/tests/data/acpi/q35/APIC.xapic
Binary files differ
diff --git a/tests/guest-debug/run-test.py b/tests/guest-debug/run-test.py
index d865e46..de6106a 100755
--- a/tests/guest-debug/run-test.py
+++ b/tests/guest-debug/run-test.py
@@ -26,11 +26,12 @@
     parser.add_argument("--qargs", help="Qemu arguments for test")
     parser.add_argument("--binary", help="Binary to debug",
                         required=True)
-    parser.add_argument("--test", help="GDB test script",
-                        required=True)
+    parser.add_argument("--test", help="GDB test script")
     parser.add_argument("--gdb", help="The gdb binary to use",
                         default=None)
+    parser.add_argument("--gdb-args", help="Additional gdb arguments")
     parser.add_argument("--output", help="A file to redirect output to")
+    parser.add_argument("--stderr", help="A file to redirect stderr to")
 
     return parser.parse_args()
 
@@ -58,6 +59,10 @@
         output = open(args.output, "w")
     else:
         output = None
+    if args.stderr:
+        stderr = open(args.stderr, "w")
+    else:
+        stderr = None
 
     socket_dir = TemporaryDirectory("qemu-gdbstub")
     socket_name = os.path.join(socket_dir.name, "gdbstub.socket")
@@ -77,6 +82,8 @@
 
     # Now launch gdb with our test and collect the result
     gdb_cmd = "%s %s" % (args.gdb, args.binary)
+    if args.gdb_args:
+        gdb_cmd += " %s" % (args.gdb_args)
     # run quietly and ignore .gdbinit
     gdb_cmd += " -q -n -batch"
     # disable prompts in case of crash
@@ -84,13 +91,14 @@
     # connect to remote
     gdb_cmd += " -ex 'target remote %s'" % (socket_name)
     # finally the test script itself
-    gdb_cmd += " -x %s" % (args.test)
+    if args.test:
+        gdb_cmd += " -x %s" % (args.test)
 
 
     sleep(1)
     log(output, "GDB CMD: %s" % (gdb_cmd))
 
-    result = subprocess.call(gdb_cmd, shell=True, stdout=output)
+    result = subprocess.call(gdb_cmd, shell=True, stdout=output, stderr=stderr)
 
     # A result of greater than 128 indicates a fatal signal (likely a
     # crash due to gdb internal failure). That's a problem for GDB and
diff --git a/tests/qtest/bios-tables-test.c b/tests/qtest/bios-tables-test.c
index 7fd88b0..159e4ed 100644
--- a/tests/qtest/bios-tables-test.c
+++ b/tests/qtest/bios-tables-test.c
@@ -1867,13 +1867,13 @@
                              " -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1"
                              " -device pxb-cxl,bus_nr=222,bus=pcie.0,id=cxl.2"
                              " -device cxl-rp,port=0,bus=cxl.1,id=rp1,chassis=0,slot=2"
-                             " -device cxl-type3,bus=rp1,memdev=cxl-mem1,lsa=lsa1"
+                             " -device cxl-type3,bus=rp1,persistent-memdev=cxl-mem1,lsa=lsa1"
                              " -device cxl-rp,port=1,bus=cxl.1,id=rp2,chassis=0,slot=3"
-                             " -device cxl-type3,bus=rp2,memdev=cxl-mem2,lsa=lsa2"
+                             " -device cxl-type3,bus=rp2,persistent-memdev=cxl-mem2,lsa=lsa2"
                              " -device cxl-rp,port=0,bus=cxl.2,id=rp3,chassis=0,slot=5"
-                             " -device cxl-type3,bus=rp3,memdev=cxl-mem3,lsa=lsa3"
+                             " -device cxl-type3,bus=rp3,persistent-memdev=cxl-mem3,lsa=lsa3"
                              " -device cxl-rp,port=1,bus=cxl.2,id=rp4,chassis=0,slot=6"
-                             " -device cxl-type3,bus=rp4,memdev=cxl-mem4,lsa=lsa4"
+                             " -device cxl-type3,bus=rp4,persistent-memdev=cxl-mem4,lsa=lsa4"
                              " -M cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G,cxl-fmw.0.interleave-granularity=8k,"
                              "cxl-fmw.1.targets.0=cxl.1,cxl-fmw.1.targets.1=cxl.2,cxl-fmw.1.size=4G,cxl-fmw.1.interleave-granularity=8k",
                              tmp_path, tmp_path, tmp_path, tmp_path,
diff --git a/tests/qtest/cxl-test.c b/tests/qtest/cxl-test.c
index 61f25a7..edcad4a 100644
--- a/tests/qtest/cxl-test.c
+++ b/tests/qtest/cxl-test.c
@@ -8,50 +8,72 @@
 #include "qemu/osdep.h"
 #include "libqtest-single.h"
 
-#define QEMU_PXB_CMD "-machine q35,cxl=on " \
-                     "-device pxb-cxl,id=cxl.0,bus=pcie.0,bus_nr=52 "  \
-                     "-M cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.size=4G "
+#define QEMU_PXB_CMD \
+    "-machine q35,cxl=on " \
+    "-device pxb-cxl,id=cxl.0,bus=pcie.0,bus_nr=52 " \
+    "-M cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.size=4G "
 
-#define QEMU_2PXB_CMD "-machine q35,cxl=on "                            \
-                      "-device pxb-cxl,id=cxl.0,bus=pcie.0,bus_nr=52 "  \
-                      "-device pxb-cxl,id=cxl.1,bus=pcie.0,bus_nr=53 " \
-                      "-M cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.targets.1=cxl.1,cxl-fmw.0.size=4G "
+#define QEMU_2PXB_CMD \
+    "-machine q35,cxl=on " \
+    "-device pxb-cxl,id=cxl.0,bus=pcie.0,bus_nr=52 " \
+    "-device pxb-cxl,id=cxl.1,bus=pcie.0,bus_nr=53 " \
+    "-M cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.targets.1=cxl.1,cxl-fmw.0.size=4G "
 
-#define QEMU_RP "-device cxl-rp,id=rp0,bus=cxl.0,chassis=0,slot=0 "
+#define QEMU_RP \
+    "-device cxl-rp,id=rp0,bus=cxl.0,chassis=0,slot=0 "
 
 /* Dual ports on first pxb */
-#define QEMU_2RP "-device cxl-rp,id=rp0,bus=cxl.0,chassis=0,slot=0 " \
-                 "-device cxl-rp,id=rp1,bus=cxl.0,chassis=0,slot=1 "
+#define QEMU_2RP \
+    "-device cxl-rp,id=rp0,bus=cxl.0,chassis=0,slot=0 " \
+    "-device cxl-rp,id=rp1,bus=cxl.0,chassis=0,slot=1 "
 
 /* Dual ports on each of the pxb instances */
-#define QEMU_4RP "-device cxl-rp,id=rp0,bus=cxl.0,chassis=0,slot=0 " \
-                 "-device cxl-rp,id=rp1,bus=cxl.0,chassis=0,slot=1 " \
-                 "-device cxl-rp,id=rp2,bus=cxl.1,chassis=0,slot=2 " \
-                 "-device cxl-rp,id=rp3,bus=cxl.1,chassis=0,slot=3 "
+#define QEMU_4RP \
+    "-device cxl-rp,id=rp0,bus=cxl.0,chassis=0,slot=0 " \
+    "-device cxl-rp,id=rp1,bus=cxl.0,chassis=0,slot=1 " \
+    "-device cxl-rp,id=rp2,bus=cxl.1,chassis=0,slot=2 " \
+    "-device cxl-rp,id=rp3,bus=cxl.1,chassis=0,slot=3 "
 
-#define QEMU_T3D "-object memory-backend-file,id=cxl-mem0,mem-path=%s,size=256M " \
-                 "-object memory-backend-file,id=lsa0,mem-path=%s,size=256M "    \
-                 "-device cxl-type3,bus=rp0,memdev=cxl-mem0,lsa=lsa0,id=cxl-pmem0 "
+#define QEMU_T3D_DEPRECATED \
+    "-object memory-backend-file,id=cxl-mem0,mem-path=%s,size=256M " \
+    "-object memory-backend-file,id=lsa0,mem-path=%s,size=256M " \
+    "-device cxl-type3,bus=rp0,memdev=cxl-mem0,lsa=lsa0,id=cxl-pmem0 "
 
-#define QEMU_2T3D "-object memory-backend-file,id=cxl-mem0,mem-path=%s,size=256M "    \
-                  "-object memory-backend-file,id=lsa0,mem-path=%s,size=256M "    \
-                  "-device cxl-type3,bus=rp0,memdev=cxl-mem0,lsa=lsa0,id=cxl-pmem0 " \
-                  "-object memory-backend-file,id=cxl-mem1,mem-path=%s,size=256M "    \
-                  "-object memory-backend-file,id=lsa1,mem-path=%s,size=256M "    \
-                  "-device cxl-type3,bus=rp1,memdev=cxl-mem1,lsa=lsa1,id=cxl-pmem1 "
+#define QEMU_T3D_PMEM \
+    "-object memory-backend-file,id=cxl-mem0,mem-path=%s,size=256M " \
+    "-object memory-backend-file,id=lsa0,mem-path=%s,size=256M " \
+    "-device cxl-type3,bus=rp0,persistent-memdev=cxl-mem0,lsa=lsa0,id=pmem0 "
 
-#define QEMU_4T3D "-object memory-backend-file,id=cxl-mem0,mem-path=%s,size=256M " \
-                  "-object memory-backend-file,id=lsa0,mem-path=%s,size=256M "    \
-                  "-device cxl-type3,bus=rp0,memdev=cxl-mem0,lsa=lsa0,id=cxl-pmem0 " \
-                  "-object memory-backend-file,id=cxl-mem1,mem-path=%s,size=256M "    \
-                  "-object memory-backend-file,id=lsa1,mem-path=%s,size=256M "    \
-                  "-device cxl-type3,bus=rp1,memdev=cxl-mem1,lsa=lsa1,id=cxl-pmem1 " \
-                  "-object memory-backend-file,id=cxl-mem2,mem-path=%s,size=256M "    \
-                  "-object memory-backend-file,id=lsa2,mem-path=%s,size=256M "    \
-                  "-device cxl-type3,bus=rp2,memdev=cxl-mem2,lsa=lsa2,id=cxl-pmem2 " \
-                  "-object memory-backend-file,id=cxl-mem3,mem-path=%s,size=256M "    \
-                  "-object memory-backend-file,id=lsa3,mem-path=%s,size=256M "    \
-                  "-device cxl-type3,bus=rp3,memdev=cxl-mem3,lsa=lsa3,id=cxl-pmem3 "
+#define QEMU_T3D_VMEM \
+    "-object memory-backend-ram,id=cxl-mem0,size=256M " \
+    "-device cxl-type3,bus=rp0,volatile-memdev=cxl-mem0,id=mem0 "
+
+#define QEMU_T3D_VMEM_LSA \
+    "-object memory-backend-ram,id=cxl-mem0,size=256M " \
+    "-object memory-backend-file,id=lsa0,mem-path=%s,size=256M " \
+    "-device cxl-type3,bus=rp0,volatile-memdev=cxl-mem0,lsa=lsa0,id=mem0 "
+
+#define QEMU_2T3D \
+    "-object memory-backend-file,id=cxl-mem0,mem-path=%s,size=256M " \
+    "-object memory-backend-file,id=lsa0,mem-path=%s,size=256M " \
+    "-device cxl-type3,bus=rp0,persistent-memdev=cxl-mem0,lsa=lsa0,id=pmem0 " \
+    "-object memory-backend-file,id=cxl-mem1,mem-path=%s,size=256M " \
+    "-object memory-backend-file,id=lsa1,mem-path=%s,size=256M " \
+    "-device cxl-type3,bus=rp1,persistent-memdev=cxl-mem1,lsa=lsa1,id=pmem1 "
+
+#define QEMU_4T3D \
+    "-object memory-backend-file,id=cxl-mem0,mem-path=%s,size=256M " \
+    "-object memory-backend-file,id=lsa0,mem-path=%s,size=256M " \
+    "-device cxl-type3,bus=rp0,persistent-memdev=cxl-mem0,lsa=lsa0,id=pmem0 " \
+    "-object memory-backend-file,id=cxl-mem1,mem-path=%s,size=256M " \
+    "-object memory-backend-file,id=lsa1,mem-path=%s,size=256M " \
+    "-device cxl-type3,bus=rp1,persistent-memdev=cxl-mem1,lsa=lsa1,id=pmem1 " \
+    "-object memory-backend-file,id=cxl-mem2,mem-path=%s,size=256M " \
+    "-object memory-backend-file,id=lsa2,mem-path=%s,size=256M " \
+    "-device cxl-type3,bus=rp2,persistent-memdev=cxl-mem2,lsa=lsa2,id=pmem2 " \
+    "-object memory-backend-file,id=cxl-mem3,mem-path=%s,size=256M " \
+    "-object memory-backend-file,id=lsa3,mem-path=%s,size=256M " \
+    "-device cxl-type3,bus=rp3,persistent-memdev=cxl-mem3,lsa=lsa3,id=pmem3 "
 
 static void cxl_basic_hb(void)
 {
@@ -90,14 +112,53 @@
 }
 
 #ifdef CONFIG_POSIX
-static void cxl_t3d(void)
+static void cxl_t3d_deprecated(void)
 {
     g_autoptr(GString) cmdline = g_string_new(NULL);
     g_autofree const char *tmpfs = NULL;
 
     tmpfs = g_dir_make_tmp("cxl-test-XXXXXX", NULL);
 
-    g_string_printf(cmdline, QEMU_PXB_CMD QEMU_RP QEMU_T3D, tmpfs, tmpfs);
+    g_string_printf(cmdline, QEMU_PXB_CMD QEMU_RP QEMU_T3D_DEPRECATED,
+                    tmpfs, tmpfs);
+
+    qtest_start(cmdline->str);
+    qtest_end();
+}
+
+static void cxl_t3d_persistent(void)
+{
+    g_autoptr(GString) cmdline = g_string_new(NULL);
+    g_autofree const char *tmpfs = NULL;
+
+    tmpfs = g_dir_make_tmp("cxl-test-XXXXXX", NULL);
+
+    g_string_printf(cmdline, QEMU_PXB_CMD QEMU_RP QEMU_T3D_PMEM,
+                    tmpfs, tmpfs);
+
+    qtest_start(cmdline->str);
+    qtest_end();
+}
+
+static void cxl_t3d_volatile(void)
+{
+    g_autoptr(GString) cmdline = g_string_new(NULL);
+
+    g_string_printf(cmdline, QEMU_PXB_CMD QEMU_RP QEMU_T3D_VMEM);
+
+    qtest_start(cmdline->str);
+    qtest_end();
+}
+
+static void cxl_t3d_volatile_lsa(void)
+{
+    g_autoptr(GString) cmdline = g_string_new(NULL);
+    g_autofree const char *tmpfs = NULL;
+
+    tmpfs = g_dir_make_tmp("cxl-test-XXXXXX", NULL);
+
+    g_string_printf(cmdline, QEMU_PXB_CMD QEMU_RP QEMU_T3D_VMEM_LSA,
+                    tmpfs);
 
     qtest_start(cmdline->str);
     qtest_end();
@@ -147,7 +208,10 @@
     qtest_add_func("/pci/cxl/rp", cxl_root_port);
     qtest_add_func("/pci/cxl/rp_x2", cxl_2root_port);
 #ifdef CONFIG_POSIX
-    qtest_add_func("/pci/cxl/type3_device", cxl_t3d);
+    qtest_add_func("/pci/cxl/type3_device", cxl_t3d_deprecated);
+    qtest_add_func("/pci/cxl/type3_device_pmem", cxl_t3d_persistent);
+    qtest_add_func("/pci/cxl/type3_device_vmem", cxl_t3d_volatile);
+    qtest_add_func("/pci/cxl/type3_device_vmem_lsa", cxl_t3d_volatile_lsa);
     qtest_add_func("/pci/cxl/rp_x2_type3_x2", cxl_1pxb_2rp_2t3d);
     qtest_add_func("/pci/cxl/pxb_x2_root_port_x4_type3_x4", cxl_2pxb_4rp_4t3d);
 #endif
diff --git a/tests/tcg/hexagon/Makefile.target b/tests/tcg/hexagon/Makefile.target
index 7c94db4..890ccee 100644
--- a/tests/tcg/hexagon/Makefile.target
+++ b/tests/tcg/hexagon/Makefile.target
@@ -45,10 +45,18 @@
 HEX_TESTS += overflow
 HEX_TESTS += signal_context
 HEX_TESTS += reg_mut
+HEX_TESTS += read_write_overlap
 HEX_TESTS += vector_add_int
 HEX_TESTS += scatter_gather
 HEX_TESTS += hvx_misc
 HEX_TESTS += hvx_histogram
+HEX_TESTS += invalid-slots
+
+run-and-check-exception = $(call run-test,$2,$3 2>$2.stderr; \
+	test $$? -eq 1 && grep -q "exception $(strip $1)" $2.stderr)
+
+run-invalid-slots: invalid-slots
+	$(call run-and-check-exception, 0x15, $@, $(QEMU) $(QEMU_OPTS) $<)
 
 HEX_TESTS += test_abs
 HEX_TESTS += test_bitcnt
@@ -76,17 +84,30 @@
 HEX_TESTS += test_vpmpyh
 HEX_TESTS += test_vspliceb
 
+HEX_TESTS += v68_scalar
+HEX_TESTS += v68_hvx
+HEX_TESTS += v69_hvx
+HEX_TESTS += v73_scalar
+
 TESTS += $(HEX_TESTS)
 
 # This test has to be compiled for the -mv67t target
 usr: usr.c
 	$(CC) $(CFLAGS) -mv67t -O2 -Wno-inline-asm -Wno-expansion-to-defined $< -o $@ $(LDFLAGS)
 
+# Build this test with -mv71 to exercise the CABAC instruction
+misc: misc.c
+	$(CC) $(CFLAGS) -mv71 -O2 $< -o $@ $(LDFLAGS)
 scatter_gather: CFLAGS += -mhvx
 vector_add_int: CFLAGS += -mhvx -fvectorize
 hvx_misc: hvx_misc.c hvx_misc.h
 hvx_misc: CFLAGS += -mhvx
 hvx_histogram: CFLAGS += -mhvx -Wno-gnu-folding-constant
+v68_hvx: v68_hvx.c hvx_misc.h v6mpy_ref.c.inc
+v68_hvx: CFLAGS += -mhvx -Wno-unused-function
+v69_hvx: v69_hvx.c hvx_misc.h
+v69_hvx: CFLAGS += -mhvx -Wno-unused-function
+v73_scalar: CFLAGS += -Wno-unused-function
 
 hvx_histogram: hvx_histogram.c hvx_histogram_row.S
 	$(CC) $(CFLAGS) $(CROSS_CC_GUEST_CFLAGS) $^ -o $@ $(LDFLAGS)
diff --git a/tests/tcg/hexagon/fpstuff.c b/tests/tcg/hexagon/fpstuff.c
index 90ce9a6..28f9397 100644
--- a/tests/tcg/hexagon/fpstuff.c
+++ b/tests/tcg/hexagon/fpstuff.c
@@ -20,6 +20,7 @@
  */
 
 #include <stdio.h>
+#include <float.h>
 
 const int FPINVF_BIT = 1;                 /* Invalid */
 const int FPINVF = 1 << FPINVF_BIT;
@@ -706,6 +707,57 @@
     check_fpstatus(usr, FPINVF);
 }
 
+static void check_float_consts(void)
+{
+    int res32;
+    unsigned long long res64;
+
+    asm("%0 = sfmake(#%1):neg\n\t" : "=r"(res32) : "i"(0xf));
+    check32(res32, 0xbc9e0000);
+
+    asm("%0 = sfmake(#%1):pos\n\t" : "=r"(res32) : "i"(0xf));
+    check32(res32, 0x3c9e0000);
+
+    asm("%0 = dfmake(#%1):neg\n\t" : "=r"(res64) : "i"(0xf));
+    check64(res64, 0xbf93c00000000000ULL);
+
+    asm("%0 = dfmake(#%1):pos\n\t" : "=r"(res64) : "i"(0xf));
+    check64(res64, 0x3f93c00000000000ULL);
+}
+
+static inline unsigned long long dfmpyll(double x, double y)
+{
+    unsigned long long res64;
+    asm("%0 = dfmpyll(%1, %2)" : "=r"(res64) : "r"(x), "r"(y));
+    return res64;
+}
+
+static inline unsigned long long dfmpylh(double acc, double x, double y)
+{
+    unsigned long long res64 = *(unsigned long long *)&acc;
+    asm("%0 += dfmpylh(%1, %2)" : "+r"(res64) : "r"(x), "r"(y));
+    return res64;
+}
+
+static void check_dfmpyxx(void)
+{
+    unsigned long long res64;
+
+    res64 = dfmpyll(DBL_MIN, DBL_MIN);
+    check64(res64, 0ULL);
+    res64 = dfmpyll(-1.0, DBL_MIN);
+    check64(res64, 0ULL);
+    res64 = dfmpyll(DBL_MAX, DBL_MAX);
+    check64(res64, 0x1fffffffdULL);
+
+    res64 = dfmpylh(DBL_MIN, DBL_MIN, DBL_MIN);
+    check64(res64, 0x10000000000000ULL);
+    res64 = dfmpylh(-1.0, DBL_MAX, DBL_MIN);
+    check64(res64, 0xc00fffffffe00000ULL);
+    res64 = dfmpylh(DBL_MAX, 0.0, -1.0);
+    check64(res64, 0x7fefffffffffffffULL);
+}
+
 int main()
 {
     check_compare_exception();
@@ -718,6 +770,8 @@
     check_sffixupd();
     check_sffms();
     check_float2int_convs();
+    check_float_consts();
+    check_dfmpyxx();
 
     puts(err ? "FAIL" : "PASS");
     return err ? 1 : 0;
diff --git a/tests/tcg/hexagon/hvx_misc.c b/tests/tcg/hexagon/hvx_misc.c
index d0e64e0..09dec8d 100644
--- a/tests/tcg/hexagon/hvx_misc.c
+++ b/tests/tcg/hexagon/hvx_misc.c
@@ -342,49 +342,6 @@
     check_output_w(__LINE__, 2);
 }
 
-static void test_vshuff(void)
-{
-    /* Test that vshuff works when the two operands are the same register */
-    const uint32_t splat = 0x089be55c;
-    const uint32_t shuff = 0x454fa926;
-    MMVector v0, v1;
-
-    memset(expect, 0x12, sizeof(MMVector));
-    memset(output, 0x34, sizeof(MMVector));
-
-    asm volatile("v25 = vsplat(%0)\n\t"
-                 "vshuff(v25, v25, %1)\n\t"
-                 "vmem(%2 + #0) = v25\n\t"
-                 : /* no outputs */
-                 : "r"(splat), "r"(shuff), "r"(output)
-                 : "v25", "memory");
-
-    /*
-     * The semantics of Hexagon are the operands are pass-by-value, so create
-     * two copies of the vsplat result.
-     */
-    for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
-        v0.uw[i] = splat;
-        v1.uw[i] = splat;
-    }
-    /* Do the vshuff operation */
-    for (int offset = 1; offset < MAX_VEC_SIZE_BYTES; offset <<= 1) {
-        if (shuff & offset) {
-            for (int k = 0; k < MAX_VEC_SIZE_BYTES; k++) {
-                if (!(k & offset)) {
-                    uint8_t tmp = v0.ub[k];
-                    v0.ub[k] = v1.ub[k + offset];
-                    v1.ub[k + offset] = tmp;
-                }
-            }
-        }
-    }
-    /* Put the result in the expect buffer for verification */
-    expect[0] = v1;
-
-    check_output_b(__LINE__, 1);
-}
-
 static void test_load_tmp_predicated(void)
 {
     void *p0 = buffer0;
@@ -454,6 +411,25 @@
     check_output_w(__LINE__, BUFSIZE);
 }
 
+static void test_vcombine(void)
+{
+    for (int i = 0; i < BUFSIZE / 2; i++) {
+        asm volatile("v2 = vsplat(%0)\n\t"
+                     "v3 = vsplat(%1)\n\t"
+                     "v3:2 = vcombine(v2, v3)\n\t"
+                     "vmem(%2+#0) = v2\n\t"
+                     "vmem(%2+#1) = v3\n\t"
+                     :
+                     : "r"(2 * i), "r"(2 * i + 1), "r"(&output[2 * i])
+                     : "v2", "v3", "memory");
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
+            expect[2 * i].w[j] = 2 * i + 1;
+            expect[2 * i + 1].w[j] = 2 * i;
+        }
+    }
+    check_output_w(__LINE__, BUFSIZE);
+}
+
 int main()
 {
     init_buffers();
@@ -489,11 +465,11 @@
     test_vadduwsat();
     test_vsubuwsat_dv();
 
-    test_vshuff();
-
     test_load_tmp_predicated();
     test_load_cur_predicated();
 
+    test_vcombine();
+
     puts(err ? "FAIL" : "PASS");
     return err ? 1 : 0;
 }
diff --git a/tests/tcg/hexagon/invalid-slots.c b/tests/tcg/hexagon/invalid-slots.c
new file mode 100644
index 0000000..366ce4f
--- /dev/null
+++ b/tests/tcg/hexagon/invalid-slots.c
@@ -0,0 +1,29 @@
+/*
+ *  Copyright(c) 2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+char mem[8] __attribute__((aligned(8)));
+
+int main()
+{
+    asm volatile(
+        "r0 = #mem\n"
+        /* Invalid packet (2 instructions at slot 0): */
+        ".word 0xa1804100\n" /* { memw(r0) = r1;      */
+        ".word 0x28032804\n" /*   r3 = #0; r4 = #0 }  */
+        : : : "r0", "r3", "r4", "memory");
+    return 0;
+}
diff --git a/tests/tcg/hexagon/misc.c b/tests/tcg/hexagon/misc.c
index e126751..cfdda3f 100644
--- a/tests/tcg/hexagon/misc.c
+++ b/tests/tcg/hexagon/misc.c
@@ -18,6 +18,8 @@
 #include <stdio.h>
 #include <string.h>
 
+#define CORE_HAS_CABAC            (__HEXAGON_ARCH__ <= 71)
+
 typedef unsigned char uint8_t;
 typedef unsigned short uint16_t;
 typedef unsigned int uint32_t;
@@ -245,6 +247,7 @@
     }
 }
 
+#if CORE_HAS_CABAC
 static void check64(long long val, long long expect)
 {
     if (val != expect) {
@@ -252,6 +255,7 @@
         err++;
     }
 }
+#endif
 
 uint32_t init[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
 uint32_t array[10];
@@ -286,6 +290,7 @@
     return retval;
 }
 
+#if CORE_HAS_CABAC
 static long long decbin(long long x, long long y, int *pred)
 {
     long long retval;
@@ -295,6 +300,7 @@
          : "r"(x), "r"(y));
     return retval;
 }
+#endif
 
 /* Check that predicates are auto-and'ed in a packet */
 static int auto_and(void)
@@ -385,11 +391,46 @@
     check(ct1p(0xffffff0fffffffffULL), 36);
 }
 
+static inline int dpmpyss_rnd_s0(int x, int y)
+{
+    int res;
+    asm("%0 = mpy(%1, %2):rnd\n\t" : "=r"(res) : "r"(x), "r"(y));
+    return res;
+}
+
+void test_dpmpyss_rnd_s0(void)
+{
+    check(dpmpyss_rnd_s0(-1, 0x80000000), 1);
+    check(dpmpyss_rnd_s0(0, 0x80000000), 0);
+    check(dpmpyss_rnd_s0(1, 0x80000000), 0);
+    check(dpmpyss_rnd_s0(0x7fffffff, 0x80000000), 0xc0000001);
+    check(dpmpyss_rnd_s0(0x80000000, -1), 1);
+    check(dpmpyss_rnd_s0(-1, -1), 0);
+    check(dpmpyss_rnd_s0(0, -1), 0);
+    check(dpmpyss_rnd_s0(1, -1), 0);
+    check(dpmpyss_rnd_s0(0x7fffffff, -1), 0);
+    check(dpmpyss_rnd_s0(0x80000000, 0), 0);
+    check(dpmpyss_rnd_s0(-1, 0), 0);
+    check(dpmpyss_rnd_s0(0, 0), 0);
+    check(dpmpyss_rnd_s0(1, 0), 0);
+    check(dpmpyss_rnd_s0(-1, -1), 0);
+    check(dpmpyss_rnd_s0(0, -1), 0);
+    check(dpmpyss_rnd_s0(1, -1), 0);
+    check(dpmpyss_rnd_s0(0x7fffffff, 1), 0);
+    check(dpmpyss_rnd_s0(0x80000000, 0x7fffffff), 0xc0000001);
+    check(dpmpyss_rnd_s0(-1, 0x7fffffff), 0);
+    check(dpmpyss_rnd_s0(0, 0x7fffffff),  0);
+    check(dpmpyss_rnd_s0(1, 0x7fffffff),  0);
+    check(dpmpyss_rnd_s0(0x7fffffff, 0x7fffffff), 0x3fffffff);
+}
+
 int main()
 {
     int res;
+#if CORE_HAS_CABAC
     long long res64;
     int pred;
+#endif
 
     memcpy(array, init, sizeof(array));
     S4_storerhnew_rr(array, 4, 0xffff);
@@ -505,6 +546,7 @@
     res = test_clrtnew(2, 7);
     check(res, 7);
 
+#if CORE_HAS_CABAC
     res64 = decbin(0xf0f1f2f3f4f5f6f7LL, 0x7f6f5f4f3f2f1f0fLL, &pred);
     check64(res64, 0x357980003700010cLL);
     check(pred, 0);
@@ -512,6 +554,9 @@
     res64 = decbin(0xfLL, 0x1bLL, &pred);
     check64(res64, 0x78000100LL);
     check(pred, 1);
+#else
+    puts("Skipping cabac tests");
+#endif
 
     res = auto_and();
     check(res, 0);
@@ -522,6 +567,8 @@
 
     test_count_trailing_zeros_ones();
 
+    test_dpmpyss_rnd_s0();
+
     puts(err ? "FAIL" : "PASS");
     return err;
 }
diff --git a/tests/tcg/hexagon/read_write_overlap.c b/tests/tcg/hexagon/read_write_overlap.c
new file mode 100644
index 0000000..a75fc11
--- /dev/null
+++ b/tests/tcg/hexagon/read_write_overlap.c
@@ -0,0 +1,136 @@
+/*
+ *  Copyright(c) 2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Test instructions where the semantics write to the destination
+ * before all the operand reads have been completed.
+ *
+ * These instructions are problematic when we short-circuit the
+ * register writes because the destination and source operands could
+ * be the same TCGv.
+ *
+ * We test by forcing the read and write to be register r7.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+int err;
+
+static void __check(const char *filename, int line, int x, int expect)
+{
+    if (x != expect) {
+        printf("ERROR %s:%d - 0x%08x != 0x%08x\n",
+               filename, line, x, expect);
+        err++;
+    }
+}
+
+#define check(x, expect) __check(__FILE__, __LINE__, (x), (expect))
+
+#define insert(RES, X, WIDTH, OFFSET) \
+    asm("r7 = %1\n\t" \
+        "r7 = insert(r7, #" #WIDTH ", #" #OFFSET ")\n\t" \
+        "%0 = r7\n\t" \
+        : "=r"(RES) : "r"(X) : "r7")
+
+static void test_insert(void)
+{
+    uint32_t res;
+
+    insert(res, 0x12345678, 8, 1);
+    check(res, 0x123456f0);
+    insert(res, 0x12345678, 0, 1);
+    check(res, 0x12345678);
+    insert(res, 0x12345678, 20, 16);
+    check(res, 0x56785678);
+}
+
+static inline uint32_t insert_rp(uint32_t x, uint32_t width, uint32_t offset)
+{
+    uint64_t width_offset = (uint64_t)width << 32 | offset;
+    uint32_t res;
+    asm("r7 = %1\n\t"
+        "r7 = insert(r7, %2)\n\t"
+        "%0 = r7\n\t"
+        : "=r"(res) : "r"(x), "r"(width_offset) : "r7");
+    return res;
+
+}
+
+static void test_insert_rp(void)
+{
+    check(insert_rp(0x12345678,   8,  1), 0x123456f0);
+    check(insert_rp(0x12345678,  63,  8), 0x34567878);
+    check(insert_rp(0x12345678, 127,  8), 0x34567878);
+    check(insert_rp(0x12345678,   8, 24), 0x78345678);
+    check(insert_rp(0x12345678,   8, 63), 0x12345678);
+    check(insert_rp(0x12345678,   8, 64), 0x00000000);
+}
+
+static inline uint32_t asr_r_svw_trun(uint64_t x, uint32_t y)
+{
+    uint32_t res;
+    asm("r7 = %2\n\t"
+        "r7 = vasrw(%1, r7)\n\t"
+        "%0 = r7\n\t"
+        : "=r"(res) : "r"(x), "r"(y) : "r7");
+    return res;
+}
+
+static void test_asr_r_svw_trun(void)
+{
+    check(asr_r_svw_trun(0x1111111122222222ULL, 5),
+          0x88881111);
+    check(asr_r_svw_trun(0x1111111122222222ULL, 63),
+          0x00000000);
+    check(asr_r_svw_trun(0x1111111122222222ULL, 64),
+          0x00000000);
+    check(asr_r_svw_trun(0x1111111122222222ULL, 127),
+          0x22224444);
+    check(asr_r_svw_trun(0x1111111122222222ULL, 128),
+          0x11112222);
+    check(asr_r_svw_trun(0xffffffff22222222ULL, 128),
+          0xffff2222);
+}
+
+static inline uint32_t swiz(uint32_t x)
+{
+    uint32_t res;
+    asm("r7 = %1\n\t"
+        "r7 = swiz(r7)\n\t"
+        "%0 = r7\n\t"
+        : "=r"(res) : "r"(x) : "r7");
+    return res;
+}
+
+static void test_swiz(void)
+{
+    check(swiz(0x11223344), 0x44332211);
+}
+
+int main()
+{
+    test_insert();
+    test_insert_rp();
+    test_asr_r_svw_trun();
+    test_swiz();
+
+    puts(err ? "FAIL" : "PASS");
+    return err ? EXIT_FAILURE : EXIT_SUCCESS;
+}
diff --git a/tests/tcg/hexagon/v68_hvx.c b/tests/tcg/hexagon/v68_hvx.c
new file mode 100644
index 0000000..0271872
--- /dev/null
+++ b/tests/tcg/hexagon/v68_hvx.c
@@ -0,0 +1,90 @@
+/*
+ *  Copyright(c) 2022-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <limits.h>
+
+int err;
+
+#include "hvx_misc.h"
+
+MMVector v6mpy_buffer0[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
+MMVector v6mpy_buffer1[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
+
+static void init_v6mpy_buffers(void)
+{
+    int counter0 = 0;
+    int counter1 = 17;
+    for (int i = 0; i < BUFSIZE; i++) {
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
+            v6mpy_buffer0[i].w[j] = counter0++;
+            v6mpy_buffer1[i].w[j] = counter1++;
+        }
+    }
+}
+
+int v6mpy_ref[BUFSIZE][MAX_VEC_SIZE_BYTES / 4] = {
+#include "v6mpy_ref.c.inc"
+};
+
+static void test_v6mpy(void)
+{
+    void *p00 = buffer0;
+    void *p01 = v6mpy_buffer0;
+    void *p10 = buffer1;
+    void *p11 = v6mpy_buffer1;
+    void *pout = output;
+
+    memset(expect, 0xff, sizeof(expect));
+    memset(output, 0xff, sizeof(expect));
+
+    for (int i = 0; i < BUFSIZE; i++) {
+        asm("v2 = vmem(%0 + #0)\n\t"
+            "v3 = vmem(%1 + #0)\n\t"
+            "v4 = vmem(%2 + #0)\n\t"
+            "v5 = vmem(%3 + #0)\n\t"
+            "v5:4.w = v6mpy(v5:4.ub, v3:2.b, #1):v\n\t"
+            "vmem(%4 + #0) = v4\n\t"
+            : : "r"(p00), "r"(p01), "r"(p10), "r"(p11), "r"(pout)
+            : "v2", "v3", "v4", "v5", "memory");
+        p00 += sizeof(MMVector);
+        p01 += sizeof(MMVector);
+        p10 += sizeof(MMVector);
+        p11 += sizeof(MMVector);
+        pout += sizeof(MMVector);
+
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
+            expect[i].w[j] = v6mpy_ref[i][j];
+        }
+    }
+
+    check_output_w(__LINE__, BUFSIZE);
+}
+
+int main()
+{
+    init_buffers();
+    init_v6mpy_buffers();
+
+    test_v6mpy();
+
+    puts(err ? "FAIL" : "PASS");
+    return err ? 1 : 0;
+}
diff --git a/tests/tcg/hexagon/v68_scalar.c b/tests/tcg/hexagon/v68_scalar.c
new file mode 100644
index 0000000..7a8adb1
--- /dev/null
+++ b/tests/tcg/hexagon/v68_scalar.c
@@ -0,0 +1,186 @@
+/*
+ *  Copyright(c) 2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+/*
+ *  Test the scalar core instructions that are new in v68
+ */
+
+int err;
+
+static int buffer32[] = { 1, 2, 3, 4 };
+static long long buffer64[] = { 5, 6, 7, 8 };
+
+static void __check32(int line, uint32_t result, uint32_t expect)
+{
+    if (result != expect) {
+        printf("ERROR at line %d: 0x%08x != 0x%08x\n",
+               line, result, expect);
+        err++;
+    }
+}
+
+#define check32(RES, EXP) __check32(__LINE__, RES, EXP)
+
+static void __check64(int line, uint64_t result, uint64_t expect)
+{
+    if (result != expect) {
+        printf("ERROR at line %d: 0x%016llx != 0x%016llx\n",
+               line, result, expect);
+        err++;
+    }
+}
+
+#define check64(RES, EXP) __check64(__LINE__, RES, EXP)
+
+static inline int loadw_aq(int *p)
+{
+    int res;
+    asm volatile("%0 = memw_aq(%1)\n\t"
+                 : "=r"(res) : "r"(p));
+    return res;
+}
+
+static void test_loadw_aq(void)
+{
+    int res;
+
+    res = loadw_aq(&buffer32[0]);
+    check32(res, 1);
+    res = loadw_aq(&buffer32[1]);
+    check32(res, 2);
+}
+
+static inline long long loadd_aq(long long *p)
+{
+    long long res;
+    asm volatile("%0 = memd_aq(%1)\n\t"
+                 : "=r"(res) : "r"(p));
+    return res;
+}
+
+static void test_loadd_aq(void)
+{
+    long long res;
+
+    res = loadd_aq(&buffer64[2]);
+    check64(res, 7);
+    res = loadd_aq(&buffer64[3]);
+    check64(res, 8);
+}
+
+static inline void release_at(int *p)
+{
+    asm volatile("release(%0):at\n\t"
+                 : : "r"(p));
+}
+
+static void test_release_at(void)
+{
+    release_at(&buffer32[2]);
+    check64(buffer32[2], 3);
+    release_at(&buffer32[3]);
+    check64(buffer32[3], 4);
+}
+
+static inline void release_st(int *p)
+{
+    asm volatile("release(%0):st\n\t"
+                 : : "r"(p));
+}
+
+static void test_release_st(void)
+{
+    release_st(&buffer32[2]);
+    check64(buffer32[2], 3);
+    release_st(&buffer32[3]);
+    check64(buffer32[3], 4);
+}
+
+static inline void storew_rl_at(int *p, int val)
+{
+    asm volatile("memw_rl(%0):at = %1\n\t"
+                 : : "r"(p), "r"(val) : "memory");
+}
+
+static void test_storew_rl_at(void)
+{
+    storew_rl_at(&buffer32[2], 9);
+    check64(buffer32[2], 9);
+    storew_rl_at(&buffer32[3], 10);
+    check64(buffer32[3], 10);
+}
+
+static inline void stored_rl_at(long long *p, long long val)
+{
+    asm volatile("memd_rl(%0):at = %1\n\t"
+                 : : "r"(p), "r"(val) : "memory");
+}
+
+static void test_stored_rl_at(void)
+{
+    stored_rl_at(&buffer64[2], 11);
+    check64(buffer64[2], 11);
+    stored_rl_at(&buffer64[3], 12);
+    check64(buffer64[3], 12);
+}
+
+static inline void storew_rl_st(int *p, int val)
+{
+    asm volatile("memw_rl(%0):st = %1\n\t"
+                 : : "r"(p), "r"(val) : "memory");
+}
+
+static void test_storew_rl_st(void)
+{
+    storew_rl_st(&buffer32[0], 13);
+    check64(buffer32[0], 13);
+    storew_rl_st(&buffer32[1], 14);
+    check64(buffer32[1], 14);
+}
+
+static inline void stored_rl_st(long long *p, long long val)
+{
+    asm volatile("memd_rl(%0):st = %1\n\t"
+                 : : "r"(p), "r"(val) : "memory");
+}
+
+static void test_stored_rl_st(void)
+{
+    stored_rl_st(&buffer64[0], 15);
+    check64(buffer64[0], 15);
+    stored_rl_st(&buffer64[1], 15);
+    check64(buffer64[1], 15);
+}
+
+int main()
+{
+    test_loadw_aq();
+    test_loadd_aq();
+    test_release_at();
+    test_release_st();
+    test_storew_rl_at();
+    test_stored_rl_at();
+    test_storew_rl_st();
+    test_stored_rl_st();
+
+    puts(err ? "FAIL" : "PASS");
+    return err ? 1 : 0;
+}
diff --git a/tests/tcg/hexagon/v69_hvx.c b/tests/tcg/hexagon/v69_hvx.c
new file mode 100644
index 0000000..a0d567d
--- /dev/null
+++ b/tests/tcg/hexagon/v69_hvx.c
@@ -0,0 +1,318 @@
+/*
+ *  Copyright(c) 2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <limits.h>
+
+int err;
+
+#include "hvx_misc.h"
+
+#define fVROUND(VAL, SHAMT) \
+    ((VAL) + (((SHAMT) > 0) ? (1LL << ((SHAMT) - 1)) : 0))
+
+#define fVSATUB(VAL) \
+    ((((VAL) & 0xffLL) == (VAL)) ? \
+        (VAL) : \
+        ((((int32_t)(VAL)) < 0) ? 0 : 0xff))
+
+#define fVSATUH(VAL) \
+    ((((VAL) & 0xffffLL) == (VAL)) ? \
+        (VAL) : \
+        ((((int32_t)(VAL)) < 0) ? 0 : 0xffff))
+
+static void test_vasrvuhubrndsat(void)
+{
+    void *p0 = buffer0;
+    void *p1 = buffer1;
+    void *pout = output;
+
+    memset(expect, 0xaa, sizeof(expect));
+    memset(output, 0xbb, sizeof(output));
+
+    for (int i = 0; i < BUFSIZE / 2; i++) {
+        asm("v4 = vmem(%0 + #0)\n\t"
+            "v5 = vmem(%0 + #1)\n\t"
+            "v6 = vmem(%1 + #0)\n\t"
+            "v5.ub = vasr(v5:4.uh, v6.ub):rnd:sat\n\t"
+            "vmem(%2) = v5\n\t"
+            : : "r"(p0), "r"(p1), "r"(pout)
+            : "v4", "v5", "v6", "memory");
+        p0 += sizeof(MMVector) * 2;
+        p1 += sizeof(MMVector);
+        pout += sizeof(MMVector);
+
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 2; j++) {
+            int shamt;
+            uint8_t byte0;
+            uint8_t byte1;
+
+            shamt = buffer1[i].ub[2 * j + 0] & 0x7;
+            byte0 = fVSATUB(fVROUND(buffer0[2 * i + 0].uh[j], shamt) >> shamt);
+            shamt = buffer1[i].ub[2 * j + 1] & 0x7;
+            byte1 = fVSATUB(fVROUND(buffer0[2 * i + 1].uh[j], shamt) >> shamt);
+            expect[i].uh[j] = (byte1 << 8) | (byte0 & 0xff);
+        }
+    }
+
+    check_output_h(__LINE__, BUFSIZE / 2);
+}
+
+static void test_vasrvuhubsat(void)
+{
+    void *p0 = buffer0;
+    void *p1 = buffer1;
+    void *pout = output;
+
+    memset(expect, 0xaa, sizeof(expect));
+    memset(output, 0xbb, sizeof(output));
+
+    for (int i = 0; i < BUFSIZE / 2; i++) {
+        asm("v4 = vmem(%0 + #0)\n\t"
+            "v5 = vmem(%0 + #1)\n\t"
+            "v6 = vmem(%1 + #0)\n\t"
+            "v5.ub = vasr(v5:4.uh, v6.ub):sat\n\t"
+            "vmem(%2) = v5\n\t"
+            : : "r"(p0), "r"(p1), "r"(pout)
+            : "v4", "v5", "v6", "memory");
+        p0 += sizeof(MMVector) * 2;
+        p1 += sizeof(MMVector);
+        pout += sizeof(MMVector);
+
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 2; j++) {
+            int shamt;
+            uint8_t byte0;
+            uint8_t byte1;
+
+            shamt = buffer1[i].ub[2 * j + 0] & 0x7;
+            byte0 = fVSATUB(buffer0[2 * i + 0].uh[j] >> shamt);
+            shamt = buffer1[i].ub[2 * j + 1] & 0x7;
+            byte1 = fVSATUB(buffer0[2 * i + 1].uh[j] >> shamt);
+            expect[i].uh[j] = (byte1 << 8) | (byte0 & 0xff);
+        }
+    }
+
+    check_output_h(__LINE__, BUFSIZE / 2);
+}
+
+static void test_vasrvwuhrndsat(void)
+{
+    void *p0 = buffer0;
+    void *p1 = buffer1;
+    void *pout = output;
+
+    memset(expect, 0xaa, sizeof(expect));
+    memset(output, 0xbb, sizeof(output));
+
+    for (int i = 0; i < BUFSIZE / 2; i++) {
+        asm("v4 = vmem(%0 + #0)\n\t"
+            "v5 = vmem(%0 + #1)\n\t"
+            "v6 = vmem(%1 + #0)\n\t"
+            "v5.uh = vasr(v5:4.w, v6.uh):rnd:sat\n\t"
+            "vmem(%2) = v5\n\t"
+            : : "r"(p0), "r"(p1), "r"(pout)
+            : "v4", "v5", "v6", "memory");
+        p0 += sizeof(MMVector) * 2;
+        p1 += sizeof(MMVector);
+        pout += sizeof(MMVector);
+
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
+            int shamt;
+            uint16_t half0;
+            uint16_t half1;
+
+            shamt = buffer1[i].uh[2 * j + 0] & 0xf;
+            half0 = fVSATUH(fVROUND(buffer0[2 * i + 0].w[j], shamt) >> shamt);
+            shamt = buffer1[i].uh[2 * j + 1] & 0xf;
+            half1 = fVSATUH(fVROUND(buffer0[2 * i + 1].w[j], shamt) >> shamt);
+            expect[i].w[j] = (half1 << 16) | (half0 & 0xffff);
+        }
+    }
+
+    check_output_w(__LINE__, BUFSIZE / 2);
+}
+
+static void test_vasrvwuhsat(void)
+{
+    void *p0 = buffer0;
+    void *p1 = buffer1;
+    void *pout = output;
+
+    memset(expect, 0xaa, sizeof(expect));
+    memset(output, 0xbb, sizeof(output));
+
+    for (int i = 0; i < BUFSIZE / 2; i++) {
+        asm("v4 = vmem(%0 + #0)\n\t"
+            "v5 = vmem(%0 + #1)\n\t"
+            "v6 = vmem(%1 + #0)\n\t"
+            "v5.uh = vasr(v5:4.w, v6.uh):sat\n\t"
+            "vmem(%2) = v5\n\t"
+            : : "r"(p0), "r"(p1), "r"(pout)
+            : "v4", "v5", "v6", "memory");
+        p0 += sizeof(MMVector) * 2;
+        p1 += sizeof(MMVector);
+        pout += sizeof(MMVector);
+
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
+            int shamt;
+            uint16_t half0;
+            uint16_t half1;
+
+            shamt = buffer1[i].uh[2 * j + 0] & 0xf;
+            half0 = fVSATUH(buffer0[2 * i + 0].w[j] >> shamt);
+            shamt = buffer1[i].uh[2 * j + 1] & 0xf;
+            half1 = fVSATUH(buffer0[2 * i + 1].w[j] >> shamt);
+            expect[i].w[j] = (half1 << 16) | (half0 & 0xffff);
+        }
+    }
+
+    check_output_w(__LINE__, BUFSIZE / 2);
+}
+
+static void test_vassign_tmp(void)
+{
+    void *p0 = buffer0;
+    void *pout = output;
+
+    memset(expect, 0xaa, sizeof(expect));
+    memset(output, 0xbb, sizeof(output));
+
+    for (int i = 0; i < BUFSIZE; i++) {
+        /*
+         * Assign into v12 as .tmp, then use it in the next packet
+         * Should get the new value within the same packet and
+         * the old value in the next packet
+         */
+        asm("v3 = vmem(%0 + #0)\n\t"
+            "r1 = #1\n\t"
+            "v12 = vsplat(r1)\n\t"
+            "r1 = #2\n\t"
+            "v13 = vsplat(r1)\n\t"
+            "{\n\t"
+            "    v12.tmp = v13\n\t"
+            "    v4.w = vadd(v12.w, v3.w)\n\t"
+            "}\n\t"
+            "v4.w = vadd(v4.w, v12.w)\n\t"
+            "vmem(%1 + #0) = v4\n\t"
+            : : "r"(p0), "r"(pout)
+            : "r1", "v3", "v4", "v12", "v13", "memory");
+        p0 += sizeof(MMVector);
+        pout += sizeof(MMVector);
+
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
+            expect[i].w[j] = buffer0[i].w[j] + 3;
+        }
+    }
+
+    check_output_w(__LINE__, BUFSIZE);
+}
+
+static void test_vcombine_tmp(void)
+{
+    void *p0 = buffer0;
+    void *p1 = buffer1;
+    void *pout = output;
+
+    memset(expect, 0xaa, sizeof(expect));
+    memset(output, 0xbb, sizeof(output));
+
+    for (int i = 0; i < BUFSIZE; i++) {
+        /*
+         * Combine into v13:12 as .tmp, then use it in the next packet
+         * Should get the new value within the same packet and
+         * the old value in the next packet
+         */
+        asm("v3 = vmem(%0 + #0)\n\t"
+            "r1 = #1\n\t"
+            "v12 = vsplat(r1)\n\t"
+            "r1 = #2\n\t"
+            "v13 = vsplat(r1)\n\t"
+            "r1 = #3\n\t"
+            "v14 = vsplat(r1)\n\t"
+            "r1 = #4\n\t"
+            "v15 = vsplat(r1)\n\t"
+            "{\n\t"
+            "    v13:12.tmp = vcombine(v15, v14)\n\t"
+            "    v4.w = vadd(v12.w, v3.w)\n\t"
+            "    v16 = v13\n\t"
+            "}\n\t"
+            "v4.w = vadd(v4.w, v12.w)\n\t"
+            "v4.w = vadd(v4.w, v13.w)\n\t"
+            "v4.w = vadd(v4.w, v16.w)\n\t"
+            "vmem(%2 + #0) = v4\n\t"
+            : : "r"(p0), "r"(p1), "r"(pout)
+            : "r1", "v3", "v4", "v12", "v13", "v14", "v15", "v16", "memory");
+        p0 += sizeof(MMVector);
+        p1 += sizeof(MMVector);
+        pout += sizeof(MMVector);
+
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
+            expect[i].w[j] = buffer0[i].w[j] + 10;
+        }
+    }
+
+    check_output_w(__LINE__, BUFSIZE);
+}
+
+static void test_vmpyuhvs(void)
+{
+    void *p0 = buffer0;
+    void *p1 = buffer1;
+    void *pout = output;
+
+    memset(expect, 0xaa, sizeof(expect));
+    memset(output, 0xbb, sizeof(output));
+
+    for (int i = 0; i < BUFSIZE; i++) {
+        asm("v4 = vmem(%0 + #0)\n\t"
+            "v5 = vmem(%1 + #0)\n\t"
+            "v4.uh = vmpy(V4.uh, v5.uh):>>16\n\t"
+            "vmem(%2) = v4\n\t"
+            : : "r"(p0), "r"(p1), "r"(pout)
+            : "v4", "v5", "memory");
+        p0 += sizeof(MMVector);
+        p1 += sizeof(MMVector);
+        pout += sizeof(MMVector);
+
+        for (int j = 0; j < MAX_VEC_SIZE_BYTES / 2; j++) {
+            expect[i].uh[j] = (buffer0[i].uh[j] * buffer1[i].uh[j]) >> 16;
+        }
+    }
+
+    check_output_h(__LINE__, BUFSIZE);
+}
+
+int main()
+{
+    init_buffers();
+
+    test_vasrvuhubrndsat();
+    test_vasrvuhubsat();
+    test_vasrvwuhrndsat();
+    test_vasrvwuhsat();
+
+    test_vassign_tmp();
+    test_vcombine_tmp();
+
+    test_vmpyuhvs();
+
+    puts(err ? "FAIL" : "PASS");
+    return err ? 1 : 0;
+}
diff --git a/tests/tcg/hexagon/v6mpy_ref.c.inc b/tests/tcg/hexagon/v6mpy_ref.c.inc
new file mode 100644
index 0000000..8258cdd
--- /dev/null
+++ b/tests/tcg/hexagon/v6mpy_ref.c.inc
@@ -0,0 +1,161 @@
+/*
+ *  Copyright(c) 2021-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+{ 0xffffee11, 0xfffffcca, 0xffffc1b3, 0xffffd0cc,
+  0xffffe215, 0xfffff58e, 0xffffaf37, 0xffffc310,
+  0xffffd919, 0xfffff152, 0xffff9fbb, 0xffffb854,
+  0xffffd31d, 0xfffff016, 0xffff933f, 0xffffb098,
+  0xffffd021, 0xfffff1da, 0xffff89c3, 0xffffabdc,
+  0xffffd025, 0xfffff69e, 0xffff8347, 0xffffaa20,
+  0xffffd329, 0xfffffe62, 0xffff7fcb, 0xffffab64,
+  0xffffd92d, 0x00000926, 0xffff7f4f, 0xffffafa8,
+  },
+{ 0xffffe231, 0x000016ea, 0xffff81d3, 0xffffb6ec,
+  0xffffee35, 0x000027ae, 0xffff8757, 0xffffc130,
+  0xfffffd39, 0x00003b72, 0xffff8fdb, 0xffffce74,
+  0x00000f3d, 0x00005236, 0xffff9b5f, 0xffffdeb8,
+  0x00002441, 0x00006bfa, 0xffffa9e3, 0xfffff1fc,
+  0x00003c45, 0x000088be, 0xffffbb67, 0x00000840,
+  0x00005749, 0x0000a882, 0xffffcfeb, 0xffffe684,
+  0x0000494d, 0x00009a46, 0xffffb16f, 0x000002c8,
+  },
+{ 0xfffff351, 0x0000440a, 0xffff4af3, 0xffff9c0c,
+  0xffffef55, 0x000044ce, 0xffff4077, 0xffff9650,
+  0xffffee59, 0x00004892, 0xffff38fb, 0xffff9394,
+  0xfffff05d, 0x00004f56, 0xffff347f, 0xffff93d8,
+  0xfffff561, 0x0000591a, 0xffff3303, 0xffff971c,
+  0xfffffd65, 0x000065de, 0xffff3487, 0xffff9d60,
+  0x00000869, 0x000075a2, 0xffff390b, 0xffffa6a4,
+  0x0000166d, 0x00008866, 0xffff408f, 0xffffb2e8,
+  },
+{ 0x00002771, 0x00009e2a, 0xffff4b13, 0xffffc22c,
+  0x00003b75, 0x0000b6ee, 0xffff5897, 0xffffd470,
+  0x00005279, 0x0000d2b2, 0xffff691b, 0xffffe9b4,
+  0x00006c7d, 0x0000f176, 0xffff7c9f, 0x000001f8,
+  0x00008981, 0x0001133a, 0xffff9323, 0x00001d3c,
+  0x0000a985, 0x000137fe, 0xffffaca7, 0x00003b80,
+  0x0000cc89, 0x00015fc2, 0xffffc92b, 0xffffe1c4,
+  0x0000868d, 0x00011986, 0xffff72af, 0x00000608,
+  },
+{ 0xfffff891, 0x00008b4a, 0xfffed433, 0xffff674c,
+  0xfffffc95, 0x0000940e, 0xfffed1b7, 0xffff6990,
+  0x00000399, 0x00009fd2, 0xfffed23b, 0xffff6ed4,
+  0x00000d9d, 0x0000ae96, 0xfffed5bf, 0xffff7718,
+  0x00001aa1, 0x0000c05a, 0xfffedc43, 0xffff825c,
+  0x00002aa5, 0x0000d51e, 0xfffee5c7, 0xffff90a0,
+  0x00003da9, 0x0000ece2, 0xfffef24b, 0xffffa1e4,
+  0x000053ad, 0x000107a6, 0xffff01cf, 0xffffb628,
+  },
+{ 0x00006cb1, 0x0001256a, 0xffff1453, 0xffffcd6c,
+  0x000088b5, 0x0001462e, 0xffff29d7, 0xffffe7b0,
+  0x0000a7b9, 0x000169f2, 0xffff425b, 0x000004f4,
+  0x0000c9bd, 0x000190b6, 0xffff5ddf, 0x00002538,
+  0x0000eec1, 0x0001ba7a, 0xffff7c63, 0x0000487c,
+  0x000116c5, 0x0001e73e, 0xffff9de7, 0x00006ec0,
+  0x000141c9, 0x00021702, 0xffffc26b, 0xffffdd04,
+  0x0000c3cd, 0x000198c6, 0xffff33ef, 0x00000948,
+  },
+{ 0xfffffdd1, 0x0000d28a, 0xfffe5d73, 0xffff328c,
+  0x000009d5, 0x0000e34e, 0xfffe62f7, 0xffff3cd0,
+  0x000018d9, 0x0000f712, 0xfffe6b7b, 0xffff4a14,
+  0x00002add, 0x00010dd6, 0xfffe76ff, 0xffff5a58,
+  0x00003fe1, 0x0001279a, 0xfffe8583, 0xffff6d9c,
+  0x000057e5, 0x0001445e, 0xfffe9707, 0xffff83e0,
+  0x000072e9, 0x00016422, 0xfffeab8b, 0xffff9d24,
+  0x000090ed, 0x000186e6, 0xfffec30f, 0xffffb968,
+  },
+{ 0x0000b1f1, 0x0001acaa, 0xfffedd93, 0xffffd8ac,
+  0x0000d5f5, 0x0001d56e, 0xfffefb17, 0xfffffaf0,
+  0x0000fcf9, 0x00020132, 0xffff1b9b, 0x00002034,
+  0x000126fd, 0x00022ff6, 0xffff3f1f, 0x00008b36,
+  0x000093c3, 0x00009d80, 0x00009d6d, 0x0000a78a,
+  0x0000b4d7, 0x0000c354, 0x0000b801, 0x0000c6de,
+  0x0000d4eb, 0x0000e828, 0x0000d195, 0xffffea32,
+  0x00000fff, 0x000022fc, 0xfffffc29, 0x00000f86,
+  },
+{ 0xffffee13, 0xfffffcd0, 0xffffc1bd, 0xffffd0da,
+  0xffffe327, 0xfffff6a4, 0xffffb051, 0xffffc42e,
+  0xffffd73b, 0xffffef78, 0xffff9de5, 0xffffb682,
+  0xffffd24f, 0xffffef4c, 0xffff9279, 0xffffafd6,
+  0xffffd063, 0xfffff220, 0xffff8a0d, 0xffffac2a,
+  0xffffd177, 0xfffff7f4, 0xffff84a1, 0xffffab7e,
+  0xffffd18b, 0xfffffcc8, 0xffff7e35, 0xffffa9d2,
+  0xffffd89f, 0x0000089c, 0xffff7ec9, 0xffffaf26,
+  },
+{ 0xffffe2b3, 0x00001770, 0xffff825d, 0xffffb77a,
+  0xffffefc7, 0x00002944, 0xffff88f1, 0xffffc2ce,
+  0xfffffbdb, 0x00003a18, 0xffff8e85, 0xffffcd22,
+  0x00000eef, 0x000051ec, 0xffff9b19, 0xffffde76,
+  0x00002503, 0x00006cc0, 0xffffaaad, 0xfffff2ca,
+  0x00003e17, 0x00008a94, 0xffffbd41, 0x00000a1e,
+  0x0000562b, 0x0000a768, 0xffffced5, 0xffffe572,
+  0x0000493f, 0x00009a3c, 0xffffb169, 0x000002c6,
+  },
+{ 0xfffff353, 0x00004410, 0xffff4afd, 0xffff9c1a,
+  0xfffff067, 0x000045e4, 0xffff4191, 0xffff976e,
+  0xffffec7b, 0x000046b8, 0xffff3725, 0xffff91c2,
+  0xffffef8f, 0x00004e8c, 0xffff33b9, 0xffff9316,
+  0xfffff5a3, 0x00005960, 0xffff334d, 0xffff976a,
+  0xfffffeb7, 0x00006734, 0xffff35e1, 0xffff9ebe,
+  0x000006cb, 0x00007408, 0xffff3775, 0xffffa512,
+  0x000015df, 0x000087dc, 0xffff4009, 0xffffb266,
+  },
+{ 0x000027f3, 0x00009eb0, 0xffff4b9d, 0xffffc2ba,
+  0x00003d07, 0x0000b884, 0xffff5a31, 0xffffd60e,
+  0x0000511b, 0x0000d158, 0xffff67c5, 0xffffe862,
+  0x00006c2f, 0x0000f12c, 0xffff7c59, 0x000001b6,
+  0x00008a43, 0x00011400, 0xffff93ed, 0x00001e0a,
+  0x0000ab57, 0x000139d4, 0xffffae81, 0x00003d5e,
+  0x0000cb6b, 0x00015ea8, 0xffffc815, 0xffffe0b2,
+  0x0000867f, 0x0001197c, 0xffff72a9, 0x00000606,
+  },
+{ 0xfffff893, 0x00008b50, 0xfffed43d, 0xffff675a,
+  0xfffffda7, 0x00009524, 0xfffed2d1, 0xffff6aae,
+  0x000001bb, 0x00009df8, 0xfffed065, 0xffff6d02,
+  0x00000ccf, 0x0000adcc, 0xfffed4f9, 0xffff7656,
+  0x00001ae3, 0x0000c0a0, 0xfffedc8d, 0xffff82aa,
+  0x00002bf7, 0x0000d674, 0xfffee721, 0xffff91fe,
+  0x00003c0b, 0x0000eb48, 0xfffef0b5, 0xffffa052,
+  0x0000531f, 0x0001071c, 0xffff0149, 0xffffb5a6,
+  },
+{ 0x00006d33, 0x000125f0, 0xffff14dd, 0xffffcdfa,
+  0x00008a47, 0x000147c4, 0xffff2b71, 0xffffe94e,
+  0x0000a65b, 0x00016898, 0xffff4105, 0x000003a2,
+  0x0000c96f, 0x0001906c, 0xffff5d99, 0x000024f6,
+  0x0000ef83, 0x0001bb40, 0xffff7d2d, 0x0000494a,
+  0x00011897, 0x0001e914, 0xffff9fc1, 0x0000709e,
+  0x000140ab, 0x000215e8, 0xffffc155, 0xffffdbf2,
+  0x0000c3bf, 0x000198bc, 0xffff33e9, 0x00000946,
+  },
+{ 0xfffffdd3, 0x0000d290, 0xfffe5d7d, 0xffff329a,
+  0x00000ae7, 0x0000e464, 0xfffe6411, 0xffff3dee,
+  0x000016fb, 0x0000f538, 0xfffe69a5, 0xffff4842,
+  0x00002a0f, 0x00010d0c, 0xfffe7639, 0xffff5996,
+  0x00004023, 0x000127e0, 0xfffe85cd, 0xffff6dea,
+  0x00005937, 0x000145b4, 0xfffe9861, 0xffff853e,
+  0x0000714b, 0x00016288, 0xfffea9f5, 0xffff9b92,
+  0x0000905f, 0x0001865c, 0xfffec289, 0xffffb8e6,
+  },
+{ 0x0000b273, 0x0001ad30, 0xfffede1d, 0xffffd93a,
+  0x0000d787, 0x0001d704, 0xfffefcb1, 0xfffffc8e,
+  0x0000fb9b, 0x0001ffd8, 0xffff1a45, 0x00001ee2,
+  0x000126af, 0x00022fac, 0xffff3ed9, 0x00008af4,
+  0x00009485, 0x00009e46, 0x00009e37, 0x0000a858,
+  0x0000b6a9, 0x0000c52a, 0x0000b9db, 0x0000c8bc,
+  0x0000d3cd, 0x0000e70e, 0x0000d07f, 0xffffe920,
+  0x00000ff1, 0x000022f2, 0xfffffc23, 0x00000f84,
+  },
diff --git a/tests/tcg/hexagon/v73_scalar.c b/tests/tcg/hexagon/v73_scalar.c
new file mode 100644
index 0000000..fee67fc
--- /dev/null
+++ b/tests/tcg/hexagon/v73_scalar.c
@@ -0,0 +1,96 @@
+/*
+ *  Copyright(c) 2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+/*
+ *  Test the scalar core instructions that are new in v73
+ */
+
+int err;
+
+static void __check32(int line, uint32_t result, uint32_t expect)
+{
+    if (result != expect) {
+        printf("ERROR at line %d: 0x%08x != 0x%08x\n",
+               line, result, expect);
+        err++;
+    }
+}
+
+#define check32(RES, EXP) __check32(__LINE__, RES, EXP)
+
+static void __check64(int line, uint64_t result, uint64_t expect)
+{
+    if (result != expect) {
+        printf("ERROR at line %d: 0x%016llx != 0x%016llx\n",
+               line, result, expect);
+        err++;
+    }
+}
+
+#define check64(RES, EXP) __check64(__LINE__, RES, EXP)
+
+static bool my_func_called;
+
+static void my_func(void)
+{
+    my_func_called = true;
+}
+
+static inline void callrh(void *func)
+{
+    asm volatile("callrh %0\n\t"
+                 : : "r"(func)
+                 /* Mark the caller-save registers as clobbered */
+                 : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9",
+                   "r10", "r11", "r12", "r13", "r14", "r15", "r28",
+                   "p0", "p1", "p2", "p3");
+}
+
+static void test_callrh(void)
+{
+    my_func_called = false;
+    callrh(&my_func);
+    check32(my_func_called, true);
+}
+
+static void test_jumprh(void)
+{
+    uint32_t res;
+    asm ("%0 = #5\n\t"
+         "r0 = ##1f\n\t"
+         "jumprh r0\n\t"
+         "%0 = #3\n\t"
+         "jump 2f\n\t"
+         "1:\n\t"
+         "%0 = #1\n\t"
+         "2:\n\t"
+         : "=r"(res) : : "r0");
+    check32(res, 1);
+}
+
+int main()
+{
+    test_callrh();
+    test_jumprh();
+
+    puts(err ? "FAIL" : "PASS");
+    return err ? 1 : 0;
+}
diff --git a/tests/tcg/multiarch/system/Makefile.softmmu-target b/tests/tcg/multiarch/system/Makefile.softmmu-target
index 5f432c9..fe40195 100644
--- a/tests/tcg/multiarch/system/Makefile.softmmu-target
+++ b/tests/tcg/multiarch/system/Makefile.softmmu-target
@@ -27,6 +27,20 @@
 		"-monitor none -display none -chardev file$(COMMA)path=$<.out$(COMMA)id=output $(QEMU_OPTS)" \
 		--bin $< --test $(MULTIARCH_SRC)/gdbstub/memory.py, \
 	softmmu gdbstub support)
+
+run-gdbstub-untimely-packet: hello
+	$(call run-test, $@, $(GDB_SCRIPT) \
+		--gdb $(HAVE_GDB_BIN) \
+		--gdb-args "-ex 'set debug remote 1'" \
+		--output untimely-packet.gdb.out \
+		--stderr untimely-packet.gdb.err \
+		--qemu $(QEMU) \
+		--bin $< --qargs \
+		"-monitor none -display none -chardev file$(COMMA)path=untimely-packet.out$(COMMA)id=output $(QEMU_OPTS)", \
+	"softmmu gdbstub untimely packets")
+	$(call quiet-command, \
+		(! grep -Fq 'Packet instead of Ack, ignoring it' untimely-packet.gdb.err), \
+		"GREP", "file  untimely-packet.gdb.err")
 else
 run-gdbstub-%:
 	$(call skip-test, "gdbstub test $*", "no guest arch support")
@@ -36,4 +50,4 @@
 	$(call skip-test, "gdbstub test $*", "need working gdb")
 endif
 
-MULTIARCH_RUNS += run-gdbstub-memory
+MULTIARCH_RUNS += run-gdbstub-memory run-gdbstub-untimely-packet