Merge tag 'pull-request-2025-05-14' of https://gitlab.com/thuth/qemu into staging

* Removal of obsolete s390x machines
* Fix a memleak in s390x code
* Skip some functional tests if the corresponding feature is not available

# -----BEGIN PGP SIGNATURE-----
#
# iQJFBAABCAAvFiEEJ7iIR+7gJQEY8+q5LtnXdP5wLbUFAmgkfWURHHRodXRoQHJl
# ZGhhdC5jb20ACgkQLtnXdP5wLbXaKA/+K/buSKZWNcvrXtU4AqEyIjicvUsbY79S
# BGmwTjO46uDzlqTIOxGJ2uBAocXSlNJ7YsvH75vBHWHF3Vy6LB1zPWDgaYTz7XkA
# K9GqtrmRdlPArKa1Q7ot0tJ/wu7lzQuccieJJwNJhotMC3C4dl1HSpp+u/rmk7gG
# vG9l5Cdi34BWXp2QCKPdrNs++4mOudLSJtYhBlSpxIaBe6h2LoHmKJNEmD9x4Xcg
# SWTqalpWUhJW4L3zCj1JXWv6HAyR6GG7+7FLr5FkorSDG/sMX7+09GLE1/BLlD87
# KtZlTBkcbXs+eXmP4y+qtskI0ca4dLaZnfIq8/v0wqCXvfOUM4Xi0E2HvGmHeI4u
# rvC/ZhK2RztMZbVMFXHSmCFJvpi2sGgH+sIHt18BJzkAC+nx0ZdCz81fgKVERHhJ
# 1ZnsRiMcf7dI6yEgbJ89vZihv3WbyCcwlnyLDN+lovZzCYTvxPLn5SRH0LEm4kN5
# N/qRwTTlPM4xCGCSc3JEGJVDDy36ojVfvGMFt4ZcFehcpkfcLznw7QYjk3QDwI2N
# 58FImsf2VVEl4sdpzpi6zfutMhFuL1N0m/kXb8GBonekXYTPtyBMqHsmhyRe5xXN
# vP9paghpU0xBuDMtmZWyq4RCubZNESA7wAbSf0+VcC/1Uhjc3QS5820kV7/WVwsU
# VwObtSEAG1c=
# =zUob
# -----END PGP SIGNATURE-----
# gpg: Signature made Wed 14 May 2025 07:24:21 EDT
# gpg:                using RSA key 27B88847EEE0250118F3EAB92ED9D774FE702DB5
# gpg:                issuer "thuth@redhat.com"
# gpg: Good signature from "Thomas Huth <th.huth@gmx.de>" [full]
# gpg:                 aka "Thomas Huth <thuth@redhat.com>" [full]
# gpg:                 aka "Thomas Huth <huth@tuxfamily.org>" [full]
# gpg:                 aka "Thomas Huth <th.huth@posteo.de>" [unknown]
# Primary key fingerprint: 27B8 8847 EEE0 2501 18F3  EAB9 2ED9 D774 FE70 2DB5

* tag 'pull-request-2025-05-14' of https://gitlab.com/thuth/qemu:
  tests/functional: Skip the screendump tests if the command is not available
  tests/functional/test_s390x_tuxrun: Check whether the machine is available
  include/hw/dma/xlnx_dpdma: Remove dependency on console.h
  s390x: Fix leak in machine_set_loadparm
  hw/s390x/s390-virtio-ccw: Remove the deprecated 4.0 machine type
  hw/s390x/s390-virtio-ccw: Remove the deprecated 3.1 machine type
  hw/s390x: Remove the obsolete hpage_1m_allowed switch
  hw/s390x/s390-virtio-ccw: Remove the deprecated 3.0 machine type
  hw/s390x/s390-virtio-ccw: Remove the deprecated 2.12 machine type
  target/s390x: Rename the qemu_V2_11 feature set to qemu_MIN
  hw/s390x/event-facility: Remove the obsolete "allow_all_mask_sizes" code
  hw/s390x/s390-virtio-ccw: Remove the deprecated 2.11 machine type
  hw/s390x/s390-virtio-ccw: Remove the deprecated 2.10 machine type

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build
index d6f533f..97d5e5a 100644
--- a/accel/tcg/meson.build
+++ b/accel/tcg/meson.build
@@ -1,4 +1,4 @@
-if not get_option('tcg').allowed()
+if not have_tcg
    subdir_done()
 endif
 
diff --git a/block/backup.c b/block/backup.c
index 79652bf..0151e84 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -361,6 +361,7 @@
                   BackupPerf *perf,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
+                  OnCbwError on_cbw_error,
                   int creation_flags,
                   BlockCompletionFunc *cb, void *opaque,
                   JobTxn *txn, Error **errp)
@@ -458,7 +459,7 @@
     }
 
     cbw = bdrv_cbw_append(bs, target, filter_node_name, discard_source,
-                          perf->min_cluster_size, &bcs, errp);
+                          perf->min_cluster_size, &bcs, on_cbw_error, errp);
     if (!cbw) {
         goto error;
     }
diff --git a/block/commit.c b/block/commit.c
index 5df3d05..7cc8c0f 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -15,6 +15,8 @@
 #include "qemu/osdep.h"
 #include "qemu/cutils.h"
 #include "trace.h"
+#include "block/block-common.h"
+#include "block/coroutines.h"
 #include "block/block_int.h"
 #include "block/blockjob_int.h"
 #include "qapi/error.h"
@@ -126,6 +128,84 @@
     blk_unref(s->top);
 }
 
+static int commit_iteration(CommitBlockJob *s, int64_t offset,
+                            int64_t *requested_bytes, void *buf)
+{
+    BlockErrorAction action;
+    int64_t bytes = *requested_bytes;
+    int ret = 0;
+    bool error_in_source = true;
+
+    /* Copy if allocated above the base */
+    WITH_GRAPH_RDLOCK_GUARD() {
+        ret = bdrv_co_common_block_status_above(blk_bs(s->top),
+            s->base_overlay, true, true, offset, COMMIT_BUFFER_SIZE,
+            &bytes, NULL, NULL, NULL);
+    }
+
+    trace_commit_one_iteration(s, offset, bytes, ret);
+
+    if (ret < 0) {
+        goto fail;
+    }
+
+    if (ret & BDRV_BLOCK_ALLOCATED) {
+        if (ret & BDRV_BLOCK_ZERO) {
+            /*
+             * If the top (sub)clusters are smaller than the base
+             * (sub)clusters, this will not unmap unless the underlying device
+             * does some tracking of these requests. Ideally, we would find
+             * the maximal extent of the zero clusters.
+             */
+            ret = blk_co_pwrite_zeroes(s->base, offset, bytes,
+                                       BDRV_REQ_MAY_UNMAP);
+            if (ret < 0) {
+                error_in_source = false;
+                goto fail;
+            }
+        } else {
+            assert(bytes < SIZE_MAX);
+
+            ret = blk_co_pread(s->top, offset, bytes, buf, 0);
+            if (ret < 0) {
+                goto fail;
+            }
+
+            ret = blk_co_pwrite(s->base, offset, bytes, buf, 0);
+            if (ret < 0) {
+                error_in_source = false;
+                goto fail;
+            }
+        }
+
+        /*
+         * Whether zeroes actually end up on disk depends on the details of
+         * the underlying driver. Therefore, this might rate limit more than
+         * is necessary.
+         */
+        block_job_ratelimit_processed_bytes(&s->common, bytes);
+    }
+
+    /* Publish progress */
+
+    job_progress_update(&s->common.job, bytes);
+
+    *requested_bytes = bytes;
+
+    return 0;
+
+fail:
+    action = block_job_error_action(&s->common, s->on_error,
+                                    error_in_source, -ret);
+    if (action == BLOCK_ERROR_ACTION_REPORT) {
+        return ret;
+    }
+
+    *requested_bytes = 0;
+
+    return 0;
+}
+
 static int coroutine_fn commit_run(Job *job, Error **errp)
 {
     CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
@@ -156,9 +236,6 @@
     buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);
 
     for (offset = 0; offset < len; offset += n) {
-        bool copy;
-        bool error_in_source = true;
-
         /* Note that even when no rate limit is applied we need to yield
          * with no pending I/O here so that bdrv_drain_all() returns.
          */
@@ -166,38 +243,11 @@
         if (job_is_cancelled(&s->common.job)) {
             break;
         }
-        /* Copy if allocated above the base */
-        ret = blk_co_is_allocated_above(s->top, s->base_overlay, true,
-                                        offset, COMMIT_BUFFER_SIZE, &n);
-        copy = (ret > 0);
-        trace_commit_one_iteration(s, offset, n, ret);
-        if (copy) {
-            assert(n < SIZE_MAX);
 
-            ret = blk_co_pread(s->top, offset, n, buf, 0);
-            if (ret >= 0) {
-                ret = blk_co_pwrite(s->base, offset, n, buf, 0);
-                if (ret < 0) {
-                    error_in_source = false;
-                }
-            }
-        }
+        ret = commit_iteration(s, offset, &n, buf);
+
         if (ret < 0) {
-            BlockErrorAction action =
-                block_job_error_action(&s->common, s->on_error,
-                                       error_in_source, -ret);
-            if (action == BLOCK_ERROR_ACTION_REPORT) {
-                return ret;
-            } else {
-                n = 0;
-                continue;
-            }
-        }
-        /* Publish progress */
-        job_progress_update(&s->common.job, n);
-
-        if (copy) {
-            block_job_ratelimit_processed_bytes(&s->common, n);
+            return ret;
         }
     }
 
diff --git a/block/copy-before-write.c b/block/copy-before-write.c
index fd470f5..00af0b1 100644
--- a/block/copy-before-write.c
+++ b/block/copy-before-write.c
@@ -551,6 +551,7 @@
                                   bool discard_source,
                                   uint64_t min_cluster_size,
                                   BlockCopyState **bcs,
+                                  OnCbwError on_cbw_error,
                                   Error **errp)
 {
     BDRVCopyBeforeWriteState *state;
@@ -568,6 +569,7 @@
     }
     qdict_put_str(opts, "file", bdrv_get_node_name(source));
     qdict_put_str(opts, "target", bdrv_get_node_name(target));
+    qdict_put_str(opts, "on-cbw-error", OnCbwError_str(on_cbw_error));
 
     if (min_cluster_size > INT64_MAX) {
         error_setg(errp, "min-cluster-size too large: %" PRIu64 " > %" PRIi64,
diff --git a/block/copy-before-write.h b/block/copy-before-write.h
index 2a5d4ba..eb93364 100644
--- a/block/copy-before-write.h
+++ b/block/copy-before-write.h
@@ -42,6 +42,7 @@
                                   bool discard_source,
                                   uint64_t min_cluster_size,
                                   BlockCopyState **bcs,
+                                  OnCbwError on_cbw_error,
                                   Error **errp);
 void bdrv_cbw_drop(BlockDriverState *bs);
 
diff --git a/block/replication.c b/block/replication.c
index d6625c5..07f274d 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -583,7 +583,9 @@
                                 0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, false,
                                 NULL, &perf,
                                 BLOCKDEV_ON_ERROR_REPORT,
-                                BLOCKDEV_ON_ERROR_REPORT, JOB_INTERNAL,
+                                BLOCKDEV_ON_ERROR_REPORT,
+                                ON_CBW_ERROR_BREAK_GUEST_WRITE,
+                                JOB_INTERNAL,
                                 backup_job_completed, bs, NULL, &local_err);
         if (local_err) {
             error_propagate(errp, local_err);
diff --git a/blockdev.c b/blockdev.c
index 1d1f27c..818ec42 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2641,6 +2641,7 @@
     BdrvDirtyBitmap *bmap = NULL;
     BackupPerf perf = { .max_workers = 64 };
     int job_flags = JOB_DEFAULT;
+    OnCbwError on_cbw_error = ON_CBW_ERROR_BREAK_GUEST_WRITE;
 
     if (!backup->has_speed) {
         backup->speed = 0;
@@ -2745,6 +2746,10 @@
         job_flags |= JOB_MANUAL_DISMISS;
     }
 
+    if (backup->has_on_cbw_error) {
+        on_cbw_error = backup->on_cbw_error;
+    }
+
     job = backup_job_create(backup->job_id, bs, target_bs, backup->speed,
                             backup->sync, bmap, backup->bitmap_mode,
                             backup->compress, backup->discard_source,
@@ -2752,6 +2757,7 @@
                             &perf,
                             backup->on_source_error,
                             backup->on_target_error,
+                            on_cbw_error,
                             job_flags, NULL, NULL, txn, errp);
     return job;
 }
diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
index 1a1b423..44d3427 100644
--- a/docs/about/deprecated.rst
+++ b/docs/about/deprecated.rst
@@ -148,6 +148,37 @@
 ``blockdev-add`` calls. See :doc:`/interop/live-block-operations` for
 details.
 
+``block-job-pause`` (since 10.1)
+''''''''''''''''''''''''''''''''
+
+Use ``job-pause`` instead. The only difference is that ``job-pause``
+always reports GenericError on failure when ``block-job-pause`` reports
+DeviceNotActive when block-job is not found.
+
+``block-job-resume`` (since 10.1)
+'''''''''''''''''''''''''''''''''
+
+Use ``job-resume`` instead. The only difference is that ``job-resume``
+always reports GenericError on failure when ``block-job-resume`` reports
+DeviceNotActive when block-job is not found.
+
+``block-job-complete`` (since 10.1)
+'''''''''''''''''''''''''''''''''''
+
+Use ``job-complete`` instead. The only difference is that ``job-complete``
+always reports GenericError on failure when ``block-job-complete`` reports
+DeviceNotActive when block-job is not found.
+
+``block-job-dismiss`` (since 10.1)
+''''''''''''''''''''''''''''''''''
+
+Use ``job-dismiss`` instead.
+
+``block-job-finalize`` (since 10.1)
+'''''''''''''''''''''''''''''''''''
+
+Use ``job-finalize`` instead.
+
 ``query-migrationthreads`` (since 9.2)
 ''''''''''''''''''''''''''''''''''''''
 
diff --git a/docs/devel/rust.rst b/docs/devel/rust.rst
index 4de8637..171d908 100644
--- a/docs/devel/rust.rst
+++ b/docs/devel/rust.rst
@@ -119,7 +119,7 @@
   for the ``hw/char/pl011.c`` and ``hw/timer/hpet.c`` files.
 
 .. [#issues] The ``pl011`` crate is synchronized with ``hw/char/pl011.c``
-   as of commit 02b1f7f61928.  The ``hpet`` crate is synchronized as of
+   as of commit 3e0f118f82.  The ``hpet`` crate is synchronized as of
    commit 1433e38cc8.  Both are lacking tracing functionality.
 
 This section explains how to work with them.
diff --git a/hw/audio/cs4231a.c b/hw/audio/cs4231a.c
index 06b44da..eb9a458 100644
--- a/hw/audio/cs4231a.c
+++ b/hw/audio/cs4231a.c
@@ -682,6 +682,10 @@
         return;
     }
 
+    if (s->irq >= ISA_NUM_IRQS) {
+        error_setg(errp, "Invalid IRQ %d (max %d)", s->irq, ISA_NUM_IRQS - 1);
+        return;
+    }
     s->pic = isa_bus_get_irq(bus, s->irq);
     k = ISADMA_GET_CLASS(s->isa_dma);
     k->register_channel(s->isa_dma, s->dma, cs_dma_read, s);
diff --git a/hw/intc/loongarch_pch_pic.c b/hw/intc/loongarch_pch_pic.c
index 8340962..cbba2fc 100644
--- a/hw/intc/loongarch_pch_pic.c
+++ b/hw/intc/loongarch_pch_pic.c
@@ -7,6 +7,7 @@
 
 #include "qemu/osdep.h"
 #include "qemu/bitops.h"
+#include "qemu/log.h"
 #include "hw/irq.h"
 #include "hw/intc/loongarch_pch_pic.h"
 #include "trace.h"
@@ -71,285 +72,181 @@
     pch_pic_update_irq(s, mask, level);
 }
 
-static uint64_t loongarch_pch_pic_low_readw(void *opaque, hwaddr addr,
-                                            unsigned size)
+static uint64_t pch_pic_read(void *opaque, hwaddr addr, uint64_t field_mask)
 {
     LoongArchPICCommonState *s = LOONGARCH_PIC_COMMON(opaque);
     uint64_t val = 0;
-    uint32_t offset = addr & 0xfff;
+    uint32_t offset;
 
-    switch (offset) {
-    case PCH_PIC_INT_ID_LO:
-        val = PCH_PIC_INT_ID_VAL;
+    offset = addr & 7;
+    addr -= offset;
+    switch (addr) {
+    case PCH_PIC_INT_ID:
+        val = s->id.data;
         break;
-    case PCH_PIC_INT_ID_HI:
-        /*
-         * With 7A1000 manual
-         *   bit  0-15 pch irqchip version
-         *   bit 16-31 irq number supported with pch irqchip
-         */
-        val = deposit32(PCH_PIC_INT_ID_VER, 16, 16, s->irq_num - 1);
+    case PCH_PIC_INT_MASK:
+        val = s->int_mask;
         break;
-    case PCH_PIC_INT_MASK_LO:
-        val = (uint32_t)s->int_mask;
+    case PCH_PIC_INT_EDGE:
+        val = s->intedge;
         break;
-    case PCH_PIC_INT_MASK_HI:
-        val = s->int_mask >> 32;
+    case PCH_PIC_HTMSI_EN:
+        val = s->htmsi_en;
         break;
-    case PCH_PIC_INT_EDGE_LO:
-        val = (uint32_t)s->intedge;
+    case PCH_PIC_AUTO_CTRL0:
+    case PCH_PIC_AUTO_CTRL1:
+        /* PCH PIC connect to EXTIOI always, discard auto_ctrl access */
         break;
-    case PCH_PIC_INT_EDGE_HI:
-        val = s->intedge >> 32;
+    case PCH_PIC_INT_STATUS:
+        val = s->intisr & (~s->int_mask);
         break;
-    case PCH_PIC_HTMSI_EN_LO:
-        val = (uint32_t)s->htmsi_en;
+    case PCH_PIC_INT_POL:
+        val = s->int_polarity;
         break;
-    case PCH_PIC_HTMSI_EN_HI:
-        val = s->htmsi_en >> 32;
+    case PCH_PIC_HTMSI_VEC ... PCH_PIC_HTMSI_VEC_END:
+        val = *(uint64_t *)(s->htmsi_vector + addr - PCH_PIC_HTMSI_VEC);
         break;
-    case PCH_PIC_AUTO_CTRL0_LO:
-    case PCH_PIC_AUTO_CTRL0_HI:
-    case PCH_PIC_AUTO_CTRL1_LO:
-    case PCH_PIC_AUTO_CTRL1_HI:
+    case PCH_PIC_ROUTE_ENTRY ... PCH_PIC_ROUTE_ENTRY_END:
+        val = *(uint64_t *)(s->route_entry + addr - PCH_PIC_ROUTE_ENTRY);
         break;
     default:
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "pch_pic_read: Bad address 0x%"PRIx64"\n", addr);
         break;
     }
 
-    trace_loongarch_pch_pic_low_readw(size, addr, val);
-    return val;
+    return (val >> (offset * 8)) & field_mask;
 }
 
-static uint64_t get_writew_val(uint64_t value, uint32_t target, bool hi)
-{
-    uint64_t mask = 0xffffffff00000000;
-    uint64_t data = target;
-
-    return hi ? (value & ~mask) | (data << 32) : (value & mask) | data;
-}
-
-static void loongarch_pch_pic_low_writew(void *opaque, hwaddr addr,
-                                         uint64_t value, unsigned size)
+static void pch_pic_write(void *opaque, hwaddr addr, uint64_t value,
+                          uint64_t field_mask)
 {
     LoongArchPICCommonState *s = LOONGARCH_PIC_COMMON(opaque);
-    uint32_t offset, old_valid, data = (uint32_t)value;
-    uint64_t old, int_mask;
-    offset = addr & 0xfff;
+    uint32_t offset;
+    uint64_t old, mask, data, *ptemp;
 
-    trace_loongarch_pch_pic_low_writew(size, addr, data);
+    offset = addr & 7;
+    addr -= offset;
+    mask = field_mask << (offset * 8);
+    data = (value & field_mask) << (offset * 8);
+    switch (addr) {
+    case PCH_PIC_INT_MASK:
+        old = s->int_mask;
+        s->int_mask = (old & ~mask) | data;
+        if (old & ~data) {
+            pch_pic_update_irq(s, old & ~data, 1);
+        }
 
-    switch (offset) {
-    case PCH_PIC_INT_MASK_LO:
-        old = s->int_mask;
-        s->int_mask = get_writew_val(old, data, 0);
-        old_valid = (uint32_t)old;
-        if (old_valid & ~data) {
-            pch_pic_update_irq(s, (old_valid & ~data), 1);
-        }
-        if (~old_valid & data) {
-            pch_pic_update_irq(s, (~old_valid & data), 0);
+        if (~old & data) {
+            pch_pic_update_irq(s, ~old & data, 0);
         }
         break;
-    case PCH_PIC_INT_MASK_HI:
-        old = s->int_mask;
-        s->int_mask = get_writew_val(old, data, 1);
-        old_valid = (uint32_t)(old >> 32);
-        int_mask = old_valid & ~data;
-        if (int_mask) {
-            pch_pic_update_irq(s, int_mask << 32, 1);
-        }
-        int_mask = ~old_valid & data;
-        if (int_mask) {
-            pch_pic_update_irq(s, int_mask << 32, 0);
-        }
+    case PCH_PIC_INT_EDGE:
+        s->intedge = (s->intedge & ~mask) | data;
         break;
-    case PCH_PIC_INT_EDGE_LO:
-        s->intedge = get_writew_val(s->intedge, data, 0);
-        break;
-    case PCH_PIC_INT_EDGE_HI:
-        s->intedge = get_writew_val(s->intedge, data, 1);
-        break;
-    case PCH_PIC_INT_CLEAR_LO:
+    case PCH_PIC_INT_CLEAR:
         if (s->intedge & data) {
-            s->intirr &= (~data);
+            s->intirr &= ~data;
             pch_pic_update_irq(s, data, 0);
-            s->intisr &= (~data);
+            s->intisr &= ~data;
         }
         break;
-    case PCH_PIC_INT_CLEAR_HI:
-        value <<= 32;
-        if (s->intedge & value) {
-            s->intirr &= (~value);
-            pch_pic_update_irq(s, value, 0);
-            s->intisr &= (~value);
-        }
+    case PCH_PIC_HTMSI_EN:
+        s->htmsi_en = (s->htmsi_en & ~mask) | data;
         break;
-    case PCH_PIC_HTMSI_EN_LO:
-        s->htmsi_en = get_writew_val(s->htmsi_en, data, 0);
+    case PCH_PIC_AUTO_CTRL0:
+    case PCH_PIC_AUTO_CTRL1:
+        /* Discard auto_ctrl access */
         break;
-    case PCH_PIC_HTMSI_EN_HI:
-        s->htmsi_en = get_writew_val(s->htmsi_en, data, 1);
+    case PCH_PIC_INT_POL:
+        s->int_polarity = (s->int_polarity & ~mask) | data;
         break;
-    case PCH_PIC_AUTO_CTRL0_LO:
-    case PCH_PIC_AUTO_CTRL0_HI:
-    case PCH_PIC_AUTO_CTRL1_LO:
-    case PCH_PIC_AUTO_CTRL1_HI:
+    case PCH_PIC_HTMSI_VEC ... PCH_PIC_HTMSI_VEC_END:
+        ptemp = (uint64_t *)(s->htmsi_vector + addr - PCH_PIC_HTMSI_VEC);
+        *ptemp = (*ptemp & ~mask) | data;
+        break;
+    case PCH_PIC_ROUTE_ENTRY ... PCH_PIC_ROUTE_ENTRY_END:
+        ptemp = (uint64_t *)(s->route_entry + addr - PCH_PIC_ROUTE_ENTRY);
+        *ptemp = (*ptemp & ~mask) | data;
         break;
     default:
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "pch_pic_write: Bad address 0x%"PRIx64"\n", addr);
         break;
     }
 }
 
-static uint64_t loongarch_pch_pic_high_readw(void *opaque, hwaddr addr,
-                                        unsigned size)
+static uint64_t loongarch_pch_pic_read(void *opaque, hwaddr addr,
+                                       unsigned size)
 {
-    LoongArchPICCommonState *s = LOONGARCH_PIC_COMMON(opaque);
     uint64_t val = 0;
-    uint32_t offset = addr & 0xfff;
 
-    switch (offset) {
-    case STATUS_LO_START:
-        val = (uint32_t)(s->intisr & (~s->int_mask));
+    switch (size) {
+    case 1:
+        val = pch_pic_read(opaque, addr, UCHAR_MAX);
         break;
-    case STATUS_HI_START:
-        val = (s->intisr & (~s->int_mask)) >> 32;
+    case 2:
+        val = pch_pic_read(opaque, addr, USHRT_MAX);
         break;
-    case POL_LO_START:
-        val = (uint32_t)s->int_polarity;
+    case 4:
+        val = pch_pic_read(opaque, addr, UINT_MAX);
         break;
-    case POL_HI_START:
-        val = s->int_polarity >> 32;
+    case 8:
+        val = pch_pic_read(opaque, addr, UINT64_MAX);
         break;
     default:
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "loongarch_pch_pic_read: Bad size %d\n", size);
         break;
     }
 
-    trace_loongarch_pch_pic_high_readw(size, addr, val);
+    trace_loongarch_pch_pic_read(size, addr, val);
     return val;
 }
 
-static void loongarch_pch_pic_high_writew(void *opaque, hwaddr addr,
-                                     uint64_t value, unsigned size)
+static void loongarch_pch_pic_write(void *opaque, hwaddr addr,
+                                    uint64_t value, unsigned size)
 {
-    LoongArchPICCommonState *s = LOONGARCH_PIC_COMMON(opaque);
-    uint32_t offset, data = (uint32_t)value;
-    offset = addr & 0xfff;
+    trace_loongarch_pch_pic_write(size, addr, value);
 
-    trace_loongarch_pch_pic_high_writew(size, addr, data);
-
-    switch (offset) {
-    case STATUS_LO_START:
-        s->intisr = get_writew_val(s->intisr, data, 0);
+    switch (size) {
+    case 1:
+        pch_pic_write(opaque, addr, value, UCHAR_MAX);
         break;
-    case STATUS_HI_START:
-        s->intisr = get_writew_val(s->intisr, data, 1);
+    case 2:
+        pch_pic_write(opaque, addr, value, USHRT_MAX);
         break;
-    case POL_LO_START:
-        s->int_polarity = get_writew_val(s->int_polarity, data, 0);
         break;
-    case POL_HI_START:
-        s->int_polarity = get_writew_val(s->int_polarity, data, 1);
+    case 4:
+        pch_pic_write(opaque, addr, value, UINT_MAX);
+        break;
+    case 8:
+        pch_pic_write(opaque, addr, value, UINT64_MAX);
         break;
     default:
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "loongarch_pch_pic_write: Bad size %d\n", size);
         break;
     }
 }
 
-static uint64_t loongarch_pch_pic_readb(void *opaque, hwaddr addr,
-                                        unsigned size)
-{
-    LoongArchPICCommonState *s = LOONGARCH_PIC_COMMON(opaque);
-    uint64_t val = 0;
-    uint32_t offset = (addr & 0xfff) + PCH_PIC_ROUTE_ENTRY_OFFSET;
-    int64_t offset_tmp;
-
-    switch (offset) {
-    case PCH_PIC_HTMSI_VEC_OFFSET ... PCH_PIC_HTMSI_VEC_END:
-        offset_tmp = offset - PCH_PIC_HTMSI_VEC_OFFSET;
-        if (offset_tmp >= 0 && offset_tmp < 64) {
-            val = s->htmsi_vector[offset_tmp];
-        }
-        break;
-    case PCH_PIC_ROUTE_ENTRY_OFFSET ... PCH_PIC_ROUTE_ENTRY_END:
-        offset_tmp = offset - PCH_PIC_ROUTE_ENTRY_OFFSET;
-        if (offset_tmp >= 0 && offset_tmp < 64) {
-            val = s->route_entry[offset_tmp];
-        }
-        break;
-    default:
-        break;
-    }
-
-    trace_loongarch_pch_pic_readb(size, addr, val);
-    return val;
-}
-
-static void loongarch_pch_pic_writeb(void *opaque, hwaddr addr,
-                                     uint64_t data, unsigned size)
-{
-    LoongArchPICCommonState *s = LOONGARCH_PIC_COMMON(opaque);
-    int32_t offset_tmp;
-    uint32_t offset = (addr & 0xfff) + PCH_PIC_ROUTE_ENTRY_OFFSET;
-
-    trace_loongarch_pch_pic_writeb(size, addr, data);
-
-    switch (offset) {
-    case PCH_PIC_HTMSI_VEC_OFFSET ... PCH_PIC_HTMSI_VEC_END:
-        offset_tmp = offset - PCH_PIC_HTMSI_VEC_OFFSET;
-        if (offset_tmp >= 0 && offset_tmp < 64) {
-            s->htmsi_vector[offset_tmp] = (uint8_t)(data & 0xff);
-        }
-        break;
-    case PCH_PIC_ROUTE_ENTRY_OFFSET ... PCH_PIC_ROUTE_ENTRY_END:
-        offset_tmp = offset - PCH_PIC_ROUTE_ENTRY_OFFSET;
-        if (offset_tmp >= 0 && offset_tmp < 64) {
-            s->route_entry[offset_tmp] = (uint8_t)(data & 0xff);
-        }
-        break;
-    default:
-        break;
-    }
-}
-
-static const MemoryRegionOps loongarch_pch_pic_reg32_low_ops = {
-    .read = loongarch_pch_pic_low_readw,
-    .write = loongarch_pch_pic_low_writew,
-    .valid = {
-        .min_access_size = 4,
-        .max_access_size = 8,
-    },
-    .impl = {
-        .min_access_size = 4,
-        .max_access_size = 4,
-    },
-    .endianness = DEVICE_LITTLE_ENDIAN,
-};
-
-static const MemoryRegionOps loongarch_pch_pic_reg32_high_ops = {
-    .read = loongarch_pch_pic_high_readw,
-    .write = loongarch_pch_pic_high_writew,
-    .valid = {
-        .min_access_size = 4,
-        .max_access_size = 8,
-    },
-    .impl = {
-        .min_access_size = 4,
-        .max_access_size = 4,
-    },
-    .endianness = DEVICE_LITTLE_ENDIAN,
-};
-
-static const MemoryRegionOps loongarch_pch_pic_reg8_ops = {
-    .read = loongarch_pch_pic_readb,
-    .write = loongarch_pch_pic_writeb,
+static const MemoryRegionOps loongarch_pch_pic_ops = {
+    .read = loongarch_pch_pic_read,
+    .write = loongarch_pch_pic_write,
     .valid = {
         .min_access_size = 1,
-        .max_access_size = 1,
+        .max_access_size = 8,
+        /*
+         * PCH PIC device would not work correctly if the guest was doing
+         * unaligned access. This might not be a limitation on the real
+         * device but in practice there is no reason for a guest to access
+         * this device unaligned.
+         */
+        .unaligned = false,
     },
     .impl = {
         .min_access_size = 1,
-        .max_access_size = 1,
+        .max_access_size = 8,
     },
     .endianness = DEVICE_LITTLE_ENDIAN,
 };
@@ -378,18 +275,10 @@
 
     qdev_init_gpio_out(dev, s->parent_irq, s->irq_num);
     qdev_init_gpio_in(dev, pch_pic_irq_handler, s->irq_num);
-    memory_region_init_io(&s->iomem32_low, OBJECT(dev),
-                          &loongarch_pch_pic_reg32_low_ops,
-                          s, PCH_PIC_NAME(.reg32_part1), 0x100);
-    memory_region_init_io(&s->iomem8, OBJECT(dev), &loongarch_pch_pic_reg8_ops,
-                          s, PCH_PIC_NAME(.reg8), 0x2a0);
-    memory_region_init_io(&s->iomem32_high, OBJECT(dev),
-                          &loongarch_pch_pic_reg32_high_ops,
-                          s, PCH_PIC_NAME(.reg32_part2), 0xc60);
-    sysbus_init_mmio(sbd, &s->iomem32_low);
-    sysbus_init_mmio(sbd, &s->iomem8);
-    sysbus_init_mmio(sbd, &s->iomem32_high);
-
+    memory_region_init_io(&s->iomem, OBJECT(dev),
+                          &loongarch_pch_pic_ops,
+                          s, TYPE_LOONGARCH_PIC, VIRT_PCH_REG_SIZE);
+    sysbus_init_mmio(sbd, &s->iomem);
 }
 
 static void loongarch_pic_class_init(ObjectClass *klass, const void *data)
diff --git a/hw/intc/loongarch_pic_common.c b/hw/intc/loongarch_pic_common.c
index 6dccacc..de17050 100644
--- a/hw/intc/loongarch_pic_common.c
+++ b/hw/intc/loongarch_pic_common.c
@@ -49,6 +49,19 @@
     LoongArchPICCommonState *s = LOONGARCH_PIC_COMMON(obj);
     int i;
 
+    /*
+     * With Loongson 7A1000 user manual
+     * Chapter 5.2 "Description of Interrupt-related Registers"
+     *
+     * Interrupt controller identification register 1
+     *   Bit 24-31 Interrupt Controller ID
+     * Interrupt controller identification register 2
+     *   Bit  0-7  Interrupt Controller version number
+     *   Bit 16-23 The number of interrupt sources supported
+     */
+    s->id.desc.id = PCH_PIC_INT_ID_VAL;
+    s->id.desc.version = PCH_PIC_INT_ID_VER;
+    s->id.desc.irq_num = s->irq_num - 1;
     s->int_mask = UINT64_MAX;
     s->htmsi_en = 0x0;
     s->intedge  = 0x0;
diff --git a/hw/intc/trace-events b/hw/intc/trace-events
index 0ba9a02..334aa6a 100644
--- a/hw/intc/trace-events
+++ b/hw/intc/trace-events
@@ -314,12 +314,8 @@
 loongson_ipi_write(unsigned size, uint64_t addr, uint64_t val) "size: %u addr: 0x%"PRIx64 "val: 0x%"PRIx64
 # loongarch_pch_pic.c
 loongarch_pch_pic_irq_handler(int irq, int level) "irq %d level %d"
-loongarch_pch_pic_low_readw(unsigned size, uint64_t addr, uint64_t val) "size: %u addr: 0x%"PRIx64 "val: 0x%" PRIx64
-loongarch_pch_pic_low_writew(unsigned size, uint64_t addr, uint64_t val) "size: %u addr: 0x%"PRIx64 "val: 0x%" PRIx64
-loongarch_pch_pic_high_readw(unsigned size, uint64_t addr, uint64_t val) "size: %u addr: 0x%"PRIx64 "val: 0x%" PRIx64
-loongarch_pch_pic_high_writew(unsigned size, uint64_t addr, uint64_t val) "size: %u addr: 0x%"PRIx64 "val: 0x%" PRIx64
-loongarch_pch_pic_readb(unsigned size, uint64_t addr, uint64_t val) "size: %u addr: 0x%"PRIx64 "val: 0x%" PRIx64
-loongarch_pch_pic_writeb(unsigned size, uint64_t addr, uint64_t val) "size: %u addr: 0x%"PRIx64 "val: 0x%" PRIx64
+loongarch_pch_pic_read(unsigned size, uint64_t addr, uint64_t val) "size: %u addr: 0x%"PRIx64 "val: 0x%" PRIx64
+loongarch_pch_pic_write(unsigned size, uint64_t addr, uint64_t val) "size: %u addr: 0x%"PRIx64 "val: 0x%" PRIx64
 
 # loongarch_pch_msi.c
 loongarch_msi_set_irq(int irq_num) "set msi irq %d"
diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c
index 0324d6a..9b6292e 100644
--- a/hw/loongarch/boot.c
+++ b/hw/loongarch/boot.c
@@ -235,6 +235,45 @@
     return size;
 }
 
+static ram_addr_t alloc_initrd_memory(struct loongarch_boot_info *info,
+                uint64_t advice_start, ssize_t rd_size)
+{
+    hwaddr base, ram_size, gap, low_end;
+    ram_addr_t initrd_end, initrd_start;
+
+    base = VIRT_LOWMEM_BASE;
+    gap = VIRT_LOWMEM_SIZE;
+    initrd_start = advice_start;
+    initrd_end = initrd_start + rd_size;
+
+    ram_size = info->ram_size;
+    low_end = base + MIN(ram_size, gap);
+    if (initrd_end <= low_end) {
+        return initrd_start;
+    }
+
+    if (ram_size <= gap) {
+        error_report("The low memory too small for initial ram disk '%s',"
+             "You need to expand the ram",
+             info->initrd_filename);
+        exit(1);
+    }
+
+    /*
+     * Try to load initrd in the high memory
+     */
+    ram_size -= gap;
+    initrd_start = VIRT_HIGHMEM_BASE;
+    if (rd_size <= ram_size) {
+        return initrd_start;
+    }
+
+    error_report("The high memory too small for initial ram disk '%s',"
+         "You need to expand the ram",
+         info->initrd_filename);
+    exit(1);
+}
+
 static int64_t load_kernel_info(struct loongarch_boot_info *info)
 {
     uint64_t kernel_entry, kernel_low, kernel_high;
@@ -263,15 +302,10 @@
         initrd_size = get_image_size(info->initrd_filename);
         if (initrd_size > 0) {
             initrd_offset = ROUND_UP(kernel_high + 4 * kernel_size, 64 * KiB);
-
-            if (initrd_offset + initrd_size > info->ram_size) {
-                error_report("memory too small for initial ram disk '%s'",
-                             info->initrd_filename);
-                exit(1);
-            }
-
-            initrd_size = load_image_targphys(info->initrd_filename, initrd_offset,
-                                              info->ram_size - initrd_offset);
+            initrd_offset = alloc_initrd_memory(info, initrd_offset,
+                                                initrd_size);
+            initrd_size = load_image_targphys(info->initrd_filename,
+                                              initrd_offset, initrd_size);
         }
 
         if (initrd_size == (target_ulong)-1) {
diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c
index 7ad7fb6..1b50404 100644
--- a/hw/loongarch/virt.c
+++ b/hw/loongarch/virt.c
@@ -429,12 +429,6 @@
     sysbus_realize_and_unref(d, &error_fatal);
     memory_region_add_subregion(get_system_memory(), VIRT_IOAPIC_REG_BASE,
                             sysbus_mmio_get_region(d, 0));
-    memory_region_add_subregion(get_system_memory(),
-                            VIRT_IOAPIC_REG_BASE + PCH_PIC_ROUTE_ENTRY_OFFSET,
-                            sysbus_mmio_get_region(d, 1));
-    memory_region_add_subregion(get_system_memory(),
-                            VIRT_IOAPIC_REG_BASE + PCH_PIC_INT_STATUS_LO,
-                            sysbus_mmio_get_region(d, 2));
 
     /* Connect pch_pic irqs to extioi */
     for (i = 0; i < num; i++) {
diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h
index eb2d92a..0d93783 100644
--- a/include/block/block_int-global-state.h
+++ b/include/block/block_int-global-state.h
@@ -179,6 +179,7 @@
  *        all ".has_*" fields are ignored.
  * @on_source_error: The action to take upon error reading from the source.
  * @on_target_error: The action to take upon error writing to the target.
+ * @on_cbw_error: The action to take upon error in copy-before-write operations.
  * @creation_flags: Flags that control the behavior of the Job lifetime.
  *                  See @BlockJobCreateFlags
  * @cb: Completion function for the job.
@@ -198,6 +199,7 @@
                             BackupPerf *perf,
                             BlockdevOnError on_source_error,
                             BlockdevOnError on_target_error,
+                            OnCbwError on_cbw_error,
                             int creation_flags,
                             BlockCompletionFunc *cb, void *opaque,
                             JobTxn *txn, Error **errp);
diff --git a/include/hw/intc/loongarch_pic_common.h b/include/hw/intc/loongarch_pic_common.h
index d301377..9349a05 100644
--- a/include/hw/intc/loongarch_pic_common.h
+++ b/include/hw/intc/loongarch_pic_common.h
@@ -10,44 +10,43 @@
 #include "hw/pci-host/ls7a.h"
 #include "hw/sysbus.h"
 
-#define PCH_PIC_INT_ID_VAL              0x7000000UL
-#define PCH_PIC_INT_ID_VER              0x1UL
-#define PCH_PIC_INT_ID_LO               0x00
-#define PCH_PIC_INT_ID_HI               0x04
-#define PCH_PIC_INT_MASK_LO             0x20
-#define PCH_PIC_INT_MASK_HI             0x24
-#define PCH_PIC_HTMSI_EN_LO             0x40
-#define PCH_PIC_HTMSI_EN_HI             0x44
-#define PCH_PIC_INT_EDGE_LO             0x60
-#define PCH_PIC_INT_EDGE_HI             0x64
-#define PCH_PIC_INT_CLEAR_LO            0x80
-#define PCH_PIC_INT_CLEAR_HI            0x84
-#define PCH_PIC_AUTO_CTRL0_LO           0xc0
-#define PCH_PIC_AUTO_CTRL0_HI           0xc4
-#define PCH_PIC_AUTO_CTRL1_LO           0xe0
-#define PCH_PIC_AUTO_CTRL1_HI           0xe4
-#define PCH_PIC_ROUTE_ENTRY_OFFSET      0x100
+#define PCH_PIC_INT_ID                  0x00
+#define  PCH_PIC_INT_ID_VAL             0x7
+#define  PCH_PIC_INT_ID_VER             0x1
+#define PCH_PIC_INT_MASK                0x20
+#define PCH_PIC_HTMSI_EN                0x40
+#define PCH_PIC_INT_EDGE                0x60
+#define PCH_PIC_INT_CLEAR               0x80
+#define PCH_PIC_AUTO_CTRL0              0xc0
+#define PCH_PIC_AUTO_CTRL1              0xe0
+#define PCH_PIC_ROUTE_ENTRY             0x100
 #define PCH_PIC_ROUTE_ENTRY_END         0x13f
-#define PCH_PIC_HTMSI_VEC_OFFSET        0x200
+#define PCH_PIC_HTMSI_VEC               0x200
 #define PCH_PIC_HTMSI_VEC_END           0x23f
-#define PCH_PIC_INT_STATUS_LO           0x3a0
-#define PCH_PIC_INT_STATUS_HI           0x3a4
-#define PCH_PIC_INT_POL_LO              0x3e0
-#define PCH_PIC_INT_POL_HI              0x3e4
-
-#define STATUS_LO_START                 0
-#define STATUS_HI_START                 0x4
-#define POL_LO_START                    0x40
-#define POL_HI_START                    0x44
+#define PCH_PIC_INT_STATUS              0x3a0
+#define PCH_PIC_INT_POL                 0x3e0
 
 #define TYPE_LOONGARCH_PIC_COMMON "loongarch_pic_common"
 OBJECT_DECLARE_TYPE(LoongArchPICCommonState,
                     LoongArchPICCommonClass, LOONGARCH_PIC_COMMON)
 
+union LoongArchPIC_ID {
+    struct {
+        uint8_t _reserved_0[3];
+        uint8_t id;
+        uint8_t version;
+        uint8_t _reserved_1;
+        uint8_t irq_num;
+        uint8_t _reserved_2;
+    } QEMU_PACKED desc;
+    uint64_t data;
+};
+
 struct LoongArchPICCommonState {
     SysBusDevice parent_obj;
 
     qemu_irq parent_irq[64];
+    union LoongArchPIC_ID id; /* 0x00  interrupt ID register */
     uint64_t int_mask;        /* 0x020 interrupt mask register */
     uint64_t htmsi_en;        /* 0x040 1=msi */
     uint64_t intedge;         /* 0x060 edge=1 level=0 */
@@ -66,9 +65,7 @@
     uint8_t route_entry[64];  /* 0x100 - 0x138 */
     uint8_t htmsi_vector[64]; /* 0x200 - 0x238 */
 
-    MemoryRegion iomem32_low;
-    MemoryRegion iomem32_high;
-    MemoryRegion iomem8;
+    MemoryRegion iomem;
     unsigned int irq_num;
 };
 
diff --git a/linux-headers/asm-x86/kvm.h b/linux-headers/asm-x86/kvm.h
index dc591fb..7fb57cc 100644
--- a/linux-headers/asm-x86/kvm.h
+++ b/linux-headers/asm-x86/kvm.h
@@ -439,6 +439,7 @@
 #define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS	(1 << 6)
 #define KVM_X86_QUIRK_SLOT_ZAP_ALL		(1 << 7)
 #define KVM_X86_QUIRK_STUFF_FEATURE_MSRS	(1 << 8)
+#define KVM_X86_QUIRK_IGNORE_GUEST_PAT		(1 << 9)
 
 #define KVM_STATE_NESTED_FORMAT_VMX	0
 #define KVM_STATE_NESTED_FORMAT_SVM	1
@@ -928,4 +929,74 @@
 #define KVM_X86_SNP_VM		4
 #define KVM_X86_TDX_VM		5
 
+/* Trust Domain eXtension sub-ioctl() commands. */
+enum kvm_tdx_cmd_id {
+	KVM_TDX_CAPABILITIES = 0,
+	KVM_TDX_INIT_VM,
+	KVM_TDX_INIT_VCPU,
+	KVM_TDX_INIT_MEM_REGION,
+	KVM_TDX_FINALIZE_VM,
+	KVM_TDX_GET_CPUID,
+
+	KVM_TDX_CMD_NR_MAX,
+};
+
+struct kvm_tdx_cmd {
+	/* enum kvm_tdx_cmd_id */
+	__u32 id;
+	/* flags for sub-commend. If sub-command doesn't use this, set zero. */
+	__u32 flags;
+	/*
+	 * data for each sub-command. An immediate or a pointer to the actual
+	 * data in process virtual address.  If sub-command doesn't use it,
+	 * set zero.
+	 */
+	__u64 data;
+	/*
+	 * Auxiliary error code.  The sub-command may return TDX SEAMCALL
+	 * status code in addition to -Exxx.
+	 */
+	__u64 hw_error;
+};
+
+struct kvm_tdx_capabilities {
+	__u64 supported_attrs;
+	__u64 supported_xfam;
+	__u64 reserved[254];
+
+	/* Configurable CPUID bits for userspace */
+	struct kvm_cpuid2 cpuid;
+};
+
+struct kvm_tdx_init_vm {
+	__u64 attributes;
+	__u64 xfam;
+	__u64 mrconfigid[6];	/* sha384 digest */
+	__u64 mrowner[6];	/* sha384 digest */
+	__u64 mrownerconfig[6];	/* sha384 digest */
+
+	/* The total space for TD_PARAMS before the CPUIDs is 256 bytes */
+	__u64 reserved[12];
+
+	/*
+	 * Call KVM_TDX_INIT_VM before vcpu creation, thus before
+	 * KVM_SET_CPUID2.
+	 * This configuration supersedes KVM_SET_CPUID2s for VCPUs because the
+	 * TDX module directly virtualizes those CPUIDs without VMM.  The user
+	 * space VMM, e.g. qemu, should make KVM_SET_CPUID2 consistent with
+	 * those values.  If it doesn't, KVM may have wrong idea of vCPUIDs of
+	 * the guest, and KVM may wrongly emulate CPUIDs or MSRs that the TDX
+	 * module doesn't virtualize.
+	 */
+	struct kvm_cpuid2 cpuid;
+};
+
+#define KVM_TDX_MEASURE_MEMORY_REGION   _BITULL(0)
+
+struct kvm_tdx_init_mem_region {
+	__u64 source_addr;
+	__u64 gpa;
+	__u64 nr_pages;
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index e5f3e8b..99cc82a 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -369,6 +369,7 @@
 #define KVM_SYSTEM_EVENT_WAKEUP         4
 #define KVM_SYSTEM_EVENT_SUSPEND        5
 #define KVM_SYSTEM_EVENT_SEV_TERM       6
+#define KVM_SYSTEM_EVENT_TDX_FATAL      7
 			__u32 type;
 			__u32 ndata;
 			union {
diff --git a/meson.build b/meson.build
index e819a70..7f91500 100644
--- a/meson.build
+++ b/meson.build
@@ -247,6 +247,8 @@
 have_vhost_net_kernel = have_vhost_kernel and get_option('vhost_net').allowed()
 have_vhost_net = have_vhost_net_kernel or have_vhost_net_user or have_vhost_net_vdpa
 
+have_tcg = get_option('tcg').allowed() and (have_system or have_user)
+
 have_tools = get_option('tools') \
   .disable_auto_if(not have_system) \
   .allowed()
@@ -863,7 +865,7 @@
             cc.find_library('network'),
             cc.find_library('bsd')]
 elif host_os == 'openbsd'
-  if get_option('tcg').allowed() and target_dirs.length() > 0
+  if have_tcg
     # Disable OpenBSD W^X if available
     emulator_link_args = cc.get_supported_link_arguments('-Wl,-z,wxneeded')
   endif
@@ -904,7 +906,7 @@
 endif
 
 tcg_arch = host_arch
-if get_option('tcg').allowed()
+if have_tcg
   if host_arch == 'unknown'
     if not get_option('tcg_interpreter')
       error('Unsupported CPU @0@, try --enable-tcg-interpreter'.format(cpu))
@@ -2534,7 +2536,7 @@
 config_host_data.set('CONFIG_SLIRP', slirp.found())
 config_host_data.set('CONFIG_SNAPPY', snappy.found())
 config_host_data.set('CONFIG_SOLARIS', host_os == 'sunos')
-if get_option('tcg').allowed()
+if have_tcg
   config_host_data.set('CONFIG_TCG', 1)
   config_host_data.set('CONFIG_TCG_INTERPRETER', tcg_arch == 'tci')
 endif
@@ -3097,22 +3099,16 @@
                      cc.has_header_symbol('asm/hwprobe.h',
                                           'RISCV_HWPROBE_EXT_ZBA'))
 
-config_host_data.set('CONFIG_AVX2_OPT', get_option('avx2') \
-  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX2') \
-  .require(cc.links('''
-    #include <cpuid.h>
+if have_cpuid_h
+  have_avx2 = cc.links('''
     #include <immintrin.h>
     static int __attribute__((target("avx2"))) bar(void *a) {
       __m256i x = *(__m256i *)a;
       return _mm256_testz_si256(x, x);
     }
     int main(int argc, char *argv[]) { return bar(argv[argc - 1]); }
-  '''), error_message: 'AVX2 not available').allowed())
-
-config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
-  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \
-  .require(cc.links('''
-    #include <cpuid.h>
+  ''')
+  have_avx512bw = cc.links('''
     #include <immintrin.h>
     static int __attribute__((target("avx512bw"))) bar(void *a) {
       __m512i *x = a;
@@ -3120,7 +3116,21 @@
       return res[1];
     }
     int main(int argc, char *argv[]) { return bar(argv[0]); }
-  '''), error_message: 'AVX512BW not available').allowed())
+  ''')
+  if get_option('x86_version') >= '3' and not have_avx2
+    error('Cannot enable AVX optimizations due to missing intrinsics')
+  elif get_option('x86_version') >= '4' and not have_avx512bw
+    error('Cannot enable AVX512 optimizations due to missing intrinsics')
+  endif
+else
+  have_avx2 = false
+  have_avx512bw = false
+  if get_option('x86_version') >= '3'
+    error('Cannot enable AVX optimizations due to missing cpuid.h')
+  endif
+endif
+config_host_data.set('CONFIG_AVX2_OPT', have_avx2)
+config_host_data.set('CONFIG_AVX512BW_OPT', have_avx512bw)
 
 # For both AArch64 and AArch32, detect if builtins are available.
 config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles('''
@@ -3893,16 +3903,11 @@
                     install: true,
                     install_dir: qemu_moddir)
       if module_ss.sources() != []
-        # FIXME: Should use sl.extract_all_objects(recursive: true) as
-        # input. Sources can be used multiple times but objects are
-        # unique when it comes to lookup in compile_commands.json.
-        # Depnds on a mesion version with
-        # https://github.com/mesonbuild/meson/pull/8900
         modinfo_files += custom_target(d + '-' + m + '.modinfo',
                                        output: d + '-' + m + '.modinfo',
-                                       input: module_ss.sources() + genh,
+                                       input: sl.extract_all_objects(recursive: true),
                                        capture: true,
-                                       command: [modinfo_collect, module_ss.sources()])
+                                       command: [modinfo_collect, '@INPUT@'])
       endif
     else
       if d == 'block'
@@ -3941,12 +3946,11 @@
                     dependencies: target_module_ss.dependencies(),
                     install: true,
                     install_dir: qemu_moddir)
-            # FIXME: Should use sl.extract_all_objects(recursive: true) too.
             modinfo_files += custom_target(module_name + '.modinfo',
                                            output: module_name + '.modinfo',
-                                           input: target_module_ss.sources() + genh,
+                                           input: sl.extract_all_objects(recursive: true),
                                            capture: true,
-                                           command: [modinfo_collect, '--target', target, target_module_ss.sources()])
+                                           command: [modinfo_collect, '--target', target, '@INPUT@'])
           endif
         endif
       endforeach
@@ -4951,7 +4955,7 @@
   message('compile or work on this host CPU. You can help by volunteering')
   message('to maintain it and providing a build host for our continuous')
   message('integration setup.')
-  if get_option('tcg').allowed() and target_dirs.length() > 0
+  if have_tcg
     message()
     message('configure has succeeded and you can continue to build, but')
     message('QEMU will use a slow interpreter to emulate the target CPU.')
diff --git a/meson_options.txt b/meson_options.txt
index cc66b46..a442be2 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -123,10 +123,6 @@
 option('membarrier', type: 'feature', value: 'disabled',
        description: 'membarrier system call (for Linux 4.14+ or Windows')
 
-option('avx2', type: 'feature', value: 'auto',
-       description: 'AVX2 optimizations')
-option('avx512bw', type: 'feature', value: 'auto',
-       description: 'AVX512BW optimizations')
 option('keyring', type: 'feature', value: 'auto',
        description: 'Linux keyring support')
 option('libkeyutils', type: 'feature', value: 'auto',
diff --git a/qapi/block-core.json b/qapi/block-core.json
index b193778..91c70e2 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1602,6 +1602,9 @@
 #     default 'report' (no limitations, since this applies to a
 #     different block device than @device).
 #
+# @on-cbw-error: policy defining behavior on I/O errors in
+#     copy-before-write jobs; defaults to break-guest-write.  (Since 10.1)
+#
 # @auto-finalize: When false, this job will wait in a PENDING state
 #     after it has finished its work, waiting for @block-job-finalize
 #     before making any block graph changes.  When true, this job will
@@ -1641,6 +1644,7 @@
             '*compress': 'bool',
             '*on-source-error': 'BlockdevOnError',
             '*on-target-error': 'BlockdevOnError',
+            '*on-cbw-error': 'OnCbwError',
             '*auto-finalize': 'bool', '*auto-dismiss': 'bool',
             '*filter-node-name': 'str',
             '*discard-source': 'bool',
@@ -2956,18 +2960,24 @@
 #
 # Pause an active background block operation.
 #
-# This command returns immediately after marking the active background
-# block operation for pausing.  It is an error to call this command if
-# no operation is in progress or if the job is already paused.
+# This command returns immediately after marking the active job for
+# pausing.  Pausing an already paused job is an error.
 #
-# The operation will pause as soon as possible.  No event is emitted
-# when the operation is actually paused.  Cancelling a paused job
-# automatically resumes it.
+# The job will pause as soon as possible, which means transitioning
+# into the PAUSED state if it was RUNNING, or into STANDBY if it was
+# READY.  The corresponding JOB_STATUS_CHANGE event will be emitted.
+#
+# Cancelling a paused job automatically resumes it.
 #
 # @device: The job identifier.  This used to be a device name (hence
 #     the name of the parameter), but since QEMU 2.7 it can have other
 #     values.
 #
+# Features:
+#
+# @deprecated: This command is deprecated.  Use @job-pause
+#     instead.
+#
 # Errors:
 #     - If no background operation is active on this device,
 #       DeviceNotActive
@@ -2975,6 +2985,7 @@
 # Since: 1.3
 ##
 { 'command': 'block-job-pause', 'data': { 'device': 'str' },
+  'features': ['deprecated'],
   'allow-preconfig': true }
 
 ##
@@ -2982,9 +2993,8 @@
 #
 # Resume an active background block operation.
 #
-# This command returns immediately after resuming a paused background
-# block operation.  It is an error to call this command if no
-# operation is in progress or if the job is not paused.
+# This command returns immediately after resuming a paused job.
+# Resuming an already running job is an error.
 #
 # This command also clears the error status of the job.
 #
@@ -2992,6 +3002,11 @@
 #     the name of the parameter), but since QEMU 2.7 it can have other
 #     values.
 #
+# Features:
+#
+# @deprecated: This command is deprecated.  Use @job-resume
+#     instead.
+#
 # Errors:
 #     - If no background operation is active on this device,
 #       DeviceNotActive
@@ -2999,15 +3014,21 @@
 # Since: 1.3
 ##
 { 'command': 'block-job-resume', 'data': { 'device': 'str' },
+  'features': ['deprecated'],
   'allow-preconfig': true }
 
 ##
 # @block-job-complete:
 #
-# Manually trigger completion of an active background block operation.
-# This is supported for drive mirroring, where it also switches the
-# device to write to the target path only.  The ability to complete is
-# signaled with a BLOCK_JOB_READY event.
+# Manually trigger completion of an active job in the READY or STANDBY
+# state.  Completing the job in any other state is an error.
+#
+# This is supported only for drive mirroring, where it also switches
+# the device to write to the target path only. Note that drive
+# mirroring includes drive-mirror, blockdev-mirror and block-commit
+# job (only in case of "active commit", when the node being commited
+# is used by the guest). The ability to complete is signaled with a
+# BLOCK_JOB_READY event.
 #
 # This command completes an active background block operation
 # synchronously.  The ordering of this command's return with the
@@ -3017,12 +3038,15 @@
 # rerror/werror arguments that were specified when starting the
 # operation.
 #
-# A cancelled or paused job cannot be completed.
-#
 # @device: The job identifier.  This used to be a device name (hence
 #     the name of the parameter), but since QEMU 2.7 it can have other
 #     values.
 #
+# Features:
+#
+# @deprecated: This command is deprecated.  Use @job-complete
+#     instead.
+#
 # Errors:
 #     - If no background operation is active on this device,
 #       DeviceNotActive
@@ -3030,15 +3054,19 @@
 # Since: 1.3
 ##
 { 'command': 'block-job-complete', 'data': { 'device': 'str' },
+  'features': ['deprecated'],
   'allow-preconfig': true }
 
 ##
 # @block-job-dismiss:
 #
-# For jobs that have already concluded, remove them from the
-# block-job-query list.  This command only needs to be run for jobs
-# which were started with QEMU 2.12+ job lifetime management
-# semantics.
+# Deletes a job that is in the CONCLUDED state.  This command only
+# needs to be run explicitly for jobs that don't have automatic
+# dismiss enabled. In turn, automatic dismiss may be enabled only
+# for jobs that have @auto-dismiss option, which are drive-backup,
+# blockdev-backup, drive-mirror, blockdev-mirror, block-commit and
+# block-stream. @auto-dismiss is enabled by default for these
+# jobs.
 #
 # This command will refuse to operate on any job that has not yet
 # reached its terminal state, JOB_STATUS_CONCLUDED.  For jobs that
@@ -3047,26 +3075,43 @@
 #
 # @id: The job identifier.
 #
+# Features:
+#
+# @deprecated: This command is deprecated.  Use @job-dismiss
+#     instead.
+#
 # Since: 2.12
 ##
 { 'command': 'block-job-dismiss', 'data': { 'id': 'str' },
+  'features': ['deprecated'],
   'allow-preconfig': true }
 
 ##
 # @block-job-finalize:
 #
-# Once a job that has manual=true reaches the pending state, it can be
-# instructed to finalize any graph changes and do any necessary
-# cleanup via this command.  For jobs in a transaction, instructing
-# one job to finalize will force ALL jobs in the transaction to
-# finalize, so it is only necessary to instruct a single member job to
-# finalize.
+# Instructs all jobs in a transaction (or a single job if it is not
+# part of any transaction) to finalize any graph changes and do any
+# necessary cleanup.  This command requires that all involved jobs are
+# in the PENDING state.
+#
+# For jobs in a transaction, instructing one job to finalize will
+# force ALL jobs in the transaction to finalize, so it is only
+# necessary to instruct a single member job to finalize.
+#
+# The command is applicable only to jobs which have @auto-finalize option
+# and only when this option is set to false.
 #
 # @id: The job identifier.
 #
+# Features:
+#
+# @deprecated: This command is deprecated.  Use @job-finalize
+#     instead.
+#
 # Since: 2.12
 ##
 { 'command': 'block-job-finalize', 'data': { 'id': 'str' },
+  'features': ['deprecated'],
   'allow-preconfig': true }
 
 ##
diff --git a/qapi/job.json b/qapi/job.json
index cfc3bee..b03f80b 100644
--- a/qapi/job.json
+++ b/qapi/job.json
@@ -156,6 +156,9 @@
 # This command returns immediately after resuming a paused job.
 # Resuming an already running job is an error.
 #
+# This command also clears the error status for block-jobs (stream,
+# commit, mirror, backup).
+#
 # @id: The job identifier.
 #
 # Since: 3.0
@@ -184,7 +187,23 @@
 ##
 # @job-complete:
 #
-# Manually trigger completion of an active job in the READY state.
+# Manually trigger completion of an active job in the READY or STANDBY
+# state.  Completing the job in any other state is an error.
+#
+# This is supported only for drive mirroring, where it also switches
+# the device to write to the target path only. Note that drive
+# mirroring includes drive-mirror, blockdev-mirror and block-commit
+# job (only in case of "active commit", when the node being commited
+# is used by the guest). The ability to complete is signaled with a
+# BLOCK_JOB_READY event.
+#
+# This command completes an active background block operation
+# synchronously.  The ordering of this command's return with the
+# BLOCK_JOB_COMPLETED event is not defined.  Note that if an I/O error
+# occurs during the processing of this command: 1) the command itself
+# will fail; 2) the error will be processed according to the
+# rerror/werror arguments that were specified when starting the
+# operation.
 #
 # @id: The job identifier.
 #
@@ -197,7 +216,11 @@
 #
 # Deletes a job that is in the CONCLUDED state.  This command only
 # needs to be run explicitly for jobs that don't have automatic
-# dismiss enabled.
+# dismiss enabled. In turn, automatic dismiss may be enabled only
+# for jobs that have @auto-dismiss option, which are drive-backup,
+# blockdev-backup, drive-mirror, blockdev-mirror, block-commit and
+# block-stream. @auto-dismiss is enabled by default for these
+# jobs.
 #
 # This command will refuse to operate on any job that has not yet
 # reached its terminal state, JOB_STATUS_CONCLUDED.  For jobs that
@@ -222,6 +245,9 @@
 # force ALL jobs in the transaction to finalize, so it is only
 # necessary to instruct a single member job to finalize.
 #
+# The command is applicable only to jobs which have @auto-finalize option
+# and only when this option is set to false.
+#
 # @id: The identifier of any job in the transaction, or of a job that
 #     is not part of any transaction.
 #
diff --git a/rust/hw/char/pl011/src/device.rs b/rust/hw/char/pl011/src/device.rs
index 7c563ad..bde3be6 100644
--- a/rust/hw/char/pl011/src/device.rs
+++ b/rust/hw/char/pl011/src/device.rs
@@ -329,7 +329,7 @@ fn loopback_tx(&mut self, value: registers::Data) -> bool {
         // hardware flow-control is enabled.
         //
         // For simplicity, the above described is not emulated.
-        self.loopback_enabled() && self.put_fifo(value)
+        self.loopback_enabled() && self.fifo_rx_put(value)
     }
 
     #[must_use]
@@ -439,7 +439,7 @@ pub fn fifo_depth(&self) -> u32 {
     }
 
     #[must_use]
-    pub fn put_fifo(&mut self, value: registers::Data) -> bool {
+    pub fn fifo_rx_put(&mut self, value: registers::Data) -> bool {
         let depth = self.fifo_depth();
         assert!(depth > 0);
         let slot = (self.read_pos + self.read_count) & (depth - 1);
@@ -580,19 +580,26 @@ fn write(&self, offset: hwaddr, value: u64, _size: u32) {
     fn can_receive(&self) -> u32 {
         let regs = self.regs.borrow();
         // trace_pl011_can_receive(s->lcr, s->read_count, r);
-        u32::from(regs.read_count < regs.fifo_depth())
+        regs.fifo_depth() - regs.read_count
     }
 
     fn receive(&self, buf: &[u8]) {
-        if buf.is_empty() {
+        let mut regs = self.regs.borrow_mut();
+        if regs.loopback_enabled() {
+            // In loopback mode, the RX input signal is internally disconnected
+            // from the entire receiving logics; thus, all inputs are ignored,
+            // and BREAK detection on RX input signal is also not performed.
             return;
         }
-        let mut regs = self.regs.borrow_mut();
-        let c: u32 = buf[0].into();
-        let update_irq = !regs.loopback_enabled() && regs.put_fifo(c.into());
+
+        let mut update_irq = false;
+        for &c in buf {
+            let c: u32 = c.into();
+            update_irq |= regs.fifo_rx_put(c.into());
+        }
+
         // Release the BqlRefCell before calling self.update()
         drop(regs);
-
         if update_irq {
             self.update();
         }
@@ -602,7 +609,7 @@ fn event(&self, event: Event) {
         let mut update_irq = false;
         let mut regs = self.regs.borrow_mut();
         if event == Event::CHR_EVENT_BREAK && !regs.loopback_enabled() {
-            update_irq = regs.put_fifo(registers::Data::BREAK);
+            update_irq = regs.fifo_rx_put(registers::Data::BREAK);
         }
         // Release the BqlRefCell before calling self.update()
         drop(regs);
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 8a67a14..f09ef96 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -97,8 +97,6 @@
   printf "%s\n" '  alsa            ALSA sound support'
   printf "%s\n" '  attr            attr/xattr support'
   printf "%s\n" '  auth-pam        PAM access control'
-  printf "%s\n" '  avx2            AVX2 optimizations'
-  printf "%s\n" '  avx512bw        AVX512BW optimizations'
   printf "%s\n" '  blkio           libblkio block device driver'
   printf "%s\n" '  bochs           bochs image format support'
   printf "%s\n" '  bpf             eBPF support'
@@ -244,10 +242,6 @@
     --audio-drv-list=*) quote_sh "-Daudio_drv_list=$2" ;;
     --enable-auth-pam) printf "%s" -Dauth_pam=enabled ;;
     --disable-auth-pam) printf "%s" -Dauth_pam=disabled ;;
-    --enable-avx2) printf "%s" -Davx2=enabled ;;
-    --disable-avx2) printf "%s" -Davx2=disabled ;;
-    --enable-avx512bw) printf "%s" -Davx512bw=enabled ;;
-    --disable-avx512bw) printf "%s" -Davx512bw=disabled ;;
     --enable-gcov) printf "%s" -Db_coverage=true ;;
     --disable-gcov) printf "%s" -Db_coverage=false ;;
     --enable-lto) printf "%s" -Db_lto=true ;;
diff --git a/scripts/modinfo-collect.py b/scripts/modinfo-collect.py
index 4e7584d..48bd92b 100644
--- a/scripts/modinfo-collect.py
+++ b/scripts/modinfo-collect.py
@@ -7,15 +7,6 @@
 import shlex
 import subprocess
 
-def find_command(src, target, compile_commands):
-    for command in compile_commands:
-        if command['file'] != src:
-            continue
-        if target != '' and command['command'].find(target) == -1:
-            continue
-        return command['command']
-    return 'false'
-
 def process_command(src, command):
     skip = False
     out = []
@@ -43,14 +34,22 @@ def main(args):
         print("MODINFO_DEBUG target %s" % target)
         arch = target[:-8] # cut '-softmmu'
         print("MODINFO_START arch \"%s\" MODINFO_END" % arch)
+
     with open('compile_commands.json') as f:
-        compile_commands = json.load(f)
-    for src in args:
+        compile_commands_json = json.load(f)
+    compile_commands = { x['output']: x for x in compile_commands_json }
+
+    for obj in args:
+        entry = compile_commands.get(obj, None)
+        if not entry:
+            sys.stderr.print('modinfo: Could not find object file', obj)
+            sys.exit(1)
+        src = entry['file']
         if not src.endswith('.c'):
             print("MODINFO_DEBUG skip %s" % src)
             continue
+        command = entry['command']
         print("MODINFO_DEBUG src %s" % src)
-        command = find_command(src, target, compile_commands)
         cmdline = process_command(src, command)
         print("MODINFO_DEBUG cmd", cmdline)
         result = subprocess.run(cmdline, stdout = subprocess.PIPE,
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 1ca6307..ec908d7 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -922,6 +922,17 @@
 #define TCG_8000_0008_EBX  (CPUID_8000_0008_EBX_XSAVEERPTR | \
           CPUID_8000_0008_EBX_WBNOINVD | CPUID_8000_0008_EBX_KERNEL_FEATURES)
 
+#if defined CONFIG_USER_ONLY
+#define CPUID_8000_0021_EAX_KERNEL_FEATURES CPUID_8000_0021_EAX_AUTO_IBRS
+#else
+#define CPUID_8000_0021_EAX_KERNEL_FEATURES 0
+#endif
+
+#define TCG_8000_0021_EAX_FEATURES ( \
+            CPUID_8000_0021_EAX_NO_NESTED_DATA_BP | \
+            CPUID_8000_0021_EAX_NULL_SEL_CLR_BASE | \
+            CPUID_8000_0021_EAX_KERNEL_FEATURES)
+
 FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
     [FEAT_1_EDX] = {
         .type = CPUID_FEATURE_WORD,
@@ -1249,7 +1260,7 @@
             "ibpb-brtype", "srso-no", "srso-user-kernel-no", NULL,
         },
         .cpuid = { .eax = 0x80000021, .reg = R_EAX, },
-        .tcg_features = 0,
+        .tcg_features = TCG_8000_0021_EAX_FEATURES,
         .unmigratable_flags = 0,
     },
     [FEAT_8000_0021_EBX] = {
@@ -1372,6 +1383,14 @@
             "bhi-no", NULL, NULL, NULL,
             "pbrsb-no", NULL, "gds-no", "rfds-no",
             "rfds-clear", NULL, NULL, NULL,
+            NULL, NULL, NULL, NULL,
+            NULL, NULL, NULL, NULL,
+            NULL, NULL, NULL, NULL,
+            NULL, NULL, NULL, NULL,
+            NULL, NULL, NULL, NULL,
+            NULL, NULL, NULL, NULL,
+            NULL, NULL, NULL, NULL,
+            NULL, NULL, "its-no", NULL,
         },
         .msr = {
             .index = MSR_IA32_ARCH_CAPABILITIES,
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 4f8ed88..c51e0a4 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1805,11 +1805,6 @@
         CPUCacheInfo *l3_cache;
 } CPUCaches;
 
-typedef struct X86LazyFlags {
-    target_ulong result;
-    target_ulong auxbits;
-} X86LazyFlags;
-
 typedef struct CPUArchState {
     /* standard registers */
     target_ulong regs[CPU_NB_REGS];
@@ -2102,7 +2097,6 @@
     QemuMutex xen_timers_lock;
 #endif
 #if defined(CONFIG_HVF)
-    X86LazyFlags lflags;
     void *emu_mmio_buf;
 #endif
 
diff --git a/target/i386/emulate/x86_decode.c b/target/i386/emulate/x86_decode.c
index 88be947..2eca398 100644
--- a/target/i386/emulate/x86_decode.c
+++ b/target/i386/emulate/x86_decode.c
@@ -109,8 +109,8 @@
 {
     op->type = X86_VAR_REG;
     op->reg = decode->modrm.reg;
-    op->ptr = get_reg_ref(env, op->reg, decode->rex.rex, decode->rex.r,
-                          decode->operand_size);
+    op->regptr = get_reg_ref(env, op->reg, decode->rex.rex, decode->rex.r,
+                             decode->operand_size);
 }
 
 static void decode_rax(CPUX86State *env, struct x86_decode *decode,
@@ -119,8 +119,8 @@
     op->type = X86_VAR_REG;
     op->reg = R_EAX;
     /* Since reg is always AX, REX prefix has no impact. */
-    op->ptr = get_reg_ref(env, op->reg, false, 0,
-                          decode->operand_size);
+    op->regptr = get_reg_ref(env, op->reg, false, 0,
+                             decode->operand_size);
 }
 
 static inline void decode_immediate(CPUX86State *env, struct x86_decode *decode,
@@ -262,16 +262,16 @@
 {
     decode->op[0].type = X86_VAR_REG;
     decode->op[0].reg = decode->opcode[0] - 0x40;
-    decode->op[0].ptr = get_reg_ref(env, decode->op[0].reg, decode->rex.rex,
-                                    decode->rex.b, decode->operand_size);
+    decode->op[0].regptr = get_reg_ref(env, decode->op[0].reg, decode->rex.rex,
+                                       decode->rex.b, decode->operand_size);
 }
 
 static void decode_decgroup(CPUX86State *env, struct x86_decode *decode)
 {
     decode->op[0].type = X86_VAR_REG;
     decode->op[0].reg = decode->opcode[0] - 0x48;
-    decode->op[0].ptr = get_reg_ref(env, decode->op[0].reg, decode->rex.rex,
-                                    decode->rex.b, decode->operand_size);
+    decode->op[0].regptr = get_reg_ref(env, decode->op[0].reg, decode->rex.rex,
+                                       decode->rex.b, decode->operand_size);
 }
 
 static void decode_incgroup2(CPUX86State *env, struct x86_decode *decode)
@@ -287,16 +287,16 @@
 {
     decode->op[0].type = X86_VAR_REG;
     decode->op[0].reg = decode->opcode[0] - 0x50;
-    decode->op[0].ptr = get_reg_ref(env, decode->op[0].reg, decode->rex.rex,
-                                    decode->rex.b, decode->operand_size);
+    decode->op[0].regptr = get_reg_ref(env, decode->op[0].reg, decode->rex.rex,
+                                       decode->rex.b, decode->operand_size);
 }
 
 static void decode_popgroup(CPUX86State *env, struct x86_decode *decode)
 {
     decode->op[0].type = X86_VAR_REG;
     decode->op[0].reg = decode->opcode[0] - 0x58;
-    decode->op[0].ptr = get_reg_ref(env, decode->op[0].reg, decode->rex.rex,
-                                    decode->rex.b, decode->operand_size);
+    decode->op[0].regptr = get_reg_ref(env, decode->op[0].reg, decode->rex.rex,
+                                       decode->rex.b, decode->operand_size);
 }
 
 static void decode_jxx(CPUX86State *env, struct x86_decode *decode)
@@ -377,16 +377,16 @@
 {
     decode->op[0].type = X86_VAR_REG;
     decode->op[0].reg = decode->opcode[0] - 0x90;
-    decode->op[0].ptr = get_reg_ref(env, decode->op[0].reg, decode->rex.rex,
-                                    decode->rex.b, decode->operand_size);
+    decode->op[0].regptr = get_reg_ref(env, decode->op[0].reg, decode->rex.rex,
+                                       decode->rex.b, decode->operand_size);
 }
 
 static void decode_movgroup(CPUX86State *env, struct x86_decode *decode)
 {
     decode->op[0].type = X86_VAR_REG;
     decode->op[0].reg = decode->opcode[0] - 0xb8;
-    decode->op[0].ptr = get_reg_ref(env, decode->op[0].reg, decode->rex.rex,
-                                    decode->rex.b, decode->operand_size);
+    decode->op[0].regptr = get_reg_ref(env, decode->op[0].reg, decode->rex.rex,
+                                       decode->rex.b, decode->operand_size);
     decode_immediate(env, decode, &decode->op[1], decode->operand_size);
 }
 
@@ -394,15 +394,15 @@
                         struct x86_decode_op *op)
 {
     op->type = X86_VAR_OFFSET;
-    op->ptr = decode_bytes(env, decode, decode->addressing_size);
+    op->addr = decode_bytes(env, decode, decode->addressing_size);
 }
 
 static void decode_movgroup8(CPUX86State *env, struct x86_decode *decode)
 {
     decode->op[0].type = X86_VAR_REG;
     decode->op[0].reg = decode->opcode[0] - 0xb0;
-    decode->op[0].ptr = get_reg_ref(env, decode->op[0].reg, decode->rex.rex,
-                                    decode->rex.b, decode->operand_size);
+    decode->op[0].regptr = get_reg_ref(env, decode->op[0].reg, decode->rex.rex,
+                                       decode->rex.b, decode->operand_size);
     decode_immediate(env, decode, &decode->op[1], decode->operand_size);
 }
 
@@ -411,8 +411,8 @@
 {
     op->type = X86_VAR_REG;
     op->reg = R_ECX;
-    op->ptr = get_reg_ref(env, op->reg, decode->rex.rex, decode->rex.b,
-                          decode->operand_size);
+    op->regptr = get_reg_ref(env, op->reg, decode->rex.rex, decode->rex.b,
+                             decode->operand_size);
 }
 
 struct decode_tbl {
@@ -631,8 +631,8 @@
 {
     decode->op[0].type = X86_VAR_REG;
     decode->op[0].reg = decode->opcode[1] - 0xc8;
-    decode->op[0].ptr = get_reg_ref(env, decode->op[0].reg, decode->rex.rex,
-                                    decode->rex.b, decode->operand_size);
+    decode->op[0].regptr = get_reg_ref(env, decode->op[0].reg, decode->rex.rex,
+                                       decode->rex.b, decode->operand_size);
 }
 
 static void decode_d9_4(CPUX86State *env, struct x86_decode *decode)
@@ -1656,16 +1656,16 @@
     }
 calc_addr:
     if (X86_DECODE_CMD_LEA == decode->cmd) {
-        op->ptr = (uint16_t)ptr;
+        op->addr = (uint16_t)ptr;
     } else {
-        op->ptr = decode_linear_addr(env, decode, (uint16_t)ptr, seg);
+        op->addr = decode_linear_addr(env, decode, (uint16_t)ptr, seg);
     }
 }
 
-target_ulong get_reg_ref(CPUX86State *env, int reg, int rex_present,
+void *get_reg_ref(CPUX86State *env, int reg, int rex_present,
                          int is_extended, int size)
 {
-    target_ulong ptr = 0;
+    void *ptr = NULL;
 
     if (is_extended) {
         reg |= R_R8;
@@ -1674,13 +1674,13 @@
     switch (size) {
     case 1:
         if (is_extended || reg < 4 || rex_present) {
-            ptr = (target_ulong)&RL(env, reg);
+            ptr = &RL(env, reg);
         } else {
-            ptr = (target_ulong)&RH(env, reg - 4);
+            ptr = &RH(env, reg - 4);
         }
         break;
     default:
-        ptr = (target_ulong)&RRX(env, reg);
+        ptr = &RRX(env, reg);
         break;
     }
     return ptr;
@@ -1691,7 +1691,7 @@
 {
     target_ulong val = 0;
     memcpy(&val,
-           (void *)get_reg_ref(env, reg, rex_present, is_extended, size),
+           get_reg_ref(env, reg, rex_present, is_extended, size),
            size);
     return val;
 }
@@ -1758,9 +1758,9 @@
     }
 
     if (X86_DECODE_CMD_LEA == decode->cmd) {
-        op->ptr = (uint32_t)ptr;
+        op->addr = (uint32_t)ptr;
     } else {
-        op->ptr = decode_linear_addr(env, decode, (uint32_t)ptr, seg);
+        op->addr = decode_linear_addr(env, decode, (uint32_t)ptr, seg);
     }
 }
 
@@ -1788,9 +1788,9 @@
     }
 
     if (X86_DECODE_CMD_LEA == decode->cmd) {
-        op->ptr = ptr;
+        op->addr = ptr;
     } else {
-        op->ptr = decode_linear_addr(env, decode, ptr, seg);
+        op->addr = decode_linear_addr(env, decode, ptr, seg);
     }
 }
 
@@ -1801,8 +1801,8 @@
     if (3 == decode->modrm.mod) {
         op->reg = decode->modrm.reg;
         op->type = X86_VAR_REG;
-        op->ptr = get_reg_ref(env, decode->modrm.rm, decode->rex.rex,
-                              decode->rex.b, decode->operand_size);
+        op->regptr = get_reg_ref(env, decode->modrm.rm, decode->rex.rex,
+                                 decode->rex.b, decode->operand_size);
         return;
     }
 
diff --git a/target/i386/emulate/x86_decode.h b/target/i386/emulate/x86_decode.h
index 87cc728..927645a 100644
--- a/target/i386/emulate/x86_decode.h
+++ b/target/i386/emulate/x86_decode.h
@@ -266,7 +266,10 @@
     int reg;
     target_ulong val;
 
-    target_ulong ptr;
+    union {
+        target_ulong addr;
+        void *regptr;
+    };
 } x86_decode_op;
 
 typedef struct x86_decode {
@@ -301,8 +304,8 @@
 
 uint32_t decode_instruction(CPUX86State *env, struct x86_decode *decode);
 
-target_ulong get_reg_ref(CPUX86State *env, int reg, int rex_present,
-                         int is_extended, int size);
+void *get_reg_ref(CPUX86State *env, int reg, int rex_present,
+                  int is_extended, int size);
 target_ulong get_reg_val(CPUX86State *env, int reg, int rex_present,
                          int is_extended, int size);
 void calc_modrm_operand(CPUX86State *env, struct x86_decode *decode,
diff --git a/target/i386/emulate/x86_emu.c b/target/i386/emulate/x86_emu.c
index 7773b51..4890e0a 100644
--- a/target/i386/emulate/x86_emu.c
+++ b/target/i386/emulate/x86_emu.c
@@ -52,7 +52,7 @@
         uint8_t v2 = (uint8_t)decode->op[1].val;    \
         uint8_t diff = v1 cmd v2;                   \
         if (save_res) {                              \
-            write_val_ext(env, decode->op[0].ptr, diff, 1);  \
+            write_val_ext(env, &decode->op[0], diff, 1);  \
         } \
         FLAGS_FUNC##8(env, v1, v2, diff);           \
         break;                                      \
@@ -63,7 +63,7 @@
         uint16_t v2 = (uint16_t)decode->op[1].val;  \
         uint16_t diff = v1 cmd v2;                  \
         if (save_res) {                              \
-            write_val_ext(env, decode->op[0].ptr, diff, 2); \
+            write_val_ext(env, &decode->op[0], diff, 2); \
         } \
         FLAGS_FUNC##16(env, v1, v2, diff);          \
         break;                                      \
@@ -74,7 +74,7 @@
         uint32_t v2 = (uint32_t)decode->op[1].val;  \
         uint32_t diff = v1 cmd v2;                  \
         if (save_res) {                              \
-            write_val_ext(env, decode->op[0].ptr, diff, 4); \
+            write_val_ext(env, &decode->op[0], diff, 4); \
         } \
         FLAGS_FUNC##32(env, v1, v2, diff);          \
         break;                                      \
@@ -121,7 +121,7 @@
     }
 }
 
-target_ulong read_val_from_reg(target_ulong reg_ptr, int size)
+target_ulong read_val_from_reg(void *reg_ptr, int size)
 {
     target_ulong val;
     
@@ -144,7 +144,7 @@
     return val;
 }
 
-void write_val_to_reg(target_ulong reg_ptr, target_ulong val, int size)
+void write_val_to_reg(void *reg_ptr, target_ulong val, int size)
 {
     switch (size) {
     case 1:
@@ -164,18 +164,18 @@
     }
 }
 
-static bool is_host_reg(CPUX86State *env, target_ulong ptr)
+static void write_val_to_mem(CPUX86State *env, target_ulong ptr, target_ulong val, int size)
 {
-    return (ptr - (target_ulong)&env->regs[0]) < sizeof(env->regs);
+    emul_ops->write_mem(env_cpu(env), &val, ptr, size);
 }
 
-void write_val_ext(CPUX86State *env, target_ulong ptr, target_ulong val, int size)
+void write_val_ext(CPUX86State *env, struct x86_decode_op *decode, target_ulong val, int size)
 {
-    if (is_host_reg(env, ptr)) {
-        write_val_to_reg(ptr, val, size);
-        return;
+    if (decode->type == X86_VAR_REG) {
+        write_val_to_reg(decode->regptr, val, size);
+    } else {
+        write_val_to_mem(env, decode->addr, val, size);
     }
-    emul_ops->write_mem(env_cpu(env), &val, ptr, size);
 }
 
 uint8_t *read_mmio(CPUX86State *env, target_ulong ptr, int bytes)
@@ -185,15 +185,11 @@
 }
 
 
-target_ulong read_val_ext(CPUX86State *env, target_ulong ptr, int size)
+static target_ulong read_val_from_mem(CPUX86State *env, target_long ptr, int size)
 {
     target_ulong val;
     uint8_t *mmio_ptr;
 
-    if (is_host_reg(env, ptr)) {
-        return read_val_from_reg(ptr, size);
-    }
-
     mmio_ptr = read_mmio(env, ptr, size);
     switch (size) {
     case 1:
@@ -215,6 +211,15 @@
     return val;
 }
 
+target_ulong read_val_ext(CPUX86State *env, struct x86_decode_op *decode, int size)
+{
+    if (decode->type == X86_VAR_REG) {
+        return read_val_from_reg(decode->regptr, size);
+    } else {
+        return read_val_from_mem(env, decode->addr, size);
+    }
+}
+
 static void fetch_operands(CPUX86State *env, struct x86_decode *decode,
                            int n, bool val_op0, bool val_op1, bool val_op2)
 {
@@ -226,25 +231,25 @@
         case X86_VAR_IMMEDIATE:
             break;
         case X86_VAR_REG:
-            VM_PANIC_ON(!decode->op[i].ptr);
+            VM_PANIC_ON(!decode->op[i].regptr);
             if (calc_val[i]) {
-                decode->op[i].val = read_val_from_reg(decode->op[i].ptr,
+                decode->op[i].val = read_val_from_reg(decode->op[i].regptr,
                                                       decode->operand_size);
             }
             break;
         case X86_VAR_RM:
             calc_modrm_operand(env, decode, &decode->op[i]);
             if (calc_val[i]) {
-                decode->op[i].val = read_val_ext(env, decode->op[i].ptr,
+                decode->op[i].val = read_val_ext(env, &decode->op[i],
                                                  decode->operand_size);
             }
             break;
         case X86_VAR_OFFSET:
-            decode->op[i].ptr = decode_linear_addr(env, decode,
-                                                   decode->op[i].ptr,
-                                                   R_DS);
+            decode->op[i].addr = decode_linear_addr(env, decode,
+                                                    decode->op[i].addr,
+                                                    R_DS);
             if (calc_val[i]) {
-                decode->op[i].val = read_val_ext(env, decode->op[i].ptr,
+                decode->op[i].val = read_val_ext(env, &decode->op[i],
                                                  decode->operand_size);
             }
             break;
@@ -257,7 +262,7 @@
 static void exec_mov(CPUX86State *env, struct x86_decode *decode)
 {
     fetch_operands(env, decode, 2, false, true, false);
-    write_val_ext(env, decode->op[0].ptr, decode->op[1].val,
+    write_val_ext(env, &decode->op[0], decode->op[1].val,
                   decode->operand_size);
 
     env->eip += decode->len;
@@ -312,7 +317,7 @@
     fetch_operands(env, decode, 2, true, true, false);
 
     val = 0 - sign(decode->op[1].val, decode->operand_size);
-    write_val_ext(env, decode->op[1].ptr, val, decode->operand_size);
+    write_val_ext(env, &decode->op[1], val, decode->operand_size);
 
     if (4 == decode->operand_size) {
         SET_FLAGS_OSZAPC_SUB32(env, 0, 0 - val, val);
@@ -363,7 +368,7 @@
 {
     fetch_operands(env, decode, 1, true, false, false);
 
-    write_val_ext(env, decode->op[0].ptr, ~decode->op[0].val,
+    write_val_ext(env, &decode->op[0], ~decode->op[0].val,
                   decode->operand_size);
     env->eip += decode->len;
 }
@@ -382,8 +387,8 @@
     }
     decode->operand_size = src_op_size;
     calc_modrm_operand(env, decode, &decode->op[1]);
-    decode->op[1].val = read_val_ext(env, decode->op[1].ptr, src_op_size);
-    write_val_ext(env, decode->op[0].ptr, decode->op[1].val, op_size);
+    decode->op[1].val = read_val_ext(env, &decode->op[1], src_op_size);
+    write_val_ext(env, &decode->op[0], decode->op[1].val, op_size);
 
     env->eip += decode->len;
 }
@@ -469,10 +474,10 @@
     while (rcx--) {
         func(env, decode);
         write_reg(env, R_ECX, rcx, decode->addressing_size);
-        if ((PREFIX_REP == rep) && !get_ZF(env)) {
+        if ((PREFIX_REP == rep) && !env->cc_dst) {
             break;
         }
-        if ((PREFIX_REPN == rep) && get_ZF(env)) {
+        if ((PREFIX_REPN == rep) && env->cc_dst) {
             break;
         }
     }
@@ -535,8 +540,8 @@
     dst_addr = linear_addr_size(env_cpu(env), RDI(env),
                                 decode->addressing_size, R_ES);
 
-    val = read_val_ext(env, src_addr, decode->operand_size);
-    write_val_ext(env, dst_addr, val, decode->operand_size);
+    val = read_val_from_mem(env, src_addr, decode->operand_size);
+    write_val_to_mem(env, dst_addr, val, decode->operand_size);
 
     string_increment_reg(env, R_ESI, decode);
     string_increment_reg(env, R_EDI, decode);
@@ -563,9 +568,9 @@
                                 decode->addressing_size, R_ES);
 
     decode->op[0].type = X86_VAR_IMMEDIATE;
-    decode->op[0].val = read_val_ext(env, src_addr, decode->operand_size);
+    decode->op[0].val = read_val_from_mem(env, src_addr, decode->operand_size);
     decode->op[1].type = X86_VAR_IMMEDIATE;
-    decode->op[1].val = read_val_ext(env, dst_addr, decode->operand_size);
+    decode->op[1].val = read_val_from_mem(env, dst_addr, decode->operand_size);
 
     EXEC_2OP_FLAGS_CMD(env, decode, -, SET_FLAGS_OSZAPC_SUB, false);
 
@@ -697,15 +702,15 @@
     if (decode->op[0].type != X86_VAR_REG) {
         if (4 == decode->operand_size) {
             displacement = ((int32_t) (decode->op[1].val & 0xffffffe0)) / 32;
-            decode->op[0].ptr += 4 * displacement;
+            decode->op[0].addr += 4 * displacement;
         } else if (2 == decode->operand_size) {
             displacement = ((int16_t) (decode->op[1].val & 0xfff0)) / 16;
-            decode->op[0].ptr += 2 * displacement;
+            decode->op[0].addr += 2 * displacement;
         } else {
             VM_PANIC("bt 64bit\n");
         }
     }
-    decode->op[0].val = read_val_ext(env, decode->op[0].ptr,
+    decode->op[0].val = read_val_ext(env, &decode->op[0],
                                      decode->operand_size);
     cf = (decode->op[0].val >> index) & 0x01;
 
@@ -723,7 +728,7 @@
         decode->op[0].val &= ~(1u << index);
         break;
     }
-    write_val_ext(env, decode->op[0].ptr, decode->op[0].val,
+    write_val_ext(env, &decode->op[0], decode->op[0].val,
                   decode->operand_size);
     set_CF(env, cf);
 }
@@ -775,7 +780,7 @@
             of = cf ^ (res >> 7);
         }
 
-        write_val_ext(env, decode->op[0].ptr, res, 1);
+        write_val_ext(env, &decode->op[0], res, 1);
         SET_FLAGS_OSZAPC_LOGIC8(env, 0, 0, res);
         SET_FLAGS_OxxxxC(env, of, cf);
         break;
@@ -791,7 +796,7 @@
             of = cf ^ (res >> 15); /* of = cf ^ result15 */
         }
 
-        write_val_ext(env, decode->op[0].ptr, res, 2);
+        write_val_ext(env, &decode->op[0], res, 2);
         SET_FLAGS_OSZAPC_LOGIC16(env, 0, 0, res);
         SET_FLAGS_OxxxxC(env, of, cf);
         break;
@@ -800,7 +805,7 @@
     {
         uint32_t res = decode->op[0].val << count;
 
-        write_val_ext(env, decode->op[0].ptr, res, 4);
+        write_val_ext(env, &decode->op[0], res, 4);
         SET_FLAGS_OSZAPC_LOGIC32(env, 0, 0, res);
         cf = (decode->op[0].val >> (32 - count)) & 0x1;
         of = cf ^ (res >> 31); /* of = cf ^ result31 */
@@ -831,10 +836,10 @@
 
     decode->operand_size = src_op_size;
     calc_modrm_operand(env, decode, &decode->op[1]);
-    decode->op[1].val = sign(read_val_ext(env, decode->op[1].ptr, src_op_size),
+    decode->op[1].val = sign(read_val_ext(env, &decode->op[1], src_op_size),
                              src_op_size);
 
-    write_val_ext(env, decode->op[0].ptr, decode->op[1].val, op_size);
+    write_val_ext(env, &decode->op[0], decode->op[1].val, op_size);
 
     env->eip += decode->len;
 }
@@ -862,7 +867,7 @@
             count &= 0x7; /* use only bottom 3 bits */
             res = ((uint8_t)decode->op[0].val >> count) |
                    ((uint8_t)decode->op[0].val << (8 - count));
-            write_val_ext(env, decode->op[0].ptr, res, 1);
+            write_val_ext(env, &decode->op[0], res, 1);
             bit6 = (res >> 6) & 1;
             bit7 = (res >> 7) & 1;
             /* set eflags: ROR count affects the following flags: C, O */
@@ -886,7 +891,7 @@
             count &= 0x0f;  /* use only 4 LSB's */
             res = ((uint16_t)decode->op[0].val >> count) |
                    ((uint16_t)decode->op[0].val << (16 - count));
-            write_val_ext(env, decode->op[0].ptr, res, 2);
+            write_val_ext(env, &decode->op[0], res, 2);
 
             bit14 = (res >> 14) & 1;
             bit15 = (res >> 15) & 1;
@@ -904,7 +909,7 @@
         if (count) {
             res = ((uint32_t)decode->op[0].val >> count) |
                    ((uint32_t)decode->op[0].val << (32 - count));
-            write_val_ext(env, decode->op[0].ptr, res, 4);
+            write_val_ext(env, &decode->op[0], res, 4);
 
             bit31 = (res >> 31) & 1;
             bit30 = (res >> 30) & 1;
@@ -941,7 +946,7 @@
             res = ((uint8_t)decode->op[0].val << count) |
                    ((uint8_t)decode->op[0].val >> (8 - count));
 
-            write_val_ext(env, decode->op[0].ptr, res, 1);
+            write_val_ext(env, &decode->op[0], res, 1);
             /* set eflags:
              * ROL count affects the following flags: C, O
              */
@@ -968,7 +973,7 @@
             res = ((uint16_t)decode->op[0].val << count) |
                    ((uint16_t)decode->op[0].val >> (16 - count));
 
-            write_val_ext(env, decode->op[0].ptr, res, 2);
+            write_val_ext(env, &decode->op[0], res, 2);
             bit0  = (res & 0x1);
             bit15 = (res >> 15);
             /* of = cf ^ result15 */
@@ -986,7 +991,7 @@
             res = ((uint32_t)decode->op[0].val << count) |
                    ((uint32_t)decode->op[0].val >> (32 - count));
 
-            write_val_ext(env, decode->op[0].ptr, res, 4);
+            write_val_ext(env, &decode->op[0], res, 4);
             bit0  = (res & 0x1);
             bit31 = (res >> 31);
             /* of = cf ^ result31 */
@@ -1024,7 +1029,7 @@
                    (op1_8 >> (9 - count));
         }
 
-        write_val_ext(env, decode->op[0].ptr, res, 1);
+        write_val_ext(env, &decode->op[0], res, 1);
 
         cf = (op1_8 >> (8 - count)) & 0x01;
         of = cf ^ (res >> 7); /* of = cf ^ result7 */
@@ -1050,7 +1055,7 @@
                    (op1_16 >> (17 - count));
         }
 
-        write_val_ext(env, decode->op[0].ptr, res, 2);
+        write_val_ext(env, &decode->op[0], res, 2);
 
         cf = (op1_16 >> (16 - count)) & 0x1;
         of = cf ^ (res >> 15); /* of = cf ^ result15 */
@@ -1073,7 +1078,7 @@
                    (op1_32 >> (33 - count));
         }
 
-        write_val_ext(env, decode->op[0].ptr, res, 4);
+        write_val_ext(env, &decode->op[0], res, 4);
 
         cf = (op1_32 >> (32 - count)) & 0x1;
         of = cf ^ (res >> 31); /* of = cf ^ result31 */
@@ -1105,7 +1110,7 @@
         res = (op1_8 >> count) | (get_CF(env) << (8 - count)) |
                (op1_8 << (9 - count));
 
-        write_val_ext(env, decode->op[0].ptr, res, 1);
+        write_val_ext(env, &decode->op[0], res, 1);
 
         cf = (op1_8 >> (count - 1)) & 0x1;
         of = (((res << 1) ^ res) >> 7) & 0x1; /* of = result6 ^ result7 */
@@ -1124,7 +1129,7 @@
         res = (op1_16 >> count) | (get_CF(env) << (16 - count)) |
                (op1_16 << (17 - count));
 
-        write_val_ext(env, decode->op[0].ptr, res, 2);
+        write_val_ext(env, &decode->op[0], res, 2);
 
         cf = (op1_16 >> (count - 1)) & 0x1;
         of = ((uint16_t)((res << 1) ^ res) >> 15) & 0x1; /* of = result15 ^
@@ -1148,7 +1153,7 @@
                    (op1_32 << (33 - count));
         }
 
-        write_val_ext(env, decode->op[0].ptr, res, 4);
+        write_val_ext(env, &decode->op[0], res, 4);
 
         cf = (op1_32 >> (count - 1)) & 0x1;
         of = ((res << 1) ^ res) >> 31; /* of = result30 ^ result31 */
@@ -1163,9 +1168,9 @@
 {
     fetch_operands(env, decode, 2, true, true, false);
 
-    write_val_ext(env, decode->op[0].ptr, decode->op[1].val,
+    write_val_ext(env, &decode->op[0], decode->op[1].val,
                   decode->operand_size);
-    write_val_ext(env, decode->op[1].ptr, decode->op[0].val,
+    write_val_ext(env, &decode->op[1], decode->op[0].val,
                   decode->operand_size);
 
     env->eip += decode->len;
@@ -1174,7 +1179,7 @@
 static void exec_xadd(CPUX86State *env, struct x86_decode *decode)
 {
     EXEC_2OP_FLAGS_CMD(env, decode, +, SET_FLAGS_OSZAPC_ADD, true);
-    write_val_ext(env, decode->op[1].ptr, decode->op[0].val,
+    write_val_ext(env, &decode->op[1], decode->op[0].val,
                   decode->operand_size);
 
     env->eip += decode->len;
diff --git a/target/i386/emulate/x86_emu.h b/target/i386/emulate/x86_emu.h
index 555b567..a1a9612 100644
--- a/target/i386/emulate/x86_emu.h
+++ b/target/i386/emulate/x86_emu.h
@@ -42,11 +42,11 @@
 
 target_ulong read_reg(CPUX86State *env, int reg, int size);
 void write_reg(CPUX86State *env, int reg, target_ulong val, int size);
-target_ulong read_val_from_reg(target_ulong reg_ptr, int size);
-void write_val_to_reg(target_ulong reg_ptr, target_ulong val, int size);
-void write_val_ext(CPUX86State *env, target_ulong ptr, target_ulong val, int size);
+target_ulong read_val_from_reg(void *reg_ptr, int size);
+void write_val_to_reg(void *reg_ptr, target_ulong val, int size);
+void write_val_ext(CPUX86State *env, struct x86_decode_op *decode, target_ulong val, int size);
 uint8_t *read_mmio(CPUX86State *env, target_ulong ptr, int bytes);
-target_ulong read_val_ext(CPUX86State *env, target_ulong ptr, int size);
+target_ulong read_val_ext(CPUX86State *env, struct x86_decode_op *decode, int size);
 
 void exec_movzx(CPUX86State *env, struct x86_decode *decode);
 void exec_shl(CPUX86State *env, struct x86_decode *decode);
diff --git a/target/i386/emulate/x86_flags.c b/target/i386/emulate/x86_flags.c
index 84e2736..47bc197 100644
--- a/target/i386/emulate/x86_flags.c
+++ b/target/i386/emulate/x86_flags.c
@@ -29,41 +29,50 @@
 #include "x86.h"
 
 
-/* this is basically bocsh code */
+/*
+ * The algorithms here are similar to those in Bochs.  After an ALU
+ * operation, CC_DST can be used to compute ZF, SF and PF, whereas
+ * CC_SRC is used to compute AF, CF and OF.  In reality, SF and PF are the
+ * XOR of the value computed from CC_DST and the value found in bits 7 and 2
+ * of CC_SRC; this way the same logic can be used to compute the flags
+ * both before and after an ALU operation.
+ *
+ * Compared to the TCG CC_OP codes, this avoids conditionals when converting
+ * to and from the RFLAGS representation.
+ */
 
-#define LF_SIGN_BIT     31
+#define LF_SIGN_BIT    (TARGET_LONG_BITS - 1)
 
-#define LF_BIT_SD      (0)          /* lazy Sign Flag Delta            */
-#define LF_BIT_AF      (3)          /* lazy Adjust flag                */
-#define LF_BIT_PDB     (8)          /* lazy Parity Delta Byte (8 bits) */
-#define LF_BIT_CF      (31)         /* lazy Carry Flag                 */
-#define LF_BIT_PO      (30)         /* lazy Partial Overflow = CF ^ OF */
+#define LF_BIT_PD      (2)          /* lazy Parity Delta, same bit as PF */
+#define LF_BIT_AF      (3)          /* lazy Adjust flag */
+#define LF_BIT_SD      (7)          /* lazy Sign Flag Delta, same bit as SF */
+#define LF_BIT_CF      (TARGET_LONG_BITS - 1) /* lazy Carry Flag */
+#define LF_BIT_PO      (TARGET_LONG_BITS - 2) /* lazy Partial Overflow = CF ^ OF */
 
-#define LF_MASK_SD     (0x01 << LF_BIT_SD)
-#define LF_MASK_AF     (0x01 << LF_BIT_AF)
-#define LF_MASK_PDB    (0xFF << LF_BIT_PDB)
-#define LF_MASK_CF     (0x01 << LF_BIT_CF)
-#define LF_MASK_PO     (0x01 << LF_BIT_PO)
+#define LF_MASK_PD     ((target_ulong)0x01 << LF_BIT_PD)
+#define LF_MASK_AF     ((target_ulong)0x01 << LF_BIT_AF)
+#define LF_MASK_SD     ((target_ulong)0x01 << LF_BIT_SD)
+#define LF_MASK_CF     ((target_ulong)0x01 << LF_BIT_CF)
+#define LF_MASK_PO     ((target_ulong)0x01 << LF_BIT_PO)
 
 /* ******************* */
 /* OSZAPC */
 /* ******************* */
 
-/* size, carries, result */
+/* use carries to fill in AF, PO and CF, while ensuring PD and SD are clear.
+ * for full-word operations just clear PD and SD; for smaller operand
+ * sizes only keep AF in the low byte and shift the carries left to
+ * place PO and CF in the top two bits.
+ */
 #define SET_FLAGS_OSZAPC_SIZE(size, lf_carries, lf_result) { \
-    target_ulong temp = ((lf_carries) & (LF_MASK_AF)) | \
-    (((lf_carries) >> (size - 2)) << LF_BIT_PO); \
-    env->lflags.result = (target_ulong)(int##size##_t)(lf_result); \
-    if ((size) == 32) { \
-        temp = ((lf_carries) & ~(LF_MASK_PDB | LF_MASK_SD)); \
-    } else if ((size) == 16) { \
-        temp = ((lf_carries) & (LF_MASK_AF)) | ((lf_carries) << 16); \
-    } else if ((size) == 8)  { \
-        temp = ((lf_carries) & (LF_MASK_AF)) | ((lf_carries) << 24); \
+    env->cc_dst = (target_ulong)(int##size##_t)(lf_result); \
+    target_ulong temp = (lf_carries); \
+    if ((size) == TARGET_LONG_BITS) { \
+        temp = temp & ~(LF_MASK_PD | LF_MASK_SD); \
     } else { \
-        VM_PANIC("unimplemented");  \
+        temp = (temp & LF_MASK_AF) | (temp << (TARGET_LONG_BITS - (size))); \
     } \
-    env->lflags.auxbits = (target_ulong)(uint32_t)temp; \
+    env->cc_src = temp; \
 }
 
 /* carries, result */
@@ -77,23 +86,18 @@
 /* ******************* */
 /* OSZAP */
 /* ******************* */
-/* size, carries, result */
+/* same as setting OSZAPC, but preserve CF and flip PO if the old value of CF
+ * did not match the high bit of lf_carries. */
 #define SET_FLAGS_OSZAP_SIZE(size, lf_carries, lf_result) { \
-    target_ulong temp = ((lf_carries) & (LF_MASK_AF)) | \
-    (((lf_carries) >> (size - 2)) << LF_BIT_PO); \
-    if ((size) == 32) { \
-        temp = ((lf_carries) & ~(LF_MASK_PDB | LF_MASK_SD)); \
-    } else if ((size) == 16) { \
-        temp = ((lf_carries) & (LF_MASK_AF)) | ((lf_carries) << 16); \
-    } else if ((size) == 8) { \
-        temp = ((lf_carries) & (LF_MASK_AF)) | ((lf_carries) << 24); \
+    env->cc_dst = (target_ulong)(int##size##_t)(lf_result); \
+    target_ulong temp = (lf_carries); \
+    if ((size) == TARGET_LONG_BITS) { \
+        temp = (temp & ~(LF_MASK_PD | LF_MASK_SD)); \
     } else { \
-        VM_PANIC("unimplemented");      \
+        temp = (temp & LF_MASK_AF) | (temp << (TARGET_LONG_BITS - (size))); \
     } \
-    env->lflags.result = (target_ulong)(int##size##_t)(lf_result); \
-    target_ulong delta_c = (env->lflags.auxbits ^ temp) & LF_MASK_CF; \
-    delta_c ^= (delta_c >> 1); \
-    env->lflags.auxbits = (target_ulong)(uint32_t)(temp ^ delta_c); \
+    target_ulong cf_changed = ((target_long)(env->cc_src ^ temp)) < 0; \
+    env->cc_src = temp ^ (cf_changed * (LF_MASK_PO | LF_MASK_CF)); \
 }
 
 /* carries, result */
@@ -104,11 +108,11 @@
 #define SET_FLAGS_OSZAP_32(carries, result) \
     SET_FLAGS_OSZAP_SIZE(32, carries, result)
 
-void SET_FLAGS_OxxxxC(CPUX86State *env, uint32_t new_of, uint32_t new_cf)
+void SET_FLAGS_OxxxxC(CPUX86State *env, bool new_of, bool new_cf)
 {
-    uint32_t temp_po = new_of ^ new_cf;
-    env->lflags.auxbits &= ~(LF_MASK_PO | LF_MASK_CF);
-    env->lflags.auxbits |= (temp_po << LF_BIT_PO) | (new_cf << LF_BIT_CF);
+    env->cc_src &= ~(LF_MASK_PO | LF_MASK_CF);
+    env->cc_src |= (-(target_ulong)new_cf << LF_BIT_PO);
+    env->cc_src ^= ((target_ulong)new_of << LF_BIT_PO);
 }
 
 void SET_FLAGS_OSZAPC_SUB32(CPUX86State *env, uint32_t v1, uint32_t v2,
@@ -202,104 +206,68 @@
     SET_FLAGS_OSZAPC_8(0, diff);
 }
 
-bool get_PF(CPUX86State *env)
+static inline uint32_t get_PF(CPUX86State *env)
 {
-    uint32_t temp = (255 & env->lflags.result);
-    temp = temp ^ (255 & (env->lflags.auxbits >> LF_BIT_PDB));
-    temp = (temp ^ (temp >> 4)) & 0x0F;
-    return (0x9669U >> temp) & 1;
+    return ((parity8(env->cc_dst) - 1) ^ env->cc_src) & CC_P;
 }
 
-void set_PF(CPUX86State *env, bool val)
+static inline uint32_t get_OF(CPUX86State *env)
 {
-    uint32_t temp = (255 & env->lflags.result) ^ (!val);
-    env->lflags.auxbits &= ~(LF_MASK_PDB);
-    env->lflags.auxbits |= (temp << LF_BIT_PDB);
-}
-
-bool get_OF(CPUX86State *env)
-{
-    return ((env->lflags.auxbits + (1U << LF_BIT_PO)) >> LF_BIT_CF) & 1;
+    return ((env->cc_src >> (LF_BIT_CF - 11)) + CC_O / 2) & CC_O;
 }
 
 bool get_CF(CPUX86State *env)
 {
-    return (env->lflags.auxbits >> LF_BIT_CF) & 1;
-}
-
-void set_OF(CPUX86State *env, bool val)
-{
-    bool old_cf = get_CF(env);
-    SET_FLAGS_OxxxxC(env, val, old_cf);
+    return ((target_long)env->cc_src) < 0;
 }
 
 void set_CF(CPUX86State *env, bool val)
 {
-    bool old_of = get_OF(env);
-    SET_FLAGS_OxxxxC(env, old_of, val);
+    /* If CF changes, flip PO and CF */
+    target_ulong temp = -(target_ulong)val;
+    target_ulong cf_changed = ((target_long)(env->cc_src ^ temp)) < 0;
+    env->cc_src ^= cf_changed * (LF_MASK_PO | LF_MASK_CF);
 }
 
-bool get_AF(CPUX86State *env)
+static inline uint32_t get_ZF(CPUX86State *env)
 {
-    return (env->lflags.auxbits >> LF_BIT_AF) & 1;
+    return env->cc_dst ? 0 : CC_Z;
 }
 
-void set_AF(CPUX86State *env, bool val)
+static inline uint32_t get_SF(CPUX86State *env)
 {
-    env->lflags.auxbits &= ~(LF_MASK_AF);
-    env->lflags.auxbits |= val << LF_BIT_AF;
-}
-
-bool get_ZF(CPUX86State *env)
-{
-    return !env->lflags.result;
-}
-
-void set_ZF(CPUX86State *env, bool val)
-{
-    if (val) {
-        env->lflags.auxbits ^=
-         (((env->lflags.result >> LF_SIGN_BIT) & 1) << LF_BIT_SD);
-        /* merge the parity bits into the Parity Delta Byte */
-        uint32_t temp_pdb = (255 & env->lflags.result);
-        env->lflags.auxbits ^= (temp_pdb << LF_BIT_PDB);
-        /* now zero the .result value */
-        env->lflags.result = 0;
-    } else {
-        env->lflags.result |= (1 << 8);
-    }
-}
-
-bool get_SF(CPUX86State *env)
-{
-    return ((env->lflags.result >> LF_SIGN_BIT) ^
-            (env->lflags.auxbits >> LF_BIT_SD)) & 1;
-}
-
-void set_SF(CPUX86State *env, bool val)
-{
-    bool temp_sf = get_SF(env);
-    env->lflags.auxbits ^= (temp_sf ^ val) << LF_BIT_SD;
+    return ((env->cc_dst >> (LF_SIGN_BIT - LF_BIT_SD)) ^
+            env->cc_src) & CC_S;
 }
 
 void lflags_to_rflags(CPUX86State *env)
 {
     env->eflags &= ~(CC_C|CC_P|CC_A|CC_Z|CC_S|CC_O);
-    env->eflags |= get_CF(env) ? CC_C : 0;
-    env->eflags |= get_PF(env) ? CC_P : 0;
-    env->eflags |= get_AF(env) ? CC_A : 0;
-    env->eflags |= get_ZF(env) ? CC_Z : 0;
-    env->eflags |= get_SF(env) ? CC_S : 0;
-    env->eflags |= get_OF(env) ? CC_O : 0;
+    /* rotate left by one to move carry-out bits into CF and AF */
+    env->eflags |= (
+        (env->cc_src << 1) |
+        (env->cc_src >> (TARGET_LONG_BITS - 1))) & (CC_C | CC_A);
+    env->eflags |= get_SF(env);
+    env->eflags |= get_PF(env);
+    env->eflags |= get_ZF(env);
+    env->eflags |= get_OF(env);
 }
 
 void rflags_to_lflags(CPUX86State *env)
 {
-    env->lflags.auxbits = env->lflags.result = 0;
-    set_OF(env, env->eflags & CC_O);
-    set_SF(env, env->eflags & CC_S);
-    set_ZF(env, env->eflags & CC_Z);
-    set_AF(env, env->eflags & CC_A);
-    set_PF(env, env->eflags & CC_P);
-    set_CF(env, env->eflags & CC_C);
+    target_ulong cf_xor_of;
+
+    env->cc_src = CC_P;
+    env->cc_src ^= env->eflags & (CC_S | CC_P);
+
+    /* rotate right by one to move CF and AF into the carry-out positions */
+    env->cc_src |= (
+        (env->eflags >> 1) |
+        (env->eflags << (TARGET_LONG_BITS - 1))) & (CC_C | CC_A);
+
+    cf_xor_of = (env->eflags & (CC_C | CC_O)) + (CC_O - CC_C);
+    env->cc_src |= -cf_xor_of & LF_MASK_PO;
+
+    /* Leave the low byte zero so that parity is not affected.  */
+    env->cc_dst = !(env->eflags & CC_Z) << 8;
 }
diff --git a/target/i386/emulate/x86_flags.h b/target/i386/emulate/x86_flags.h
index 6c17500..28b008e 100644
--- a/target/i386/emulate/x86_flags.h
+++ b/target/i386/emulate/x86_flags.h
@@ -28,20 +28,10 @@
 void lflags_to_rflags(CPUX86State *env);
 void rflags_to_lflags(CPUX86State *env);
 
-bool get_PF(CPUX86State *env);
-void set_PF(CPUX86State *env, bool val);
 bool get_CF(CPUX86State *env);
 void set_CF(CPUX86State *env, bool val);
-bool get_AF(CPUX86State *env);
-void set_AF(CPUX86State *env, bool val);
-bool get_ZF(CPUX86State *env);
-void set_ZF(CPUX86State *env, bool val);
-bool get_SF(CPUX86State *env);
-void set_SF(CPUX86State *env, bool val);
-bool get_OF(CPUX86State *env);
-void set_OF(CPUX86State *env, bool val);
 
-void SET_FLAGS_OxxxxC(CPUX86State *env, uint32_t new_of, uint32_t new_cf);
+void SET_FLAGS_OxxxxC(CPUX86State *env, bool new_of, bool new_cf);
 
 void SET_FLAGS_OSZAPC_SUB32(CPUX86State *env, uint32_t v1, uint32_t v2,
                             uint32_t diff);
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index cda32ee..55216e0 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -2542,7 +2542,13 @@
     s->has_modrm = false;
     s->prefix = 0;
 
- next_byte:
+ next_byte:;
+#ifdef TARGET_X86_64
+    /* clear any REX prefix followed by other prefixes.  */
+    int rex;
+    rex = -1;
+ next_byte_rex:
+#endif
     b = x86_ldub_code(env, s);
 
     /* Collect prefixes.  */
@@ -2585,13 +2591,12 @@
 #ifdef TARGET_X86_64
     case 0x40 ... 0x4f:
         if (CODE64(s)) {
-            /* REX prefix */
-            s->prefix |= PREFIX_REX;
-            s->vex_w = (b >> 3) & 1;
-            s->rex_r = (b & 0x4) << 1;
-            s->rex_x = (b & 0x2) << 2;
-            s->rex_b = (b & 0x1) << 3;
-            goto next_byte;
+            /*
+             * REX prefix; ignored unless it is the last prefix, so
+             * for now just stash it
+             */
+            rex = b;
+            goto next_byte_rex;
         }
         break;
 #endif
@@ -2618,10 +2623,13 @@
 
             /* 4.1.1-4.1.3: No preceding lock, 66, f2, f3, or rex prefixes. */
             if (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ
-                             | PREFIX_LOCK | PREFIX_DATA | PREFIX_REX)) {
+                             | PREFIX_LOCK | PREFIX_DATA)) {
                 goto illegal_op;
             }
 #ifdef TARGET_X86_64
+            if (rex != -1) {
+                goto illegal_op;
+            }
             s->rex_r = (~vex2 >> 4) & 8;
 #endif
             if (b == 0xc5) {
@@ -2661,6 +2669,16 @@
 
     /* Post-process prefixes.  */
     if (CODE64(s)) {
+#ifdef TARGET_X86_64
+        if (rex != -1) {
+            s->prefix |= PREFIX_REX;
+            s->vex_w = (rex >> 3) & 1;
+            s->rex_r = (rex & 0x4) << 1;
+            s->rex_x = (rex & 0x2) << 2;
+            s->rex_b = (rex & 0x1) << 3;
+        }
+#endif
+
         /*
          * In 64-bit mode, the default data size is 32-bit.  Select 64-bit
          * data with rex_w, and 16-bit data with 0x66; rex_w takes precedence
diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c
index 0ca081b..071f3fb 100644
--- a/target/i386/tcg/seg_helper.c
+++ b/target/i386/tcg/seg_helper.c
@@ -326,10 +326,10 @@
 #define SWITCH_TSS_IRET 1
 #define SWITCH_TSS_CALL 2
 
-/* return 0 if switching to a 16-bit selector */
-static int switch_tss_ra(CPUX86State *env, int tss_selector,
-                         uint32_t e1, uint32_t e2, int source,
-                         uint32_t next_eip, uintptr_t retaddr)
+static void switch_tss_ra(CPUX86State *env, int tss_selector,
+                          uint32_t e1, uint32_t e2, int source,
+                          uint32_t next_eip, bool has_error_code,
+                          uint32_t error_code, uintptr_t retaddr)
 {
     int tss_limit, tss_limit_max, type, old_tss_limit_max, old_type, i;
     target_ulong tss_base;
@@ -473,10 +473,6 @@
         new_segs[R_GS] = 0;
         new_trap = 0;
     }
-    /* XXX: avoid a compiler warning, see
-     http://support.amd.com/us/Processor_TechDocs/24593.pdf
-     chapters 12.2.5 and 13.2.4 on how to implement TSS Trap bit */
-    (void)new_trap;
 
     /* clear busy bit (it is restartable) */
     if (source == SWITCH_TSS_JMP || source == SWITCH_TSS_IRET) {
@@ -599,14 +595,43 @@
         cpu_x86_update_dr7(env, env->dr[7] & ~DR7_LOCAL_BP_MASK);
     }
 #endif
-    return type >> 3;
+
+    if (has_error_code) {
+        int cpl = env->hflags & HF_CPL_MASK;
+        StackAccess sa;
+
+        /* push the error code */
+        sa.env = env;
+        sa.ra = retaddr;
+        sa.mmu_index = x86_mmu_index_pl(env, cpl);
+        sa.sp = env->regs[R_ESP];
+        if (env->segs[R_SS].flags & DESC_B_MASK) {
+            sa.sp_mask = 0xffffffff;
+        } else {
+            sa.sp_mask = 0xffff;
+        }
+        sa.ss_base = env->segs[R_SS].base;
+        if (type & 8) {
+            pushl(&sa, error_code);
+        } else {
+            pushw(&sa, error_code);
+        }
+        SET_ESP(sa.sp, sa.sp_mask);
+    }
+
+    if (new_trap) {
+        env->dr[6] |= DR6_BT;
+        raise_exception_ra(env, EXCP01_DB, retaddr);
+    }
 }
 
-static int switch_tss(CPUX86State *env, int tss_selector,
-                      uint32_t e1, uint32_t e2, int source,
-                      uint32_t next_eip)
+static void switch_tss(CPUX86State *env, int tss_selector,
+                       uint32_t e1, uint32_t e2, int source,
+                       uint32_t next_eip, bool has_error_code,
+                       int error_code)
 {
-    return switch_tss_ra(env, tss_selector, e1, e2, source, next_eip, 0);
+    switch_tss_ra(env, tss_selector, e1, e2, source, next_eip,
+                  has_error_code, error_code, 0);
 }
 
 static inline unsigned int get_sp_mask(unsigned int e2)
@@ -719,25 +744,8 @@
         if (!(e2 & DESC_P_MASK)) {
             raise_exception_err(env, EXCP0B_NOSEG, intno * 8 + 2);
         }
-        shift = switch_tss(env, intno * 8, e1, e2, SWITCH_TSS_CALL, old_eip);
-        if (has_error_code) {
-            /* push the error code on the destination stack */
-            cpl = env->hflags & HF_CPL_MASK;
-            sa.mmu_index = x86_mmu_index_pl(env, cpl);
-            if (env->segs[R_SS].flags & DESC_B_MASK) {
-                sa.sp_mask = 0xffffffff;
-            } else {
-                sa.sp_mask = 0xffff;
-            }
-            sa.sp = env->regs[R_ESP];
-            sa.ss_base = env->segs[R_SS].base;
-            if (shift) {
-                pushl(&sa, error_code);
-            } else {
-                pushw(&sa, error_code);
-            }
-            SET_ESP(sa.sp, sa.sp_mask);
-        }
+        switch_tss(env, intno * 8, e1, e2, SWITCH_TSS_CALL, old_eip,
+                   has_error_code, error_code);
         return;
     }
 
@@ -1533,7 +1541,8 @@
             if (dpl < cpl || dpl < rpl) {
                 raise_exception_err_ra(env, EXCP0D_GPF, new_cs & 0xfffc, GETPC());
             }
-            switch_tss_ra(env, new_cs, e1, e2, SWITCH_TSS_JMP, next_eip, GETPC());
+            switch_tss_ra(env, new_cs, e1, e2, SWITCH_TSS_JMP, next_eip,
+                          false, 0, GETPC());
             break;
         case 4: /* 286 call gate */
         case 12: /* 386 call gate */
@@ -1745,7 +1754,8 @@
             if (dpl < cpl || dpl < rpl) {
                 raise_exception_err_ra(env, EXCP0D_GPF, new_cs & 0xfffc, GETPC());
             }
-            switch_tss_ra(env, new_cs, e1, e2, SWITCH_TSS_CALL, next_eip, GETPC());
+            switch_tss_ra(env, new_cs, e1, e2, SWITCH_TSS_CALL, next_eip,
+                          false, 0, GETPC());
             return;
         case 4: /* 286 call gate */
         case 12: /* 386 call gate */
@@ -2256,7 +2266,8 @@
         if (type != 3) {
             raise_exception_err_ra(env, EXCP0A_TSS, tss_selector & 0xfffc, GETPC());
         }
-        switch_tss_ra(env, tss_selector, e1, e2, SWITCH_TSS_IRET, next_eip, GETPC());
+        switch_tss_ra(env, tss_selector, e1, e2, SWITCH_TSS_IRET, next_eip,
+                      false, 0, GETPC());
     } else {
         helper_ret_protected(env, shift, 1, 0, GETPC());
     }
diff --git a/tcg/meson.build b/tcg/meson.build
index 7df378d..bd2821e 100644
--- a/tcg/meson.build
+++ b/tcg/meson.build
@@ -1,4 +1,4 @@
-if not get_option('tcg').allowed()
+if not have_tcg
    subdir_done()
 endif
 
diff --git a/tests/qemu-iotests/tests/commit-zero-blocks b/tests/qemu-iotests/tests/commit-zero-blocks
new file mode 100755
index 0000000..de00273
--- /dev/null
+++ b/tests/qemu-iotests/tests/commit-zero-blocks
@@ -0,0 +1,96 @@
+#!/usr/bin/env bash
+# group: rw quick
+#
+# Test for commit of discarded blocks
+#
+# This tests committing a live snapshot where some of the blocks that
+# are present in the base image are discarded in the intermediate image.
+# This intends to check that these blocks are also discarded in the base
+# image after the commit.
+#
+# Copyright (C) 2024 Vincent Vanlaer.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# creator
+owner=libvirt-e6954efa@volkihar.be
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+status=1	# failure is the default!
+
+_cleanup()
+{
+    _cleanup_qemu
+    _rm_test_img "${TEST_IMG}.base"
+    _rm_test_img "${TEST_IMG}.mid"
+    _cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+cd ..
+. ./common.rc
+. ./common.filter
+. ./common.qemu
+
+_supported_fmt qcow2
+_supported_proto file
+
+size="1M"
+
+TEST_IMG="$TEST_IMG.base" _make_test_img $size
+TEST_IMG="$TEST_IMG.mid" _make_test_img -b "$TEST_IMG.base" -F $IMGFMT $size
+_make_test_img -b "${TEST_IMG}.mid" -F $IMGFMT $size
+
+$QEMU_IO -c "write -P 0x01 64k 128k" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "discard 64k 64k" "$TEST_IMG.mid" | _filter_qemu_io
+
+echo
+echo "=== Base image info before commit ==="
+TEST_IMG="${TEST_IMG}.base" _img_info | _filter_img_info
+$QEMU_IMG map --output=json "$TEST_IMG.base" | _filter_qemu_img_map
+
+echo
+echo "=== Middle image info before commit ==="
+TEST_IMG="${TEST_IMG}.mid" _img_info | _filter_img_info
+$QEMU_IMG map --output=json "$TEST_IMG.mid" | _filter_qemu_img_map
+
+echo
+echo === Running QEMU Live Commit Test ===
+echo
+
+qemu_comm_method="qmp"
+_launch_qemu -drive file="${TEST_IMG}",if=virtio,id=test
+h=$QEMU_HANDLE
+
+_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" "return"
+
+_send_qemu_cmd $h "{ 'execute': 'block-commit',
+                                 'arguments': { 'device': 'test',
+                                 'top': '"${TEST_IMG}.mid"',
+                                 'base': '"${TEST_IMG}.base"'} }" '"status": "null"'
+
+_cleanup_qemu
+
+echo
+echo "=== Base image info after commit ==="
+TEST_IMG="${TEST_IMG}.base" _img_info | _filter_img_info
+$QEMU_IMG map --output=json "$TEST_IMG.base" | _filter_qemu_img_map
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/tests/commit-zero-blocks.out b/tests/qemu-iotests/tests/commit-zero-blocks.out
new file mode 100644
index 0000000..85bdc46
--- /dev/null
+++ b/tests/qemu-iotests/tests/commit-zero-blocks.out
@@ -0,0 +1,54 @@
+QA output created by commit-zero-blocks
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=1048576
+Formatting 'TEST_DIR/t.IMGFMT.mid', fmt=IMGFMT size=1048576 backing_file=TEST_DIR/t.IMGFMT.base backing_fmt=IMGFMT
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 backing_file=TEST_DIR/t.IMGFMT.mid backing_fmt=IMGFMT
+wrote 131072/131072 bytes at offset 65536
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+discard 65536/65536 bytes at offset 65536
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+=== Base image info before commit ===
+image: TEST_DIR/t.IMGFMT.base
+file format: IMGFMT
+virtual size: 1 MiB (1048576 bytes)
+[{ "start": 0, "length": 65536, "depth": 0, "present": false, "zero": true, "data": false, "compressed": false},
+{ "start": 65536, "length": 131072, "depth": 0, "present": true, "zero": false, "data": true, "compressed": false, "offset": OFFSET},
+{ "start": 196608, "length": 851968, "depth": 0, "present": false, "zero": true, "data": false, "compressed": false}]
+
+=== Middle image info before commit ===
+image: TEST_DIR/t.IMGFMT.mid
+file format: IMGFMT
+virtual size: 1 MiB (1048576 bytes)
+backing file: TEST_DIR/t.IMGFMT.base
+backing file format: IMGFMT
+[{ "start": 0, "length": 65536, "depth": 1, "present": false, "zero": true, "data": false, "compressed": false},
+{ "start": 65536, "length": 65536, "depth": 0, "present": true, "zero": true, "data": false, "compressed": false},
+{ "start": 131072, "length": 65536, "depth": 1, "present": true, "zero": false, "data": true, "compressed": false, "offset": OFFSET},
+{ "start": 196608, "length": 851968, "depth": 1, "present": false, "zero": true, "data": false, "compressed": false}]
+
+=== Running QEMU Live Commit Test ===
+
+{ 'execute': 'qmp_capabilities' }
+{"return": {}}
+{ 'execute': 'block-commit',
+                                 'arguments': { 'device': 'test',
+                                 'top': 'TEST_DIR/t.IMGFMT.mid',
+                                 'base': 'TEST_DIR/t.IMGFMT.base'} }
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "test"}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "test"}}
+{"return": {}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "test"}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "test"}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "test", "len": 1048576, "offset": 1048576, "speed": 0, "type": "commit"}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "test"}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "test"}}
+
+=== Base image info after commit ===
+image: TEST_DIR/t.IMGFMT.base
+file format: IMGFMT
+virtual size: 1 MiB (1048576 bytes)
+[{ "start": 0, "length": 65536, "depth": 0, "present": false, "zero": true, "data": false, "compressed": false},
+{ "start": 65536, "length": 65536, "depth": 0, "present": true, "zero": true, "data": false, "compressed": false},
+{ "start": 131072, "length": 65536, "depth": 0, "present": true, "zero": false, "data": true, "compressed": false, "offset": OFFSET},
+{ "start": 196608, "length": 851968, "depth": 0, "present": false, "zero": true, "data": false, "compressed": false}]
+*** done
diff --git a/tests/qemu-iotests/tests/copy-before-write b/tests/qemu-iotests/tests/copy-before-write
index 498c558..236cb8a 100755
--- a/tests/qemu-iotests/tests/copy-before-write
+++ b/tests/qemu-iotests/tests/copy-before-write
@@ -99,6 +99,68 @@
         log = iotests.filter_qemu_io(log)
         return log
 
+    def do_cbw_error_via_blockdev_backup(self, on_cbw_error=None):
+        self.vm.cmd('blockdev-add', {
+            'node-name': 'source',
+            'driver': iotests.imgfmt,
+            'file': {
+                'driver': 'file',
+                'filename': source_img
+            }
+        })
+
+        self.vm.cmd('blockdev-add', {
+            'node-name': 'target',
+            'driver': iotests.imgfmt,
+            'file': {
+                'driver': 'blkdebug',
+                'image': {
+                    'driver': 'file',
+                    'filename': temp_img
+                },
+                'inject-error': [
+                    {
+                        'event': 'write_aio',
+                        'errno': 5,
+                        'immediately': False,
+                        'once': True
+                    }
+                ]
+            }
+        })
+
+        blockdev_backup_options = {
+            'device': 'source',
+            'target': 'target',
+            'sync': 'none',
+            'job-id': 'job-id',
+            'filter-node-name': 'cbw'
+        }
+
+        if on_cbw_error:
+            blockdev_backup_options['on-cbw-error'] = on_cbw_error
+
+        self.vm.cmd('blockdev-backup', blockdev_backup_options)
+
+        self.vm.cmd('blockdev-add', {
+            'node-name': 'access',
+            'driver': 'snapshot-access',
+            'file': 'cbw'
+        })
+
+        result = self.vm.qmp('human-monitor-command',
+                             command_line='qemu-io cbw "write 0 1M"')
+        self.assert_qmp(result, 'return', '')
+
+        result = self.vm.qmp('human-monitor-command',
+                             command_line='qemu-io access "read 0 1M"')
+        self.assert_qmp(result, 'return', '')
+
+        self.vm.shutdown()
+        log = self.vm.get_log()
+        log = iotests.filter_qemu_io(log)
+        return log
+
     def test_break_snapshot_on_cbw_error(self):
         """break-snapshot behavior:
         Guest write succeed, but further snapshot-read fails, as snapshot is
@@ -125,6 +187,39 @@
 1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 """)
 
+    def test_break_snapshot_policy_forwarding(self):
+        """Ensure CBW filter accepts break-snapshot policy
+        specified in blockdev-backup QMP command.
+        """
+        log = self.do_cbw_error_via_blockdev_backup('break-snapshot')
+        self.assertEqual(log, """\
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read failed: Permission denied
+""")
+
+    def test_break_guest_write_policy_forwarding(self):
+        """Ensure CBW filter accepts break-guest-write policy
+        specified in blockdev-backup QMP command.
+        """
+        log = self.do_cbw_error_via_blockdev_backup('break-guest-write')
+        self.assertEqual(log, """\
+write failed: Input/output error
+read 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+""")
+
+    def test_default_on_cbw_error_policy_forwarding(self):
+        """Ensure break-guest-write policy is used by default when
+        on-cbw-error is not explicitly specified.
+        """
+        log = self.do_cbw_error_via_blockdev_backup()
+        self.assertEqual(log, """\
+write failed: Input/output error
+read 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+""")
+
     def do_cbw_timeout(self, on_cbw_error):
         self.vm.cmd('object-add', {
             'qom-type': 'throttle-group',
diff --git a/tests/qemu-iotests/tests/copy-before-write.out b/tests/qemu-iotests/tests/copy-before-write.out
index 89968f3..2f7d390 100644
--- a/tests/qemu-iotests/tests/copy-before-write.out
+++ b/tests/qemu-iotests/tests/copy-before-write.out
@@ -1,5 +1,5 @@
-....
+.......
 ----------------------------------------------------------------------
-Ran 4 tests
+Ran 7 tests
 
 OK
diff --git a/ui/meson.build b/ui/meson.build
index 35fb04c..6371422 100644
--- a/ui/meson.build
+++ b/ui/meson.build
@@ -1,7 +1,4 @@
 system_ss.add(pixman)
-specific_ss.add(when: ['CONFIG_SYSTEM_ONLY'], if_true: pixman)   # for the include path
-specific_ss.add(when: ['CONFIG_SYSTEM_ONLY'], if_true: opengl)   # for the include path
-
 system_ss.add(png)
 system_ss.add(files(
   'clipboard.c',