Merge tag 'pull-qapi-2024-03-04' of https://repo.or.cz/qemu/armbru into staging
QAPI patches patches for 2024-03-04
# -----BEGIN PGP SIGNATURE-----
#
# iQJGBAABCAAwFiEENUvIs9frKmtoZ05fOHC0AOuRhlMFAmXlaSISHGFybWJydUBy
# ZWRoYXQuY29tAAoJEDhwtADrkYZTdZ8P/iMgqLoAFkCCjwfkUc/rqZUezK52Ynr7
# LYwOPI/xcYD7EnVogdRgFgjWFNoivQLP5yKsU/eRTk29pwdDzTscFm/0ztTQX/Gb
# ypWV+GBcu5J8mKbp1KF5w68aDD8Bat4WRfEgDQ1DV7v6CoMiUzTiF3CGXkYzqK5Y
# kYNq97vdEkBFvFdOl/7scs/XXN2jG27egDhMp68RTxnPHlXZiAO9/2Bul3uVe3x0
# fzQ2ViYv0qLnjE/PwENDqqE3Thv3Sxp5iEeQQ6GWi07EVh07UtHpOM3RYyrTU0Sb
# VrTApSrg0oxlkOuR0CBd9Fi+timtbokBL0DWyUpXNTfIEZfLtA9H+8riUg3EOcDp
# r7a4SI/27VdPxX6Kc6zA3bi+/j1o7CLTW2LGEwuZs52nmixoo1HTWPIFdyh13g/V
# QjNbun0fViHb0FVLiyDlXF/7Y+EWUWIyqwwGqbvve1DyUHQmo3CUQAKGOpkeKSBe
# 4eGciVDgpBoKhtw9Kv6LCDj2cwZKC8DxBMibf7GHkOnAsX2mnyuHcey7HvYNCoF+
# yYz7oIEXdlL2eWqg7CfBZK7lniCDln50RI4Ll1v+J4r1v1kRZGMLesTYXCdNc4ku
# yb4kpU4t22/RODffLE7K+fc3Onwze3fcfxlZMN66F+wFtk4KdPR2aQBE66bB8J99
# vuSKlTbT4cGL
# =s9AR
# -----END PGP SIGNATURE-----
# gpg: Signature made Mon 04 Mar 2024 06:24:34 GMT
# gpg: using RSA key 354BC8B3D7EB2A6B68674E5F3870B400EB918653
# gpg: issuer "armbru@redhat.com"
# gpg: Good signature from "Markus Armbruster <armbru@redhat.com>" [full]
# gpg: aka "Markus Armbruster <armbru@pond.sub.org>" [full]
# Primary key fingerprint: 354B C8B3 D7EB 2A6B 6867 4E5F 3870 B400 EB91 8653
* tag 'pull-qapi-2024-03-04' of https://repo.or.cz/qemu/armbru:
migration: simplify exec migration functions
qapi: New strv_from_str_list()
qapi: New QAPI_LIST_LENGTH()
docs/devel/writing-monitor-commands: Minor improvements
docs/devel/writing-monitor-commands: Repair a decade of rot
qapi: Reject "Returns" section when command doesn't return anything
qga/qapi-schema: Fix guest-set-memory-blocks documentation
qga/qapi-schema: Tweak documentation of fsfreeze commands
qga/qapi-schema: Clean up "Returns" sections
qga/qapi-schema: Delete useless "Returns" sections
qga/qapi-schema: Move error documentation to new "Errors" sections
qapi/yank: Tweak @yank's error description for consistency
qapi: Clean up "Returns" sections
qapi: Delete useless "Returns" sections
qapi: Move error documentation to new "Errors" sections
qapi: New documentation section tag "Errors"
qapi: Slightly clearer error message for invalid "Returns" section
qapi: Memorize since & returns sections
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
diff --git a/.gitlab-ci.d/cirrus.yml b/.gitlab-ci.d/cirrus.yml
index 64f2e25..b45f9de 100644
--- a/.gitlab-ci.d/cirrus.yml
+++ b/.gitlab-ci.d/cirrus.yml
@@ -52,7 +52,7 @@
NAME: freebsd-13
CIRRUS_VM_INSTANCE_TYPE: freebsd_instance
CIRRUS_VM_IMAGE_SELECTOR: image_family
- CIRRUS_VM_IMAGE_NAME: freebsd-13-2
+ CIRRUS_VM_IMAGE_NAME: freebsd-13-3
CIRRUS_VM_CPUS: 8
CIRRUS_VM_RAM: 8G
UPDATE_COMMAND: pkg update; pkg upgrade -y
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index 977576c..52239a4 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -396,6 +396,14 @@
uint64_t cs_base;
uint32_t flags, cflags;
+ /*
+ * By definition we've just finished a TB, so I/O is OK.
+ * Avoid the possibility of calling cpu_io_recompile() if
+ * a page table walk triggered by tb_lookup() calling
+ * probe_access_internal() happens to touch an MMIO device.
+ * The next TB, if we chain to it, will clear the flag again.
+ */
+ cpu->neg.can_do_io = true;
cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
cflags = curr_cflags(cpu);
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 047cd2c..6243bcb 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -2022,7 +2022,6 @@
MemoryRegion *mr;
hwaddr mr_offset;
MemTxAttrs attrs;
- uint64_t ret;
tcg_debug_assert(size > 0 && size <= 8);
@@ -2030,12 +2029,9 @@
section = io_prepare(&mr_offset, cpu, full->xlat_section, attrs, addr, ra);
mr = section->mr;
- bql_lock();
- ret = int_ld_mmio_beN(cpu, full, ret_be, addr, size, mmu_idx,
- type, ra, mr, mr_offset);
- bql_unlock();
-
- return ret;
+ BQL_LOCK_GUARD();
+ return int_ld_mmio_beN(cpu, full, ret_be, addr, size, mmu_idx,
+ type, ra, mr, mr_offset);
}
static Int128 do_ld16_mmio_beN(CPUState *cpu, CPUTLBEntryFull *full,
@@ -2054,13 +2050,11 @@
section = io_prepare(&mr_offset, cpu, full->xlat_section, attrs, addr, ra);
mr = section->mr;
- bql_lock();
+ BQL_LOCK_GUARD();
a = int_ld_mmio_beN(cpu, full, ret_be, addr, size - 8, mmu_idx,
MMU_DATA_LOAD, ra, mr, mr_offset);
b = int_ld_mmio_beN(cpu, full, ret_be, addr + size - 8, 8, mmu_idx,
MMU_DATA_LOAD, ra, mr, mr_offset + size - 8);
- bql_unlock();
-
return int128_make128(b, a);
}
@@ -2569,7 +2563,6 @@
hwaddr mr_offset;
MemoryRegion *mr;
MemTxAttrs attrs;
- uint64_t ret;
tcg_debug_assert(size > 0 && size <= 8);
@@ -2577,12 +2570,9 @@
section = io_prepare(&mr_offset, cpu, full->xlat_section, attrs, addr, ra);
mr = section->mr;
- bql_lock();
- ret = int_st_mmio_leN(cpu, full, val_le, addr, size, mmu_idx,
- ra, mr, mr_offset);
- bql_unlock();
-
- return ret;
+ BQL_LOCK_GUARD();
+ return int_st_mmio_leN(cpu, full, val_le, addr, size, mmu_idx,
+ ra, mr, mr_offset);
}
static uint64_t do_st16_mmio_leN(CPUState *cpu, CPUTLBEntryFull *full,
@@ -2593,7 +2583,6 @@
MemoryRegion *mr;
hwaddr mr_offset;
MemTxAttrs attrs;
- uint64_t ret;
tcg_debug_assert(size > 8 && size <= 16);
@@ -2601,14 +2590,11 @@
section = io_prepare(&mr_offset, cpu, full->xlat_section, attrs, addr, ra);
mr = section->mr;
- bql_lock();
+ BQL_LOCK_GUARD();
int_st_mmio_leN(cpu, full, int128_getlo(val_le), addr, 8,
mmu_idx, ra, mr, mr_offset);
- ret = int_st_mmio_leN(cpu, full, int128_gethi(val_le), addr + 8,
- size - 8, mmu_idx, ra, mr, mr_offset + 8);
- bql_unlock();
-
- return ret;
+ return int_st_mmio_leN(cpu, full, int128_gethi(val_le), addr + 8,
+ size - 8, mmu_idx, ra, mr, mr_offset + 8);
}
/*
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index 1c695ef..c1f57e8 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -256,7 +256,6 @@
void page_init(void)
{
- page_size_init();
page_table_config_init();
}
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index 68b252c..3cac3a7 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -651,16 +651,17 @@
{
PageFlagsNode *p;
target_ulong start, last;
+ int host_page_size = qemu_real_host_page_size();
int prot;
assert_memory_lock();
- if (qemu_host_page_size <= TARGET_PAGE_SIZE) {
+ if (host_page_size <= TARGET_PAGE_SIZE) {
start = address & TARGET_PAGE_MASK;
last = start + TARGET_PAGE_SIZE - 1;
} else {
- start = address & qemu_host_page_mask;
- last = start + qemu_host_page_size - 1;
+ start = address & -host_page_size;
+ last = start + host_page_size - 1;
}
p = pageflags_find(start, last);
@@ -671,7 +672,7 @@
if (unlikely(p->itree.last < last)) {
/* More than one protection region covers the one host page. */
- assert(TARGET_PAGE_SIZE < qemu_host_page_size);
+ assert(TARGET_PAGE_SIZE < host_page_size);
while ((p = pageflags_next(p, start, last)) != NULL) {
prot |= p->flags;
}
@@ -679,7 +680,7 @@
if (prot & PAGE_WRITE) {
pageflags_set_clear(start, last, 0, PAGE_WRITE);
- mprotect(g2h_untagged(start), qemu_host_page_size,
+ mprotect(g2h_untagged(start), last - start + 1,
prot & (PAGE_READ | PAGE_EXEC) ? PROT_READ : PROT_NONE);
}
}
@@ -725,18 +726,19 @@
}
#endif
} else {
+ int host_page_size = qemu_real_host_page_size();
target_ulong start, len, i;
int prot;
- if (qemu_host_page_size <= TARGET_PAGE_SIZE) {
+ if (host_page_size <= TARGET_PAGE_SIZE) {
start = address & TARGET_PAGE_MASK;
len = TARGET_PAGE_SIZE;
prot = p->flags | PAGE_WRITE;
pageflags_set_clear(start, start + len - 1, PAGE_WRITE, 0);
current_tb_invalidated = tb_invalidate_phys_page_unwind(start, pc);
} else {
- start = address & qemu_host_page_mask;
- len = qemu_host_page_size;
+ start = address & -host_page_size;
+ len = host_page_size;
prot = 0;
for (i = 0; i < len; i += TARGET_PAGE_SIZE) {
@@ -862,7 +864,7 @@
typedef struct TargetPageDataNode {
struct rcu_head rcu;
IntervalTreeNode itree;
- char data[TPD_PAGES][TARGET_PAGE_DATA_SIZE] __attribute__((aligned));
+ char data[] __attribute__((aligned));
} TargetPageDataNode;
static IntervalTreeRoot targetdata_root;
@@ -900,7 +902,8 @@
n_last = MIN(last, n->last);
p_len = (n_last + 1 - n_start) >> TARGET_PAGE_BITS;
- memset(t->data[p_ofs], 0, p_len * TARGET_PAGE_DATA_SIZE);
+ memset(t->data + p_ofs * TARGET_PAGE_DATA_SIZE, 0,
+ p_len * TARGET_PAGE_DATA_SIZE);
}
}
@@ -908,7 +911,7 @@
{
IntervalTreeNode *n;
TargetPageDataNode *t;
- target_ulong page, region;
+ target_ulong page, region, p_ofs;
page = address & TARGET_PAGE_MASK;
region = address & TBD_MASK;
@@ -924,7 +927,8 @@
mmap_lock();
n = interval_tree_iter_first(&targetdata_root, page, page);
if (!n) {
- t = g_new0(TargetPageDataNode, 1);
+ t = g_malloc0(sizeof(TargetPageDataNode)
+ + TPD_PAGES * TARGET_PAGE_DATA_SIZE);
n = &t->itree;
n->start = region;
n->last = region | ~TBD_MASK;
@@ -934,7 +938,8 @@
}
t = container_of(n, TargetPageDataNode, itree);
- return t->data[(page - region) >> TARGET_PAGE_BITS];
+ p_ofs = (page - region) >> TARGET_PAGE_BITS;
+ return t->data + p_ofs * TARGET_PAGE_DATA_SIZE;
}
#else
void page_reset_target_data(target_ulong start, target_ulong last) { }
diff --git a/bsd-user/main.c b/bsd-user/main.c
index e5efb7b..512d4ab 100644
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -49,6 +49,13 @@
#include "host-os.h"
#include "target_arch_cpu.h"
+
+/*
+ * TODO: Remove these and rely only on qemu_real_host_page_size().
+ */
+uintptr_t qemu_host_page_size;
+intptr_t qemu_host_page_mask;
+
static bool opt_one_insn_per_tb;
uintptr_t guest_base;
bool have_guest_base;
@@ -307,6 +314,9 @@
(void) envlist_setenv(envlist, *wrk);
}
+ qemu_host_page_size = getpagesize();
+ qemu_host_page_size = MAX(qemu_host_page_size, TARGET_PAGE_SIZE);
+
cpu_model = NULL;
qemu_add_opts(&qemu_trace_opts);
@@ -364,11 +374,12 @@
} else if (!strcmp(r, "L")) {
interp_prefix = argv[optind++];
} else if (!strcmp(r, "p")) {
- qemu_host_page_size = atoi(argv[optind++]);
- if (qemu_host_page_size == 0 ||
- (qemu_host_page_size & (qemu_host_page_size - 1)) != 0) {
- fprintf(stderr, "page size must be a power of two\n");
- exit(1);
+ unsigned size, want = qemu_real_host_page_size();
+
+ r = argv[optind++];
+ if (qemu_strtoui(r, NULL, 10, &size) || size != want) {
+ warn_report("Deprecated page size option cannot "
+ "change host page size (%u)", want);
}
} else if (!strcmp(r, "g")) {
gdbstub = g_strdup(argv[optind++]);
@@ -403,6 +414,8 @@
}
}
+ qemu_host_page_mask = -qemu_host_page_size;
+
/* init debug */
{
int mask = 0;
diff --git a/bsd-user/qemu.h b/bsd-user/qemu.h
index dc842ff..c05c512 100644
--- a/bsd-user/qemu.h
+++ b/bsd-user/qemu.h
@@ -40,6 +40,13 @@
#include "qemu-os.h"
/*
+ * TODO: Remove these and rely only on qemu_real_host_page_size().
+ */
+extern uintptr_t qemu_host_page_size;
+extern intptr_t qemu_host_page_mask;
+#define HOST_PAGE_ALIGN(addr) ROUND_UP((addr), qemu_host_page_size)
+
+/*
* This struct is used to hold certain information about the image. Basically,
* it replicates in user space what would be certain task_struct fields in the
* kernel
diff --git a/cpu-target.c b/cpu-target.c
index 86444cc..4c0621b 100644
--- a/cpu-target.c
+++ b/cpu-target.c
@@ -45,9 +45,6 @@
#include "trace/trace-root.h"
#include "qemu/accel.h"
-uintptr_t qemu_host_page_size;
-intptr_t qemu_host_page_mask;
-
#ifndef CONFIG_USER_ONLY
static int cpu_common_post_load(void *opaque, int version_id)
{
@@ -474,16 +471,3 @@
{
return TARGET_NAME;
}
-
-void page_size_init(void)
-{
- /* NOTE: we can always suppose that qemu_host_page_size >=
- TARGET_PAGE_SIZE */
- if (qemu_host_page_size == 0) {
- qemu_host_page_size = qemu_real_host_page_size();
- }
- if (qemu_host_page_size < TARGET_PAGE_SIZE) {
- qemu_host_page_size = TARGET_PAGE_SIZE;
- }
- qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
-}
diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
index 36bd3e1..8565644 100644
--- a/docs/about/deprecated.rst
+++ b/docs/about/deprecated.rst
@@ -63,6 +63,16 @@
However, short-form booleans are deprecated and full explicit ``arg_name=on``
form is preferred.
+User-mode emulator command line arguments
+-----------------------------------------
+
+``-p`` (since 9.0)
+''''''''''''''''''
+
+The ``-p`` option pretends to control the host page size. However,
+it is not possible to change the host page size, and using the
+option only causes failures.
+
QEMU Machine Protocol (QMP) commands
------------------------------------
diff --git a/docs/conf.py b/docs/conf.py
index e84a95e..1b2afa2 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -29,7 +29,6 @@
import os
import sys
import sphinx
-from distutils.version import LooseVersion
from sphinx.errors import ConfigError
# The per-manual conf.py will set qemu_docdir for a single-manual build;
@@ -165,11 +164,10 @@
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
-if LooseVersion(sphinx_rtd_theme.__version__) >= LooseVersion("0.4.3"):
- html_theme_options = {
- "style_nav_header_background": "#802400",
- "navigation_with_keys": True,
- }
+html_theme_options = {
+ "style_nav_header_background": "#802400",
+ "navigation_with_keys": True,
+}
html_logo = os.path.join(qemu_docdir, "../ui/icons/qemu_128x128.png")
diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst
index a9acaf6..9d1abd2 100644
--- a/docs/devel/migration/features.rst
+++ b/docs/devel/migration/features.rst
@@ -10,3 +10,4 @@
dirty-limit
vfio
virtio
+ mapped-ram
diff --git a/docs/devel/migration/mapped-ram.rst b/docs/devel/migration/mapped-ram.rst
new file mode 100644
index 0000000..fa4cefd
--- /dev/null
+++ b/docs/devel/migration/mapped-ram.rst
@@ -0,0 +1,138 @@
+Mapped-ram
+==========
+
+Mapped-ram is a new stream format for the RAM section designed to
+supplement the existing ``file:`` migration and make it compatible
+with ``multifd``. This enables parallel migration of a guest's RAM to
+a file.
+
+The core of the feature is to ensure that RAM pages are mapped
+directly to offsets in the resulting migration file. This enables the
+``multifd`` threads to write exclusively to those offsets even if the
+guest is constantly dirtying pages (i.e. live migration). Another
+benefit is that the resulting file will have a bounded size, since
+pages which are dirtied multiple times will always go to a fixed
+location in the file, rather than constantly being added to a
+sequential stream. Having the pages at fixed offsets also allows the
+usage of O_DIRECT for save/restore of the migration stream as the
+pages are ensured to be written respecting O_DIRECT alignment
+restrictions (direct-io support not yet implemented).
+
+Usage
+-----
+
+On both source and destination, enable the ``multifd`` and
+``mapped-ram`` capabilities:
+
+ ``migrate_set_capability multifd on``
+
+ ``migrate_set_capability mapped-ram on``
+
+Use a ``file:`` URL for migration:
+
+ ``migrate file:/path/to/migration/file``
+
+Mapped-ram migration is best done non-live, i.e. by stopping the VM on
+the source side before migrating.
+
+Use-cases
+---------
+
+The mapped-ram feature was designed for use cases where the migration
+stream will be directed to a file in the filesystem and not
+immediately restored on the destination VM [#]_. These could be
+thought of as snapshots. We can further categorize them into live and
+non-live.
+
+- Non-live snapshot
+
+If the use case requires a VM to be stopped before taking a snapshot,
+that's the ideal scenario for mapped-ram migration. Not having to
+track dirty pages, the migration will write the RAM pages to the disk
+as fast as it can.
+
+Note: if a snapshot is taken of a running VM, but the VM will be
+stopped after the snapshot by the admin, then consider stopping it
+right before the snapshot to take benefit of the performance gains
+mentioned above.
+
+- Live snapshot
+
+If the use case requires that the VM keeps running during and after
+the snapshot operation, then mapped-ram migration can still be used,
+but will be less performant. Other strategies such as
+background-snapshot should be evaluated as well. One benefit of
+mapped-ram in this scenario is portability since background-snapshot
+depends on async dirty tracking (KVM_GET_DIRTY_LOG) which is not
+supported outside of Linux.
+
+.. [#] While this same effect could be obtained with the usage of
+ snapshots or the ``file:`` migration alone, mapped-ram provides
+ a performance increase for VMs with larger RAM sizes (10s to
+ 100s of GiBs), specially if the VM has been stopped beforehand.
+
+RAM section format
+------------------
+
+Instead of having a sequential stream of pages that follow the
+RAMBlock headers, the dirty pages for a RAMBlock follow its header
+instead. This ensures that each RAM page has a fixed offset in the
+resulting migration file.
+
+A bitmap is introduced to track which pages have been written in the
+migration file. Pages are written at a fixed location for every
+ramblock. Zero pages are ignored as they'd be zero in the destination
+migration as well.
+
+::
+
+ Without mapped-ram: With mapped-ram:
+
+ --------------------- --------------------------------
+ | ramblock 1 header | | ramblock 1 header |
+ --------------------- --------------------------------
+ | ramblock 2 header | | ramblock 1 mapped-ram header |
+ --------------------- --------------------------------
+ | ... | | padding to next 1MB boundary |
+ --------------------- | ... |
+ | ramblock n header | --------------------------------
+ --------------------- | ramblock 1 pages |
+ | RAM_SAVE_FLAG_EOS | | ... |
+ --------------------- --------------------------------
+ | stream of pages | | ramblock 2 header |
+ | (iter 1) | --------------------------------
+ | ... | | ramblock 2 mapped-ram header |
+ --------------------- --------------------------------
+ | RAM_SAVE_FLAG_EOS | | padding to next 1MB boundary |
+ --------------------- | ... |
+ | stream of pages | --------------------------------
+ | (iter 2) | | ramblock 2 pages |
+ | ... | | ... |
+ --------------------- --------------------------------
+ | ... | | ... |
+ --------------------- --------------------------------
+ | RAM_SAVE_FLAG_EOS |
+ --------------------------------
+ | ... |
+ --------------------------------
+
+where:
+ - ramblock header: the generic information for a ramblock, such as
+ idstr, used_len, etc.
+
+ - ramblock mapped-ram header: the information added by this feature:
+ bitmap of pages written, bitmap size and offset of pages in the
+ migration file.
+
+Restrictions
+------------
+
+Since pages are written to their relative offsets and out of order
+(due to the memory dirtying patterns), streaming channels such as
+sockets are not supported. A seekable channel such as a file is
+required. This can be verified in the QIOChannel by the presence of
+the QIO_CHANNEL_FEATURE_SEEKABLE.
+
+The improvements brought by this feature apply only to guest physical
+RAM. Other types of memory such as VRAM are migrated as part of device
+states.
diff --git a/docs/user/main.rst b/docs/user/main.rst
index 7e7ad07..d5fbb78 100644
--- a/docs/user/main.rst
+++ b/docs/user/main.rst
@@ -87,9 +87,6 @@
Activate logging of the specified items (use '-d help' for a list of
log items)
-``-p pagesize``
- Act as if the host page size was 'pagesize' bytes
-
``-g port``
Wait gdb connection to port
diff --git a/hw/tpm/tpm_ppi.c b/hw/tpm/tpm_ppi.c
index 7f74e26..f27ed6c 100644
--- a/hw/tpm/tpm_ppi.c
+++ b/hw/tpm/tpm_ppi.c
@@ -47,8 +47,10 @@
void tpm_ppi_init(TPMPPI *tpmppi, MemoryRegion *m,
hwaddr addr, Object *obj)
{
- tpmppi->buf = qemu_memalign(qemu_real_host_page_size(),
- HOST_PAGE_ALIGN(TPM_PPI_ADDR_SIZE));
+ size_t host_page_size = qemu_real_host_page_size();
+
+ tpmppi->buf = qemu_memalign(host_page_size,
+ ROUND_UP(TPM_PPI_ADDR_SIZE, host_page_size));
memory_region_init_ram_device_ptr(&tpmppi->ram, obj, "tpm-ppi",
TPM_PPI_ADDR_SIZE, tpmppi->buf);
vmstate_register_ram(&tpmppi->ram, DEVICE(obj));
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 9ead1be..6346df1 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -20,13 +20,6 @@
void cpu_exec_init_all(void);
void cpu_exec_step_atomic(CPUState *cpu);
-/* Using intptr_t ensures that qemu_*_page_mask is sign-extended even
- * when intptr_t is 32-bit and we are aligning a long long.
- */
-extern uintptr_t qemu_host_page_size;
-extern intptr_t qemu_host_page_mask;
-
-#define HOST_PAGE_ALIGN(addr) ROUND_UP((addr), qemu_host_page_size)
#define REAL_HOST_PAGE_ALIGN(addr) ROUND_UP((addr), qemu_real_host_page_size())
/* The CPU list lock nests outside page_(un)lock or mmap_(un)lock */
diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
index 3eb7972..848915e 100644
--- a/include/exec/ramblock.h
+++ b/include/exec/ramblock.h
@@ -44,6 +44,19 @@
size_t page_size;
/* dirty bitmap used during migration */
unsigned long *bmap;
+
+ /*
+ * Below fields are only used by mapped-ram migration
+ */
+ /* bitmap of pages present in the migration file */
+ unsigned long *file_bmap;
+ /*
+ * offset in the file pages belonging to this ramblock are saved,
+ * used only during migration to a file.
+ */
+ off_t bitmap_offset;
+ uint64_t pages_offset;
+
/* bitmap of already received pages in postcopy */
unsigned long *receivedmap;
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index af1a295..d0e3454 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -1179,8 +1179,6 @@
const char *target_name(void);
-void page_size_init(void);
-
#ifdef NEED_CPU_H
#ifndef CONFIG_USER_ONLY
diff --git a/include/io/channel.h b/include/io/channel.h
index 5f9dbaa..7986c49 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -44,6 +44,7 @@
QIO_CHANNEL_FEATURE_LISTEN,
QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY,
QIO_CHANNEL_FEATURE_READ_MSG_PEEK,
+ QIO_CHANNEL_FEATURE_SEEKABLE,
};
@@ -130,6 +131,16 @@
Error **errp);
/* Optional callbacks */
+ ssize_t (*io_pwritev)(QIOChannel *ioc,
+ const struct iovec *iov,
+ size_t niov,
+ off_t offset,
+ Error **errp);
+ ssize_t (*io_preadv)(QIOChannel *ioc,
+ const struct iovec *iov,
+ size_t niov,
+ off_t offset,
+ Error **errp);
int (*io_shutdown)(QIOChannel *ioc,
QIOChannelShutdown how,
Error **errp);
@@ -529,6 +540,78 @@
Error **errp);
/**
+ * qio_channel_pwritev
+ * @ioc: the channel object
+ * @iov: the array of memory regions to write data from
+ * @niov: the length of the @iov array
+ * @offset: offset in the channel where writes should begin
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Not all implementations will support this facility, so may report
+ * an error. To avoid errors, the caller may check for the feature
+ * flag QIO_CHANNEL_FEATURE_SEEKABLE prior to calling this method.
+ *
+ * Behaves as qio_channel_writev_full, apart from not supporting
+ * sending of file handles as well as beginning the write at the
+ * passed @offset
+ *
+ */
+ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov,
+ size_t niov, off_t offset, Error **errp);
+
+/**
+ * qio_channel_pwrite
+ * @ioc: the channel object
+ * @buf: the memory region to write data into
+ * @buflen: the number of bytes to @buf
+ * @offset: offset in the channel where writes should begin
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Not all implementations will support this facility, so may report
+ * an error. To avoid errors, the caller may check for the feature
+ * flag QIO_CHANNEL_FEATURE_SEEKABLE prior to calling this method.
+ *
+ */
+ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen,
+ off_t offset, Error **errp);
+
+/**
+ * qio_channel_preadv
+ * @ioc: the channel object
+ * @iov: the array of memory regions to read data into
+ * @niov: the length of the @iov array
+ * @offset: offset in the channel where writes should begin
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Not all implementations will support this facility, so may report
+ * an error. To avoid errors, the caller may check for the feature
+ * flag QIO_CHANNEL_FEATURE_SEEKABLE prior to calling this method.
+ *
+ * Behaves as qio_channel_readv_full, apart from not supporting
+ * receiving of file handles as well as beginning the read at the
+ * passed @offset
+ *
+ */
+ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov,
+ size_t niov, off_t offset, Error **errp);
+
+/**
+ * qio_channel_pread
+ * @ioc: the channel object
+ * @buf: the memory region to write data into
+ * @buflen: the number of bytes to @buf
+ * @offset: offset in the channel where writes should begin
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Not all implementations will support this facility, so may report
+ * an error. To avoid errors, the caller may check for the feature
+ * flag QIO_CHANNEL_FEATURE_SEEKABLE prior to calling this method.
+ *
+ */
+ssize_t qio_channel_pread(QIOChannel *ioc, char *buf, size_t buflen,
+ off_t offset, Error **errp);
+
+/**
* qio_channel_shutdown:
* @ioc: the channel object
* @how: the direction to shutdown
diff --git a/include/migration/qemu-file-types.h b/include/migration/qemu-file-types.h
index 9ba163f..adec5ab 100644
--- a/include/migration/qemu-file-types.h
+++ b/include/migration/qemu-file-types.h
@@ -50,6 +50,8 @@
unsigned int qemu_get_be32(QEMUFile *f);
uint64_t qemu_get_be64(QEMUFile *f);
+bool qemu_file_is_seekable(QEMUFile *f);
+
static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
{
qemu_put_be64(f, *pv);
diff --git a/include/qemu/bitops.h b/include/qemu/bitops.h
index cb3526d..2c0a2fe 100644
--- a/include/qemu/bitops.h
+++ b/include/qemu/bitops.h
@@ -68,6 +68,19 @@
}
/**
+ * clear_bit_atomic - Clears a bit in memory atomically
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ */
+static inline void clear_bit_atomic(long nr, unsigned long *addr)
+{
+ unsigned long mask = BIT_MASK(nr);
+ unsigned long *p = addr + BIT_WORD(nr);
+
+ return qatomic_and(p, ~mask);
+}
+
+/**
* change_bit - Toggle a bit in memory
* @nr: Bit to change
* @addr: Address to start counting from
diff --git a/io/channel-file.c b/io/channel-file.c
index 4a12c61..d4706fa 100644
--- a/io/channel-file.c
+++ b/io/channel-file.c
@@ -36,6 +36,10 @@
ioc->fd = fd;
+ if (lseek(fd, 0, SEEK_CUR) != (off_t)-1) {
+ qio_channel_set_feature(QIO_CHANNEL(ioc), QIO_CHANNEL_FEATURE_SEEKABLE);
+ }
+
trace_qio_channel_file_new_fd(ioc, fd);
return ioc;
@@ -60,6 +64,10 @@
return NULL;
}
+ if (lseek(ioc->fd, 0, SEEK_CUR) != (off_t)-1) {
+ qio_channel_set_feature(QIO_CHANNEL(ioc), QIO_CHANNEL_FEATURE_SEEKABLE);
+ }
+
trace_qio_channel_file_new_path(ioc, path, flags, mode, ioc->fd);
return ioc;
@@ -138,6 +146,58 @@
return ret;
}
+#ifdef CONFIG_PREADV
+static ssize_t qio_channel_file_preadv(QIOChannel *ioc,
+ const struct iovec *iov,
+ size_t niov,
+ off_t offset,
+ Error **errp)
+{
+ QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc);
+ ssize_t ret;
+
+ retry:
+ ret = preadv(fioc->fd, iov, niov, offset);
+ if (ret < 0) {
+ if (errno == EAGAIN) {
+ return QIO_CHANNEL_ERR_BLOCK;
+ }
+ if (errno == EINTR) {
+ goto retry;
+ }
+
+ error_setg_errno(errp, errno, "Unable to read from file");
+ return -1;
+ }
+
+ return ret;
+}
+
+static ssize_t qio_channel_file_pwritev(QIOChannel *ioc,
+ const struct iovec *iov,
+ size_t niov,
+ off_t offset,
+ Error **errp)
+{
+ QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc);
+ ssize_t ret;
+
+ retry:
+ ret = pwritev(fioc->fd, iov, niov, offset);
+ if (ret <= 0) {
+ if (errno == EAGAIN) {
+ return QIO_CHANNEL_ERR_BLOCK;
+ }
+ if (errno == EINTR) {
+ goto retry;
+ }
+ error_setg_errno(errp, errno, "Unable to write to file");
+ return -1;
+ }
+ return ret;
+}
+#endif /* CONFIG_PREADV */
+
static int qio_channel_file_set_blocking(QIOChannel *ioc,
bool enabled,
Error **errp)
@@ -182,6 +242,11 @@
{
QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc);
+ if (qemu_fdatasync(fioc->fd) < 0) {
+ error_setg_errno(errp, errno,
+ "Unable to synchronize file data with storage device");
+ return -1;
+ }
if (qemu_close(fioc->fd) < 0) {
error_setg_errno(errp, errno,
"Unable to close file");
@@ -223,6 +288,10 @@
ioc_klass->io_writev = qio_channel_file_writev;
ioc_klass->io_readv = qio_channel_file_readv;
ioc_klass->io_set_blocking = qio_channel_file_set_blocking;
+#ifdef CONFIG_PREADV
+ ioc_klass->io_pwritev = qio_channel_file_pwritev;
+ ioc_klass->io_preadv = qio_channel_file_preadv;
+#endif
ioc_klass->io_seek = qio_channel_file_seek;
ioc_klass->io_close = qio_channel_file_close;
ioc_klass->io_create_watch = qio_channel_file_create_watch;
diff --git a/io/channel.c b/io/channel.c
index 86c5834..a1f12f8 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -454,6 +454,64 @@
}
+ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov,
+ size_t niov, off_t offset, Error **errp)
+{
+ QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
+
+ if (!klass->io_pwritev) {
+ error_setg(errp, "Channel does not support pwritev");
+ return -1;
+ }
+
+ if (!qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_SEEKABLE)) {
+ error_setg_errno(errp, EINVAL, "Requested channel is not seekable");
+ return -1;
+ }
+
+ return klass->io_pwritev(ioc, iov, niov, offset, errp);
+}
+
+ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen,
+ off_t offset, Error **errp)
+{
+ struct iovec iov = {
+ .iov_base = buf,
+ .iov_len = buflen
+ };
+
+ return qio_channel_pwritev(ioc, &iov, 1, offset, errp);
+}
+
+ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov,
+ size_t niov, off_t offset, Error **errp)
+{
+ QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
+
+ if (!klass->io_preadv) {
+ error_setg(errp, "Channel does not support preadv");
+ return -1;
+ }
+
+ if (!qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_SEEKABLE)) {
+ error_setg_errno(errp, EINVAL, "Requested channel is not seekable");
+ return -1;
+ }
+
+ return klass->io_preadv(ioc, iov, niov, offset, errp);
+}
+
+ssize_t qio_channel_pread(QIOChannel *ioc, char *buf, size_t buflen,
+ off_t offset, Error **errp)
+{
+ struct iovec iov = {
+ .iov_base = buf,
+ .iov_len = buflen
+ };
+
+ return qio_channel_preadv(ioc, &iov, 1, offset, errp);
+}
+
int qio_channel_shutdown(QIOChannel *ioc,
QIOChannelShutdown how,
Error **errp)
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index b8eef89..0c299a7 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -460,6 +460,7 @@
static bool init_guest_commpage(void)
{
ARMCPU *cpu = ARM_CPU(thread_cpu);
+ int host_page_size = qemu_real_host_page_size();
abi_ptr commpage;
void *want;
void *addr;
@@ -472,10 +473,12 @@
return true;
}
- commpage = HI_COMMPAGE & -qemu_host_page_size;
+ commpage = HI_COMMPAGE & -host_page_size;
want = g2h_untagged(commpage);
- addr = mmap(want, qemu_host_page_size, PROT_READ | PROT_WRITE,
- MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ addr = mmap(want, host_page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE |
+ (commpage < reserved_va ? MAP_FIXED : MAP_FIXED_NOREPLACE),
+ -1, 0);
if (addr == MAP_FAILED) {
perror("Allocating guest commpage");
@@ -488,12 +491,12 @@
/* Set kernel helper versions; rest of page is 0. */
__put_user(5, (uint32_t *)g2h_untagged(0xffff0ffcu));
- if (mprotect(addr, qemu_host_page_size, PROT_READ)) {
+ if (mprotect(addr, host_page_size, PROT_READ)) {
perror("Protecting guest commpage");
exit(EXIT_FAILURE);
}
- page_set_flags(commpage, commpage | ~qemu_host_page_mask,
+ page_set_flags(commpage, commpage | (host_page_size - 1),
PAGE_READ | PAGE_EXEC | PAGE_VALID);
return true;
}
@@ -1532,10 +1535,14 @@
0x3a, 0x68, 0x3b, 0x00, /* trap 0 */
};
- void *want = g2h_untagged(LO_COMMPAGE & -qemu_host_page_size);
- void *addr = mmap(want, qemu_host_page_size, PROT_READ | PROT_WRITE,
- MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ int host_page_size = qemu_real_host_page_size();
+ void *want, *addr;
+ want = g2h_untagged(LO_COMMPAGE & -host_page_size);
+ addr = mmap(want, host_page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE |
+ (reserved_va ? MAP_FIXED : MAP_FIXED_NOREPLACE),
+ -1, 0);
if (addr == MAP_FAILED) {
perror("Allocating guest commpage");
exit(EXIT_FAILURE);
@@ -1544,9 +1551,9 @@
return false;
}
- memcpy(addr, kuser_page, sizeof(kuser_page));
+ memcpy(g2h_untagged(LO_COMMPAGE), kuser_page, sizeof(kuser_page));
- if (mprotect(addr, qemu_host_page_size, PROT_READ)) {
+ if (mprotect(addr, host_page_size, PROT_READ)) {
perror("Protecting guest commpage");
exit(EXIT_FAILURE);
}
@@ -1970,16 +1977,20 @@
static bool init_guest_commpage(void)
{
- void *want = g2h_untagged(LO_COMMPAGE);
- void *addr = mmap(want, qemu_host_page_size, PROT_NONE,
- MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ /* If reserved_va, then we have already mapped 0 page on the host. */
+ if (!reserved_va) {
+ void *want, *addr;
- if (addr == MAP_FAILED) {
- perror("Allocating guest commpage");
- exit(EXIT_FAILURE);
- }
- if (addr != want) {
- return false;
+ want = g2h_untagged(LO_COMMPAGE);
+ addr = mmap(want, TARGET_PAGE_SIZE, PROT_NONE,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED_NOREPLACE, -1, 0);
+ if (addr == MAP_FAILED) {
+ perror("Allocating guest commpage");
+ exit(EXIT_FAILURE);
+ }
+ if (addr != want) {
+ return false;
+ }
}
/*
@@ -2679,13 +2690,7 @@
NEW_AUX_ENT(AT_PHDR, (abi_ulong)(info->load_addr + exec->e_phoff));
NEW_AUX_ENT(AT_PHENT, (abi_ulong)(sizeof (struct elf_phdr)));
NEW_AUX_ENT(AT_PHNUM, (abi_ulong)(exec->e_phnum));
- if ((info->alignment & ~qemu_host_page_mask) != 0) {
- /* Target doesn't support host page size alignment */
- NEW_AUX_ENT(AT_PAGESZ, (abi_ulong)(TARGET_PAGE_SIZE));
- } else {
- NEW_AUX_ENT(AT_PAGESZ, (abi_ulong)(MAX(TARGET_PAGE_SIZE,
- qemu_host_page_size)));
- }
+ NEW_AUX_ENT(AT_PAGESZ, (abi_ulong)(TARGET_PAGE_SIZE));
NEW_AUX_ENT(AT_BASE, (abi_ulong)(interp_info ? interp_info->load_addr : 0));
NEW_AUX_ENT(AT_FLAGS, (abi_ulong)0);
NEW_AUX_ENT(AT_ENTRY, info->entry);
@@ -2893,7 +2898,7 @@
/* Add any HI_COMMPAGE not covered by reserved_va. */
if (reserved_va < HI_COMMPAGE) {
- ga->bounds[n][0] = HI_COMMPAGE & qemu_host_page_mask;
+ ga->bounds[n][0] = HI_COMMPAGE & qemu_real_host_page_mask();
ga->bounds[n][1] = HI_COMMPAGE + TARGET_PAGE_SIZE - 1;
n++;
}
@@ -3017,8 +3022,6 @@
uintptr_t brk, ret;
PGBAddrs ga;
- assert(QEMU_IS_ALIGNED(guest_loaddr, align));
-
/* Try the identity map first. */
if (pgb_addr_set(&ga, guest_loaddr, guest_hiaddr, true)) {
brk = (uintptr_t)sbrk(0);
@@ -3075,7 +3078,7 @@
abi_ulong guest_hiaddr)
{
/* In order to use host shmat, we must be able to honor SHMLBA. */
- uintptr_t align = MAX(SHMLBA, qemu_host_page_size);
+ uintptr_t align = MAX(SHMLBA, TARGET_PAGE_SIZE);
/* Sanity check the guest binary. */
if (reserved_va) {
@@ -3912,8 +3915,9 @@
and some applications "depend" upon this behavior. Since
we do not have the power to recompile these, we emulate
the SVr4 behavior. Sigh. */
- target_mmap(0, qemu_host_page_size, PROT_READ | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ target_mmap(0, TARGET_PAGE_SIZE, PROT_READ | PROT_EXEC,
+ MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_ANONYMOUS,
+ -1, 0);
}
#ifdef TARGET_MIPS
info->interp_fp_abi = interp_info.fp_abi;
@@ -3963,6 +3967,8 @@
}
#ifdef USE_ELF_CORE_DUMP
+#include "exec/translate-all.h"
+
/*
* Definitions to generate Intel SVR4-like core files.
* These mostly have the same names as the SVR4 types with "target_elf_"
@@ -4002,18 +4008,6 @@
* Example for ARM target is provided in this file.
*/
-/* An ELF note in memory */
-struct memelfnote {
- const char *name;
- size_t namesz;
- size_t namesz_rounded;
- int type;
- size_t datasz;
- size_t datasz_rounded;
- void *data;
- size_t notesz;
-};
-
struct target_elf_siginfo {
abi_int si_signo; /* signal number */
abi_int si_code; /* extra code */
@@ -4053,77 +4047,6 @@
char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */
};
-/* Here is the structure in which status of each thread is captured. */
-struct elf_thread_status {
- QTAILQ_ENTRY(elf_thread_status) ets_link;
- struct target_elf_prstatus prstatus; /* NT_PRSTATUS */
-#if 0
- elf_fpregset_t fpu; /* NT_PRFPREG */
- struct task_struct *thread;
- elf_fpxregset_t xfpu; /* ELF_CORE_XFPREG_TYPE */
-#endif
- struct memelfnote notes[1];
- int num_notes;
-};
-
-struct elf_note_info {
- struct memelfnote *notes;
- struct target_elf_prstatus *prstatus; /* NT_PRSTATUS */
- struct target_elf_prpsinfo *psinfo; /* NT_PRPSINFO */
-
- QTAILQ_HEAD(, elf_thread_status) thread_list;
-#if 0
- /*
- * Current version of ELF coredump doesn't support
- * dumping fp regs etc.
- */
- elf_fpregset_t *fpu;
- elf_fpxregset_t *xfpu;
- int thread_status_size;
-#endif
- int notes_size;
- int numnote;
-};
-
-struct vm_area_struct {
- target_ulong vma_start; /* start vaddr of memory region */
- target_ulong vma_end; /* end vaddr of memory region */
- abi_ulong vma_flags; /* protection etc. flags for the region */
- QTAILQ_ENTRY(vm_area_struct) vma_link;
-};
-
-struct mm_struct {
- QTAILQ_HEAD(, vm_area_struct) mm_mmap;
- int mm_count; /* number of mappings */
-};
-
-static struct mm_struct *vma_init(void);
-static void vma_delete(struct mm_struct *);
-static int vma_add_mapping(struct mm_struct *, target_ulong,
- target_ulong, abi_ulong);
-static int vma_get_mapping_count(const struct mm_struct *);
-static struct vm_area_struct *vma_first(const struct mm_struct *);
-static struct vm_area_struct *vma_next(struct vm_area_struct *);
-static abi_ulong vma_dump_size(const struct vm_area_struct *);
-static int vma_walker(void *priv, target_ulong start, target_ulong end,
- unsigned long flags);
-
-static void fill_elf_header(struct elfhdr *, int, uint16_t, uint32_t);
-static void fill_note(struct memelfnote *, const char *, int,
- unsigned int, void *);
-static void fill_prstatus(struct target_elf_prstatus *, const TaskState *, int);
-static int fill_psinfo(struct target_elf_prpsinfo *, const TaskState *);
-static void fill_auxv_note(struct memelfnote *, const TaskState *);
-static void fill_elf_note_phdr(struct elf_phdr *, int, off_t);
-static size_t note_size(const struct memelfnote *);
-static void free_note_info(struct elf_note_info *);
-static int fill_note_info(struct elf_note_info *, long, const CPUArchState *);
-static void fill_thread_info(struct elf_note_info *, const CPUArchState *);
-
-static int dump_write(int, const void *, size_t);
-static int write_note(struct memelfnote *, int);
-static int write_note_info(struct elf_note_info *, int);
-
#ifdef BSWAP_NEEDED
static void bswap_prstatus(struct target_elf_prstatus *prstatus)
{
@@ -4166,145 +4089,66 @@
#endif /* BSWAP_NEEDED */
/*
- * Minimal support for linux memory regions. These are needed
- * when we are finding out what memory exactly belongs to
- * emulated process. No locks needed here, as long as
- * thread that received the signal is stopped.
- */
-
-static struct mm_struct *vma_init(void)
-{
- struct mm_struct *mm;
-
- if ((mm = g_malloc(sizeof (*mm))) == NULL)
- return (NULL);
-
- mm->mm_count = 0;
- QTAILQ_INIT(&mm->mm_mmap);
-
- return (mm);
-}
-
-static void vma_delete(struct mm_struct *mm)
-{
- struct vm_area_struct *vma;
-
- while ((vma = vma_first(mm)) != NULL) {
- QTAILQ_REMOVE(&mm->mm_mmap, vma, vma_link);
- g_free(vma);
- }
- g_free(mm);
-}
-
-static int vma_add_mapping(struct mm_struct *mm, target_ulong start,
- target_ulong end, abi_ulong flags)
-{
- struct vm_area_struct *vma;
-
- if ((vma = g_malloc0(sizeof (*vma))) == NULL)
- return (-1);
-
- vma->vma_start = start;
- vma->vma_end = end;
- vma->vma_flags = flags;
-
- QTAILQ_INSERT_TAIL(&mm->mm_mmap, vma, vma_link);
- mm->mm_count++;
-
- return (0);
-}
-
-static struct vm_area_struct *vma_first(const struct mm_struct *mm)
-{
- return (QTAILQ_FIRST(&mm->mm_mmap));
-}
-
-static struct vm_area_struct *vma_next(struct vm_area_struct *vma)
-{
- return (QTAILQ_NEXT(vma, vma_link));
-}
-
-static int vma_get_mapping_count(const struct mm_struct *mm)
-{
- return (mm->mm_count);
-}
-
-/*
* Calculate file (dump) size of given memory region.
*/
-static abi_ulong vma_dump_size(const struct vm_area_struct *vma)
+static size_t vma_dump_size(target_ulong start, target_ulong end,
+ unsigned long flags)
{
- /* if we cannot even read the first page, skip it */
- if (!access_ok_untagged(VERIFY_READ, vma->vma_start, TARGET_PAGE_SIZE))
- return (0);
+ /* The area must be readable. */
+ if (!(flags & PAGE_READ)) {
+ return 0;
+ }
/*
* Usually we don't dump executable pages as they contain
* non-writable code that debugger can read directly from
- * target library etc. However, thread stacks are marked
- * also executable so we read in first page of given region
- * and check whether it contains elf header. If there is
- * no elf header, we dump it.
+ * target library etc. If there is no elf header, we dump it.
*/
- if (vma->vma_flags & PROT_EXEC) {
- char page[TARGET_PAGE_SIZE];
-
- if (copy_from_user(page, vma->vma_start, sizeof (page))) {
- return 0;
- }
- if ((page[EI_MAG0] == ELFMAG0) &&
- (page[EI_MAG1] == ELFMAG1) &&
- (page[EI_MAG2] == ELFMAG2) &&
- (page[EI_MAG3] == ELFMAG3)) {
- /*
- * Mappings are possibly from ELF binary. Don't dump
- * them.
- */
- return (0);
- }
+ if (!(flags & PAGE_WRITE_ORG) &&
+ (flags & PAGE_EXEC) &&
+ memcmp(g2h_untagged(start), ELFMAG, SELFMAG) == 0) {
+ return 0;
}
- return (vma->vma_end - vma->vma_start);
+ return end - start;
}
-static int vma_walker(void *priv, target_ulong start, target_ulong end,
- unsigned long flags)
+static size_t size_note(const char *name, size_t datasz)
{
- struct mm_struct *mm = (struct mm_struct *)priv;
+ size_t namesz = strlen(name) + 1;
- vma_add_mapping(mm, start, end, flags);
- return (0);
+ namesz = ROUND_UP(namesz, 4);
+ datasz = ROUND_UP(datasz, 4);
+
+ return sizeof(struct elf_note) + namesz + datasz;
}
-static void fill_note(struct memelfnote *note, const char *name, int type,
- unsigned int sz, void *data)
+static void *fill_note(void **pptr, int type, const char *name, size_t datasz)
{
- unsigned int namesz;
+ void *ptr = *pptr;
+ struct elf_note *n = ptr;
+ size_t namesz = strlen(name) + 1;
- namesz = strlen(name) + 1;
- note->name = name;
- note->namesz = namesz;
- note->namesz_rounded = roundup(namesz, sizeof (int32_t));
- note->type = type;
- note->datasz = sz;
- note->datasz_rounded = roundup(sz, sizeof (int32_t));
+ n->n_namesz = namesz;
+ n->n_descsz = datasz;
+ n->n_type = type;
+ bswap_note(n);
- note->data = data;
+ ptr += sizeof(*n);
+ memcpy(ptr, name, namesz);
- /*
- * We calculate rounded up note size here as specified by
- * ELF document.
- */
- note->notesz = sizeof (struct elf_note) +
- note->namesz_rounded + note->datasz_rounded;
+ namesz = ROUND_UP(namesz, 4);
+ datasz = ROUND_UP(datasz, 4);
+
+ *pptr = ptr + namesz + datasz;
+ return ptr + namesz;
}
static void fill_elf_header(struct elfhdr *elf, int segs, uint16_t machine,
uint32_t flags)
{
- (void) memset(elf, 0, sizeof(*elf));
+ memcpy(elf->e_ident, ELFMAG, SELFMAG);
- (void) memcpy(elf->e_ident, ELFMAG, SELFMAG);
elf->e_ident[EI_CLASS] = ELF_CLASS;
elf->e_ident[EI_DATA] = ELF_DATA;
elf->e_ident[EI_VERSION] = EV_CURRENT;
@@ -4322,95 +4166,79 @@
bswap_ehdr(elf);
}
-static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, off_t offset)
+static void fill_elf_note_phdr(struct elf_phdr *phdr, size_t sz, off_t offset)
{
phdr->p_type = PT_NOTE;
phdr->p_offset = offset;
- phdr->p_vaddr = 0;
- phdr->p_paddr = 0;
phdr->p_filesz = sz;
- phdr->p_memsz = 0;
- phdr->p_flags = 0;
- phdr->p_align = 0;
bswap_phdr(phdr, 1);
}
-static size_t note_size(const struct memelfnote *note)
+static void fill_prstatus_note(void *data, const TaskState *ts,
+ CPUState *cpu, int signr)
{
- return (note->notesz);
+ /*
+ * Because note memory is only aligned to 4, and target_elf_prstatus
+ * may well have higher alignment requirements, fill locally and
+ * memcpy to the destination afterward.
+ */
+ struct target_elf_prstatus prstatus = {
+ .pr_info.si_signo = signr,
+ .pr_cursig = signr,
+ .pr_pid = ts->ts_tid,
+ .pr_ppid = getppid(),
+ .pr_pgrp = getpgrp(),
+ .pr_sid = getsid(0),
+ };
+
+ elf_core_copy_regs(&prstatus.pr_reg, cpu_env(cpu));
+ bswap_prstatus(&prstatus);
+ memcpy(data, &prstatus, sizeof(prstatus));
}
-static void fill_prstatus(struct target_elf_prstatus *prstatus,
- const TaskState *ts, int signr)
+static void fill_prpsinfo_note(void *data, const TaskState *ts)
{
- (void) memset(prstatus, 0, sizeof (*prstatus));
- prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
- prstatus->pr_pid = ts->ts_tid;
- prstatus->pr_ppid = getppid();
- prstatus->pr_pgrp = getpgrp();
- prstatus->pr_sid = getsid(0);
-
- bswap_prstatus(prstatus);
-}
-
-static int fill_psinfo(struct target_elf_prpsinfo *psinfo, const TaskState *ts)
-{
+ /*
+ * Because note memory is only aligned to 4, and target_elf_prpsinfo
+ * may well have higher alignment requirements, fill locally and
+ * memcpy to the destination afterward.
+ */
+ struct target_elf_prpsinfo psinfo;
char *base_filename;
- unsigned int i, len;
-
- (void) memset(psinfo, 0, sizeof (*psinfo));
+ size_t len;
len = ts->info->env_strings - ts->info->arg_strings;
- if (len >= ELF_PRARGSZ)
- len = ELF_PRARGSZ - 1;
- if (copy_from_user(&psinfo->pr_psargs, ts->info->arg_strings, len)) {
- return -EFAULT;
+ len = MIN(len, ELF_PRARGSZ);
+ memcpy(&psinfo.pr_psargs, g2h_untagged(ts->info->arg_strings), len);
+ for (size_t i = 0; i < len; i++) {
+ if (psinfo.pr_psargs[i] == 0) {
+ psinfo.pr_psargs[i] = ' ';
+ }
}
- for (i = 0; i < len; i++)
- if (psinfo->pr_psargs[i] == 0)
- psinfo->pr_psargs[i] = ' ';
- psinfo->pr_psargs[len] = 0;
- psinfo->pr_pid = getpid();
- psinfo->pr_ppid = getppid();
- psinfo->pr_pgrp = getpgrp();
- psinfo->pr_sid = getsid(0);
- psinfo->pr_uid = getuid();
- psinfo->pr_gid = getgid();
+ psinfo.pr_pid = getpid();
+ psinfo.pr_ppid = getppid();
+ psinfo.pr_pgrp = getpgrp();
+ psinfo.pr_sid = getsid(0);
+ psinfo.pr_uid = getuid();
+ psinfo.pr_gid = getgid();
base_filename = g_path_get_basename(ts->bprm->filename);
/*
* Using strncpy here is fine: at max-length,
* this field is not NUL-terminated.
*/
- (void) strncpy(psinfo->pr_fname, base_filename,
- sizeof(psinfo->pr_fname));
-
+ strncpy(psinfo.pr_fname, base_filename, sizeof(psinfo.pr_fname));
g_free(base_filename);
- bswap_psinfo(psinfo);
- return (0);
+
+ bswap_psinfo(&psinfo);
+ memcpy(data, &psinfo, sizeof(psinfo));
}
-static void fill_auxv_note(struct memelfnote *note, const TaskState *ts)
+static void fill_auxv_note(void *data, const TaskState *ts)
{
- elf_addr_t auxv = (elf_addr_t)ts->info->saved_auxv;
- elf_addr_t orig_auxv = auxv;
- void *ptr;
- int len = ts->info->auxv_len;
-
- /*
- * Auxiliary vector is stored in target process stack. It contains
- * {type, value} pairs that we need to dump into note. This is not
- * strictly necessary but we do it here for sake of completeness.
- */
-
- /* read in whole auxv vector and copy it to memelfnote */
- ptr = lock_user(VERIFY_READ, orig_auxv, len, 0);
- if (ptr != NULL) {
- fill_note(note, "CORE", NT_AUXV, len, ptr);
- unlock_user(ptr, auxv, len);
- }
+ memcpy(data, g2h_untagged(ts->info->saved_auxv), ts->info->auxv_len);
}
/*
@@ -4434,27 +4262,9 @@
{
const char *bufp = (const char *)ptr;
ssize_t bytes_written, bytes_left;
- struct rlimit dumpsize;
- off_t pos;
bytes_written = 0;
- getrlimit(RLIMIT_CORE, &dumpsize);
- if ((pos = lseek(fd, 0, SEEK_CUR))==-1) {
- if (errno == ESPIPE) { /* not a seekable stream */
- bytes_left = size;
- } else {
- return pos;
- }
- } else {
- if (dumpsize.rlim_cur <= pos) {
- return -1;
- } else if (dumpsize.rlim_cur == RLIM_INFINITY) {
- bytes_left = size;
- } else {
- size_t limit_left=dumpsize.rlim_cur - pos;
- bytes_left = limit_left >= size ? size : limit_left ;
- }
- }
+ bytes_left = size;
/*
* In normal conditions, single write(2) should do but
@@ -4476,135 +4286,76 @@
return (0);
}
-static int write_note(struct memelfnote *men, int fd)
+static int wmr_page_unprotect_regions(void *opaque, target_ulong start,
+ target_ulong end, unsigned long flags)
{
- struct elf_note en;
+ if ((flags & (PAGE_WRITE | PAGE_WRITE_ORG)) == PAGE_WRITE_ORG) {
+ size_t step = MAX(TARGET_PAGE_SIZE, qemu_real_host_page_size());
- en.n_namesz = men->namesz;
- en.n_type = men->type;
- en.n_descsz = men->datasz;
-
- bswap_note(&en);
-
- if (dump_write(fd, &en, sizeof(en)) != 0)
- return (-1);
- if (dump_write(fd, men->name, men->namesz_rounded) != 0)
- return (-1);
- if (dump_write(fd, men->data, men->datasz_rounded) != 0)
- return (-1);
-
- return (0);
-}
-
-static void fill_thread_info(struct elf_note_info *info, const CPUArchState *env)
-{
- CPUState *cpu = env_cpu((CPUArchState *)env);
- TaskState *ts = (TaskState *)cpu->opaque;
- struct elf_thread_status *ets;
-
- ets = g_malloc0(sizeof (*ets));
- ets->num_notes = 1; /* only prstatus is dumped */
- fill_prstatus(&ets->prstatus, ts, 0);
- elf_core_copy_regs(&ets->prstatus.pr_reg, env);
- fill_note(&ets->notes[0], "CORE", NT_PRSTATUS, sizeof (ets->prstatus),
- &ets->prstatus);
-
- QTAILQ_INSERT_TAIL(&info->thread_list, ets, ets_link);
-
- info->notes_size += note_size(&ets->notes[0]);
-}
-
-static void init_note_info(struct elf_note_info *info)
-{
- /* Initialize the elf_note_info structure so that it is at
- * least safe to call free_note_info() on it. Must be
- * called before calling fill_note_info().
- */
- memset(info, 0, sizeof (*info));
- QTAILQ_INIT(&info->thread_list);
-}
-
-static int fill_note_info(struct elf_note_info *info,
- long signr, const CPUArchState *env)
-{
-#define NUMNOTES 3
- CPUState *cpu = env_cpu((CPUArchState *)env);
- TaskState *ts = (TaskState *)cpu->opaque;
- int i;
-
- info->notes = g_new0(struct memelfnote, NUMNOTES);
- if (info->notes == NULL)
- return (-ENOMEM);
- info->prstatus = g_malloc0(sizeof (*info->prstatus));
- if (info->prstatus == NULL)
- return (-ENOMEM);
- info->psinfo = g_malloc0(sizeof (*info->psinfo));
- if (info->prstatus == NULL)
- return (-ENOMEM);
-
- /*
- * First fill in status (and registers) of current thread
- * including process info & aux vector.
- */
- fill_prstatus(info->prstatus, ts, signr);
- elf_core_copy_regs(&info->prstatus->pr_reg, env);
- fill_note(&info->notes[0], "CORE", NT_PRSTATUS,
- sizeof (*info->prstatus), info->prstatus);
- fill_psinfo(info->psinfo, ts);
- fill_note(&info->notes[1], "CORE", NT_PRPSINFO,
- sizeof (*info->psinfo), info->psinfo);
- fill_auxv_note(&info->notes[2], ts);
- info->numnote = 3;
-
- info->notes_size = 0;
- for (i = 0; i < info->numnote; i++)
- info->notes_size += note_size(&info->notes[i]);
-
- /* read and fill status of all threads */
- WITH_QEMU_LOCK_GUARD(&qemu_cpu_list_lock) {
- CPU_FOREACH(cpu) {
- if (cpu == thread_cpu) {
- continue;
+ while (1) {
+ page_unprotect(start, 0);
+ if (end - start <= step) {
+ break;
}
- fill_thread_info(info, cpu_env(cpu));
+ start += step;
}
}
-
- return (0);
+ return 0;
}
-static void free_note_info(struct elf_note_info *info)
+typedef struct {
+ unsigned count;
+ size_t size;
+} CountAndSizeRegions;
+
+static int wmr_count_and_size_regions(void *opaque, target_ulong start,
+ target_ulong end, unsigned long flags)
{
- struct elf_thread_status *ets;
+ CountAndSizeRegions *css = opaque;
- while (!QTAILQ_EMPTY(&info->thread_list)) {
- ets = QTAILQ_FIRST(&info->thread_list);
- QTAILQ_REMOVE(&info->thread_list, ets, ets_link);
- g_free(ets);
- }
-
- g_free(info->prstatus);
- g_free(info->psinfo);
- g_free(info->notes);
+ css->count++;
+ css->size += vma_dump_size(start, end, flags);
+ return 0;
}
-static int write_note_info(struct elf_note_info *info, int fd)
+typedef struct {
+ struct elf_phdr *phdr;
+ off_t offset;
+} FillRegionPhdr;
+
+static int wmr_fill_region_phdr(void *opaque, target_ulong start,
+ target_ulong end, unsigned long flags)
{
- struct elf_thread_status *ets;
- int i, error = 0;
+ FillRegionPhdr *d = opaque;
+ struct elf_phdr *phdr = d->phdr;
- /* write prstatus, psinfo and auxv for current thread */
- for (i = 0; i < info->numnote; i++)
- if ((error = write_note(&info->notes[i], fd)) != 0)
- return (error);
+ phdr->p_type = PT_LOAD;
+ phdr->p_vaddr = start;
+ phdr->p_paddr = 0;
+ phdr->p_filesz = vma_dump_size(start, end, flags);
+ phdr->p_offset = d->offset;
+ d->offset += phdr->p_filesz;
+ phdr->p_memsz = end - start;
+ phdr->p_flags = (flags & PAGE_READ ? PF_R : 0)
+ | (flags & PAGE_WRITE_ORG ? PF_W : 0)
+ | (flags & PAGE_EXEC ? PF_X : 0);
+ phdr->p_align = ELF_EXEC_PAGESIZE;
- /* write prstatus for each thread */
- QTAILQ_FOREACH(ets, &info->thread_list, ets_link) {
- if ((error = write_note(&ets->notes[0], fd)) != 0)
- return (error);
+ bswap_phdr(phdr, 1);
+ d->phdr = phdr + 1;
+ return 0;
+}
+
+static int wmr_write_region(void *opaque, target_ulong start,
+ target_ulong end, unsigned long flags)
+{
+ int fd = *(int *)opaque;
+ size_t size = vma_dump_size(start, end, flags);
+
+ if (!size) {
+ return 0;
}
-
- return (0);
+ return dump_write(fd, g2h_untagged(start), size);
}
/*
@@ -4654,151 +4405,125 @@
{
const CPUState *cpu = env_cpu((CPUArchState *)env);
const TaskState *ts = (const TaskState *)cpu->opaque;
- struct vm_area_struct *vma = NULL;
- g_autofree char *corefile = NULL;
- struct elf_note_info info;
- struct elfhdr elf;
- struct elf_phdr phdr;
struct rlimit dumpsize;
- struct mm_struct *mm = NULL;
- off_t offset = 0, data_offset = 0;
- int segs = 0;
+ CountAndSizeRegions css;
+ off_t offset, note_offset, data_offset;
+ size_t note_size;
+ int cpus, ret;
int fd = -1;
-
- init_note_info(&info);
-
- errno = 0;
+ CPUState *cpu_iter;
if (prctl(PR_GET_DUMPABLE) == 0) {
return 0;
}
- if (getrlimit(RLIMIT_CORE, &dumpsize) == 0 && dumpsize.rlim_cur == 0) {
+ if (getrlimit(RLIMIT_CORE, &dumpsize) < 0 || dumpsize.rlim_cur == 0) {
return 0;
}
- corefile = core_dump_filename(ts);
+ cpu_list_lock();
+ mmap_lock();
- if ((fd = open(corefile, O_WRONLY | O_CREAT,
- S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH)) < 0)
- return (-errno);
+ /* By unprotecting, we merge vmas that might be split. */
+ walk_memory_regions(NULL, wmr_page_unprotect_regions);
/*
* Walk through target process memory mappings and
- * set up structure containing this information. After
- * this point vma_xxx functions can be used.
+ * set up structure containing this information.
*/
- if ((mm = vma_init()) == NULL)
- goto out;
+ memset(&css, 0, sizeof(css));
+ walk_memory_regions(&css, wmr_count_and_size_regions);
- walk_memory_regions(mm, vma_walker);
- segs = vma_get_mapping_count(mm);
+ cpus = 0;
+ CPU_FOREACH(cpu_iter) {
+ cpus++;
+ }
+
+ offset = sizeof(struct elfhdr);
+ offset += (css.count + 1) * sizeof(struct elf_phdr);
+ note_offset = offset;
+
+ offset += size_note("CORE", ts->info->auxv_len);
+ offset += size_note("CORE", sizeof(struct target_elf_prpsinfo));
+ offset += size_note("CORE", sizeof(struct target_elf_prstatus)) * cpus;
+ note_size = offset - note_offset;
+ data_offset = ROUND_UP(offset, ELF_EXEC_PAGESIZE);
+
+ /* Do not dump if the corefile size exceeds the limit. */
+ if (dumpsize.rlim_cur != RLIM_INFINITY
+ && dumpsize.rlim_cur < data_offset + css.size) {
+ errno = 0;
+ goto out;
+ }
+
+ {
+ g_autofree char *corefile = core_dump_filename(ts);
+ fd = open(corefile, O_WRONLY | O_CREAT | O_TRUNC,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+ }
+ if (fd < 0) {
+ goto out;
+ }
/*
- * Construct valid coredump ELF header. We also
- * add one more segment for notes.
+ * There is a fair amount of alignment padding within the notes
+ * as well as preceeding the process memory. Allocate a zeroed
+ * block to hold it all. Write all of the headers directly into
+ * this buffer and then write it out as a block.
*/
- fill_elf_header(&elf, segs + 1, ELF_MACHINE, 0);
- if (dump_write(fd, &elf, sizeof (elf)) != 0)
- goto out;
+ {
+ g_autofree void *header = g_malloc0(data_offset);
+ FillRegionPhdr frp;
+ void *hptr, *dptr;
- /* fill in the in-memory version of notes */
- if (fill_note_info(&info, signr, env) < 0)
- goto out;
+ /* Create elf file header. */
+ hptr = header;
+ fill_elf_header(hptr, css.count + 1, ELF_MACHINE, 0);
+ hptr += sizeof(struct elfhdr);
- offset += sizeof (elf); /* elf header */
- offset += (segs + 1) * sizeof (struct elf_phdr); /* program headers */
+ /* Create elf program headers. */
+ fill_elf_note_phdr(hptr, note_size, note_offset);
+ hptr += sizeof(struct elf_phdr);
- /* write out notes program header */
- fill_elf_note_phdr(&phdr, info.notes_size, offset);
+ frp.phdr = hptr;
+ frp.offset = data_offset;
+ walk_memory_regions(&frp, wmr_fill_region_phdr);
+ hptr = frp.phdr;
- offset += info.notes_size;
- if (dump_write(fd, &phdr, sizeof (phdr)) != 0)
- goto out;
+ /* Create the notes. */
+ dptr = fill_note(&hptr, NT_AUXV, "CORE", ts->info->auxv_len);
+ fill_auxv_note(dptr, ts);
- /*
- * ELF specification wants data to start at page boundary so
- * we align it here.
- */
- data_offset = offset = roundup(offset, ELF_EXEC_PAGESIZE);
+ dptr = fill_note(&hptr, NT_PRPSINFO, "CORE",
+ sizeof(struct target_elf_prpsinfo));
+ fill_prpsinfo_note(dptr, ts);
- /*
- * Write program headers for memory regions mapped in
- * the target process.
- */
- for (vma = vma_first(mm); vma != NULL; vma = vma_next(vma)) {
- (void) memset(&phdr, 0, sizeof (phdr));
+ CPU_FOREACH(cpu_iter) {
+ dptr = fill_note(&hptr, NT_PRSTATUS, "CORE",
+ sizeof(struct target_elf_prstatus));
+ fill_prstatus_note(dptr, ts, cpu_iter,
+ cpu_iter == cpu ? signr : 0);
+ }
- phdr.p_type = PT_LOAD;
- phdr.p_offset = offset;
- phdr.p_vaddr = vma->vma_start;
- phdr.p_paddr = 0;
- phdr.p_filesz = vma_dump_size(vma);
- offset += phdr.p_filesz;
- phdr.p_memsz = vma->vma_end - vma->vma_start;
- phdr.p_flags = vma->vma_flags & PROT_READ ? PF_R : 0;
- if (vma->vma_flags & PROT_WRITE)
- phdr.p_flags |= PF_W;
- if (vma->vma_flags & PROT_EXEC)
- phdr.p_flags |= PF_X;
- phdr.p_align = ELF_EXEC_PAGESIZE;
-
- bswap_phdr(&phdr, 1);
- if (dump_write(fd, &phdr, sizeof(phdr)) != 0) {
+ if (dump_write(fd, header, data_offset) < 0) {
goto out;
}
}
/*
- * Next we write notes just after program headers. No
- * alignment needed here.
+ * Finally write process memory into the corefile as well.
*/
- if (write_note_info(&info, fd) < 0)
+ if (walk_memory_regions(&fd, wmr_write_region) < 0) {
goto out;
-
- /* align data to page boundary */
- if (lseek(fd, data_offset, SEEK_SET) != data_offset)
- goto out;
-
- /*
- * Finally we can dump process memory into corefile as well.
- */
- for (vma = vma_first(mm); vma != NULL; vma = vma_next(vma)) {
- abi_ulong addr;
- abi_ulong end;
-
- end = vma->vma_start + vma_dump_size(vma);
-
- for (addr = vma->vma_start; addr < end;
- addr += TARGET_PAGE_SIZE) {
- char page[TARGET_PAGE_SIZE];
- int error;
-
- /*
- * Read in page from target process memory and
- * write it to coredump file.
- */
- error = copy_from_user(page, addr, sizeof (page));
- if (error != 0) {
- (void) fprintf(stderr, "unable to dump " TARGET_ABI_FMT_lx "\n",
- addr);
- errno = -error;
- goto out;
- }
- if (dump_write(fd, page, TARGET_PAGE_SIZE) < 0)
- goto out;
- }
}
+ errno = 0;
out:
- free_note_info(&info);
- if (mm != NULL)
- vma_delete(mm);
- (void) close(fd);
-
- if (errno != 0)
- return (-errno);
- return (0);
+ ret = -errno;
+ mmap_unlock();
+ cpu_list_unlock();
+ close(fd);
+ return ret;
}
#endif /* USE_ELF_CORE_DUMP */
diff --git a/linux-user/loongarch64/target_syscall.h b/linux-user/loongarch64/target_syscall.h
index 8b5de52..39f229b 100644
--- a/linux-user/loongarch64/target_syscall.h
+++ b/linux-user/loongarch64/target_syscall.h
@@ -38,11 +38,4 @@
#define TARGET_MCL_FUTURE 2
#define TARGET_MCL_ONFAULT 4
-#define TARGET_FORCE_SHMLBA
-
-static inline abi_ulong target_shmlba(CPULoongArchState *env)
-{
- return 64 * KiB;
-}
-
#endif
diff --git a/linux-user/main.c b/linux-user/main.c
index 74b2fbb..551acf1 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -55,6 +55,7 @@
#include "loader.h"
#include "user-mmap.h"
#include "tcg/perf.h"
+#include "exec/page-vary.h"
#ifdef CONFIG_SEMIHOSTING
#include "semihosting/semihost.h"
@@ -332,11 +333,11 @@
static void handle_arg_pagesize(const char *arg)
{
- qemu_host_page_size = atoi(arg);
- if (qemu_host_page_size == 0 ||
- (qemu_host_page_size & (qemu_host_page_size - 1)) != 0) {
- fprintf(stderr, "page size must be a power of two\n");
- exit(EXIT_FAILURE);
+ unsigned size, want = qemu_real_host_page_size();
+
+ if (qemu_strtoui(arg, NULL, 10, &size) || size != want) {
+ warn_report("Deprecated page size option cannot "
+ "change host page size (%u)", want);
}
}
@@ -496,7 +497,7 @@
{"D", "QEMU_LOG_FILENAME", true, handle_arg_log_filename,
"logfile", "write logs to 'logfile' (default stderr)"},
{"p", "QEMU_PAGESIZE", true, handle_arg_pagesize,
- "pagesize", "set the host page size to 'pagesize'"},
+ "pagesize", "deprecated change to host page size"},
{"one-insn-per-tb",
"QEMU_ONE_INSN_PER_TB", false, handle_arg_one_insn_per_tb,
"", "run with one guest instruction per emulated TB"},
@@ -680,6 +681,7 @@
int i;
int ret;
int execfd;
+ int host_page_size;
unsigned long max_reserved_va;
bool preserve_argv0;
@@ -781,7 +783,7 @@
}
cpu_type = parse_cpu_option(cpu_model);
- /* init tcg before creating CPUs and to get qemu_host_page_size */
+ /* init tcg before creating CPUs */
{
AccelState *accel = current_accel();
AccelClass *ac = ACCEL_GET_CLASS(accel);
@@ -791,6 +793,16 @@
opt_one_insn_per_tb, &error_abort);
ac->init_machine(NULL);
}
+
+ /*
+ * Finalize page size before creating CPUs.
+ * This will do nothing if !TARGET_PAGE_BITS_VARY.
+ * The most efficient setting is to match the host.
+ */
+ host_page_size = qemu_real_host_page_size();
+ set_preferred_target_page_bits(ctz32(host_page_size));
+ finalize_target_page_bits();
+
cpu = cpu_create(cpu_type);
env = cpu_env(cpu);
cpu_reset(cpu);
@@ -804,8 +816,8 @@
*/
max_reserved_va = MAX_RESERVED_VA(cpu);
if (reserved_va != 0) {
- if ((reserved_va + 1) % qemu_host_page_size) {
- char *s = size_to_str(qemu_host_page_size);
+ if ((reserved_va + 1) % host_page_size) {
+ char *s = size_to_str(host_page_size);
fprintf(stderr, "Reserved virtual address not aligned mod %s\n", s);
g_free(s);
exit(EXIT_FAILURE);
@@ -889,7 +901,7 @@
if ((fp = fopen("/proc/sys/vm/mmap_min_addr", "r")) != NULL) {
unsigned long tmp;
if (fscanf(fp, "%lu", &tmp) == 1 && tmp != 0) {
- mmap_min_addr = tmp;
+ mmap_min_addr = MAX(tmp, host_page_size);
qemu_log_mask(CPU_LOG_PAGE, "host mmap_min_addr=0x%lx\n",
mmap_min_addr);
}
@@ -902,7 +914,7 @@
* If we're in a chroot with no /proc, fall back to 1 page.
*/
if (mmap_min_addr == 0) {
- mmap_min_addr = qemu_host_page_size;
+ mmap_min_addr = host_page_size;
qemu_log_mask(CPU_LOG_PAGE,
"host mmap_min_addr=0x%lx (fallback)\n",
mmap_min_addr);
diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index 96c9433..4505fd7 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -165,6 +165,7 @@
/* NOTE: all the constants are the HOST ones, but addresses are target. */
int target_mprotect(abi_ulong start, abi_ulong len, int target_prot)
{
+ int host_page_size = qemu_real_host_page_size();
abi_ulong starts[3];
abi_ulong lens[3];
int prots[3];
@@ -189,13 +190,13 @@
}
last = start + len - 1;
- host_start = start & qemu_host_page_mask;
- host_last = HOST_PAGE_ALIGN(last) - 1;
+ host_start = start & -host_page_size;
+ host_last = ROUND_UP(last, host_page_size) - 1;
nranges = 0;
mmap_lock();
- if (host_last - host_start < qemu_host_page_size) {
+ if (host_last - host_start < host_page_size) {
/* Single host page contains all guest pages: sum the prot. */
prot1 = target_prot;
for (abi_ulong a = host_start; a < start; a += TARGET_PAGE_SIZE) {
@@ -205,7 +206,7 @@
prot1 |= page_get_flags(a + 1);
}
starts[nranges] = host_start;
- lens[nranges] = qemu_host_page_size;
+ lens[nranges] = host_page_size;
prots[nranges] = prot1;
nranges++;
} else {
@@ -218,10 +219,10 @@
/* If the resulting sum differs, create a new range. */
if (prot1 != target_prot) {
starts[nranges] = host_start;
- lens[nranges] = qemu_host_page_size;
+ lens[nranges] = host_page_size;
prots[nranges] = prot1;
nranges++;
- host_start += qemu_host_page_size;
+ host_start += host_page_size;
}
}
@@ -233,9 +234,9 @@
}
/* If the resulting sum differs, create a new range. */
if (prot1 != target_prot) {
- host_last -= qemu_host_page_size;
+ host_last -= host_page_size;
starts[nranges] = host_last + 1;
- lens[nranges] = qemu_host_page_size;
+ lens[nranges] = host_page_size;
prots[nranges] = prot1;
nranges++;
}
@@ -266,10 +267,35 @@
return ret;
}
-/* map an incomplete host page */
+/*
+ * Perform munmap on behalf of the target, with host parameters.
+ * If reserved_va, we must replace the memory reservation.
+ */
+static int do_munmap(void *addr, size_t len)
+{
+ if (reserved_va) {
+ void *ptr = mmap(addr, len, PROT_NONE,
+ MAP_FIXED | MAP_ANONYMOUS
+ | MAP_PRIVATE | MAP_NORESERVE, -1, 0);
+ return ptr == addr ? 0 : -1;
+ }
+ return munmap(addr, len);
+}
+
+/*
+ * Map an incomplete host page.
+ *
+ * Here be dragons. This case will not work if there is an existing
+ * overlapping host page, which is file mapped, and for which the mapping
+ * is beyond the end of the file. In that case, we will see SIGBUS when
+ * trying to write a portion of this page.
+ *
+ * FIXME: Work around this with a temporary signal handler and longjmp.
+ */
static bool mmap_frag(abi_ulong real_start, abi_ulong start, abi_ulong last,
int prot, int flags, int fd, off_t offset)
{
+ int host_page_size = qemu_real_host_page_size();
abi_ulong real_last;
void *host_start;
int prot_old, prot_new;
@@ -286,7 +312,7 @@
return false;
}
- real_last = real_start + qemu_host_page_size - 1;
+ real_last = real_start + host_page_size - 1;
host_start = g2h_untagged(real_start);
/* Get the protection of the target pages outside the mapping. */
@@ -304,12 +330,12 @@
* outside of the fragment we need to map. Allocate a new host
* page to cover, discarding whatever else may have been present.
*/
- void *p = mmap(host_start, qemu_host_page_size,
+ void *p = mmap(host_start, host_page_size,
target_to_host_prot(prot),
flags | MAP_ANONYMOUS, -1, 0);
if (p != host_start) {
if (p != MAP_FAILED) {
- munmap(p, qemu_host_page_size);
+ do_munmap(p, host_page_size);
errno = EEXIST;
}
return false;
@@ -324,7 +350,7 @@
/* Adjust protection to be able to write. */
if (!(host_prot_old & PROT_WRITE)) {
host_prot_old |= PROT_WRITE;
- mprotect(host_start, qemu_host_page_size, host_prot_old);
+ mprotect(host_start, host_page_size, host_prot_old);
}
/* Read or zero the new guest pages. */
@@ -338,7 +364,7 @@
/* Put final protection */
if (host_prot_new != host_prot_old) {
- mprotect(host_start, qemu_host_page_size, host_prot_new);
+ mprotect(host_start, host_page_size, host_prot_new);
}
return true;
}
@@ -373,21 +399,21 @@
*/
abi_ulong mmap_find_vma(abi_ulong start, abi_ulong size, abi_ulong align)
{
+ int host_page_size = qemu_real_host_page_size();
void *ptr, *prev;
abi_ulong addr;
int wrapped, repeat;
- align = MAX(align, qemu_host_page_size);
+ align = MAX(align, host_page_size);
/* If 'start' == 0, then a default start address is used. */
if (start == 0) {
start = mmap_next_start;
} else {
- start &= qemu_host_page_mask;
+ start &= -host_page_size;
}
start = ROUND_UP(start, align);
-
- size = HOST_PAGE_ALIGN(size);
+ size = ROUND_UP(size, host_page_size);
if (reserved_va) {
return mmap_find_vma_reserved(start, size, align);
@@ -488,266 +514,14 @@
}
}
-/* NOTE: all the constants are the HOST ones */
-abi_long target_mmap(abi_ulong start, abi_ulong len, int target_prot,
- int flags, int fd, off_t offset)
+/*
+ * Record a successful mmap within the user-exec interval tree.
+ */
+static abi_long mmap_end(abi_ulong start, abi_ulong last,
+ abi_ulong passthrough_start,
+ abi_ulong passthrough_last,
+ int flags, int page_flags)
{
- abi_ulong ret, last, real_start, real_last, retaddr, host_len;
- abi_ulong passthrough_start = -1, passthrough_last = 0;
- int page_flags;
- off_t host_offset;
-
- mmap_lock();
- trace_target_mmap(start, len, target_prot, flags, fd, offset);
-
- if (!len) {
- errno = EINVAL;
- goto fail;
- }
-
- page_flags = validate_prot_to_pageflags(target_prot);
- if (!page_flags) {
- errno = EINVAL;
- goto fail;
- }
-
- /* Also check for overflows... */
- len = TARGET_PAGE_ALIGN(len);
- if (!len) {
- errno = ENOMEM;
- goto fail;
- }
-
- if (offset & ~TARGET_PAGE_MASK) {
- errno = EINVAL;
- goto fail;
- }
-
- /*
- * If we're mapping shared memory, ensure we generate code for parallel
- * execution and flush old translations. This will work up to the level
- * supported by the host -- anything that requires EXCP_ATOMIC will not
- * be atomic with respect to an external process.
- */
- if (flags & MAP_SHARED) {
- CPUState *cpu = thread_cpu;
- if (!(cpu->tcg_cflags & CF_PARALLEL)) {
- cpu->tcg_cflags |= CF_PARALLEL;
- tb_flush(cpu);
- }
- }
-
- real_start = start & qemu_host_page_mask;
- host_offset = offset & qemu_host_page_mask;
-
- /*
- * If the user is asking for the kernel to find a location, do that
- * before we truncate the length for mapping files below.
- */
- if (!(flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) {
- host_len = len + offset - host_offset;
- host_len = HOST_PAGE_ALIGN(host_len);
- start = mmap_find_vma(real_start, host_len, TARGET_PAGE_SIZE);
- if (start == (abi_ulong)-1) {
- errno = ENOMEM;
- goto fail;
- }
- }
-
- /*
- * When mapping files into a memory area larger than the file, accesses
- * to pages beyond the file size will cause a SIGBUS.
- *
- * For example, if mmaping a file of 100 bytes on a host with 4K pages
- * emulating a target with 8K pages, the target expects to be able to
- * access the first 8K. But the host will trap us on any access beyond
- * 4K.
- *
- * When emulating a target with a larger page-size than the hosts, we
- * may need to truncate file maps at EOF and add extra anonymous pages
- * up to the targets page boundary.
- */
- if ((qemu_real_host_page_size() < qemu_host_page_size) &&
- !(flags & MAP_ANONYMOUS)) {
- struct stat sb;
-
- if (fstat(fd, &sb) == -1) {
- goto fail;
- }
-
- /* Are we trying to create a map beyond EOF?. */
- if (offset + len > sb.st_size) {
- /*
- * If so, truncate the file map at eof aligned with
- * the hosts real pagesize. Additional anonymous maps
- * will be created beyond EOF.
- */
- len = REAL_HOST_PAGE_ALIGN(sb.st_size - offset);
- }
- }
-
- if (!(flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) {
- uintptr_t host_start;
- int host_prot;
- void *p;
-
- host_len = len + offset - host_offset;
- host_len = HOST_PAGE_ALIGN(host_len);
- host_prot = target_to_host_prot(target_prot);
-
- /*
- * Note: we prefer to control the mapping address. It is
- * especially important if qemu_host_page_size >
- * qemu_real_host_page_size.
- */
- p = mmap(g2h_untagged(start), host_len, host_prot,
- flags | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
- if (p == MAP_FAILED) {
- goto fail;
- }
- /* update start so that it points to the file position at 'offset' */
- host_start = (uintptr_t)p;
- if (!(flags & MAP_ANONYMOUS)) {
- p = mmap(g2h_untagged(start), len, host_prot,
- flags | MAP_FIXED, fd, host_offset);
- if (p == MAP_FAILED) {
- munmap(g2h_untagged(start), host_len);
- goto fail;
- }
- host_start += offset - host_offset;
- }
- start = h2g(host_start);
- last = start + len - 1;
- passthrough_start = start;
- passthrough_last = last;
- } else {
- if (start & ~TARGET_PAGE_MASK) {
- errno = EINVAL;
- goto fail;
- }
- last = start + len - 1;
- real_last = HOST_PAGE_ALIGN(last) - 1;
-
- /*
- * Test if requested memory area fits target address space
- * It can fail only on 64-bit host with 32-bit target.
- * On any other target/host host mmap() handles this error correctly.
- */
- if (last < start || !guest_range_valid_untagged(start, len)) {
- errno = ENOMEM;
- goto fail;
- }
-
- if (flags & MAP_FIXED_NOREPLACE) {
- /* Validate that the chosen range is empty. */
- if (!page_check_range_empty(start, last)) {
- errno = EEXIST;
- goto fail;
- }
-
- /*
- * With reserved_va, the entire address space is mmaped in the
- * host to ensure it isn't accidentally used for something else.
- * We have just checked that the guest address is not mapped
- * within the guest, but need to replace the host reservation.
- *
- * Without reserved_va, despite the guest address check above,
- * keep MAP_FIXED_NOREPLACE so that the guest does not overwrite
- * any host address mappings.
- */
- if (reserved_va) {
- flags = (flags & ~MAP_FIXED_NOREPLACE) | MAP_FIXED;
- }
- }
-
- /*
- * worst case: we cannot map the file because the offset is not
- * aligned, so we read it
- */
- if (!(flags & MAP_ANONYMOUS) &&
- (offset & ~qemu_host_page_mask) != (start & ~qemu_host_page_mask)) {
- /*
- * msync() won't work here, so we return an error if write is
- * possible while it is a shared mapping
- */
- if ((flags & MAP_TYPE) == MAP_SHARED
- && (target_prot & PROT_WRITE)) {
- errno = EINVAL;
- goto fail;
- }
- retaddr = target_mmap(start, len, target_prot | PROT_WRITE,
- (flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))
- | MAP_PRIVATE | MAP_ANONYMOUS,
- -1, 0);
- if (retaddr == -1) {
- goto fail;
- }
- if (pread(fd, g2h_untagged(start), len, offset) == -1) {
- goto fail;
- }
- if (!(target_prot & PROT_WRITE)) {
- ret = target_mprotect(start, len, target_prot);
- assert(ret == 0);
- }
- goto the_end;
- }
-
- /* handle the start of the mapping */
- if (start > real_start) {
- if (real_last == real_start + qemu_host_page_size - 1) {
- /* one single host page */
- if (!mmap_frag(real_start, start, last,
- target_prot, flags, fd, offset)) {
- goto fail;
- }
- goto the_end1;
- }
- if (!mmap_frag(real_start, start,
- real_start + qemu_host_page_size - 1,
- target_prot, flags, fd, offset)) {
- goto fail;
- }
- real_start += qemu_host_page_size;
- }
- /* handle the end of the mapping */
- if (last < real_last) {
- abi_ulong real_page = real_last - qemu_host_page_size + 1;
- if (!mmap_frag(real_page, real_page, last,
- target_prot, flags, fd,
- offset + real_page - start)) {
- goto fail;
- }
- real_last -= qemu_host_page_size;
- }
-
- /* map the middle (easier) */
- if (real_start < real_last) {
- void *p, *want_p;
- off_t offset1;
- size_t len1;
-
- if (flags & MAP_ANONYMOUS) {
- offset1 = 0;
- } else {
- offset1 = offset + real_start - start;
- }
- len1 = real_last - real_start + 1;
- want_p = g2h_untagged(real_start);
-
- p = mmap(want_p, len1, target_to_host_prot(target_prot),
- flags, fd, offset1);
- if (p != want_p) {
- if (p != MAP_FAILED) {
- munmap(p, len1);
- errno = EEXIST;
- }
- goto fail;
- }
- passthrough_start = real_start;
- passthrough_last = real_last;
- }
- }
- the_end1:
if (flags & MAP_ANONYMOUS) {
page_flags |= PAGE_ANON;
}
@@ -765,7 +539,6 @@
}
}
shm_region_rm_complete(start, last);
- the_end:
trace_target_mmap_complete(start);
if (qemu_loglevel_mask(CPU_LOG_PAGE)) {
FILE *f = qemu_log_trylock();
@@ -775,15 +548,429 @@
qemu_log_unlock(f);
}
}
- mmap_unlock();
return start;
-fail:
+}
+
+/*
+ * Special case host page size == target page size,
+ * where there are no edge conditions.
+ */
+static abi_long mmap_h_eq_g(abi_ulong start, abi_ulong len,
+ int host_prot, int flags, int page_flags,
+ int fd, off_t offset)
+{
+ void *p, *want_p = g2h_untagged(start);
+ abi_ulong last;
+
+ p = mmap(want_p, len, host_prot, flags, fd, offset);
+ if (p == MAP_FAILED) {
+ return -1;
+ }
+ /* If the host kernel does not support MAP_FIXED_NOREPLACE, emulate. */
+ if ((flags & MAP_FIXED_NOREPLACE) && p != want_p) {
+ do_munmap(p, len);
+ errno = EEXIST;
+ return -1;
+ }
+
+ start = h2g(p);
+ last = start + len - 1;
+ return mmap_end(start, last, start, last, flags, page_flags);
+}
+
+/*
+ * Special case host page size < target page size.
+ *
+ * The two special cases are increased guest alignment, and mapping
+ * past the end of a file.
+ *
+ * When mapping files into a memory area larger than the file,
+ * accesses to pages beyond the file size will cause a SIGBUS.
+ *
+ * For example, if mmaping a file of 100 bytes on a host with 4K
+ * pages emulating a target with 8K pages, the target expects to
+ * be able to access the first 8K. But the host will trap us on
+ * any access beyond 4K.
+ *
+ * When emulating a target with a larger page-size than the hosts,
+ * we may need to truncate file maps at EOF and add extra anonymous
+ * pages up to the targets page boundary.
+ *
+ * This workaround only works for files that do not change.
+ * If the file is later extended (e.g. ftruncate), the SIGBUS
+ * vanishes and the proper behaviour is that changes within the
+ * anon page should be reflected in the file.
+ *
+ * However, this case is rather common with executable images,
+ * so the workaround is important for even trivial tests, whereas
+ * the mmap of of a file being extended is less common.
+ */
+static abi_long mmap_h_lt_g(abi_ulong start, abi_ulong len, int host_prot,
+ int mmap_flags, int page_flags, int fd,
+ off_t offset, int host_page_size)
+{
+ void *p, *want_p = g2h_untagged(start);
+ off_t fileend_adj = 0;
+ int flags = mmap_flags;
+ abi_ulong last, pass_last;
+
+ if (!(flags & MAP_ANONYMOUS)) {
+ struct stat sb;
+
+ if (fstat(fd, &sb) == -1) {
+ return -1;
+ }
+ if (offset >= sb.st_size) {
+ /*
+ * The entire map is beyond the end of the file.
+ * Transform it to an anonymous mapping.
+ */
+ flags |= MAP_ANONYMOUS;
+ fd = -1;
+ offset = 0;
+ } else if (offset + len > sb.st_size) {
+ /*
+ * A portion of the map is beyond the end of the file.
+ * Truncate the file portion of the allocation.
+ */
+ fileend_adj = offset + len - sb.st_size;
+ }
+ }
+
+ if (flags & (MAP_FIXED | MAP_FIXED_NOREPLACE)) {
+ if (fileend_adj) {
+ p = mmap(want_p, len, host_prot, flags | MAP_ANONYMOUS, -1, 0);
+ } else {
+ p = mmap(want_p, len, host_prot, flags, fd, offset);
+ }
+ if (p != want_p) {
+ if (p != MAP_FAILED) {
+ /* Host does not support MAP_FIXED_NOREPLACE: emulate. */
+ do_munmap(p, len);
+ errno = EEXIST;
+ }
+ return -1;
+ }
+
+ if (fileend_adj) {
+ void *t = mmap(p, len - fileend_adj, host_prot,
+ (flags & ~MAP_FIXED_NOREPLACE) | MAP_FIXED,
+ fd, offset);
+
+ if (t == MAP_FAILED) {
+ int save_errno = errno;
+
+ /*
+ * We failed a map over the top of the successful anonymous
+ * mapping above. The only failure mode is running out of VMAs,
+ * and there's nothing that we can do to detect that earlier.
+ * If we have replaced an existing mapping with MAP_FIXED,
+ * then we cannot properly recover. It's a coin toss whether
+ * it would be better to exit or continue here.
+ */
+ if (!(flags & MAP_FIXED_NOREPLACE) &&
+ !page_check_range_empty(start, start + len - 1)) {
+ qemu_log("QEMU target_mmap late failure: %s",
+ strerror(save_errno));
+ }
+
+ do_munmap(want_p, len);
+ errno = save_errno;
+ return -1;
+ }
+ }
+ } else {
+ size_t host_len, part_len;
+
+ /*
+ * Take care to align the host memory. Perform a larger anonymous
+ * allocation and extract the aligned portion. Remap the file on
+ * top of that.
+ */
+ host_len = len + TARGET_PAGE_SIZE - host_page_size;
+ p = mmap(want_p, host_len, host_prot, flags | MAP_ANONYMOUS, -1, 0);
+ if (p == MAP_FAILED) {
+ return -1;
+ }
+
+ part_len = (uintptr_t)p & (TARGET_PAGE_SIZE - 1);
+ if (part_len) {
+ part_len = TARGET_PAGE_SIZE - part_len;
+ do_munmap(p, part_len);
+ p += part_len;
+ host_len -= part_len;
+ }
+ if (len < host_len) {
+ do_munmap(p + len, host_len - len);
+ }
+
+ if (!(flags & MAP_ANONYMOUS)) {
+ void *t = mmap(p, len - fileend_adj, host_prot,
+ flags | MAP_FIXED, fd, offset);
+
+ if (t == MAP_FAILED) {
+ int save_errno = errno;
+ do_munmap(p, len);
+ errno = save_errno;
+ return -1;
+ }
+ }
+
+ start = h2g(p);
+ }
+
+ last = start + len - 1;
+ if (fileend_adj) {
+ pass_last = ROUND_UP(last - fileend_adj, host_page_size) - 1;
+ } else {
+ pass_last = last;
+ }
+ return mmap_end(start, last, start, pass_last, mmap_flags, page_flags);
+}
+
+/*
+ * Special case host page size > target page size.
+ *
+ * The two special cases are address and file offsets that are valid
+ * for the guest that cannot be directly represented by the host.
+ */
+static abi_long mmap_h_gt_g(abi_ulong start, abi_ulong len,
+ int target_prot, int host_prot,
+ int flags, int page_flags, int fd,
+ off_t offset, int host_page_size)
+{
+ void *p, *want_p = g2h_untagged(start);
+ off_t host_offset = offset & -host_page_size;
+ abi_ulong last, real_start, real_last;
+ bool misaligned_offset = false;
+ size_t host_len;
+
+ if (!(flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) {
+ /*
+ * Adjust the offset to something representable on the host.
+ */
+ host_len = len + offset - host_offset;
+ p = mmap(want_p, host_len, host_prot, flags, fd, host_offset);
+ if (p == MAP_FAILED) {
+ return -1;
+ }
+
+ /* Update start to the file position at offset. */
+ p += offset - host_offset;
+
+ start = h2g(p);
+ last = start + len - 1;
+ return mmap_end(start, last, start, last, flags, page_flags);
+ }
+
+ if (!(flags & MAP_ANONYMOUS)) {
+ misaligned_offset = (start ^ offset) & (host_page_size - 1);
+
+ /*
+ * The fallback for misalignment is a private mapping + read.
+ * This carries none of semantics required of MAP_SHARED.
+ */
+ if (misaligned_offset && (flags & MAP_TYPE) != MAP_PRIVATE) {
+ errno = EINVAL;
+ return -1;
+ }
+ }
+
+ last = start + len - 1;
+ real_start = start & -host_page_size;
+ real_last = ROUND_UP(last, host_page_size) - 1;
+
+ /*
+ * Handle the start and end of the mapping.
+ */
+ if (real_start < start) {
+ abi_ulong real_page_last = real_start + host_page_size - 1;
+ if (last <= real_page_last) {
+ /* Entire allocation a subset of one host page. */
+ if (!mmap_frag(real_start, start, last, target_prot,
+ flags, fd, offset)) {
+ return -1;
+ }
+ return mmap_end(start, last, -1, 0, flags, page_flags);
+ }
+
+ if (!mmap_frag(real_start, start, real_page_last, target_prot,
+ flags, fd, offset)) {
+ return -1;
+ }
+ real_start = real_page_last + 1;
+ }
+
+ if (last < real_last) {
+ abi_ulong real_page_start = real_last - host_page_size + 1;
+ if (!mmap_frag(real_page_start, real_page_start, last,
+ target_prot, flags, fd,
+ offset + real_page_start - start)) {
+ return -1;
+ }
+ real_last = real_page_start - 1;
+ }
+
+ if (real_start > real_last) {
+ return mmap_end(start, last, -1, 0, flags, page_flags);
+ }
+
+ /*
+ * Handle the middle of the mapping.
+ */
+
+ host_len = real_last - real_start + 1;
+ want_p += real_start - start;
+
+ if (flags & MAP_ANONYMOUS) {
+ p = mmap(want_p, host_len, host_prot, flags, -1, 0);
+ } else if (!misaligned_offset) {
+ p = mmap(want_p, host_len, host_prot, flags, fd,
+ offset + real_start - start);
+ } else {
+ p = mmap(want_p, host_len, host_prot | PROT_WRITE,
+ flags | MAP_ANONYMOUS, -1, 0);
+ }
+ if (p != want_p) {
+ if (p != MAP_FAILED) {
+ do_munmap(p, host_len);
+ errno = EEXIST;
+ }
+ return -1;
+ }
+
+ if (misaligned_offset) {
+ /* TODO: The read could be short. */
+ if (pread(fd, p, host_len, offset + real_start - start) != host_len) {
+ do_munmap(p, host_len);
+ return -1;
+ }
+ if (!(host_prot & PROT_WRITE)) {
+ mprotect(p, host_len, host_prot);
+ }
+ }
+
+ return mmap_end(start, last, -1, 0, flags, page_flags);
+}
+
+static abi_long target_mmap__locked(abi_ulong start, abi_ulong len,
+ int target_prot, int flags, int page_flags,
+ int fd, off_t offset)
+{
+ int host_page_size = qemu_real_host_page_size();
+ int host_prot;
+
+ /*
+ * For reserved_va, we are in full control of the allocation.
+ * Find a suitable hole and convert to MAP_FIXED.
+ */
+ if (reserved_va) {
+ if (flags & MAP_FIXED_NOREPLACE) {
+ /* Validate that the chosen range is empty. */
+ if (!page_check_range_empty(start, start + len - 1)) {
+ errno = EEXIST;
+ return -1;
+ }
+ flags = (flags & ~MAP_FIXED_NOREPLACE) | MAP_FIXED;
+ } else if (!(flags & MAP_FIXED)) {
+ abi_ulong real_start = start & -host_page_size;
+ off_t host_offset = offset & -host_page_size;
+ size_t real_len = len + offset - host_offset;
+ abi_ulong align = MAX(host_page_size, TARGET_PAGE_SIZE);
+
+ start = mmap_find_vma(real_start, real_len, align);
+ if (start == (abi_ulong)-1) {
+ errno = ENOMEM;
+ return -1;
+ }
+ start += offset - host_offset;
+ flags |= MAP_FIXED;
+ }
+ }
+
+ host_prot = target_to_host_prot(target_prot);
+
+ if (host_page_size == TARGET_PAGE_SIZE) {
+ return mmap_h_eq_g(start, len, host_prot, flags,
+ page_flags, fd, offset);
+ } else if (host_page_size < TARGET_PAGE_SIZE) {
+ return mmap_h_lt_g(start, len, host_prot, flags,
+ page_flags, fd, offset, host_page_size);
+ } else {
+ return mmap_h_gt_g(start, len, target_prot, host_prot, flags,
+ page_flags, fd, offset, host_page_size);
+ }
+}
+
+/* NOTE: all the constants are the HOST ones */
+abi_long target_mmap(abi_ulong start, abi_ulong len, int target_prot,
+ int flags, int fd, off_t offset)
+{
+ abi_long ret;
+ int page_flags;
+
+ trace_target_mmap(start, len, target_prot, flags, fd, offset);
+
+ if (!len) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ page_flags = validate_prot_to_pageflags(target_prot);
+ if (!page_flags) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ /* Also check for overflows... */
+ len = TARGET_PAGE_ALIGN(len);
+ if (!len || len != (size_t)len) {
+ errno = ENOMEM;
+ return -1;
+ }
+
+ if (offset & ~TARGET_PAGE_MASK) {
+ errno = EINVAL;
+ return -1;
+ }
+ if (flags & (MAP_FIXED | MAP_FIXED_NOREPLACE)) {
+ if (start & ~TARGET_PAGE_MASK) {
+ errno = EINVAL;
+ return -1;
+ }
+ if (!guest_range_valid_untagged(start, len)) {
+ errno = ENOMEM;
+ return -1;
+ }
+ }
+
+ mmap_lock();
+
+ ret = target_mmap__locked(start, len, target_prot, flags,
+ page_flags, fd, offset);
+
mmap_unlock();
- return -1;
+
+ /*
+ * If we're mapping shared memory, ensure we generate code for parallel
+ * execution and flush old translations. This will work up to the level
+ * supported by the host -- anything that requires EXCP_ATOMIC will not
+ * be atomic with respect to an external process.
+ */
+ if (ret != -1 && (flags & MAP_TYPE) != MAP_PRIVATE) {
+ CPUState *cpu = thread_cpu;
+ if (!(cpu->tcg_cflags & CF_PARALLEL)) {
+ cpu->tcg_cflags |= CF_PARALLEL;
+ tb_flush(cpu);
+ }
+ }
+
+ return ret;
}
static int mmap_reserve_or_unmap(abi_ulong start, abi_ulong len)
{
+ int host_page_size = qemu_real_host_page_size();
abi_ulong real_start;
abi_ulong real_last;
abi_ulong real_len;
@@ -793,8 +980,8 @@
int prot;
last = start + len - 1;
- real_start = start & qemu_host_page_mask;
- real_last = HOST_PAGE_ALIGN(last) - 1;
+ real_start = start & -host_page_size;
+ real_last = ROUND_UP(last, host_page_size) - 1;
/*
* If guest pages remain on the first or last host pages,
@@ -802,7 +989,7 @@
* The single page special case is required for the last page,
* lest real_start overflow to zero.
*/
- if (real_last - real_start < qemu_host_page_size) {
+ if (real_last - real_start < host_page_size) {
prot = 0;
for (a = real_start; a < start; a += TARGET_PAGE_SIZE) {
prot |= page_get_flags(a);
@@ -818,14 +1005,14 @@
prot |= page_get_flags(a);
}
if (prot != 0) {
- real_start += qemu_host_page_size;
+ real_start += host_page_size;
}
for (prot = 0, a = last; a < real_last; a += TARGET_PAGE_SIZE) {
prot |= page_get_flags(a + 1);
}
if (prot != 0) {
- real_last -= qemu_host_page_size;
+ real_last -= host_page_size;
}
if (real_last < real_start) {
@@ -836,13 +1023,7 @@
real_len = real_last - real_start + 1;
host_start = g2h_untagged(real_start);
- if (reserved_va) {
- void *ptr = mmap(host_start, real_len, PROT_NONE,
- MAP_FIXED | MAP_ANONYMOUS
- | MAP_PRIVATE | MAP_NORESERVE, -1, 0);
- return ptr == host_start ? 0 : -1;
- }
- return munmap(host_start, real_len);
+ return do_munmap(host_start, real_len);
}
int target_munmap(abi_ulong start, abi_ulong len)
@@ -1055,69 +1236,161 @@
}
#endif
+#if defined(__arm__) || defined(__mips__) || defined(__sparc__)
+#define HOST_FORCE_SHMLBA 1
+#else
+#define HOST_FORCE_SHMLBA 0
+#endif
+
abi_ulong target_shmat(CPUArchState *cpu_env, int shmid,
abi_ulong shmaddr, int shmflg)
{
CPUState *cpu = env_cpu(cpu_env);
- abi_ulong raddr;
struct shmid_ds shm_info;
int ret;
- abi_ulong shmlba;
+ int h_pagesize;
+ int t_shmlba, h_shmlba, m_shmlba;
+ size_t t_len, h_len, m_len;
/* shmat pointers are always untagged */
- /* find out the length of the shared memory segment */
+ /*
+ * Because we can't use host shmat() unless the address is sufficiently
+ * aligned for the host, we'll need to check both.
+ * TODO: Could be fixed with softmmu.
+ */
+ t_shmlba = target_shmlba(cpu_env);
+ h_pagesize = qemu_real_host_page_size();
+ h_shmlba = (HOST_FORCE_SHMLBA ? SHMLBA : h_pagesize);
+ m_shmlba = MAX(t_shmlba, h_shmlba);
+
+ if (shmaddr) {
+ if (shmaddr & (m_shmlba - 1)) {
+ if (shmflg & SHM_RND) {
+ /*
+ * The guest is allowing the kernel to round the address.
+ * Assume that the guest is ok with us rounding to the
+ * host required alignment too. Anyway if we don't, we'll
+ * get an error from the kernel.
+ */
+ shmaddr &= ~(m_shmlba - 1);
+ if (shmaddr == 0 && (shmflg & SHM_REMAP)) {
+ return -TARGET_EINVAL;
+ }
+ } else {
+ int require = TARGET_PAGE_SIZE;
+#ifdef TARGET_FORCE_SHMLBA
+ require = t_shmlba;
+#endif
+ /*
+ * Include host required alignment, as otherwise we cannot
+ * use host shmat at all.
+ */
+ require = MAX(require, h_shmlba);
+ if (shmaddr & (require - 1)) {
+ return -TARGET_EINVAL;
+ }
+ }
+ }
+ } else {
+ if (shmflg & SHM_REMAP) {
+ return -TARGET_EINVAL;
+ }
+ }
+ /* All rounding now manually concluded. */
+ shmflg &= ~SHM_RND;
+
+ /* Find out the length of the shared memory segment. */
ret = get_errno(shmctl(shmid, IPC_STAT, &shm_info));
if (is_error(ret)) {
/* can't get length, bail out */
return ret;
}
+ t_len = TARGET_PAGE_ALIGN(shm_info.shm_segsz);
+ h_len = ROUND_UP(shm_info.shm_segsz, h_pagesize);
+ m_len = MAX(t_len, h_len);
- shmlba = target_shmlba(cpu_env);
-
- if (shmaddr & (shmlba - 1)) {
- if (shmflg & SHM_RND) {
- shmaddr &= ~(shmlba - 1);
- } else {
- return -TARGET_EINVAL;
- }
- }
- if (!guest_range_valid_untagged(shmaddr, shm_info.shm_segsz)) {
+ if (!guest_range_valid_untagged(shmaddr, m_len)) {
return -TARGET_EINVAL;
}
WITH_MMAP_LOCK_GUARD() {
- void *host_raddr;
+ bool mapped = false;
+ void *want, *test;
abi_ulong last;
- if (shmaddr) {
- host_raddr = shmat(shmid, (void *)g2h_untagged(shmaddr), shmflg);
- } else {
- abi_ulong mmap_start;
-
- /* In order to use the host shmat, we need to honor host SHMLBA. */
- mmap_start = mmap_find_vma(0, shm_info.shm_segsz,
- MAX(SHMLBA, shmlba));
-
- if (mmap_start == -1) {
+ if (!shmaddr) {
+ shmaddr = mmap_find_vma(0, m_len, m_shmlba);
+ if (shmaddr == -1) {
return -TARGET_ENOMEM;
}
- host_raddr = shmat(shmid, g2h_untagged(mmap_start),
- shmflg | SHM_REMAP);
+ mapped = !reserved_va;
+ } else if (shmflg & SHM_REMAP) {
+ /*
+ * If host page size > target page size, the host shmat may map
+ * more memory than the guest expects. Reject a mapping that
+ * would replace memory in the unexpected gap.
+ * TODO: Could be fixed with softmmu.
+ */
+ if (t_len < h_len &&
+ !page_check_range_empty(shmaddr + t_len,
+ shmaddr + h_len - 1)) {
+ return -TARGET_EINVAL;
+ }
+ } else {
+ if (!page_check_range_empty(shmaddr, shmaddr + m_len - 1)) {
+ return -TARGET_EINVAL;
+ }
}
- if (host_raddr == (void *)-1) {
- return get_errno(-1);
- }
- raddr = h2g(host_raddr);
- last = raddr + shm_info.shm_segsz - 1;
+ /* All placement is now complete. */
+ want = (void *)g2h_untagged(shmaddr);
- page_set_flags(raddr, last,
+ /*
+ * Map anonymous pages across the entire range, then remap with
+ * the shared memory. This is required for a number of corner
+ * cases for which host and guest page sizes differ.
+ */
+ if (h_len != t_len) {
+ int mmap_p = PROT_READ | (shmflg & SHM_RDONLY ? 0 : PROT_WRITE);
+ int mmap_f = MAP_PRIVATE | MAP_ANONYMOUS
+ | (reserved_va || (shmflg & SHM_REMAP)
+ ? MAP_FIXED : MAP_FIXED_NOREPLACE);
+
+ test = mmap(want, m_len, mmap_p, mmap_f, -1, 0);
+ if (unlikely(test != want)) {
+ /* shmat returns EINVAL not EEXIST like mmap. */
+ ret = (test == MAP_FAILED && errno != EEXIST
+ ? get_errno(-1) : -TARGET_EINVAL);
+ if (mapped) {
+ do_munmap(want, m_len);
+ }
+ return ret;
+ }
+ mapped = true;
+ }
+
+ if (reserved_va || mapped) {
+ shmflg |= SHM_REMAP;
+ }
+ test = shmat(shmid, want, shmflg);
+ if (test == MAP_FAILED) {
+ ret = get_errno(-1);
+ if (mapped) {
+ do_munmap(want, m_len);
+ }
+ return ret;
+ }
+ assert(test == want);
+
+ last = shmaddr + m_len - 1;
+ page_set_flags(shmaddr, last,
PAGE_VALID | PAGE_RESET | PAGE_READ |
- (shmflg & SHM_RDONLY ? 0 : PAGE_WRITE));
+ (shmflg & SHM_RDONLY ? 0 : PAGE_WRITE) |
+ (shmflg & SHM_EXEC ? PAGE_EXEC : 0));
- shm_region_rm_complete(raddr, last);
- shm_region_add(raddr, last);
+ shm_region_rm_complete(shmaddr, last);
+ shm_region_add(shmaddr, last);
}
/*
@@ -1131,7 +1404,15 @@
tb_flush(cpu);
}
- return raddr;
+ if (qemu_loglevel_mask(CPU_LOG_PAGE)) {
+ FILE *f = qemu_log_trylock();
+ if (f) {
+ fprintf(f, "page layout changed following shmat\n");
+ page_dump(f);
+ qemu_log_unlock(f);
+ }
+ }
+ return shmaddr;
}
abi_long target_shmdt(abi_ulong shmaddr)
diff --git a/linux-user/strace.c b/linux-user/strace.c
index cf26e55..8d13e55 100644
--- a/linux-user/strace.c
+++ b/linux-user/strace.c
@@ -670,6 +670,26 @@
}
#endif
+static void
+print_shmat(CPUArchState *cpu_env, const struct syscallname *name,
+ abi_long arg0, abi_long arg1, abi_long arg2,
+ abi_long arg3, abi_long arg4, abi_long arg5)
+{
+ static const struct flags shmat_flags[] = {
+ FLAG_GENERIC(SHM_RND),
+ FLAG_GENERIC(SHM_REMAP),
+ FLAG_GENERIC(SHM_RDONLY),
+ FLAG_GENERIC(SHM_EXEC),
+ FLAG_END
+ };
+
+ print_syscall_prologue(name);
+ print_raw_param(TARGET_ABI_FMT_ld, arg0, 0);
+ print_pointer(arg1, 0);
+ print_flags(shmat_flags, arg2, 1);
+ print_syscall_epilogue(name);
+}
+
#ifdef TARGET_NR_ipc
static void
print_ipc(CPUArchState *cpu_env, const struct syscallname *name,
@@ -683,6 +703,10 @@
print_ipc_cmd(arg3);
qemu_log(",0x" TARGET_ABI_FMT_lx ")", arg4);
break;
+ case IPCOP_shmat:
+ print_shmat(cpu_env, &(const struct syscallname){ .name = "shmat" },
+ arg1, arg4, arg2, 0, 0, 0);
+ break;
default:
qemu_log(("%s("
TARGET_ABI_FMT_ld ","
diff --git a/linux-user/strace.list b/linux-user/strace.list
index 6655d4f..dfd4237 100644
--- a/linux-user/strace.list
+++ b/linux-user/strace.list
@@ -1398,7 +1398,7 @@
{ TARGET_NR_sgetmask, "sgetmask" , NULL, NULL, NULL },
#endif
#ifdef TARGET_NR_shmat
-{ TARGET_NR_shmat, "shmat" , NULL, NULL, print_syscall_ret_addr },
+{ TARGET_NR_shmat, "shmat" , NULL, print_shmat, print_syscall_ret_addr },
#endif
#ifdef TARGET_NR_shmctl
{ TARGET_NR_shmctl, "shmctl" , NULL, NULL, NULL },
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index e384e14..bc8c065 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -7994,6 +7994,10 @@
path = "[heap]";
} else if (start == info->vdso) {
path = "[vdso]";
+#ifdef TARGET_X86_64
+ } else if (start == TARGET_VSYSCALL_PAGE) {
+ path = "[vsyscall]";
+#endif
}
/* Except null device (MAP_ANON), adjust offset for this fragment. */
@@ -8082,6 +8086,18 @@
uintptr_t host_start = (uintptr_t)g2h_untagged(guest_start);
uintptr_t host_last = (uintptr_t)g2h_untagged(guest_end - 1);
+#ifdef TARGET_X86_64
+ /*
+ * Because of the extremely high position of the page within the guest
+ * virtual address space, this is not backed by host memory at all.
+ * Therefore the loop below would fail. This is the only instance
+ * of not having host backing memory.
+ */
+ if (guest_start == TARGET_VSYSCALL_PAGE) {
+ return open_self_maps_3(opaque, guest_start, guest_end, flags);
+ }
+#endif
+
while (1) {
IntervalTreeNode *n =
interval_tree_iter_first(d->host_maps, host_start, host_start);
diff --git a/meson.build b/meson.build
index 0ef1654..c59ca49 100644
--- a/meson.build
+++ b/meson.build
@@ -555,17 +555,24 @@
# Check further flags that make QEMU more robust against malicious parties
hardening_flags = [
- # Zero out registers used during a function call
- # upon its return. This makes it harder to assemble
- # ROP gadgets into something usable
- '-fzero-call-used-regs=used-gpr',
-
# Initialize all stack variables to zero. This makes
# it harder to take advantage of uninitialized stack
# data to drive exploits
'-ftrivial-auto-var-init=zero',
]
+# Zero out registers used during a function call
+# upon its return. This makes it harder to assemble
+# ROP gadgets into something usable
+#
+# NB: Clang 17 is broken and SEGVs
+# https://github.com/llvm/llvm-project/issues/75168
+if cc.compiles('extern struct { void (*cb)(void); } s; void f(void) { s.cb(); }',
+ name: '-fzero-call-used-regs=used-gpr',
+ args: ['-O2', '-fzero-call-used-regs=used-gpr'])
+ hardening_flags += '-fzero-call-used-regs=used-gpr'
+endif
+
qemu_common_flags += cc.get_supported_arguments(hardening_flags)
add_global_arguments(qemu_common_flags, native: false, language: all_languages)
diff --git a/migration/fd.c b/migration/fd.c
index 0eb677d..d4ae72d 100644
--- a/migration/fd.c
+++ b/migration/fd.c
@@ -15,18 +15,41 @@
*/
#include "qemu/osdep.h"
+#include "qapi/error.h"
#include "channel.h"
#include "fd.h"
#include "migration.h"
#include "monitor/monitor.h"
+#include "io/channel-file.h"
#include "io/channel-util.h"
+#include "options.h"
#include "trace.h"
+static struct FdOutgoingArgs {
+ int fd;
+} outgoing_args;
+
+int fd_args_get_fd(void)
+{
+ return outgoing_args.fd;
+}
+
+void fd_cleanup_outgoing_migration(void)
+{
+ if (outgoing_args.fd > 0) {
+ close(outgoing_args.fd);
+ outgoing_args.fd = -1;
+ }
+}
+
void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp)
{
QIOChannel *ioc;
int fd = monitor_get_fd(monitor_cur(), fdname, errp);
+
+ outgoing_args.fd = -1;
+
if (fd == -1) {
return;
}
@@ -38,6 +61,8 @@
return;
}
+ outgoing_args.fd = fd;
+
qio_channel_set_name(ioc, "migration-fd-outgoing");
migration_channel_connect(s, ioc, NULL, NULL);
object_unref(OBJECT(ioc));
@@ -73,4 +98,23 @@
fd_accept_incoming_migration,
NULL, NULL,
g_main_context_get_thread_default());
+
+ if (migrate_multifd()) {
+ int channels = migrate_multifd_channels();
+
+ while (channels--) {
+ ioc = QIO_CHANNEL(qio_channel_file_new_fd(dup(fd)));
+
+ if (QIO_CHANNEL_FILE(ioc)->fd == -1) {
+ error_setg(errp, "Failed to duplicate fd %d", fd);
+ return;
+ }
+
+ qio_channel_set_name(ioc, "migration-fd-incoming");
+ qio_channel_add_watch_full(ioc, G_IO_IN,
+ fd_accept_incoming_migration,
+ NULL, NULL,
+ g_main_context_get_thread_default());
+ }
+ }
}
diff --git a/migration/fd.h b/migration/fd.h
index b901bc0..0c0a18d 100644
--- a/migration/fd.h
+++ b/migration/fd.h
@@ -20,4 +20,6 @@
void fd_start_outgoing_migration(MigrationState *s, const char *fdname,
Error **errp);
+void fd_cleanup_outgoing_migration(void);
+int fd_args_get_fd(void);
#endif
diff --git a/migration/file.c b/migration/file.c
index 5d4975f..164b079 100644
--- a/migration/file.c
+++ b/migration/file.c
@@ -6,17 +6,25 @@
*/
#include "qemu/osdep.h"
+#include "exec/ramblock.h"
#include "qemu/cutils.h"
+#include "qemu/error-report.h"
#include "qapi/error.h"
#include "channel.h"
+#include "fd.h"
#include "file.h"
#include "migration.h"
#include "io/channel-file.h"
#include "io/channel-util.h"
+#include "options.h"
#include "trace.h"
#define OFFSET_OPTION ",offset="
+static struct FileOutgoingArgs {
+ char *fname;
+} outgoing_args;
+
/* Remove the offset option from @filespec and return it in @offsetp. */
int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp)
@@ -36,6 +44,41 @@
return 0;
}
+void file_cleanup_outgoing_migration(void)
+{
+ g_free(outgoing_args.fname);
+ outgoing_args.fname = NULL;
+}
+
+bool file_send_channel_create(gpointer opaque, Error **errp)
+{
+ QIOChannelFile *ioc;
+ int flags = O_WRONLY;
+ bool ret = false;
+ int fd = fd_args_get_fd();
+
+ if (fd && fd != -1) {
+ ioc = qio_channel_file_new_fd(dup(fd));
+ } else {
+ ioc = qio_channel_file_new_path(outgoing_args.fname, flags, 0, errp);
+ if (!ioc) {
+ goto out;
+ }
+ }
+
+ multifd_channel_connect(opaque, QIO_CHANNEL(ioc));
+ ret = true;
+
+out:
+ /*
+ * File channel creation is synchronous. However posting this
+ * semaphore here is simpler than adding a special case.
+ */
+ multifd_send_channel_created();
+
+ return ret;
+}
+
void file_start_outgoing_migration(MigrationState *s,
FileMigrationArgs *file_args, Error **errp)
{
@@ -52,6 +95,8 @@
return;
}
+ outgoing_args.fname = g_strdup(filename);
+
ioc = QIO_CHANNEL(fioc);
if (offset && qio_channel_io_seek(ioc, offset, SEEK_SET, errp) < 0) {
return;
@@ -74,7 +119,8 @@
g_autofree char *filename = g_strdup(file_args->filename);
QIOChannelFile *fioc = NULL;
uint64_t offset = file_args->offset;
- QIOChannel *ioc;
+ int channels = 1;
+ int i = 0;
trace_migration_file_incoming(filename);
@@ -83,13 +129,100 @@
return;
}
- ioc = QIO_CHANNEL(fioc);
- if (offset && qio_channel_io_seek(ioc, offset, SEEK_SET, errp) < 0) {
+ if (offset &&
+ qio_channel_io_seek(QIO_CHANNEL(fioc), offset, SEEK_SET, errp) < 0) {
return;
}
- qio_channel_set_name(QIO_CHANNEL(ioc), "migration-file-incoming");
- qio_channel_add_watch_full(ioc, G_IO_IN,
- file_accept_incoming_migration,
- NULL, NULL,
- g_main_context_get_thread_default());
+
+ if (migrate_multifd()) {
+ channels += migrate_multifd_channels();
+ }
+
+ do {
+ QIOChannel *ioc = QIO_CHANNEL(fioc);
+
+ qio_channel_set_name(ioc, "migration-file-incoming");
+ qio_channel_add_watch_full(ioc, G_IO_IN,
+ file_accept_incoming_migration,
+ NULL, NULL,
+ g_main_context_get_thread_default());
+
+ fioc = qio_channel_file_new_fd(dup(fioc->fd));
+
+ if (!fioc || fioc->fd == -1) {
+ error_setg(errp, "Error creating migration incoming channel");
+ break;
+ }
+ } while (++i < channels);
+}
+
+int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov,
+ int niov, RAMBlock *block, Error **errp)
+{
+ ssize_t ret = -1;
+ int i, slice_idx, slice_num;
+ uintptr_t base, next, offset;
+ size_t len;
+
+ slice_idx = 0;
+ slice_num = 1;
+
+ /*
+ * If the iov array doesn't have contiguous elements, we need to
+ * split it in slices because we only have one file offset for the
+ * whole iov. Do this here so callers don't need to break the iov
+ * array themselves.
+ */
+ for (i = 0; i < niov; i++, slice_num++) {
+ base = (uintptr_t) iov[i].iov_base;
+
+ if (i != niov - 1) {
+ len = iov[i].iov_len;
+ next = (uintptr_t) iov[i + 1].iov_base;
+
+ if (base + len == next) {
+ continue;
+ }
+ }
+
+ /*
+ * Use the offset of the first element of the segment that
+ * we're sending.
+ */
+ offset = (uintptr_t) iov[slice_idx].iov_base - (uintptr_t) block->host;
+ if (offset >= block->used_length) {
+ error_setg(errp, "offset " RAM_ADDR_FMT
+ "outside of ramblock %s range", offset, block->idstr);
+ ret = -1;
+ break;
+ }
+
+ ret = qio_channel_pwritev(ioc, &iov[slice_idx], slice_num,
+ block->pages_offset + offset, errp);
+ if (ret < 0) {
+ break;
+ }
+
+ slice_idx += slice_num;
+ slice_num = 0;
+ }
+
+ return (ret < 0) ? ret : 0;
+}
+
+int multifd_file_recv_data(MultiFDRecvParams *p, Error **errp)
+{
+ MultiFDRecvData *data = p->data;
+ size_t ret;
+
+ ret = qio_channel_pread(p->c, (char *) data->opaque,
+ data->size, data->file_offset, errp);
+ if (ret != data->size) {
+ error_prepend(errp,
+ "multifd recv (%u): read 0x%zx, expected 0x%zx",
+ p->id, ret, data->size);
+ return -1;
+ }
+
+ return 0;
}
diff --git a/migration/file.h b/migration/file.h
index 37d6a08..9f71e87 100644
--- a/migration/file.h
+++ b/migration/file.h
@@ -9,10 +9,18 @@
#define QEMU_MIGRATION_FILE_H
#include "qapi/qapi-types-migration.h"
+#include "io/task.h"
+#include "channel.h"
+#include "multifd.h"
void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp);
void file_start_outgoing_migration(MigrationState *s,
FileMigrationArgs *file_args, Error **errp);
int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp);
+void file_cleanup_outgoing_migration(void);
+bool file_send_channel_create(gpointer opaque, Error **errp);
+int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov,
+ int niov, RAMBlock *block, Error **errp);
+int multifd_file_recv_data(MultiFDRecvParams *p, Error **errp);
#endif
diff --git a/migration/migration.c b/migration/migration.c
index bab68bc..a49fcd5 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -140,9 +140,38 @@
if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) {
SocketAddress *saddr = &addr->u.socket;
- return saddr->type == SOCKET_ADDRESS_TYPE_INET ||
- saddr->type == SOCKET_ADDRESS_TYPE_UNIX ||
- saddr->type == SOCKET_ADDRESS_TYPE_VSOCK;
+ if (saddr->type == SOCKET_ADDRESS_TYPE_FD) {
+ return migrate_mapped_ram();
+ }
+
+ return (saddr->type == SOCKET_ADDRESS_TYPE_INET ||
+ saddr->type == SOCKET_ADDRESS_TYPE_UNIX ||
+ saddr->type == SOCKET_ADDRESS_TYPE_VSOCK);
+ } else if (addr->transport == MIGRATION_ADDRESS_TYPE_FILE) {
+ return migrate_mapped_ram();
+ } else {
+ return false;
+ }
+}
+
+static bool migration_needs_seekable_channel(void)
+{
+ return migrate_mapped_ram();
+}
+
+static bool transport_supports_seeking(MigrationAddress *addr)
+{
+ if (addr->transport == MIGRATION_ADDRESS_TYPE_FILE) {
+ return true;
+ }
+
+ /*
+ * At this point, the user might not yet have passed the file
+ * descriptor to QEMU, so we cannot know for sure whether it
+ * refers to a plain file or a socket. Let it through anyway.
+ */
+ if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) {
+ return addr->u.socket.type == SOCKET_ADDRESS_TYPE_FD;
}
return false;
@@ -152,6 +181,12 @@
migration_channels_and_transport_compatible(MigrationAddress *addr,
Error **errp)
{
+ if (migration_needs_seekable_channel() &&
+ !transport_supports_seeking(addr)) {
+ error_setg(errp, "Migration requires seekable transport (e.g. file)");
+ return false;
+ }
+
if (migration_needs_multiple_sockets() &&
!transport_supports_multi_channels(addr)) {
error_setg(errp, "Migration requires multi-channel URIs (e.g. tcp)");
@@ -881,7 +916,8 @@
uint32_t channel_magic = 0;
int ret = 0;
- if (migrate_multifd() && !migrate_postcopy_ram() &&
+ if (migrate_multifd() && !migrate_mapped_ram() &&
+ !migrate_postcopy_ram() &&
qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
/*
* With multiple channels, it is possible that we receive channels
@@ -1950,6 +1986,18 @@
return false;
}
+ if (migrate_mapped_ram()) {
+ if (migrate_tls()) {
+ error_setg(errp, "Cannot use TLS with mapped-ram");
+ return false;
+ }
+
+ if (migrate_multifd_compression()) {
+ error_setg(errp, "Cannot use compression with mapped-ram");
+ return false;
+ }
+ }
+
if (migrate_mode_is_cpr(s)) {
const char *conflict = NULL;
diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
index 012e3bd..6120faa 100644
--- a/migration/multifd-zlib.c
+++ b/migration/multifd-zlib.c
@@ -69,7 +69,7 @@
err_msg = "out of memory for buf";
goto err_free_zbuff;
}
- p->data = z;
+ p->compress_data = z;
return 0;
err_free_zbuff:
@@ -92,15 +92,15 @@
*/
static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp)
{
- struct zlib_data *z = p->data;
+ struct zlib_data *z = p->compress_data;
deflateEnd(&z->zs);
g_free(z->zbuff);
z->zbuff = NULL;
g_free(z->buf);
z->buf = NULL;
- g_free(p->data);
- p->data = NULL;
+ g_free(p->compress_data);
+ p->compress_data = NULL;
}
/**
@@ -117,7 +117,7 @@
static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
{
MultiFDPages_t *pages = p->pages;
- struct zlib_data *z = p->data;
+ struct zlib_data *z = p->compress_data;
z_stream *zs = &z->zs;
uint32_t out_size = 0;
int ret;
@@ -194,7 +194,7 @@
struct zlib_data *z = g_new0(struct zlib_data, 1);
z_stream *zs = &z->zs;
- p->data = z;
+ p->compress_data = z;
zs->zalloc = Z_NULL;
zs->zfree = Z_NULL;
zs->opaque = Z_NULL;
@@ -224,17 +224,17 @@
*/
static void zlib_recv_cleanup(MultiFDRecvParams *p)
{
- struct zlib_data *z = p->data;
+ struct zlib_data *z = p->compress_data;
inflateEnd(&z->zs);
g_free(z->zbuff);
z->zbuff = NULL;
- g_free(p->data);
- p->data = NULL;
+ g_free(p->compress_data);
+ p->compress_data = NULL;
}
/**
- * zlib_recv_pages: read the data from the channel into actual pages
+ * zlib_recv: read the data from the channel into actual pages
*
* Read the compressed buffer, and uncompress it into the actual
* pages.
@@ -244,9 +244,9 @@
* @p: Params for the channel that we are using
* @errp: pointer to an error
*/
-static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
+static int zlib_recv(MultiFDRecvParams *p, Error **errp)
{
- struct zlib_data *z = p->data;
+ struct zlib_data *z = p->compress_data;
z_stream *zs = &z->zs;
uint32_t in_size = p->next_packet_size;
/* we measure the change of total_out */
@@ -319,7 +319,7 @@
.send_prepare = zlib_send_prepare,
.recv_setup = zlib_recv_setup,
.recv_cleanup = zlib_recv_cleanup,
- .recv_pages = zlib_recv_pages
+ .recv = zlib_recv
};
static void multifd_zlib_register(void)
diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
index dc8fe43..cac2368 100644
--- a/migration/multifd-zstd.c
+++ b/migration/multifd-zstd.c
@@ -52,7 +52,7 @@
struct zstd_data *z = g_new0(struct zstd_data, 1);
int res;
- p->data = z;
+ p->compress_data = z;
z->zcs = ZSTD_createCStream();
if (!z->zcs) {
g_free(z);
@@ -90,14 +90,14 @@
*/
static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp)
{
- struct zstd_data *z = p->data;
+ struct zstd_data *z = p->compress_data;
ZSTD_freeCStream(z->zcs);
z->zcs = NULL;
g_free(z->zbuff);
z->zbuff = NULL;
- g_free(p->data);
- p->data = NULL;
+ g_free(p->compress_data);
+ p->compress_data = NULL;
}
/**
@@ -114,7 +114,7 @@
static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
{
MultiFDPages_t *pages = p->pages;
- struct zstd_data *z = p->data;
+ struct zstd_data *z = p->compress_data;
int ret;
uint32_t i;
@@ -183,7 +183,7 @@
struct zstd_data *z = g_new0(struct zstd_data, 1);
int ret;
- p->data = z;
+ p->compress_data = z;
z->zds = ZSTD_createDStream();
if (!z->zds) {
g_free(z);
@@ -221,18 +221,18 @@
*/
static void zstd_recv_cleanup(MultiFDRecvParams *p)
{
- struct zstd_data *z = p->data;
+ struct zstd_data *z = p->compress_data;
ZSTD_freeDStream(z->zds);
z->zds = NULL;
g_free(z->zbuff);
z->zbuff = NULL;
- g_free(p->data);
- p->data = NULL;
+ g_free(p->compress_data);
+ p->compress_data = NULL;
}
/**
- * zstd_recv_pages: read the data from the channel into actual pages
+ * zstd_recv: read the data from the channel into actual pages
*
* Read the compressed buffer, and uncompress it into the actual
* pages.
@@ -242,13 +242,13 @@
* @p: Params for the channel that we are using
* @errp: pointer to an error
*/
-static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp)
+static int zstd_recv(MultiFDRecvParams *p, Error **errp)
{
uint32_t in_size = p->next_packet_size;
uint32_t out_size = 0;
uint32_t expected_size = p->normal_num * p->page_size;
uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
- struct zstd_data *z = p->data;
+ struct zstd_data *z = p->compress_data;
int ret;
int i;
@@ -310,7 +310,7 @@
.send_prepare = zstd_send_prepare,
.recv_setup = zstd_recv_setup,
.recv_cleanup = zstd_recv_cleanup,
- .recv_pages = zstd_recv_pages
+ .recv = zstd_recv
};
static void multifd_zstd_register(void)
diff --git a/migration/multifd.c b/migration/multifd.c
index 6c07f19..d4a44da 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -17,7 +17,8 @@
#include "exec/ramblock.h"
#include "qemu/error-report.h"
#include "qapi/error.h"
-#include "ram.h"
+#include "fd.h"
+#include "file.h"
#include "migration.h"
#include "migration-stats.h"
#include "socket.h"
@@ -28,6 +29,7 @@
#include "threadinfo.h"
#include "options.h"
#include "qemu/yank.h"
+#include "io/channel-file.h"
#include "io/channel-socket.h"
#include "yank_functions.h"
@@ -81,9 +83,13 @@
struct {
MultiFDRecvParams *params;
+ MultiFDRecvData *data;
/* number of created threads */
int count;
- /* syncs main thread and channels */
+ /*
+ * This is always posted by the recv threads, the migration thread
+ * uses it to wait for recv threads to finish assigned tasks.
+ */
QemuSemaphore sem_sync;
/* global number of generated multifd packets */
uint64_t packet_num;
@@ -92,6 +98,27 @@
MultiFDMethods *ops;
} *multifd_recv_state;
+static bool multifd_use_packets(void)
+{
+ return !migrate_mapped_ram();
+}
+
+void multifd_send_channel_created(void)
+{
+ qemu_sem_post(&multifd_send_state->channels_created);
+}
+
+static void multifd_set_file_bitmap(MultiFDSendParams *p)
+{
+ MultiFDPages_t *pages = p->pages;
+
+ assert(pages->block);
+
+ for (int i = 0; i < p->pages->num; i++) {
+ ramblock_set_file_bmap_atomic(pages->block, pages->offset[i]);
+ }
+}
+
/* Multifd without compression */
/**
@@ -122,6 +149,19 @@
return;
}
+static void multifd_send_prepare_iovs(MultiFDSendParams *p)
+{
+ MultiFDPages_t *pages = p->pages;
+
+ for (int i = 0; i < pages->num; i++) {
+ p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i];
+ p->iov[p->iovs_num].iov_len = p->page_size;
+ p->iovs_num++;
+ }
+
+ p->next_packet_size = pages->num * p->page_size;
+}
+
/**
* nocomp_send_prepare: prepare date to be able to send
*
@@ -136,9 +176,15 @@
static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
{
bool use_zero_copy_send = migrate_zero_copy_send();
- MultiFDPages_t *pages = p->pages;
int ret;
+ if (!multifd_use_packets()) {
+ multifd_send_prepare_iovs(p);
+ multifd_set_file_bitmap(p);
+
+ return 0;
+ }
+
if (!use_zero_copy_send) {
/*
* Only !zerocopy needs the header in IOV; zerocopy will
@@ -147,13 +193,7 @@
multifd_send_prepare_header(p);
}
- for (int i = 0; i < pages->num; i++) {
- p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i];
- p->iov[p->iovs_num].iov_len = p->page_size;
- p->iovs_num++;
- }
-
- p->next_packet_size = pages->num * p->page_size;
+ multifd_send_prepare_iovs(p);
p->flags |= MULTIFD_FLAG_NOCOMP;
multifd_send_fill_packet(p);
@@ -197,7 +237,7 @@
}
/**
- * nocomp_recv_pages: read the data from the channel into actual pages
+ * nocomp_recv: read the data from the channel
*
* For no compression we just need to read things into the correct place.
*
@@ -206,9 +246,15 @@
* @p: Params for the channel that we are using
* @errp: pointer to an error
*/
-static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp)
+static int nocomp_recv(MultiFDRecvParams *p, Error **errp)
{
- uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
+ uint32_t flags;
+
+ if (!multifd_use_packets()) {
+ return multifd_file_recv_data(p, errp);
+ }
+
+ flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
if (flags != MULTIFD_FLAG_NOCOMP) {
error_setg(errp, "multifd %u: flags received %x flags expected %x",
@@ -228,7 +274,7 @@
.send_prepare = nocomp_send_prepare,
.recv_setup = nocomp_recv_setup,
.recv_cleanup = nocomp_recv_cleanup,
- .recv_pages = nocomp_recv_pages
+ .recv = nocomp_recv
};
static MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = {
@@ -663,6 +709,19 @@
{
if (p->c) {
migration_ioc_unregister_yank(p->c);
+ /*
+ * An explicit close() on the channel here is normally not
+ * required, but can be helpful for "file:" iochannels, where it
+ * will include fdatasync() to make sure the data is flushed to the
+ * disk backend.
+ *
+ * The object_unref() cannot guarantee that because: (1) finalize()
+ * of the iochannel is only triggered on the last reference, and
+ * it's not guaranteed that we always hold the last refcount when
+ * reaching here, and, (2) even if finalize() is invoked, it only
+ * does a close(fd) without data flush.
+ */
+ qio_channel_close(p->c, &error_abort);
object_unref(OBJECT(p->c));
p->c = NULL;
}
@@ -684,6 +743,8 @@
static void multifd_send_cleanup_state(void)
{
+ file_cleanup_outgoing_migration();
+ fd_cleanup_outgoing_migration();
socket_cleanup_outgoing_migration();
qemu_sem_destroy(&multifd_send_state->channels_created);
qemu_sem_destroy(&multifd_send_state->channels_ready);
@@ -795,15 +856,18 @@
MigrationThread *thread = NULL;
Error *local_err = NULL;
int ret = 0;
+ bool use_packets = multifd_use_packets();
thread = migration_threads_add(p->name, qemu_get_thread_id());
trace_multifd_send_thread_start(p->id);
rcu_register_thread();
- if (multifd_send_initial_packet(p, &local_err) < 0) {
- ret = -1;
- goto out;
+ if (use_packets) {
+ if (multifd_send_initial_packet(p, &local_err) < 0) {
+ ret = -1;
+ goto out;
+ }
}
while (true) {
@@ -829,8 +893,15 @@
break;
}
- ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL,
- 0, p->write_flags, &local_err);
+ if (migrate_mapped_ram()) {
+ ret = file_write_ramblock_iov(p->c, p->iov, p->iovs_num,
+ p->pages->block, &local_err);
+ } else {
+ ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num,
+ NULL, 0, p->write_flags,
+ &local_err);
+ }
+
if (ret != 0) {
break;
}
@@ -854,16 +925,20 @@
* it doesn't require explicit memory barriers.
*/
assert(qatomic_read(&p->pending_sync));
- p->flags = MULTIFD_FLAG_SYNC;
- multifd_send_fill_packet(p);
- ret = qio_channel_write_all(p->c, (void *)p->packet,
- p->packet_len, &local_err);
- if (ret != 0) {
- break;
+
+ if (use_packets) {
+ p->flags = MULTIFD_FLAG_SYNC;
+ multifd_send_fill_packet(p);
+ ret = qio_channel_write_all(p->c, (void *)p->packet,
+ p->packet_len, &local_err);
+ if (ret != 0) {
+ break;
+ }
+ /* p->next_packet_size will always be zero for a SYNC packet */
+ stat64_add(&mig_stats.multifd_bytes, p->packet_len);
+ p->flags = 0;
}
- /* p->next_packet_size will always be zero for a SYNC packet */
- stat64_add(&mig_stats.multifd_bytes, p->packet_len);
- p->flags = 0;
+
qatomic_set(&p->pending_sync, false);
qemu_sem_post(&p->sem_sync);
}
@@ -939,7 +1014,7 @@
return true;
}
-static void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc)
+void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc)
{
qio_channel_set_delay(ioc, false);
@@ -990,7 +1065,7 @@
* Here we're not interested whether creation succeeded, only that
* it happened at all.
*/
- qemu_sem_post(&multifd_send_state->channels_created);
+ multifd_send_channel_created();
if (ret) {
return;
@@ -1007,9 +1082,14 @@
error_free(local_err);
}
-static void multifd_new_send_channel_create(gpointer opaque)
+static bool multifd_new_send_channel_create(gpointer opaque, Error **errp)
{
+ if (!multifd_use_packets()) {
+ return file_send_channel_create(opaque, errp);
+ }
+
socket_send_channel_create(multifd_new_send_channel_async, opaque);
+ return true;
}
bool multifd_send_setup(void)
@@ -1018,6 +1098,7 @@
Error *local_err = NULL;
int thread_count, ret = 0;
uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
+ bool use_packets = multifd_use_packets();
uint8_t i;
if (!migrate_multifd()) {
@@ -1040,18 +1121,27 @@
qemu_sem_init(&p->sem_sync, 0);
p->id = i;
p->pages = multifd_pages_init(page_count);
- p->packet_len = sizeof(MultiFDPacket_t)
- + sizeof(uint64_t) * page_count;
- p->packet = g_malloc0(p->packet_len);
- p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
- p->packet->version = cpu_to_be32(MULTIFD_VERSION);
+
+ if (use_packets) {
+ p->packet_len = sizeof(MultiFDPacket_t)
+ + sizeof(uint64_t) * page_count;
+ p->packet = g_malloc0(p->packet_len);
+ p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
+ p->packet->version = cpu_to_be32(MULTIFD_VERSION);
+
+ /* We need one extra place for the packet header */
+ p->iov = g_new0(struct iovec, page_count + 1);
+ } else {
+ p->iov = g_new0(struct iovec, page_count);
+ }
p->name = g_strdup_printf("multifdsend_%d", i);
- /* We need one extra place for the packet header */
- p->iov = g_new0(struct iovec, page_count + 1);
p->page_size = qemu_target_page_size();
p->page_count = page_count;
p->write_flags = 0;
- multifd_new_send_channel_create(p);
+
+ if (!multifd_new_send_channel_create(p, &local_err)) {
+ return false;
+ }
}
/*
@@ -1083,6 +1173,57 @@
return true;
}
+bool multifd_recv(void)
+{
+ int i;
+ static int next_recv_channel;
+ MultiFDRecvParams *p = NULL;
+ MultiFDRecvData *data = multifd_recv_state->data;
+
+ /*
+ * next_channel can remain from a previous migration that was
+ * using more channels, so ensure it doesn't overflow if the
+ * limit is lower now.
+ */
+ next_recv_channel %= migrate_multifd_channels();
+ for (i = next_recv_channel;; i = (i + 1) % migrate_multifd_channels()) {
+ if (multifd_recv_should_exit()) {
+ return false;
+ }
+
+ p = &multifd_recv_state->params[i];
+
+ if (qatomic_read(&p->pending_job) == false) {
+ next_recv_channel = (i + 1) % migrate_multifd_channels();
+ break;
+ }
+ }
+
+ /*
+ * Order pending_job read before manipulating p->data below. Pairs
+ * with qatomic_store_release() at multifd_recv_thread().
+ */
+ smp_mb_acquire();
+
+ assert(!p->data->size);
+ multifd_recv_state->data = p->data;
+ p->data = data;
+
+ /*
+ * Order p->data update before setting pending_job. Pairs with
+ * qatomic_load_acquire() at multifd_recv_thread().
+ */
+ qatomic_store_release(&p->pending_job, true);
+ qemu_sem_post(&p->sem);
+
+ return true;
+}
+
+MultiFDRecvData *multifd_get_recv_data(void)
+{
+ return multifd_recv_state->data;
+}
+
static void multifd_recv_terminate_threads(Error *err)
{
int i;
@@ -1107,10 +1248,27 @@
MultiFDRecvParams *p = &multifd_recv_state->params[i];
/*
- * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
- * however try to wakeup it without harm in cleanup phase.
+ * The migration thread and channels interact differently
+ * depending on the presence of packets.
*/
- qemu_sem_post(&p->sem_sync);
+ if (multifd_use_packets()) {
+ /*
+ * The channel receives as long as there are packets. When
+ * packets end (i.e. MULTIFD_FLAG_SYNC is reached), the
+ * channel waits for the migration thread to sync. If the
+ * sync never happens, do it here.
+ */
+ qemu_sem_post(&p->sem_sync);
+ } else {
+ /*
+ * The channel waits for the migration thread to give it
+ * work. When the migration thread runs out of work, it
+ * releases the channel and waits for any pending work to
+ * finish. If we reach here (e.g. due to error) before the
+ * work runs out, release the channel.
+ */
+ qemu_sem_post(&p->sem);
+ }
/*
* We could arrive here for two reasons:
@@ -1138,6 +1296,7 @@
p->c = NULL;
qemu_mutex_destroy(&p->mutex);
qemu_sem_destroy(&p->sem_sync);
+ qemu_sem_destroy(&p->sem);
g_free(p->name);
p->name = NULL;
p->packet_len = 0;
@@ -1155,6 +1314,8 @@
qemu_sem_destroy(&multifd_recv_state->sem_sync);
g_free(multifd_recv_state->params);
multifd_recv_state->params = NULL;
+ g_free(multifd_recv_state->data);
+ multifd_recv_state->data = NULL;
g_free(multifd_recv_state);
multifd_recv_state = NULL;
}
@@ -1182,18 +1343,53 @@
void multifd_recv_sync_main(void)
{
+ int thread_count = migrate_multifd_channels();
+ bool file_based = !multifd_use_packets();
int i;
if (!migrate_multifd()) {
return;
}
- for (i = 0; i < migrate_multifd_channels(); i++) {
- MultiFDRecvParams *p = &multifd_recv_state->params[i];
- trace_multifd_recv_sync_main_wait(p->id);
+ /*
+ * File-based channels don't use packets and therefore need to
+ * wait for more work. Release them to start the sync.
+ */
+ if (file_based) {
+ for (i = 0; i < thread_count; i++) {
+ MultiFDRecvParams *p = &multifd_recv_state->params[i];
+
+ trace_multifd_recv_sync_main_signal(p->id);
+ qemu_sem_post(&p->sem);
+ }
+ }
+
+ /*
+ * Initiate the synchronization by waiting for all channels.
+ *
+ * For socket-based migration this means each channel has received
+ * the SYNC packet on the stream.
+ *
+ * For file-based migration this means each channel is done with
+ * the work (pending_job=false).
+ */
+ for (i = 0; i < thread_count; i++) {
+ trace_multifd_recv_sync_main_wait(i);
qemu_sem_wait(&multifd_recv_state->sem_sync);
}
- for (i = 0; i < migrate_multifd_channels(); i++) {
+
+ if (file_based) {
+ /*
+ * For file-based loading is done in one iteration. We're
+ * done.
+ */
+ return;
+ }
+
+ /*
+ * Sync done. Release the channels for the next iteration.
+ */
+ for (i = 0; i < thread_count; i++) {
MultiFDRecvParams *p = &multifd_recv_state->params[i];
WITH_QEMU_LOCK_GUARD(&p->mutex) {
@@ -1211,46 +1407,87 @@
{
MultiFDRecvParams *p = opaque;
Error *local_err = NULL;
+ bool use_packets = multifd_use_packets();
int ret;
trace_multifd_recv_thread_start(p->id);
rcu_register_thread();
while (true) {
- uint32_t flags;
+ uint32_t flags = 0;
+ bool has_data = false;
+ p->normal_num = 0;
- if (multifd_recv_should_exit()) {
- break;
- }
+ if (use_packets) {
+ if (multifd_recv_should_exit()) {
+ break;
+ }
- ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
- p->packet_len, &local_err);
- if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */
- break;
- }
+ ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
+ p->packet_len, &local_err);
+ if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */
+ break;
+ }
- qemu_mutex_lock(&p->mutex);
- ret = multifd_recv_unfill_packet(p, &local_err);
- if (ret) {
+ qemu_mutex_lock(&p->mutex);
+ ret = multifd_recv_unfill_packet(p, &local_err);
+ if (ret) {
+ qemu_mutex_unlock(&p->mutex);
+ break;
+ }
+
+ flags = p->flags;
+ /* recv methods don't know how to handle the SYNC flag */
+ p->flags &= ~MULTIFD_FLAG_SYNC;
+ has_data = !!p->normal_num;
qemu_mutex_unlock(&p->mutex);
- break;
+ } else {
+ /*
+ * No packets, so we need to wait for the vmstate code to
+ * give us work.
+ */
+ qemu_sem_wait(&p->sem);
+
+ if (multifd_recv_should_exit()) {
+ break;
+ }
+
+ /* pairs with qatomic_store_release() at multifd_recv() */
+ if (!qatomic_load_acquire(&p->pending_job)) {
+ /*
+ * Migration thread did not send work, this is
+ * equivalent to pending_sync on the sending
+ * side. Post sem_sync to notify we reached this
+ * point.
+ */
+ qemu_sem_post(&multifd_recv_state->sem_sync);
+ continue;
+ }
+
+ has_data = !!p->data->size;
}
- flags = p->flags;
- /* recv methods don't know how to handle the SYNC flag */
- p->flags &= ~MULTIFD_FLAG_SYNC;
- qemu_mutex_unlock(&p->mutex);
-
- if (p->normal_num) {
- ret = multifd_recv_state->ops->recv_pages(p, &local_err);
+ if (has_data) {
+ ret = multifd_recv_state->ops->recv(p, &local_err);
if (ret != 0) {
break;
}
}
- if (flags & MULTIFD_FLAG_SYNC) {
- qemu_sem_post(&multifd_recv_state->sem_sync);
- qemu_sem_wait(&p->sem_sync);
+ if (use_packets) {
+ if (flags & MULTIFD_FLAG_SYNC) {
+ qemu_sem_post(&multifd_recv_state->sem_sync);
+ qemu_sem_wait(&p->sem_sync);
+ }
+ } else {
+ p->total_normal_pages += p->data->size / qemu_target_page_size();
+ p->data->size = 0;
+ /*
+ * Order data->size update before clearing
+ * pending_job. Pairs with smp_mb_acquire() at
+ * multifd_recv().
+ */
+ qatomic_store_release(&p->pending_job, false);
}
}
@@ -1269,6 +1506,7 @@
{
int thread_count;
uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
+ bool use_packets = multifd_use_packets();
uint8_t i;
/*
@@ -1282,6 +1520,10 @@
thread_count = migrate_multifd_channels();
multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
+
+ multifd_recv_state->data = g_new0(MultiFDRecvData, 1);
+ multifd_recv_state->data->size = 0;
+
qatomic_set(&multifd_recv_state->count, 0);
qatomic_set(&multifd_recv_state->exiting, 0);
qemu_sem_init(&multifd_recv_state->sem_sync, 0);
@@ -1292,10 +1534,18 @@
qemu_mutex_init(&p->mutex);
qemu_sem_init(&p->sem_sync, 0);
+ qemu_sem_init(&p->sem, 0);
+ p->pending_job = false;
p->id = i;
- p->packet_len = sizeof(MultiFDPacket_t)
- + sizeof(uint64_t) * page_count;
- p->packet = g_malloc0(p->packet_len);
+
+ p->data = g_new0(MultiFDRecvData, 1);
+ p->data->size = 0;
+
+ if (use_packets) {
+ p->packet_len = sizeof(MultiFDPacket_t)
+ + sizeof(uint64_t) * page_count;
+ p->packet = g_malloc0(p->packet_len);
+ }
p->name = g_strdup_printf("multifdrecv_%d", i);
p->iov = g_new0(struct iovec, page_count);
p->normal = g_new0(ram_addr_t, page_count);
@@ -1339,18 +1589,23 @@
{
MultiFDRecvParams *p;
Error *local_err = NULL;
+ bool use_packets = multifd_use_packets();
int id;
- id = multifd_recv_initial_packet(ioc, &local_err);
- if (id < 0) {
- multifd_recv_terminate_threads(local_err);
- error_propagate_prepend(errp, local_err,
- "failed to receive packet"
- " via multifd channel %d: ",
- qatomic_read(&multifd_recv_state->count));
- return;
+ if (use_packets) {
+ id = multifd_recv_initial_packet(ioc, &local_err);
+ if (id < 0) {
+ multifd_recv_terminate_threads(local_err);
+ error_propagate_prepend(errp, local_err,
+ "failed to receive packet"
+ " via multifd channel %d: ",
+ qatomic_read(&multifd_recv_state->count));
+ return;
+ }
+ trace_multifd_recv_new_channel(id);
+ } else {
+ id = qatomic_read(&multifd_recv_state->count);
}
- trace_multifd_recv_new_channel(id);
p = &multifd_recv_state->params[id];
if (p->c != NULL) {
diff --git a/migration/multifd.h b/migration/multifd.h
index b3fe27a..7447c2b 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -13,8 +13,13 @@
#ifndef QEMU_MIGRATION_MULTIFD_H
#define QEMU_MIGRATION_MULTIFD_H
+#include "ram.h"
+
+typedef struct MultiFDRecvData MultiFDRecvData;
+
bool multifd_send_setup(void);
void multifd_send_shutdown(void);
+void multifd_send_channel_created(void);
int multifd_recv_setup(Error **errp);
void multifd_recv_cleanup(void);
void multifd_recv_shutdown(void);
@@ -23,6 +28,8 @@
void multifd_recv_sync_main(void);
int multifd_send_sync_main(void);
bool multifd_queue_page(RAMBlock *block, ram_addr_t offset);
+bool multifd_recv(void);
+MultiFDRecvData *multifd_get_recv_data(void);
/* Multifd Compression flags */
#define MULTIFD_FLAG_SYNC (1 << 0)
@@ -63,6 +70,13 @@
RAMBlock *block;
} MultiFDPages_t;
+struct MultiFDRecvData {
+ void *opaque;
+ size_t size;
+ /* for preadv */
+ off_t file_offset;
+};
+
typedef struct {
/* Fields are only written at creating/deletion time */
/* No lock required for them, they are read only */
@@ -127,7 +141,7 @@
/* number of iovs used */
uint32_t iovs_num;
/* used for compression methods */
- void *data;
+ void *compress_data;
} MultiFDSendParams;
typedef struct {
@@ -152,6 +166,8 @@
/* syncs main thread and channels */
QemuSemaphore sem_sync;
+ /* sem where to wait for more work */
+ QemuSemaphore sem;
/* this mutex protects the following parameters */
QemuMutex mutex;
@@ -161,6 +177,8 @@
uint32_t flags;
/* global number of generated multifd packets */
uint64_t packet_num;
+ int pending_job;
+ MultiFDRecvData *data;
/* thread local variables. No locking required */
@@ -183,7 +201,7 @@
/* num of non zero pages */
uint32_t normal_num;
/* used for de-compression methods */
- void *data;
+ void *compress_data;
} MultiFDRecvParams;
typedef struct {
@@ -197,8 +215,8 @@
int (*recv_setup)(MultiFDRecvParams *p, Error **errp);
/* Cleanup for receiving side */
void (*recv_cleanup)(MultiFDRecvParams *p);
- /* Read all pages */
- int (*recv_pages)(MultiFDRecvParams *p, Error **errp);
+ /* Read all data */
+ int (*recv)(MultiFDRecvParams *p, Error **errp);
} MultiFDMethods;
void multifd_register_ops(int method, MultiFDMethods *ops);
@@ -211,5 +229,6 @@
p->iovs_num++;
}
+void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc);
#endif
diff --git a/migration/options.c b/migration/options.c
index 3e3e0b9..40eb930 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -204,6 +204,7 @@
DEFINE_PROP_MIG_CAP("x-switchover-ack",
MIGRATION_CAPABILITY_SWITCHOVER_ACK),
DEFINE_PROP_MIG_CAP("x-dirty-limit", MIGRATION_CAPABILITY_DIRTY_LIMIT),
+ DEFINE_PROP_MIG_CAP("mapped-ram", MIGRATION_CAPABILITY_MAPPED_RAM),
DEFINE_PROP_END_OF_LIST(),
};
@@ -263,6 +264,13 @@
return s->capabilities[MIGRATION_CAPABILITY_EVENTS];
}
+bool migrate_mapped_ram(void)
+{
+ MigrationState *s = migrate_get_current();
+
+ return s->capabilities[MIGRATION_CAPABILITY_MAPPED_RAM];
+}
+
bool migrate_ignore_shared(void)
{
MigrationState *s = migrate_get_current();
@@ -645,6 +653,26 @@
}
}
+ if (new_caps[MIGRATION_CAPABILITY_MAPPED_RAM]) {
+ if (new_caps[MIGRATION_CAPABILITY_XBZRLE]) {
+ error_setg(errp,
+ "Mapped-ram migration is incompatible with xbzrle");
+ return false;
+ }
+
+ if (new_caps[MIGRATION_CAPABILITY_COMPRESS]) {
+ error_setg(errp,
+ "Mapped-ram migration is incompatible with compression");
+ return false;
+ }
+
+ if (new_caps[MIGRATION_CAPABILITY_POSTCOPY_RAM]) {
+ error_setg(errp,
+ "Mapped-ram migration is incompatible with postcopy");
+ return false;
+ }
+ }
+
return true;
}
@@ -1218,6 +1246,13 @@
}
#endif
+ if (migrate_mapped_ram() &&
+ (migrate_multifd_compression() || migrate_tls())) {
+ error_setg(errp,
+ "Mapped-ram only available for non-compressed non-TLS multifd migration");
+ return false;
+ }
+
if (params->has_x_vcpu_dirty_limit_period &&
(params->x_vcpu_dirty_limit_period < 1 ||
params->x_vcpu_dirty_limit_period > 1000)) {
@@ -1312,6 +1347,12 @@
if (params->has_multifd_compression) {
dest->multifd_compression = params->multifd_compression;
}
+ if (params->has_multifd_zlib_level) {
+ dest->multifd_zlib_level = params->multifd_zlib_level;
+ }
+ if (params->has_multifd_zstd_level) {
+ dest->multifd_zstd_level = params->multifd_zstd_level;
+ }
if (params->has_xbzrle_cache_size) {
dest->xbzrle_cache_size = params->xbzrle_cache_size;
}
@@ -1447,6 +1488,12 @@
if (params->has_multifd_compression) {
s->parameters.multifd_compression = params->multifd_compression;
}
+ if (params->has_multifd_zlib_level) {
+ s->parameters.multifd_zlib_level = params->multifd_zlib_level;
+ }
+ if (params->has_multifd_zstd_level) {
+ s->parameters.multifd_zstd_level = params->multifd_zstd_level;
+ }
if (params->has_xbzrle_cache_size) {
s->parameters.xbzrle_cache_size = params->xbzrle_cache_size;
xbzrle_cache_resize(params->xbzrle_cache_size, errp);
diff --git a/migration/options.h b/migration/options.h
index 246c160..6ddd8da 100644
--- a/migration/options.h
+++ b/migration/options.h
@@ -31,6 +31,7 @@
bool migrate_dirty_bitmaps(void);
bool migrate_dirty_limit(void);
bool migrate_events(void);
+bool migrate_mapped_ram(void);
bool migrate_ignore_shared(void);
bool migrate_late_block_activate(void);
bool migrate_multifd(void);
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 94231ff..b10c882 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -33,6 +33,7 @@
#include "options.h"
#include "qapi/error.h"
#include "rdma.h"
+#include "io/channel-file.h"
#define IO_BUF_SIZE 32768
#define MAX_IOV_SIZE MIN_CONST(IOV_MAX, 64)
@@ -255,6 +256,10 @@
memset(f->may_free, 0, sizeof(f->may_free));
}
+bool qemu_file_is_seekable(QEMUFile *f)
+{
+ return qio_channel_has_feature(f->ioc, QIO_CHANNEL_FEATURE_SEEKABLE);
+}
/**
* Flushes QEMUFile buffer
@@ -447,6 +452,107 @@
}
}
+void qemu_put_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen,
+ off_t pos)
+{
+ Error *err = NULL;
+ size_t ret;
+
+ if (f->last_error) {
+ return;
+ }
+
+ qemu_fflush(f);
+ ret = qio_channel_pwrite(f->ioc, (char *)buf, buflen, pos, &err);
+
+ if (err) {
+ qemu_file_set_error_obj(f, -EIO, err);
+ return;
+ }
+
+ if ((ssize_t)ret == QIO_CHANNEL_ERR_BLOCK) {
+ qemu_file_set_error_obj(f, -EAGAIN, NULL);
+ return;
+ }
+
+ if (ret != buflen) {
+ error_setg(&err, "Partial write of size %zu, expected %zu", ret,
+ buflen);
+ qemu_file_set_error_obj(f, -EIO, err);
+ return;
+ }
+
+ stat64_add(&mig_stats.qemu_file_transferred, buflen);
+
+ return;
+}
+
+
+size_t qemu_get_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen,
+ off_t pos)
+{
+ Error *err = NULL;
+ size_t ret;
+
+ if (f->last_error) {
+ return 0;
+ }
+
+ ret = qio_channel_pread(f->ioc, (char *)buf, buflen, pos, &err);
+
+ if ((ssize_t)ret == -1 || err) {
+ qemu_file_set_error_obj(f, -EIO, err);
+ return 0;
+ }
+
+ if ((ssize_t)ret == QIO_CHANNEL_ERR_BLOCK) {
+ qemu_file_set_error_obj(f, -EAGAIN, NULL);
+ return 0;
+ }
+
+ if (ret != buflen) {
+ error_setg(&err, "Partial read of size %zu, expected %zu", ret, buflen);
+ qemu_file_set_error_obj(f, -EIO, err);
+ return 0;
+ }
+
+ return ret;
+}
+
+void qemu_set_offset(QEMUFile *f, off_t off, int whence)
+{
+ Error *err = NULL;
+ off_t ret;
+
+ if (qemu_file_is_writable(f)) {
+ qemu_fflush(f);
+ } else {
+ /* Drop all cached buffers if existed; will trigger a re-fill later */
+ f->buf_index = 0;
+ f->buf_size = 0;
+ }
+
+ ret = qio_channel_io_seek(f->ioc, off, whence, &err);
+ if (ret == (off_t)-1) {
+ qemu_file_set_error_obj(f, -EIO, err);
+ }
+}
+
+off_t qemu_get_offset(QEMUFile *f)
+{
+ Error *err = NULL;
+ off_t ret;
+
+ qemu_fflush(f);
+
+ ret = qio_channel_io_seek(f->ioc, 0, SEEK_CUR, &err);
+ if (ret == (off_t)-1) {
+ qemu_file_set_error_obj(f, -EIO, err);
+ }
+ return ret;
+}
+
+
void qemu_put_byte(QEMUFile *f, int v)
{
if (f->last_error) {
diff --git a/migration/qemu-file.h b/migration/qemu-file.h
index 8aec9fa..32fd4a3 100644
--- a/migration/qemu-file.h
+++ b/migration/qemu-file.h
@@ -75,6 +75,12 @@
int qemu_fflush(QEMUFile *f);
void qemu_file_set_blocking(QEMUFile *f, bool block);
int qemu_file_get_to_fd(QEMUFile *f, int fd, size_t size);
+void qemu_set_offset(QEMUFile *f, off_t off, int whence);
+off_t qemu_get_offset(QEMUFile *f);
+void qemu_put_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen,
+ off_t pos);
+size_t qemu_get_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen,
+ off_t pos);
QIOChannel *qemu_file_get_ioc(QEMUFile *file);
diff --git a/migration/ram.c b/migration/ram.c
index 45a00b4..003c28e 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -94,6 +94,24 @@
#define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200
/* We can't use any flag that is bigger than 0x200 */
+/*
+ * mapped-ram migration supports O_DIRECT, so we need to make sure the
+ * userspace buffer, the IO operation size and the file offset are
+ * aligned according to the underlying device's block size. The first
+ * two are already aligned to page size, but we need to add padding to
+ * the file to align the offset. We cannot read the block size
+ * dynamically because the migration file can be moved between
+ * different systems, so use 1M to cover most block sizes and to keep
+ * the file offset aligned at page size as well.
+ */
+#define MAPPED_RAM_FILE_OFFSET_ALIGNMENT 0x100000
+
+/*
+ * When doing mapped-ram migration, this is the amount we read from
+ * the pages region in the migration file at a time.
+ */
+#define MAPPED_RAM_LOAD_BUF_SIZE 0x100000
+
XBZRLECacheStats xbzrle_counters;
/* used by the search for pages to send */
@@ -1126,12 +1144,18 @@
return 0;
}
+ stat64_add(&mig_stats.zero_pages, 1);
+
+ if (migrate_mapped_ram()) {
+ /* zero pages are not transferred with mapped-ram */
+ clear_bit_atomic(offset >> TARGET_PAGE_BITS, pss->block->file_bmap);
+ return 1;
+ }
+
len += save_page_header(pss, file, pss->block, offset | RAM_SAVE_FLAG_ZERO);
qemu_put_byte(file, 0);
len += 1;
ram_release_page(pss->block->idstr, offset);
-
- stat64_add(&mig_stats.zero_pages, 1);
ram_transferred_add(len);
/*
@@ -1189,14 +1213,20 @@
{
QEMUFile *file = pss->pss_channel;
- ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
- offset | RAM_SAVE_FLAG_PAGE));
- if (async) {
- qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
- migrate_release_ram() &&
- migration_in_postcopy());
+ if (migrate_mapped_ram()) {
+ qemu_put_buffer_at(file, buf, TARGET_PAGE_SIZE,
+ block->pages_offset + offset);
+ set_bit(offset >> TARGET_PAGE_BITS, block->file_bmap);
} else {
- qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
+ ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
+ offset | RAM_SAVE_FLAG_PAGE));
+ if (async) {
+ qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
+ migrate_release_ram() &&
+ migration_in_postcopy());
+ } else {
+ qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
+ }
}
ram_transferred_add(TARGET_PAGE_SIZE);
stat64_add(&mig_stats.normal_pages, 1);
@@ -1332,14 +1362,18 @@
pss->block = QLIST_NEXT_RCU(pss->block, next);
if (!pss->block) {
if (migrate_multifd() &&
- !migrate_multifd_flush_after_each_section()) {
+ (!migrate_multifd_flush_after_each_section() ||
+ migrate_mapped_ram())) {
QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
int ret = multifd_send_sync_main();
if (ret < 0) {
return ret;
}
- qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
- qemu_fflush(f);
+
+ if (!migrate_mapped_ram()) {
+ qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
+ qemu_fflush(f);
+ }
}
/*
* If memory migration starts over, we will meet a dirtied page
@@ -2778,6 +2812,9 @@
*/
block->bmap = bitmap_new(pages);
bitmap_set(block->bmap, 0, pages);
+ if (migrate_mapped_ram()) {
+ block->file_bmap = bitmap_new(pages);
+ }
block->clear_bmap_shift = shift;
block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
}
@@ -2915,6 +2952,89 @@
}
}
+#define MAPPED_RAM_HDR_VERSION 1
+struct MappedRamHeader {
+ uint32_t version;
+ /*
+ * The target's page size, so we know how many pages are in the
+ * bitmap.
+ */
+ uint64_t page_size;
+ /*
+ * The offset in the migration file where the pages bitmap is
+ * stored.
+ */
+ uint64_t bitmap_offset;
+ /*
+ * The offset in the migration file where the actual pages (data)
+ * are stored.
+ */
+ uint64_t pages_offset;
+} QEMU_PACKED;
+typedef struct MappedRamHeader MappedRamHeader;
+
+static void mapped_ram_setup_ramblock(QEMUFile *file, RAMBlock *block)
+{
+ g_autofree MappedRamHeader *header = NULL;
+ size_t header_size, bitmap_size;
+ long num_pages;
+
+ header = g_new0(MappedRamHeader, 1);
+ header_size = sizeof(MappedRamHeader);
+
+ num_pages = block->used_length >> TARGET_PAGE_BITS;
+ bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
+
+ /*
+ * Save the file offsets of where the bitmap and the pages should
+ * go as they are written at the end of migration and during the
+ * iterative phase, respectively.
+ */
+ block->bitmap_offset = qemu_get_offset(file) + header_size;
+ block->pages_offset = ROUND_UP(block->bitmap_offset +
+ bitmap_size,
+ MAPPED_RAM_FILE_OFFSET_ALIGNMENT);
+
+ header->version = cpu_to_be32(MAPPED_RAM_HDR_VERSION);
+ header->page_size = cpu_to_be64(TARGET_PAGE_SIZE);
+ header->bitmap_offset = cpu_to_be64(block->bitmap_offset);
+ header->pages_offset = cpu_to_be64(block->pages_offset);
+
+ qemu_put_buffer(file, (uint8_t *) header, header_size);
+
+ /* prepare offset for next ramblock */
+ qemu_set_offset(file, block->pages_offset + block->used_length, SEEK_SET);
+}
+
+static bool mapped_ram_read_header(QEMUFile *file, MappedRamHeader *header,
+ Error **errp)
+{
+ size_t ret, header_size = sizeof(MappedRamHeader);
+
+ ret = qemu_get_buffer(file, (uint8_t *)header, header_size);
+ if (ret != header_size) {
+ error_setg(errp, "Could not read whole mapped-ram migration header "
+ "(expected %zd, got %zd bytes)", header_size, ret);
+ return false;
+ }
+
+ /* migration stream is big-endian */
+ header->version = be32_to_cpu(header->version);
+
+ if (header->version > MAPPED_RAM_HDR_VERSION) {
+ error_setg(errp, "Migration mapped-ram capability version not "
+ "supported (expected <= %d, got %d)", MAPPED_RAM_HDR_VERSION,
+ header->version);
+ return false;
+ }
+
+ header->page_size = be64_to_cpu(header->page_size);
+ header->bitmap_offset = be64_to_cpu(header->bitmap_offset);
+ header->pages_offset = be64_to_cpu(header->pages_offset);
+
+ return true;
+}
+
/*
* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
* long-running RCU critical section. When rcu-reclaims in the code
@@ -2934,7 +3054,7 @@
{
RAMState **rsp = opaque;
RAMBlock *block;
- int ret;
+ int ret, max_hg_page_size;
if (compress_threads_save_setup()) {
return -1;
@@ -2949,6 +3069,12 @@
}
(*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
+ /*
+ * ??? Mirrors the previous value of qemu_host_page_size,
+ * but is this really what was intended for the migration?
+ */
+ max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE);
+
WITH_RCU_READ_LOCK_GUARD() {
qemu_put_be64(f, ram_bytes_total_with_ignored()
| RAM_SAVE_FLAG_MEM_SIZE);
@@ -2957,13 +3083,17 @@
qemu_put_byte(f, strlen(block->idstr));
qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
qemu_put_be64(f, block->used_length);
- if (migrate_postcopy_ram() && block->page_size !=
- qemu_host_page_size) {
+ if (migrate_postcopy_ram() &&
+ block->page_size != max_hg_page_size) {
qemu_put_be64(f, block->page_size);
}
if (migrate_ignore_shared()) {
qemu_put_be64(f, block->mr->addr);
}
+
+ if (migrate_mapped_ram()) {
+ mapped_ram_setup_ramblock(f, block);
+ }
}
}
@@ -2989,7 +3119,8 @@
return ret;
}
- if (migrate_multifd() && !migrate_multifd_flush_after_each_section()) {
+ if (migrate_multifd() && !migrate_multifd_flush_after_each_section()
+ && !migrate_mapped_ram()) {
qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
}
@@ -2997,6 +3128,33 @@
return qemu_fflush(f);
}
+static void ram_save_file_bmap(QEMUFile *f)
+{
+ RAMBlock *block;
+
+ RAMBLOCK_FOREACH_MIGRATABLE(block) {
+ long num_pages = block->used_length >> TARGET_PAGE_BITS;
+ long bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
+
+ qemu_put_buffer_at(f, (uint8_t *)block->file_bmap, bitmap_size,
+ block->bitmap_offset);
+ ram_transferred_add(bitmap_size);
+
+ /*
+ * Free the bitmap here to catch any synchronization issues
+ * with multifd channels. No channels should be sending pages
+ * after we've written the bitmap to file.
+ */
+ g_free(block->file_bmap);
+ block->file_bmap = NULL;
+ }
+}
+
+void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset)
+{
+ set_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap);
+}
+
/**
* ram_save_iterate: iterative stage for migration
*
@@ -3106,7 +3264,8 @@
out:
if (ret >= 0
&& migration_is_setup_or_active(migrate_get_current()->state)) {
- if (migrate_multifd() && migrate_multifd_flush_after_each_section()) {
+ if (migrate_multifd() && migrate_multifd_flush_after_each_section() &&
+ !migrate_mapped_ram()) {
ret = multifd_send_sync_main();
if (ret < 0) {
return ret;
@@ -3186,7 +3345,20 @@
return ret;
}
- if (migrate_multifd() && !migrate_multifd_flush_after_each_section()) {
+ if (migrate_mapped_ram()) {
+ ram_save_file_bmap(f);
+
+ if (qemu_file_get_error(f)) {
+ Error *local_err = NULL;
+ int err = qemu_file_get_error_obj(f, &local_err);
+
+ error_reportf_err(local_err, "Failed to write bitmap to file: ");
+ return -err;
+ }
+ }
+
+ if (migrate_multifd() && !migrate_multifd_flush_after_each_section() &&
+ !migrate_mapped_ram()) {
qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
}
qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
@@ -3786,31 +3958,165 @@
trace_colo_flush_ram_cache_end();
}
+static size_t ram_load_multifd_pages(void *host_addr, size_t size,
+ uint64_t offset)
+{
+ MultiFDRecvData *data = multifd_get_recv_data();
+
+ data->opaque = host_addr;
+ data->file_offset = offset;
+ data->size = size;
+
+ if (!multifd_recv()) {
+ return 0;
+ }
+
+ return size;
+}
+
+static bool read_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block,
+ long num_pages, unsigned long *bitmap,
+ Error **errp)
+{
+ ERRP_GUARD();
+ unsigned long set_bit_idx, clear_bit_idx;
+ ram_addr_t offset;
+ void *host;
+ size_t read, unread, size;
+
+ for (set_bit_idx = find_first_bit(bitmap, num_pages);
+ set_bit_idx < num_pages;
+ set_bit_idx = find_next_bit(bitmap, num_pages, clear_bit_idx + 1)) {
+
+ clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1);
+
+ unread = TARGET_PAGE_SIZE * (clear_bit_idx - set_bit_idx);
+ offset = set_bit_idx << TARGET_PAGE_BITS;
+
+ while (unread > 0) {
+ host = host_from_ram_block_offset(block, offset);
+ if (!host) {
+ error_setg(errp, "page outside of ramblock %s range",
+ block->idstr);
+ return false;
+ }
+
+ size = MIN(unread, MAPPED_RAM_LOAD_BUF_SIZE);
+
+ if (migrate_multifd()) {
+ read = ram_load_multifd_pages(host, size,
+ block->pages_offset + offset);
+ } else {
+ read = qemu_get_buffer_at(f, host, size,
+ block->pages_offset + offset);
+ }
+
+ if (!read) {
+ goto err;
+ }
+ offset += read;
+ unread -= read;
+ }
+ }
+
+ return true;
+
+err:
+ qemu_file_get_error_obj(f, errp);
+ error_prepend(errp, "(%s) failed to read page " RAM_ADDR_FMT
+ "from file offset %" PRIx64 ": ", block->idstr, offset,
+ block->pages_offset + offset);
+ return false;
+}
+
+static void parse_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block,
+ ram_addr_t length, Error **errp)
+{
+ g_autofree unsigned long *bitmap = NULL;
+ MappedRamHeader header;
+ size_t bitmap_size;
+ long num_pages;
+
+ if (!mapped_ram_read_header(f, &header, errp)) {
+ return;
+ }
+
+ block->pages_offset = header.pages_offset;
+
+ /*
+ * Check the alignment of the file region that contains pages. We
+ * don't enforce MAPPED_RAM_FILE_OFFSET_ALIGNMENT to allow that
+ * value to change in the future. Do only a sanity check with page
+ * size alignment.
+ */
+ if (!QEMU_IS_ALIGNED(block->pages_offset, TARGET_PAGE_SIZE)) {
+ error_setg(errp,
+ "Error reading ramblock %s pages, region has bad alignment",
+ block->idstr);
+ return;
+ }
+
+ num_pages = length / header.page_size;
+ bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
+
+ bitmap = g_malloc0(bitmap_size);
+ if (qemu_get_buffer_at(f, (uint8_t *)bitmap, bitmap_size,
+ header.bitmap_offset) != bitmap_size) {
+ error_setg(errp, "Error reading dirty bitmap");
+ return;
+ }
+
+ if (!read_ramblock_mapped_ram(f, block, num_pages, bitmap, errp)) {
+ return;
+ }
+
+ /* Skip pages array */
+ qemu_set_offset(f, block->pages_offset + length, SEEK_SET);
+
+ return;
+}
+
static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length)
{
int ret = 0;
/* ADVISE is earlier, it shows the source has the postcopy capability on */
bool postcopy_advised = migration_incoming_postcopy_advised();
+ int max_hg_page_size;
+ Error *local_err = NULL;
assert(block);
+ if (migrate_mapped_ram()) {
+ parse_ramblock_mapped_ram(f, block, length, &local_err);
+ if (local_err) {
+ error_report_err(local_err);
+ return -EINVAL;
+ }
+ return 0;
+ }
+
if (!qemu_ram_is_migratable(block)) {
error_report("block %s should not be migrated !", block->idstr);
return -EINVAL;
}
if (length != block->used_length) {
- Error *local_err = NULL;
-
ret = qemu_ram_resize(block, length, &local_err);
if (local_err) {
error_report_err(local_err);
return ret;
}
}
+
+ /*
+ * ??? Mirrors the previous value of qemu_host_page_size,
+ * but is this really what was intended for the migration?
+ */
+ max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE);
+
/* For postcopy we need to check hugepage sizes match */
if (postcopy_advised && migrate_postcopy_ram() &&
- block->page_size != qemu_host_page_size) {
+ block->page_size != max_hg_page_size) {
uint64_t remote_page_size = qemu_get_be64(f);
if (remote_page_size != block->page_size) {
error_report("Mismatched RAM page size %s "
@@ -3885,6 +4191,12 @@
invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
}
+ if (migrate_mapped_ram()) {
+ invalid_flags |= (RAM_SAVE_FLAG_HOOK | RAM_SAVE_FLAG_MULTIFD_FLUSH |
+ RAM_SAVE_FLAG_PAGE | RAM_SAVE_FLAG_XBZRLE |
+ RAM_SAVE_FLAG_ZERO);
+ }
+
while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
ram_addr_t addr;
void *host = NULL, *host_bak = NULL;
@@ -3906,6 +4218,8 @@
addr &= TARGET_PAGE_MASK;
if (flags & invalid_flags) {
+ error_report("Unexpected RAM flags: %d", flags & invalid_flags);
+
if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
error_report("Received an unexpected compressed page");
}
@@ -3958,6 +4272,16 @@
switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
case RAM_SAVE_FLAG_MEM_SIZE:
ret = parse_ramblocks(f, addr);
+ /*
+ * For mapped-ram migration (to a file) using multifd, we sync
+ * once and for all here to make sure all tasks we queued to
+ * multifd threads are completed, so that all the ramblocks
+ * (including all the guest memory pages within) are fully
+ * loaded after this sync returns.
+ */
+ if (migrate_mapped_ram()) {
+ multifd_recv_sync_main();
+ }
break;
case RAM_SAVE_FLAG_ZERO:
@@ -3998,7 +4322,12 @@
case RAM_SAVE_FLAG_EOS:
/* normal exit */
if (migrate_multifd() &&
- migrate_multifd_flush_after_each_section()) {
+ migrate_multifd_flush_after_each_section() &&
+ /*
+ * Mapped-ram migration flushes once and for all after
+ * parsing ramblocks. Always ignore EOS for it.
+ */
+ !migrate_mapped_ram()) {
multifd_recv_sync_main();
}
break;
diff --git a/migration/ram.h b/migration/ram.h
index 9b937a4..b9ac0da 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -75,6 +75,7 @@
bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start);
void postcopy_preempt_shutdown_file(MigrationState *s);
void *postcopy_preempt_thread(void *opaque);
+void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset);
/* ram cache */
int colo_init_ram_cache(void);
diff --git a/migration/savevm.c b/migration/savevm.c
index d612c8a..dc1fb9c 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -245,6 +245,7 @@
/* Validate only new capabilities to keep compatibility. */
switch (capability) {
case MIGRATION_CAPABILITY_X_IGNORE_SHARED:
+ case MIGRATION_CAPABILITY_MAPPED_RAM:
return true;
default:
return false;
diff --git a/migration/trace-events b/migration/trace-events
index 298ad2b..bf1a069 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -132,7 +132,7 @@
multifd_recv_new_channel(uint8_t id) "channel %u"
multifd_recv_sync_main(long packet_num) "packet num %ld"
multifd_recv_sync_main_signal(uint8_t id) "channel %u"
-multifd_recv_sync_main_wait(uint8_t id) "channel %u"
+multifd_recv_sync_main_wait(uint8_t id) "iter %u"
multifd_recv_terminate_threads(bool error) "error %d"
multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %u packets %" PRIu64 " pages %" PRIu64
multifd_recv_thread_start(uint8_t id) "%u"
diff --git a/pc-bios/README b/pc-bios/README
index b8a0210..7ffb2f4 100644
--- a/pc-bios/README
+++ b/pc-bios/README
@@ -75,3 +75,9 @@
initialize and run boot images stored in SPI flash, but may grow more
features over time as needed. The source code is available at:
https://github.com/google/vbootrom
+
+- hppa-firmware.img (32-bit) and hppa-firmware64.img (64-bit) are firmware
+ files for the HP-PARISC (hppa) architecture.
+ They are built form the SeaBIOS-hppa sources, which is a fork of SeaBIOS
+ adapted for hppa.
+ SeaBIOS-hppa is available at https://github.com/hdeller/seabios-hppa
diff --git a/pc-bios/meson.build b/pc-bios/meson.build
index e67fa43..0760612 100644
--- a/pc-bios/meson.build
+++ b/pc-bios/meson.build
@@ -73,6 +73,7 @@
'qemu_vga.ndrv',
'edk2-licenses.txt',
'hppa-firmware.img',
+ 'hppa-firmware64.img',
'opensbi-riscv32-generic-fw_dynamic.bin',
'opensbi-riscv64-generic-fw_dynamic.bin',
'npcm7xx_bootrom.bin',
diff --git a/qapi/migration.json b/qapi/migration.json
index 27a67e5..51d188b 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -531,6 +531,10 @@
# and can result in more stable read performance. Requires KVM
# with accelerator property "dirty-ring-size" set. (Since 8.1)
#
+# @mapped-ram: Migrate using fixed offsets in the migration file for
+# each RAM page. Requires a migration URI that supports seeking,
+# such as a file. (since 9.0)
+#
# Features:
#
# @deprecated: Member @block is deprecated. Use blockdev-mirror with
@@ -555,7 +559,7 @@
{ 'name': 'x-ignore-shared', 'features': [ 'unstable' ] },
'validate-uuid', 'background-snapshot',
'zero-copy-send', 'postcopy-preempt', 'switchover-ack',
- 'dirty-limit'] }
+ 'dirty-limit', 'mapped-ram'] }
##
# @MigrationCapabilityStatus:
@@ -636,28 +640,30 @@
#
# @normal: the original form of migration. (since 8.2)
#
-# @cpr-reboot: The migrate command stops the VM and saves state to the URI.
-# After quitting qemu, the user resumes by running qemu -incoming.
+# @cpr-reboot: The migrate command stops the VM and saves state to
+# the URI. After quitting QEMU, the user resumes by running
+# QEMU -incoming.
#
-# This mode allows the user to quit qemu, and restart an updated version
-# of qemu. The user may even update and reboot the OS before restarting,
-# as long as the URI persists across a reboot.
+# This mode allows the user to quit QEMU, optionally update and
+# reboot the OS, and restart QEMU. If the user reboots, the URI
+# must persist across the reboot, such as by using a file.
#
-# Unlike normal mode, the use of certain local storage options does not
-# block the migration, but the user must not modify guest block devices
-# between the quit and restart.
+# Unlike normal mode, the use of certain local storage options
+# does not block the migration, but the user must not modify the
+# contents of guest block devices between the quit and restart.
#
-# This mode supports vfio devices provided the user first puts the guest
-# in the suspended runstate, such as by issuing guest-suspend-ram to the
-# qemu guest agent.
+# This mode supports VFIO devices provided the user first puts
+# the guest in the suspended runstate, such as by issuing
+# guest-suspend-ram to the QEMU guest agent.
#
-# Best performance is achieved when the memory backend is shared and the
-# @x-ignore-shared migration capability is set, but this is not required.
-# Further, if the user reboots before restarting such a configuration, the
-# shared backend must be be non-volatile across reboot, such as by backing
-# it with a dax device.
+# Best performance is achieved when the memory backend is shared
+# and the @x-ignore-shared migration capability is set, but this
+# is not required. Further, if the user reboots before restarting
+# such a configuration, the shared memory must persist across the
+# reboot, such as by backing it with a dax device.
#
-# cpr-reboot may not be used with postcopy, colo, or background-snapshot.
+# @cpr-reboot may not be used with postcopy, background-snapshot,
+# or COLO.
#
# (since 8.2)
##
diff --git a/roms/Makefile b/roms/Makefile
index 67f709b..8e5d8d2 100644
--- a/roms/Makefile
+++ b/roms/Makefile
@@ -68,6 +68,7 @@
@echo " opensbi32-generic -- update OpenSBI for 32-bit generic machine"
@echo " opensbi64-generic -- update OpenSBI for 64-bit generic machine"
@echo " qboot -- update qboot"
+ @echo " hppa-firmware -- update 32- and 64-bit hppa firmware"
@echo " clean -- delete the files generated by the previous" \
"build targets"
@@ -177,6 +178,11 @@
$(MAKE) -C vbootrom CROSS_COMPILE=$(arm_cross_prefix)
cp vbootrom/npcm7xx_bootrom.bin ../pc-bios/npcm7xx_bootrom.bin
+hppa-firmware:
+ $(MAKE) -C seabios-hppa parisc
+ cp seabios-hppa/out/hppa-firmware.img ../pc-bios/
+ cp seabios-hppa/out-64/hppa-firmware64.img ../pc-bios/
+
clean:
rm -rf seabios/.config seabios/out seabios/builds
$(MAKE) -C ipxe/src veryclean
@@ -189,3 +195,4 @@
$(MAKE) -C opensbi clean
$(MAKE) -C qboot clean
$(MAKE) -C vbootrom clean
+ $(MAKE) -C seabios-hppa clean
diff --git a/system/physmem.c b/system/physmem.c
index e3ebc19..3adda08 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -1680,7 +1680,8 @@
assert(block);
- newsize = HOST_PAGE_ALIGN(newsize);
+ newsize = TARGET_PAGE_ALIGN(newsize);
+ newsize = REAL_HOST_PAGE_ALIGN(newsize);
if (block->used_length == newsize) {
/*
@@ -1916,7 +1917,9 @@
return NULL;
}
- size = HOST_PAGE_ALIGN(size);
+ size = TARGET_PAGE_ALIGN(size);
+ size = REAL_HOST_PAGE_ALIGN(size);
+
file_size = get_file_size(fd);
if (file_size > offset && file_size < (offset + size)) {
error_setg(errp, "backing store size 0x%" PRIx64
@@ -2014,13 +2017,17 @@
{
RAMBlock *new_block;
Error *local_err = NULL;
+ int align;
assert((ram_flags & ~(RAM_SHARED | RAM_RESIZEABLE | RAM_PREALLOC |
RAM_NORESERVE)) == 0);
assert(!host ^ (ram_flags & RAM_PREALLOC));
- size = HOST_PAGE_ALIGN(size);
- max_size = HOST_PAGE_ALIGN(max_size);
+ align = qemu_real_host_page_size();
+ align = MAX(align, TARGET_PAGE_SIZE);
+ size = ROUND_UP(size, align);
+ max_size = ROUND_UP(max_size, align);
+
new_block = g_malloc0(sizeof(*new_block));
new_block->mr = mr;
new_block->resized = resized;
@@ -3511,7 +3518,7 @@
* fallocate works on hugepages and shmem
* shared anonymous memory requires madvise REMOVE
*/
- need_madvise = (rb->page_size == qemu_host_page_size);
+ need_madvise = (rb->page_size == qemu_real_host_page_size());
need_fallocate = rb->fd != -1;
if (need_fallocate) {
/* For a file, this causes the area of the file to be zero'd
diff --git a/system/vl.c b/system/vl.c
index e480afd..48aae6e 100644
--- a/system/vl.c
+++ b/system/vl.c
@@ -2118,7 +2118,6 @@
}
cpu_exec_init_all();
- page_size_init();
if (machine_class->hw_version) {
qemu_set_hw_version(machine_class->hw_version);
diff --git a/target/alpha/cpu-param.h b/target/alpha/cpu-param.h
index 68c46f7..c969cb0 100644
--- a/target/alpha/cpu-param.h
+++ b/target/alpha/cpu-param.h
@@ -9,10 +9,22 @@
#define ALPHA_CPU_PARAM_H
#define TARGET_LONG_BITS 64
-#define TARGET_PAGE_BITS 13
/* ??? EV4 has 34 phys addr bits, EV5 has 40, EV6 has 44. */
#define TARGET_PHYS_ADDR_SPACE_BITS 44
-#define TARGET_VIRT_ADDR_SPACE_BITS (30 + TARGET_PAGE_BITS)
+
+#ifdef CONFIG_USER_ONLY
+/*
+ * Allow user-only to vary page size. Real hardware allows only 8k and 64k,
+ * but since any variance means guests cannot assume a fixed value, allow
+ * a 4k minimum to match x86 host, which can minimize emulation issues.
+ */
+# define TARGET_PAGE_BITS_VARY
+# define TARGET_PAGE_BITS_MIN 12
+# define TARGET_VIRT_ADDR_SPACE_BITS 63
+#else
+# define TARGET_PAGE_BITS 13
+# define TARGET_VIRT_ADDR_SPACE_BITS (30 + TARGET_PAGE_BITS)
+#endif
#endif
diff --git a/target/arm/cpu-param.h b/target/arm/cpu-param.h
index f9b462a..da3243a 100644
--- a/target/arm/cpu-param.h
+++ b/target/arm/cpu-param.h
@@ -19,9 +19,13 @@
#endif
#ifdef CONFIG_USER_ONLY
-#define TARGET_PAGE_BITS 12
# ifdef TARGET_AARCH64
# define TARGET_TAGGED_ADDRESSES
+/* Allow user-only to vary page size from 4k */
+# define TARGET_PAGE_BITS_VARY
+# define TARGET_PAGE_BITS_MIN 12
+# else
+# define TARGET_PAGE_BITS 12
# endif
#else
/*
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index b2ea5d6..f3ed79c 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -1809,7 +1809,6 @@
ARMCPU *cpu = ARM_CPU(dev);
ARMCPUClass *acc = ARM_CPU_GET_CLASS(dev);
CPUARMState *env = &cpu->env;
- int pagebits;
Error *local_err = NULL;
#if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY)
@@ -2100,28 +2099,36 @@
!cpu_isar_feature(aa32_vfp_simd, cpu) ||
!arm_feature(env, ARM_FEATURE_XSCALE));
- if (arm_feature(env, ARM_FEATURE_V7) &&
- !arm_feature(env, ARM_FEATURE_M) &&
- !arm_feature(env, ARM_FEATURE_PMSA)) {
- /* v7VMSA drops support for the old ARMv5 tiny pages, so we
- * can use 4K pages.
- */
- pagebits = 12;
- } else {
- /* For CPUs which might have tiny 1K pages, or which have an
- * MPU and might have small region sizes, stick with 1K pages.
- */
- pagebits = 10;
+#ifndef CONFIG_USER_ONLY
+ {
+ int pagebits;
+ if (arm_feature(env, ARM_FEATURE_V7) &&
+ !arm_feature(env, ARM_FEATURE_M) &&
+ !arm_feature(env, ARM_FEATURE_PMSA)) {
+ /*
+ * v7VMSA drops support for the old ARMv5 tiny pages,
+ * so we can use 4K pages.
+ */
+ pagebits = 12;
+ } else {
+ /*
+ * For CPUs which might have tiny 1K pages, or which have an
+ * MPU and might have small region sizes, stick with 1K pages.
+ */
+ pagebits = 10;
+ }
+ if (!set_preferred_target_page_bits(pagebits)) {
+ /*
+ * This can only ever happen for hotplugging a CPU, or if
+ * the board code incorrectly creates a CPU which it has
+ * promised via minimum_page_size that it will not.
+ */
+ error_setg(errp, "This CPU requires a smaller page size "
+ "than the system is using");
+ return;
+ }
}
- if (!set_preferred_target_page_bits(pagebits)) {
- /* This can only ever happen for hotplugging a CPU, or if
- * the board code incorrectly creates a CPU which it has
- * promised via minimum_page_size that it will not.
- */
- error_setg(errp, "This CPU requires a smaller page size than the "
- "system is using");
- return;
- }
+#endif
/* This cpu-id-to-MPIDR affinity is used only for TCG; KVM will override it.
* We don't support setting cluster ID ([16..23]) (known as Aff2
diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index afe73d4..3831cb6 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -121,9 +121,10 @@
CPUHPPAState *env = &cpu->env;
cs->exception_index = EXCP_UNALIGN;
+ cpu_restore_state(cs, retaddr);
hppa_set_ior_and_isr(env, addr, MMU_IDX_MMU_DISABLED(mmu_idx));
- cpu_loop_exit_restore(cs, retaddr);
+ cpu_loop_exit(cs);
}
#endif /* CONFIG_USER_ONLY */
diff --git a/target/hppa/helper.c b/target/hppa/helper.c
index 859644c..9d217d0 100644
--- a/target/hppa/helper.c
+++ b/target/hppa/helper.c
@@ -76,7 +76,8 @@
}
psw &= ~reserved;
- env->psw = psw & ~(PSW_N | PSW_V | PSW_CB);
+ env->psw = psw & (uint32_t)~(PSW_N | PSW_V | PSW_CB);
+
env->psw_n = (psw / PSW_N) & 1;
env->psw_v = -((psw / PSW_V) & 1);
diff --git a/target/hppa/mem_helper.c b/target/hppa/mem_helper.c
index 66b8fa7..3fc895c 100644
--- a/target/hppa/mem_helper.c
+++ b/target/hppa/mem_helper.c
@@ -348,9 +348,10 @@
CPUState *cs = env_cpu(env);
cs->exception_index = excp;
+ cpu_restore_state(cs, retaddr);
hppa_set_ior_and_isr(env, addr, mmu_disabled);
- cpu_loop_exit_restore(cs, retaddr);
+ cpu_loop_exit(cs);
}
void hppa_cpu_do_transaction_failed(CPUState *cs, hwaddr physaddr,
diff --git a/target/hppa/op_helper.c b/target/hppa/op_helper.c
index b1f24a5..480fe80 100644
--- a/target/hppa/op_helper.c
+++ b/target/hppa/op_helper.c
@@ -351,11 +351,12 @@
excp = hppa_get_physical_address(env, addr, mmu_idx, 0, &phys,
&prot, NULL);
if (excp >= 0) {
+ cpu_restore_state(env_cpu(env), GETPC());
hppa_set_ior_and_isr(env, addr, MMU_IDX_MMU_DISABLED(mmu_idx));
if (excp == EXCP_DTLB_MISS) {
excp = EXCP_NA_DTLB_MISS;
}
- hppa_dynamic_excp(env, excp, GETPC());
+ helper_excp(env, excp);
}
return (want & prot) != 0;
#endif
diff --git a/target/ppc/cpu-param.h b/target/ppc/cpu-param.h
index 0a0416e..b7ad52d 100644
--- a/target/ppc/cpu-param.h
+++ b/target/ppc/cpu-param.h
@@ -31,6 +31,13 @@
# define TARGET_PHYS_ADDR_SPACE_BITS 36
# define TARGET_VIRT_ADDR_SPACE_BITS 32
#endif
-#define TARGET_PAGE_BITS 12
+
+#ifdef CONFIG_USER_ONLY
+/* Allow user-only to vary page size from 4k */
+# define TARGET_PAGE_BITS_VARY
+# define TARGET_PAGE_BITS_MIN 12
+#else
+# define TARGET_PAGE_BITS 12
+#endif
#endif
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index ef5ebe9..85d5746 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -55,7 +55,11 @@
#define TCG_TARGET_CALL_STACK_OFFSET 0
#define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL
#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
-#define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_EVEN
+#ifdef CONFIG_DARWIN
+# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_NORMAL
+#else
+# define TCG_TARGET_CALL_ARG_I128 TCG_CALL_ARG_EVEN
+#endif
#define TCG_TARGET_CALL_RET_I128 TCG_CALL_RET_NORMAL
#define have_lse (cpuinfo & CPUINFO_LSE)
diff --git a/tcg/optimize.c b/tcg/optimize.c
index 79e7016..752cc5c 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -2102,7 +2102,8 @@
static void fold_setcond_tst_pow2(OptContext *ctx, TCGOp *op, bool neg)
{
- TCGOpcode and_opc, sub_opc, xor_opc, neg_opc, shr_opc, uext_opc, sext_opc;
+ TCGOpcode and_opc, sub_opc, xor_opc, neg_opc, shr_opc;
+ TCGOpcode uext_opc = 0, sext_opc = 0;
TCGCond cond = op->args[3];
TCGArg ret, src1, src2;
TCGOp *op2;
diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 83512bc..4023d80 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -2200,6 +2200,14 @@
return NULL;
}
+static void *migrate_mapped_ram_start(QTestState *from, QTestState *to)
+{
+ migrate_set_capability(from, "mapped-ram", true);
+ migrate_set_capability(to, "mapped-ram", true);
+
+ return NULL;
+}
+
static void test_mode_reboot(void)
{
g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs,
@@ -2214,6 +2222,72 @@
test_file_common(&args, true);
}
+static void test_precopy_file_mapped_ram_live(void)
+{
+ g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs,
+ FILE_TEST_FILENAME);
+ MigrateCommon args = {
+ .connect_uri = uri,
+ .listen_uri = "defer",
+ .start_hook = migrate_mapped_ram_start,
+ };
+
+ test_file_common(&args, false);
+}
+
+static void test_precopy_file_mapped_ram(void)
+{
+ g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs,
+ FILE_TEST_FILENAME);
+ MigrateCommon args = {
+ .connect_uri = uri,
+ .listen_uri = "defer",
+ .start_hook = migrate_mapped_ram_start,
+ };
+
+ test_file_common(&args, true);
+}
+
+static void *migrate_multifd_mapped_ram_start(QTestState *from, QTestState *to)
+{
+ migrate_mapped_ram_start(from, to);
+
+ migrate_set_parameter_int(from, "multifd-channels", 4);
+ migrate_set_parameter_int(to, "multifd-channels", 4);
+
+ migrate_set_capability(from, "multifd", true);
+ migrate_set_capability(to, "multifd", true);
+
+ return NULL;
+}
+
+static void test_multifd_file_mapped_ram_live(void)
+{
+ g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs,
+ FILE_TEST_FILENAME);
+ MigrateCommon args = {
+ .connect_uri = uri,
+ .listen_uri = "defer",
+ .start_hook = migrate_multifd_mapped_ram_start,
+ };
+
+ test_file_common(&args, false);
+}
+
+static void test_multifd_file_mapped_ram(void)
+{
+ g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs,
+ FILE_TEST_FILENAME);
+ MigrateCommon args = {
+ .connect_uri = uri,
+ .listen_uri = "defer",
+ .start_hook = migrate_multifd_mapped_ram_start,
+ };
+
+ test_file_common(&args, true);
+}
+
+
static void test_precopy_tcp_plain(void)
{
MigrateCommon args = {
@@ -2462,6 +2536,13 @@
return NULL;
}
+static void *migrate_fd_file_mapped_ram_start(QTestState *from, QTestState *to)
+{
+ migrate_mapped_ram_start(from, to);
+
+ return migrate_precopy_fd_file_start(from, to);
+}
+
static void test_migrate_precopy_fd_file(void)
{
MigrateCommon args = {
@@ -2472,6 +2553,36 @@
};
test_file_common(&args, true);
}
+
+static void test_migrate_precopy_fd_file_mapped_ram(void)
+{
+ MigrateCommon args = {
+ .listen_uri = "defer",
+ .connect_uri = "fd:fd-mig",
+ .start_hook = migrate_fd_file_mapped_ram_start,
+ .finish_hook = test_migrate_fd_finish_hook
+ };
+ test_file_common(&args, true);
+}
+
+static void *migrate_multifd_fd_mapped_ram_start(QTestState *from,
+ QTestState *to)
+{
+ migrate_multifd_mapped_ram_start(from, to);
+ return migrate_precopy_fd_file_start(from, to);
+}
+
+static void test_multifd_fd_mapped_ram(void)
+{
+ MigrateCommon args = {
+ .connect_uri = "fd:fd-mig",
+ .listen_uri = "defer",
+ .start_hook = migrate_multifd_fd_mapped_ram_start,
+ .finish_hook = test_migrate_fd_finish_hook
+ };
+
+ test_file_common(&args, true);
+}
#endif /* _WIN32 */
static void do_test_validate_uuid(MigrateStart *args, bool should_fail)
@@ -2664,6 +2775,13 @@
test_migrate_precopy_tcp_multifd_zlib_start(QTestState *from,
QTestState *to)
{
+ /*
+ * Overloading this test to also check that set_parameter does not error.
+ * This is also done in the tests for the other compression methods.
+ */
+ migrate_set_parameter_int(from, "multifd-zlib-level", 2);
+ migrate_set_parameter_int(to, "multifd-zlib-level", 2);
+
return test_migrate_precopy_tcp_multifd_start_common(from, to, "zlib");
}
@@ -2672,6 +2790,9 @@
test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from,
QTestState *to)
{
+ migrate_set_parameter_int(from, "multifd-zstd-level", 2);
+ migrate_set_parameter_int(to, "multifd-zstd-level", 2);
+
return test_migrate_precopy_tcp_multifd_start_common(from, to, "zstd");
}
#endif /* CONFIG_ZSTD */
@@ -3509,6 +3630,20 @@
migration_test_add("/migration/mode/reboot", test_mode_reboot);
}
+ migration_test_add("/migration/precopy/file/mapped-ram",
+ test_precopy_file_mapped_ram);
+ migration_test_add("/migration/precopy/file/mapped-ram/live",
+ test_precopy_file_mapped_ram_live);
+
+ migration_test_add("/migration/multifd/file/mapped-ram",
+ test_multifd_file_mapped_ram);
+ migration_test_add("/migration/multifd/file/mapped-ram/live",
+ test_multifd_file_mapped_ram_live);
+#ifndef _WIN32
+ migration_test_add("/migration/multifd/fd/mapped-ram",
+ test_multifd_fd_mapped_ram);
+#endif
+
#ifdef CONFIG_GNUTLS
migration_test_add("/migration/precopy/unix/tls/psk",
test_precopy_unix_tls_psk);
@@ -3570,6 +3705,8 @@
test_migrate_precopy_fd_socket);
migration_test_add("/migration/precopy/fd/file",
test_migrate_precopy_fd_file);
+ migration_test_add("/migration/precopy/fd/file/mapped-ram",
+ test_migrate_precopy_fd_file_mapped_ram);
#endif
migration_test_add("/migration/validate_uuid", test_validate_uuid);
migration_test_add("/migration/validate_uuid_error",
diff --git a/tests/tcg/alpha/Makefile.target b/tests/tcg/alpha/Makefile.target
index b94500a..fdd7ddf 100644
--- a/tests/tcg/alpha/Makefile.target
+++ b/tests/tcg/alpha/Makefile.target
@@ -13,6 +13,3 @@
$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $< -o $@ $(LDFLAGS)
run-test-cmov: test-cmov
-
-# On Alpha Linux only supports 8k pages
-EXTRA_RUNS+=run-test-mmap-8192
diff --git a/tests/tcg/arm/Makefile.target b/tests/tcg/arm/Makefile.target
index 3473f46..0a1965f 100644
--- a/tests/tcg/arm/Makefile.target
+++ b/tests/tcg/arm/Makefile.target
@@ -79,6 +79,3 @@
ARM_TESTS += sha512-vector
TESTS += $(ARM_TESTS)
-
-# On ARM Linux only supports 4k pages
-EXTRA_RUNS+=run-test-mmap-4096
diff --git a/tests/tcg/hppa/Makefile.target b/tests/tcg/hppa/Makefile.target
index cdd0d57..ea5ae21 100644
--- a/tests/tcg/hppa/Makefile.target
+++ b/tests/tcg/hppa/Makefile.target
@@ -2,9 +2,6 @@
#
# HPPA specific tweaks - specifically masking out broken tests
-# On parisc Linux supports 4K/16K/64K (but currently only 4k works)
-EXTRA_RUNS+=run-test-mmap-4096 # run-test-mmap-16384 run-test-mmap-65536
-
# This triggers failures for hppa-linux about 1% of the time
# HPPA is the odd target that can't use the sigtramp page;
# it requires the full vdso with dwarf2 unwind info.
diff --git a/tests/tcg/i386/Makefile.target b/tests/tcg/i386/Makefile.target
index 9906f9e..bbe2c44 100644
--- a/tests/tcg/i386/Makefile.target
+++ b/tests/tcg/i386/Makefile.target
@@ -71,9 +71,6 @@
I386_TESTS:=$(filter-out $(SKIP_I386_TESTS), $(ALL_X86_TESTS))
TESTS=$(MULTIARCH_TESTS) $(I386_TESTS)
-# On i386 and x86_64 Linux only supports 4k pages (large pages are a different hack)
-EXTRA_RUNS+=run-test-mmap-4096
-
sha512-sse: CFLAGS=-msse4.1 -O3
sha512-sse: sha512.c
$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $< -o $@ $(LDFLAGS)
diff --git a/tests/tcg/m68k/Makefile.target b/tests/tcg/m68k/Makefile.target
index 6ff214e..33f7b1b 100644
--- a/tests/tcg/m68k/Makefile.target
+++ b/tests/tcg/m68k/Makefile.target
@@ -5,6 +5,3 @@
VPATH += $(SRC_PATH)/tests/tcg/m68k
TESTS += trap denormal
-
-# On m68k Linux supports 4k and 8k pages (but 8k is currently broken)
-EXTRA_RUNS+=run-test-mmap-4096 # run-test-mmap-8192
diff --git a/tests/tcg/multiarch/Makefile.target b/tests/tcg/multiarch/Makefile.target
index e10951a..f11f3b0 100644
--- a/tests/tcg/multiarch/Makefile.target
+++ b/tests/tcg/multiarch/Makefile.target
@@ -51,18 +51,9 @@
$(call skip-test, $<, "flaky on CI?")
endif
-# We define the runner for test-mmap after the individual
-# architectures have defined their supported pages sizes. If no
-# additional page sizes are defined we only run the default test.
-
-# default case (host page size)
run-test-mmap: test-mmap
$(call run-test, test-mmap, $(QEMU) $<, $< (default))
-# additional page sizes (defined by each architecture adding to EXTRA_RUNS)
-run-test-mmap-%: test-mmap
- $(call run-test, test-mmap-$*, $(QEMU) -p $* $<, $< ($* byte pages))
-
ifneq ($(GDB),)
GDB_SCRIPT=$(SRC_PATH)/tests/guest-debug/run-test.py
diff --git a/tests/tcg/multiarch/linux/linux-madvise.c b/tests/tcg/multiarch/linux/linux-madvise.c
index 29d0997..539fb3b 100644
--- a/tests/tcg/multiarch/linux/linux-madvise.c
+++ b/tests/tcg/multiarch/linux/linux-madvise.c
@@ -42,6 +42,8 @@
assert(ret == 0);
written = write(fd, &c, sizeof(c));
assert(written == sizeof(c));
+ ret = ftruncate(fd, pagesize);
+ assert(ret == 0);
page = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE, fd, 0);
assert(page != MAP_FAILED);
diff --git a/tests/tcg/multiarch/linux/linux-shmat-maps.c b/tests/tcg/multiarch/linux/linux-shmat-maps.c
new file mode 100644
index 0000000..0ccf7a9
--- /dev/null
+++ b/tests/tcg/multiarch/linux/linux-shmat-maps.c
@@ -0,0 +1,55 @@
+/*
+ * Test that shmat() does not break /proc/self/maps.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#include <assert.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <unistd.h>
+
+int main(void)
+{
+ char buf[128];
+ int err, fd;
+ int shmid;
+ ssize_t n;
+ void *p;
+
+ shmid = shmget(IPC_PRIVATE, 1, IPC_CREAT | 0600);
+ assert(shmid != -1);
+
+ /*
+ * The original bug required a non-NULL address, which skipped the
+ * mmap_find_vma step, which could result in a host mapping smaller
+ * than the target mapping. Choose an address at random.
+ */
+ p = shmat(shmid, (void *)0x800000, SHM_RND);
+ if (p == (void *)-1) {
+ /*
+ * Because we are now running the testcase for all guests for which
+ * we have a cross-compiler, the above random address might conflict
+ * with the guest executable in some way. Rather than stopping,
+ * continue with a system supplied address, which should never fail.
+ */
+ p = shmat(shmid, NULL, 0);
+ assert(p != (void *)-1);
+ }
+
+ fd = open("/proc/self/maps", O_RDONLY);
+ assert(fd != -1);
+ do {
+ n = read(fd, buf, sizeof(buf));
+ assert(n >= 0);
+ } while (n != 0);
+ close(fd);
+
+ err = shmdt(p);
+ assert(err == 0);
+ err = shmctl(shmid, IPC_RMID, NULL);
+ assert(err == 0);
+
+ return EXIT_SUCCESS;
+}
diff --git a/tests/tcg/ppc/Makefile.target b/tests/tcg/ppc/Makefile.target
deleted file mode 100644
index f5e08c7..0000000
--- a/tests/tcg/ppc/Makefile.target
+++ /dev/null
@@ -1,12 +0,0 @@
-# -*- Mode: makefile -*-
-#
-# PPC - included from tests/tcg/Makefile
-#
-
-ifneq (,$(findstring 64,$(TARGET_NAME)))
-# On PPC64 Linux can be configured with 4k (default) or 64k pages (currently broken)
-EXTRA_RUNS+=run-test-mmap-4096 #run-test-mmap-65536
-else
-# On PPC32 Linux supports 4K/16K/64K/256K (but currently only 4k works)
-EXTRA_RUNS+=run-test-mmap-4096 #run-test-mmap-16384 run-test-mmap-65536 run-test-mmap-262144
-endif
diff --git a/tests/tcg/sh4/Makefile.target b/tests/tcg/sh4/Makefile.target
index 47c39a4..16eaa85 100644
--- a/tests/tcg/sh4/Makefile.target
+++ b/tests/tcg/sh4/Makefile.target
@@ -3,9 +3,6 @@
# SuperH specific tweaks
#
-# On sh Linux supports 4k, 8k, 16k and 64k pages (but only 4k currently works)
-EXTRA_RUNS+=run-test-mmap-4096 # run-test-mmap-8192 run-test-mmap-16384 run-test-mmap-65536
-
# This triggers failures for sh4-linux about 10% of the time.
# Random SIGSEGV at unpredictable guest address, cause unknown.
run-signals: signals
diff --git a/tests/tcg/sparc64/Makefile.target b/tests/tcg/sparc64/Makefile.target
deleted file mode 100644
index 408dace..0000000
--- a/tests/tcg/sparc64/Makefile.target
+++ /dev/null
@@ -1,6 +0,0 @@
-# -*- Mode: makefile -*-
-#
-# sparc specific tweaks
-
-# On Sparc64 Linux support 8k pages
-EXTRA_RUNS+=run-test-mmap-8192