migration/multifd: Support outgoing mapped-ram stream format
The new mapped-ram stream format uses a file transport and puts ram
pages in the migration file at their respective offsets and can be
done in parallel by using the pwritev system call which takes iovecs
and an offset.
Add support to enabling the new format along with multifd to make use
of the threading and page handling already in place.
This requires multifd to stop sending headers and leaving the stream
format to the mapped-ram code. When it comes time to write the data, we
need to call a version of qio_channel_write that can take an offset.
Usage on HMP is:
(qemu) stop
(qemu) migrate_set_capability multifd on
(qemu) migrate_set_capability mapped-ram on
(qemu) migrate_set_parameter max-bandwidth 0
(qemu) migrate_set_parameter multifd-channels 8
(qemu) migrate file:migfile
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Fabiano Rosas <farosas@suse.de>
Link: https://lore.kernel.org/r/20240229153017.2221-21-farosas@suse.de
Signed-off-by: Peter Xu <peterx@redhat.com>
diff --git a/include/qemu/bitops.h b/include/qemu/bitops.h
index cb3526d..2c0a2fe 100644
--- a/include/qemu/bitops.h
+++ b/include/qemu/bitops.h
@@ -68,6 +68,19 @@
}
/**
+ * clear_bit_atomic - Clears a bit in memory atomically
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ */
+static inline void clear_bit_atomic(long nr, unsigned long *addr)
+{
+ unsigned long mask = BIT_MASK(nr);
+ unsigned long *p = addr + BIT_WORD(nr);
+
+ return qatomic_and(p, ~mask);
+}
+
+/**
* change_bit - Toggle a bit in memory
* @nr: Bit to change
* @addr: Address to start counting from
diff --git a/migration/file.c b/migration/file.c
index 2f8b626..d949a94 100644
--- a/migration/file.c
+++ b/migration/file.c
@@ -150,3 +150,57 @@
}
} while (++i < channels);
}
+
+int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov,
+ int niov, RAMBlock *block, Error **errp)
+{
+ ssize_t ret = -1;
+ int i, slice_idx, slice_num;
+ uintptr_t base, next, offset;
+ size_t len;
+
+ slice_idx = 0;
+ slice_num = 1;
+
+ /*
+ * If the iov array doesn't have contiguous elements, we need to
+ * split it in slices because we only have one file offset for the
+ * whole iov. Do this here so callers don't need to break the iov
+ * array themselves.
+ */
+ for (i = 0; i < niov; i++, slice_num++) {
+ base = (uintptr_t) iov[i].iov_base;
+
+ if (i != niov - 1) {
+ len = iov[i].iov_len;
+ next = (uintptr_t) iov[i + 1].iov_base;
+
+ if (base + len == next) {
+ continue;
+ }
+ }
+
+ /*
+ * Use the offset of the first element of the segment that
+ * we're sending.
+ */
+ offset = (uintptr_t) iov[slice_idx].iov_base - (uintptr_t) block->host;
+ if (offset >= block->used_length) {
+ error_setg(errp, "offset " RAM_ADDR_FMT
+ "outside of ramblock %s range", offset, block->idstr);
+ ret = -1;
+ break;
+ }
+
+ ret = qio_channel_pwritev(ioc, &iov[slice_idx], slice_num,
+ block->pages_offset + offset, errp);
+ if (ret < 0) {
+ break;
+ }
+
+ slice_idx += slice_num;
+ slice_num = 0;
+ }
+
+ return (ret < 0) ? ret : 0;
+}
diff --git a/migration/file.h b/migration/file.h
index 4577f9e..01a338c 100644
--- a/migration/file.h
+++ b/migration/file.h
@@ -19,4 +19,6 @@
int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp);
void file_cleanup_outgoing_migration(void);
bool file_send_channel_create(gpointer opaque, Error **errp);
+int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov,
+ int niov, RAMBlock *block, Error **errp);
#endif
diff --git a/migration/migration.c b/migration/migration.c
index faeb75a..b9baab5 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -140,12 +140,14 @@
if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) {
SocketAddress *saddr = &addr->u.socket;
- return saddr->type == SOCKET_ADDRESS_TYPE_INET ||
- saddr->type == SOCKET_ADDRESS_TYPE_UNIX ||
- saddr->type == SOCKET_ADDRESS_TYPE_VSOCK;
+ return (saddr->type == SOCKET_ADDRESS_TYPE_INET ||
+ saddr->type == SOCKET_ADDRESS_TYPE_UNIX ||
+ saddr->type == SOCKET_ADDRESS_TYPE_VSOCK);
+ } else if (addr->transport == MIGRATION_ADDRESS_TYPE_FILE) {
+ return migrate_mapped_ram();
+ } else {
+ return false;
}
-
- return false;
}
static bool migration_needs_seekable_channel(void)
@@ -1985,6 +1987,11 @@
error_setg(errp, "Cannot use TLS with mapped-ram");
return false;
}
+
+ if (migrate_multifd_compression()) {
+ error_setg(errp, "Cannot use compression with mapped-ram");
+ return false;
+ }
}
if (migrate_mode_is_cpr(s)) {
diff --git a/migration/multifd.c b/migration/multifd.c
index ea08f1a..8118145 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -108,6 +108,17 @@
qemu_sem_post(&multifd_send_state->channels_created);
}
+static void multifd_set_file_bitmap(MultiFDSendParams *p)
+{
+ MultiFDPages_t *pages = p->pages;
+
+ assert(pages->block);
+
+ for (int i = 0; i < p->pages->num; i++) {
+ ramblock_set_file_bmap_atomic(pages->block, pages->offset[i]);
+ }
+}
+
/* Multifd without compression */
/**
@@ -169,6 +180,8 @@
if (!multifd_use_packets()) {
multifd_send_prepare_iovs(p);
+ multifd_set_file_bitmap(p);
+
return 0;
}
@@ -867,8 +880,15 @@
break;
}
- ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL,
- 0, p->write_flags, &local_err);
+ if (migrate_mapped_ram()) {
+ ret = file_write_ramblock_iov(p->c, p->iov, p->iovs_num,
+ p->pages->block, &local_err);
+ } else {
+ ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num,
+ NULL, 0, p->write_flags,
+ &local_err);
+ }
+
if (ret != 0) {
break;
}
diff --git a/migration/options.c b/migration/options.c
index 5df8982..40eb930 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -654,12 +654,6 @@
}
if (new_caps[MIGRATION_CAPABILITY_MAPPED_RAM]) {
- if (new_caps[MIGRATION_CAPABILITY_MULTIFD]) {
- error_setg(errp,
- "Mapped-ram migration is incompatible with multifd");
- return false;
- }
-
if (new_caps[MIGRATION_CAPABILITY_XBZRLE]) {
error_setg(errp,
"Mapped-ram migration is incompatible with xbzrle");
@@ -1252,6 +1246,13 @@
}
#endif
+ if (migrate_mapped_ram() &&
+ (migrate_multifd_compression() || migrate_tls())) {
+ error_setg(errp,
+ "Mapped-ram only available for non-compressed non-TLS multifd migration");
+ return false;
+ }
+
if (params->has_x_vcpu_dirty_limit_period &&
(params->x_vcpu_dirty_limit_period < 1 ||
params->x_vcpu_dirty_limit_period > 1000)) {
diff --git a/migration/ram.c b/migration/ram.c
index 329153d..87cb73f 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1148,7 +1148,7 @@
if (migrate_mapped_ram()) {
/* zero pages are not transferred with mapped-ram */
- clear_bit(offset >> TARGET_PAGE_BITS, pss->block->file_bmap);
+ clear_bit_atomic(offset >> TARGET_PAGE_BITS, pss->block->file_bmap);
return 1;
}
@@ -2445,8 +2445,6 @@
block->clear_bmap = NULL;
g_free(block->bmap);
block->bmap = NULL;
- g_free(block->file_bmap);
- block->file_bmap = NULL;
}
xbzrle_cleanup();
@@ -3135,9 +3133,22 @@
qemu_put_buffer_at(f, (uint8_t *)block->file_bmap, bitmap_size,
block->bitmap_offset);
ram_transferred_add(bitmap_size);
+
+ /*
+ * Free the bitmap here to catch any synchronization issues
+ * with multifd channels. No channels should be sending pages
+ * after we've written the bitmap to file.
+ */
+ g_free(block->file_bmap);
+ block->file_bmap = NULL;
}
}
+void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset)
+{
+ set_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap);
+}
+
/**
* ram_save_iterate: iterative stage for migration
*
diff --git a/migration/ram.h b/migration/ram.h
index 9b937a4..b9ac0da 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -75,6 +75,7 @@
bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start);
void postcopy_preempt_shutdown_file(MigrationState *s);
void *postcopy_preempt_thread(void *opaque);
+void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset);
/* ram cache */
int colo_init_ram_cache(void);