| /* |
| * Postcopy migration for RAM |
| * |
| * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates |
| * |
| * Authors: |
| * Dave Gilbert <dgilbert@redhat.com> |
| * |
| * This work is licensed under the terms of the GNU GPL, version 2 or later. |
| * See the COPYING file in the top-level directory. |
| * |
| */ |
| |
| /* |
| * Postcopy is a migration technique where the execution flips from the |
| * source to the destination before all the data has been copied. |
| */ |
| |
| #include <glib.h> |
| #include <stdio.h> |
| #include <unistd.h> |
| |
| #include "qemu-common.h" |
| #include "migration/migration.h" |
| #include "migration/postcopy-ram.h" |
| #include "sysemu/sysemu.h" |
| #include "qemu/error-report.h" |
| #include "trace.h" |
| |
| /* Arbitrary limit on size of each discard command, |
| * keeps them around ~200 bytes |
| */ |
| #define MAX_DISCARDS_PER_COMMAND 12 |
| |
| struct PostcopyDiscardState { |
| const char *ramblock_name; |
| uint64_t offset; /* Bitmap entry for the 1st bit of this RAMBlock */ |
| uint16_t cur_entry; |
| /* |
| * Start and length of a discard range (bytes) |
| */ |
| uint64_t start_list[MAX_DISCARDS_PER_COMMAND]; |
| uint64_t length_list[MAX_DISCARDS_PER_COMMAND]; |
| unsigned int nsentwords; |
| unsigned int nsentcmds; |
| }; |
| |
| /* Postcopy needs to detect accesses to pages that haven't yet been copied |
| * across, and efficiently map new pages in, the techniques for doing this |
| * are target OS specific. |
| */ |
| #if defined(__linux__) |
| |
| #include <sys/mman.h> |
| #include <sys/ioctl.h> |
| #include <sys/syscall.h> |
| #include <sys/types.h> |
| #include <asm/types.h> /* for __u64 */ |
| #endif |
| |
| #if defined(__linux__) && defined(__NR_userfaultfd) |
| #include <linux/userfaultfd.h> |
| |
| static bool ufd_version_check(int ufd) |
| { |
| struct uffdio_api api_struct; |
| uint64_t ioctl_mask; |
| |
| api_struct.api = UFFD_API; |
| api_struct.features = 0; |
| if (ioctl(ufd, UFFDIO_API, &api_struct)) { |
| error_report("postcopy_ram_supported_by_host: UFFDIO_API failed: %s", |
| strerror(errno)); |
| return false; |
| } |
| |
| ioctl_mask = (__u64)1 << _UFFDIO_REGISTER | |
| (__u64)1 << _UFFDIO_UNREGISTER; |
| if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) { |
| error_report("Missing userfault features: %" PRIx64, |
| (uint64_t)(~api_struct.ioctls & ioctl_mask)); |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool postcopy_ram_supported_by_host(void) |
| { |
| long pagesize = getpagesize(); |
| int ufd = -1; |
| bool ret = false; /* Error unless we change it */ |
| void *testarea = NULL; |
| struct uffdio_register reg_struct; |
| struct uffdio_range range_struct; |
| uint64_t feature_mask; |
| |
| if ((1ul << qemu_target_page_bits()) > pagesize) { |
| error_report("Target page size bigger than host page size"); |
| goto out; |
| } |
| |
| ufd = syscall(__NR_userfaultfd, O_CLOEXEC); |
| if (ufd == -1) { |
| error_report("%s: userfaultfd not available: %s", __func__, |
| strerror(errno)); |
| goto out; |
| } |
| |
| /* Version and features check */ |
| if (!ufd_version_check(ufd)) { |
| goto out; |
| } |
| |
| /* |
| * We need to check that the ops we need are supported on anon memory |
| * To do that we need to register a chunk and see the flags that |
| * are returned. |
| */ |
| testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | |
| MAP_ANONYMOUS, -1, 0); |
| if (testarea == MAP_FAILED) { |
| error_report("%s: Failed to map test area: %s", __func__, |
| strerror(errno)); |
| goto out; |
| } |
| g_assert(((size_t)testarea & (pagesize-1)) == 0); |
| |
| reg_struct.range.start = (uintptr_t)testarea; |
| reg_struct.range.len = pagesize; |
| reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; |
| |
| if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) { |
| error_report("%s userfault register: %s", __func__, strerror(errno)); |
| goto out; |
| } |
| |
| range_struct.start = (uintptr_t)testarea; |
| range_struct.len = pagesize; |
| if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) { |
| error_report("%s userfault unregister: %s", __func__, strerror(errno)); |
| goto out; |
| } |
| |
| feature_mask = (__u64)1 << _UFFDIO_WAKE | |
| (__u64)1 << _UFFDIO_COPY | |
| (__u64)1 << _UFFDIO_ZEROPAGE; |
| if ((reg_struct.ioctls & feature_mask) != feature_mask) { |
| error_report("Missing userfault map features: %" PRIx64, |
| (uint64_t)(~reg_struct.ioctls & feature_mask)); |
| goto out; |
| } |
| |
| /* Success! */ |
| ret = true; |
| out: |
| if (testarea) { |
| munmap(testarea, pagesize); |
| } |
| if (ufd != -1) { |
| close(ufd); |
| } |
| return ret; |
| } |
| |
| /** |
| * postcopy_ram_discard_range: Discard a range of memory. |
| * We can assume that if we've been called postcopy_ram_hosttest returned true. |
| * |
| * @mis: Current incoming migration state. |
| * @start, @length: range of memory to discard. |
| * |
| * returns: 0 on success. |
| */ |
| int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start, |
| size_t length) |
| { |
| trace_postcopy_ram_discard_range(start, length); |
| if (madvise(start, length, MADV_DONTNEED)) { |
| error_report("%s MADV_DONTNEED: %s", __func__, strerror(errno)); |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Setup an area of RAM so that it *can* be used for postcopy later; this |
| * must be done right at the start prior to pre-copy. |
| * opaque should be the MIS. |
| */ |
| static int init_range(const char *block_name, void *host_addr, |
| ram_addr_t offset, ram_addr_t length, void *opaque) |
| { |
| MigrationIncomingState *mis = opaque; |
| |
| trace_postcopy_init_range(block_name, host_addr, offset, length); |
| |
| /* |
| * We need the whole of RAM to be truly empty for postcopy, so things |
| * like ROMs and any data tables built during init must be zero'd |
| * - we're going to get the copy from the source anyway. |
| * (Precopy will just overwrite this data, so doesn't need the discard) |
| */ |
| if (postcopy_ram_discard_range(mis, host_addr, length)) { |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * At the end of migration, undo the effects of init_range |
| * opaque should be the MIS. |
| */ |
| static int cleanup_range(const char *block_name, void *host_addr, |
| ram_addr_t offset, ram_addr_t length, void *opaque) |
| { |
| MigrationIncomingState *mis = opaque; |
| struct uffdio_range range_struct; |
| trace_postcopy_cleanup_range(block_name, host_addr, offset, length); |
| |
| /* |
| * We turned off hugepage for the precopy stage with postcopy enabled |
| * we can turn it back on now. |
| */ |
| #ifdef MADV_HUGEPAGE |
| if (madvise(host_addr, length, MADV_HUGEPAGE)) { |
| error_report("%s HUGEPAGE: %s", __func__, strerror(errno)); |
| return -1; |
| } |
| #endif |
| |
| /* |
| * We can also turn off userfault now since we should have all the |
| * pages. It can be useful to leave it on to debug postcopy |
| * if you're not sure it's always getting every page. |
| */ |
| range_struct.start = (uintptr_t)host_addr; |
| range_struct.len = length; |
| |
| if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) { |
| error_report("%s: userfault unregister %s", __func__, strerror(errno)); |
| |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Initialise postcopy-ram, setting the RAM to a state where we can go into |
| * postcopy later; must be called prior to any precopy. |
| * called from arch_init's similarly named ram_postcopy_incoming_init |
| */ |
| int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages) |
| { |
| if (qemu_ram_foreach_block(init_range, mis)) { |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * At the end of a migration where postcopy_ram_incoming_init was called. |
| */ |
| int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) |
| { |
| /* TODO: Join the fault thread once we're sure it will exit */ |
| if (qemu_ram_foreach_block(cleanup_range, mis)) { |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Mark the given area of RAM as requiring notification to unwritten areas |
| * Used as a callback on qemu_ram_foreach_block. |
| * host_addr: Base of area to mark |
| * offset: Offset in the whole ram arena |
| * length: Length of the section |
| * opaque: MigrationIncomingState pointer |
| * Returns 0 on success |
| */ |
| static int ram_block_enable_notify(const char *block_name, void *host_addr, |
| ram_addr_t offset, ram_addr_t length, |
| void *opaque) |
| { |
| MigrationIncomingState *mis = opaque; |
| struct uffdio_register reg_struct; |
| |
| reg_struct.range.start = (uintptr_t)host_addr; |
| reg_struct.range.len = length; |
| reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; |
| |
| /* Now tell our userfault_fd that it's responsible for this area */ |
| if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) { |
| error_report("%s userfault register: %s", __func__, strerror(errno)); |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Handle faults detected by the USERFAULT markings |
| */ |
| static void *postcopy_ram_fault_thread(void *opaque) |
| { |
| MigrationIncomingState *mis = opaque; |
| |
| fprintf(stderr, "postcopy_ram_fault_thread\n"); |
| /* TODO: In later patch */ |
| qemu_sem_post(&mis->fault_thread_sem); |
| while (1) { |
| /* TODO: In later patch */ |
| } |
| |
| return NULL; |
| } |
| |
| int postcopy_ram_enable_notify(MigrationIncomingState *mis) |
| { |
| /* Create the fault handler thread and wait for it to be ready */ |
| qemu_sem_init(&mis->fault_thread_sem, 0); |
| qemu_thread_create(&mis->fault_thread, "postcopy/fault", |
| postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE); |
| qemu_sem_wait(&mis->fault_thread_sem); |
| qemu_sem_destroy(&mis->fault_thread_sem); |
| |
| /* Mark so that we get notified of accesses to unwritten areas */ |
| if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) { |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| #else |
| /* No target OS support, stubs just fail */ |
| bool postcopy_ram_supported_by_host(void) |
| { |
| error_report("%s: No OS support", __func__); |
| return false; |
| } |
| |
| int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages) |
| { |
| error_report("postcopy_ram_incoming_init: No OS support"); |
| return -1; |
| } |
| |
| int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) |
| { |
| assert(0); |
| return -1; |
| } |
| |
| int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start, |
| size_t length) |
| { |
| assert(0); |
| return -1; |
| } |
| |
| int postcopy_ram_enable_notify(MigrationIncomingState *mis) |
| { |
| assert(0); |
| return -1; |
| } |
| #endif |
| |
| /* ------------------------------------------------------------------------- */ |
| |
| /** |
| * postcopy_discard_send_init: Called at the start of each RAMBlock before |
| * asking to discard individual ranges. |
| * |
| * @ms: The current migration state. |
| * @offset: the bitmap offset of the named RAMBlock in the migration |
| * bitmap. |
| * @name: RAMBlock that discards will operate on. |
| * |
| * returns: a new PDS. |
| */ |
| PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms, |
| unsigned long offset, |
| const char *name) |
| { |
| PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState)); |
| |
| if (res) { |
| res->ramblock_name = name; |
| res->offset = offset; |
| } |
| |
| return res; |
| } |
| |
| /** |
| * postcopy_discard_send_range: Called by the bitmap code for each chunk to |
| * discard. May send a discard message, may just leave it queued to |
| * be sent later. |
| * |
| * @ms: Current migration state. |
| * @pds: Structure initialised by postcopy_discard_send_init(). |
| * @start,@length: a range of pages in the migration bitmap in the |
| * RAM block passed to postcopy_discard_send_init() (length=1 is one page) |
| */ |
| void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds, |
| unsigned long start, unsigned long length) |
| { |
| size_t tp_bits = qemu_target_page_bits(); |
| /* Convert to byte offsets within the RAM block */ |
| pds->start_list[pds->cur_entry] = (start - pds->offset) << tp_bits; |
| pds->length_list[pds->cur_entry] = length << tp_bits; |
| trace_postcopy_discard_send_range(pds->ramblock_name, start, length); |
| pds->cur_entry++; |
| pds->nsentwords++; |
| |
| if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) { |
| /* Full set, ship it! */ |
| qemu_savevm_send_postcopy_ram_discard(ms->file, pds->ramblock_name, |
| pds->cur_entry, |
| pds->start_list, |
| pds->length_list); |
| pds->nsentcmds++; |
| pds->cur_entry = 0; |
| } |
| } |
| |
| /** |
| * postcopy_discard_send_finish: Called at the end of each RAMBlock by the |
| * bitmap code. Sends any outstanding discard messages, frees the PDS |
| * |
| * @ms: Current migration state. |
| * @pds: Structure initialised by postcopy_discard_send_init(). |
| */ |
| void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds) |
| { |
| /* Anything unsent? */ |
| if (pds->cur_entry) { |
| qemu_savevm_send_postcopy_ram_discard(ms->file, pds->ramblock_name, |
| pds->cur_entry, |
| pds->start_list, |
| pds->length_list); |
| pds->nsentcmds++; |
| } |
| |
| trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords, |
| pds->nsentcmds); |
| |
| g_free(pds); |
| } |