blob: e8b737f449c6a7dd0cc3d3d196d3b6e943a4ae32 [file] [log] [blame]
/*
* Copyright (c) 2020 Nutanix Inc. All rights reserved.
*
* Authors: Thanos Makatos <thanos@nutanix.com>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Nutanix nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*
*/
#include <stdio.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <stdlib.h>
#include <stdarg.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/eventfd.h>
#include <sys/param.h>
#include <time.h>
#include <err.h>
#include <assert.h>
#include <sys/stat.h>
#include <libgen.h>
#include <pthread.h>
#include <linux/limits.h>
#include "common.h"
#include "libvfio-user.h"
#include "rte_hash_crc.h"
#include "tran_sock.h"
#define CLIENT_MAX_FDS (32)
/* This is low, so we get testing of vfu_sgl_read/write() chunking. */
#define CLIENT_MAX_DATA_XFER_SIZE (1024)
static char const *irq_to_str[] = {
[VFU_DEV_INTX_IRQ] = "INTx",
[VFU_DEV_MSI_IRQ] = "MSI",
[VFU_DEV_MSIX_IRQ] = "MSI-X",
[VFU_DEV_ERR_IRQ] = "ERR",
[VFU_DEV_REQ_IRQ] = "REQ"
};
static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
struct client_dma_region {
/*
* Our DMA regions are one page in size so we only need one bit to mark them as
* dirty.
*/
#define CLIENT_DIRTY_PAGE_TRACKING_ENABLED (1 << 0)
#define CLIENT_DIRTY_DMA_REGION (1 << 1)
uint32_t flags;
struct vfio_user_dma_map map;
int fd;
};
void
vfu_log(UNUSED vfu_ctx_t *vfu_ctx, UNUSED int level,
const char *fmt, ...)
{
va_list ap;
printf("client: ");
va_start(ap, fmt);
vprintf(fmt, ap);
va_end(ap);
}
static int
init_sock(const char *path)
{
struct sockaddr_un addr = {.sun_family = AF_UNIX};
int sock;
/* TODO path should be defined elsewhere */
snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", path);
if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
err(EXIT_FAILURE, "failed to open socket %s", path);
}
if (connect(sock, (struct sockaddr*)&addr, sizeof(addr)) == -1) {
err(EXIT_FAILURE, "failed to connect server");
}
return sock;
}
static void
send_version(int sock)
{
struct vfio_user_version cversion;
struct iovec iovecs[3] = { { 0 } };
char client_caps[1024];
int msg_id = 0xbada55;
int slen;
int ret;
slen = snprintf(client_caps, sizeof(client_caps),
"{"
"\"capabilities\":{"
"\"max_msg_fds\":%u,"
"\"max_data_xfer_size\":%u"
"}"
"}", CLIENT_MAX_FDS, CLIENT_MAX_DATA_XFER_SIZE);
cversion.major = LIB_VFIO_USER_MAJOR;
cversion.minor = LIB_VFIO_USER_MINOR;
/* [0] is for the header. */
iovecs[1].iov_base = &cversion;
iovecs[1].iov_len = sizeof(cversion);
iovecs[2].iov_base = client_caps;
/* Include the NUL. */
iovecs[2].iov_len = slen + 1;
ret = tran_sock_send_iovec(sock, msg_id, false, VFIO_USER_VERSION,
iovecs, ARRAY_SIZE(iovecs), NULL, 0, 0);
if (ret < 0) {
err(EXIT_FAILURE, "failed to send client version message");
}
}
static void
recv_version(int sock, int *server_max_fds, size_t *server_max_data_xfer_size,
size_t *pgsize)
{
struct vfio_user_version *sversion = NULL;
struct vfio_user_header hdr;
size_t vlen;
int ret;
ret = tran_sock_recv_alloc(sock, &hdr, true, NULL,
(void **)&sversion, &vlen);
if (ret < 0) {
err(EXIT_FAILURE, "failed to receive version");
}
if (hdr.cmd != VFIO_USER_VERSION) {
errx(EXIT_FAILURE, "msg%hx: invalid cmd %hu (expected %u)",
hdr.msg_id, hdr.cmd, VFIO_USER_VERSION);
}
if (vlen < sizeof(*sversion)) {
errx(EXIT_FAILURE, "VFIO_USER_VERSION: invalid size %zu", vlen);
}
if (sversion->major != LIB_VFIO_USER_MAJOR) {
errx(EXIT_FAILURE, "unsupported server major %hu (must be %u)",
sversion->major, LIB_VFIO_USER_MAJOR);
}
/*
* The server is supposed to tell us the minimum agreed version.
*/
if (sversion->minor > LIB_VFIO_USER_MINOR) {
errx(EXIT_FAILURE, "unsupported server minor %hu (must be <= %u)",
sversion->minor, LIB_VFIO_USER_MINOR);
}
*server_max_fds = 1;
*server_max_data_xfer_size = VFIO_USER_DEFAULT_MAX_DATA_XFER_SIZE;
*pgsize = sysconf(_SC_PAGESIZE);
if (vlen > sizeof(*sversion)) {
const char *json_str = (const char *)sversion->data;
size_t len = vlen - sizeof(*sversion);
if (json_str[len - 1] != '\0') {
errx(EXIT_FAILURE, "ignoring invalid JSON from server");
}
ret = tran_parse_version_json(json_str, server_max_fds,
server_max_data_xfer_size, pgsize, NULL);
if (ret < 0) {
err(EXIT_FAILURE, "failed to parse server JSON \"%s\"", json_str);
}
}
free(sversion);
}
static void
negotiate(int sock, int *server_max_fds, size_t *server_max_data_xfer_size,
size_t *pgsize)
{
send_version(sock);
recv_version(sock, server_max_fds, server_max_data_xfer_size, pgsize);
}
static void
send_device_reset(int sock)
{
int ret = tran_sock_msg(sock, 1, VFIO_USER_DEVICE_RESET,
NULL, 0, NULL, NULL, 0);
if (ret < 0) {
err(EXIT_FAILURE, "failed to reset device");
}
}
static void
get_region_vfio_caps(struct vfio_info_cap_header *header,
struct vfio_region_info_cap_sparse_mmap **sparse)
{
unsigned int i;
while (true) {
switch (header->id) {
case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
*sparse = (struct vfio_region_info_cap_sparse_mmap *)header;
printf("client: %s: Sparse cap nr_mmap_areas %d\n", __func__,
(*sparse)->nr_areas);
for (i = 0; i < (*sparse)->nr_areas; i++) {
printf("client: %s: area %d offset %#llx size %llu\n",
__func__, i,
(ull_t)(*sparse)->areas[i].offset,
(ull_t)(*sparse)->areas[i].size);
}
break;
default:
errx(EXIT_FAILURE, "bad VFIO cap ID %#x", header->id);
}
if (header->next == 0) {
break;
}
header = (struct vfio_info_cap_header*)((char*)header + header->next - sizeof(struct vfio_region_info));
}
}
static void
do_get_device_region_info(int sock, struct vfio_region_info *region_info,
int *fds, size_t *nr_fds)
{
int ret = tran_sock_msg_fds(sock, 0xabcd, VFIO_USER_DEVICE_GET_REGION_INFO,
region_info, region_info->argsz, NULL,
region_info, region_info->argsz, fds, nr_fds);
if (ret < 0) {
err(EXIT_FAILURE, "failed to get device region info");
}
}
static void
mmap_sparse_areas(int fd, struct vfio_region_info *region_info,
struct vfio_region_info_cap_sparse_mmap *sparse)
{
size_t i;
for (i = 0; i < sparse->nr_areas; i++) {
ssize_t ret;
void *addr;
char pathname[PATH_MAX];
char buf[PATH_MAX] = "";
ret = snprintf(pathname, sizeof(pathname), "/proc/self/fd/%d", fd);
assert(ret != -1 && (size_t)ret < sizeof(pathname));
ret = readlink(pathname, buf, sizeof(buf) - 1);
if (ret == -1) {
err(EXIT_FAILURE, "failed to resolve file descriptor %d", fd);
}
addr = mmap(NULL, sparse->areas[i].size, PROT_READ | PROT_WRITE,
MAP_SHARED, fd, region_info->offset +
sparse->areas[i].offset);
if (addr == MAP_FAILED) {
err(EXIT_FAILURE,
"failed to mmap sparse region %zu in %s (%#llx-%#llx)",
i, buf, (ull_t)sparse->areas[i].offset,
(ull_t)sparse->areas[i].offset + sparse->areas[i].size - 1);
}
ret = munmap(addr, sparse->areas[i].size);
assert(ret == 0);
}
}
static void
get_device_region_info(int sock, uint32_t index)
{
struct vfio_region_info *region_info;
size_t cap_sz;
size_t size = sizeof(struct vfio_region_info);
int fds[CLIENT_MAX_FDS] = { 0 };
size_t nr_fds = ARRAY_SIZE(fds);
region_info = malloc(size);
if (region_info == NULL) {
err(EXIT_FAILURE, "%m\n");
}
memset(region_info, 0, size);
region_info->argsz = size;
region_info->index = index;
do_get_device_region_info(sock, region_info, NULL, 0);
if (region_info->argsz > size) {
size = region_info->argsz;
region_info = malloc(size);
if (region_info == NULL) {
err(EXIT_FAILURE, "%m\n");
}
memset(region_info, 0, size);
region_info->argsz = size;
region_info->index = index;
do_get_device_region_info(sock, region_info, fds, &nr_fds);
assert(region_info->argsz == size);
} else {
nr_fds = 0;
}
cap_sz = region_info->argsz - sizeof(struct vfio_region_info);
printf("client: %s: region_info[%d] offset %#llx flags %#x "
"size %llu cap_sz %zu #FDs %zu\n", __func__, index,
(ull_t)region_info->offset, region_info->flags,
(ull_t)region_info->size, cap_sz,
nr_fds);
if (cap_sz) {
struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
get_region_vfio_caps((struct vfio_info_cap_header*)(region_info + 1),
&sparse);
if (sparse != NULL) {
assert(index == VFU_PCI_DEV_BAR1_REGION_IDX && nr_fds == 1);
mmap_sparse_areas(fds[0], region_info, sparse);
} else {
assert(index != VFU_PCI_DEV_BAR1_REGION_IDX);
}
}
free(region_info);
}
static void
get_device_regions_info(int sock, struct vfio_user_device_info *client_dev_info)
{
unsigned int i;
for (i = 0; i < client_dev_info->num_regions; i++) {
get_device_region_info(sock, i);
}
}
static void
get_device_info(int sock, struct vfio_user_device_info *dev_info)
{
uint16_t msg_id = 0xb10c;
int ret;
dev_info->argsz = sizeof(*dev_info);
ret = tran_sock_msg(sock, msg_id,
VFIO_USER_DEVICE_GET_INFO,
dev_info, sizeof(*dev_info),
NULL,
dev_info, sizeof(*dev_info));
if (ret < 0) {
err(EXIT_FAILURE, "failed to get device info");
}
if (dev_info->num_regions != 9) {
errx(EXIT_FAILURE, "bad number of device regions %d",
dev_info->num_regions);
}
printf("client: devinfo: flags %#x, num_regions %d, num_irqs %d\n",
dev_info->flags, dev_info->num_regions, dev_info->num_irqs);
}
static int
configure_irqs(int sock)
{
struct iovec iovecs[2] = { { 0, } };
struct vfio_irq_set irq_set;
uint16_t msg_id = 0x1bad;
int irq_fd;
int i, ret;
for (i = 0; i < VFU_DEV_NUM_IRQS; i++) { /* TODO move body of loop into function */
struct vfio_irq_info vfio_irq_info = {
.argsz = sizeof(vfio_irq_info),
.index = i
};
ret = tran_sock_msg(sock, msg_id,
VFIO_USER_DEVICE_GET_IRQ_INFO,
&vfio_irq_info, sizeof(vfio_irq_info),
NULL,
&vfio_irq_info, sizeof(vfio_irq_info));
if (ret < 0) {
err(EXIT_FAILURE, "failed to get %s info", irq_to_str[i]);
}
if (vfio_irq_info.count > 0) {
printf("client: IRQ %s: count=%d flags=%#x\n",
irq_to_str[i], vfio_irq_info.count, vfio_irq_info.flags);
}
}
msg_id++;
irq_set.argsz = sizeof(irq_set);
irq_set.flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
irq_set.index = 0;
irq_set.start = 0;
irq_set.count = 1;
irq_fd = eventfd(0, 0);
if (irq_fd == -1) {
err(EXIT_FAILURE, "failed to create eventfd");
}
/* [0] is for the header. */
iovecs[1].iov_base = &irq_set;
iovecs[1].iov_len = sizeof(irq_set);
ret = tran_sock_msg_iovec(sock, msg_id, VFIO_USER_DEVICE_SET_IRQS,
iovecs, ARRAY_SIZE(iovecs),
&irq_fd, 1,
NULL, NULL, 0, NULL, 0);
if (ret < 0) {
err(EXIT_FAILURE, "failed to send configure IRQs message");
}
return irq_fd;
}
static int
access_region(int sock, int region, bool is_write, uint64_t offset,
void *data, size_t data_len)
{
static int msg_id = 0xf00f;
struct vfio_user_region_access send_region_access = {
.offset = offset,
.region = region,
.count = data_len
};
struct iovec send_iovecs[3] = {
[1] = {
.iov_base = &send_region_access,
.iov_len = sizeof(send_region_access)
},
[2] = {
.iov_base = data,
.iov_len = data_len
}
};
struct vfio_user_region_access *recv_data;
size_t nr_send_iovecs, recv_data_len;
int op, ret;
if (is_write) {
op = VFIO_USER_REGION_WRITE;
nr_send_iovecs = 3;
recv_data_len = sizeof(*recv_data);
} else {
op = VFIO_USER_REGION_READ;
nr_send_iovecs = 2;
recv_data_len = sizeof(*recv_data) + data_len;
}
recv_data = calloc(1, recv_data_len);
if (recv_data == NULL) {
err(EXIT_FAILURE, "failed to alloc recv_data");
}
pthread_mutex_lock(&mutex);
ret = tran_sock_msg_iovec(sock, msg_id--, op,
send_iovecs, nr_send_iovecs,
NULL, 0, NULL,
recv_data, recv_data_len, NULL, 0);
pthread_mutex_unlock(&mutex);
if (ret != 0) {
warn("failed to %s region %d %#llx-%#llx",
is_write ? "write to" : "read from", region,
(ull_t)offset,
(ull_t)(offset + data_len - 1));
free(recv_data);
return ret;
}
if (recv_data->count != data_len) {
warnx("bad %s data count, expected=%zu, actual=%d",
is_write ? "write" : "read", data_len,
recv_data->count);
free(recv_data);
errno = EINVAL;
return -1;
}
/*
* TODO we could avoid the memcpy if tran_sock_msg_iovec() received the
* response into an iovec, but it's some work to implement it.
*/
if (!is_write) {
memcpy(data, ((char *)recv_data) + sizeof(*recv_data), data_len);
}
free(recv_data);
return 0;
}
static int
set_migration_state(int sock, uint32_t state)
{
static int msg_id = 0xfab1;
struct vfio_user_device_feature req = {
.argsz = sizeof(struct vfio_user_device_feature)
+ sizeof(struct vfio_user_device_feature_mig_state),
.flags = VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE
};
struct vfio_user_device_feature_mig_state change_state = {
.device_state = state,
.data_fd = -1
};
struct iovec send_iovecs[3] = {
[1] = {
.iov_base = &req,
.iov_len = sizeof(req)
},
[2] = {
.iov_base = &change_state,
.iov_len = sizeof(change_state)
}
};
void *response = alloca(sizeof(req) + sizeof(change_state));
if (response == NULL) {
return -1;
}
pthread_mutex_lock(&mutex);
int ret = tran_sock_msg_iovec(sock, msg_id--, VFIO_USER_DEVICE_FEATURE,
send_iovecs, 3, NULL, 0, NULL,
response, sizeof(req) + sizeof(change_state),
NULL, 0);
pthread_mutex_unlock(&mutex);
if (ret < 0) {
err(EXIT_FAILURE, "failed to set state: %d", ret);
}
if (memcmp(&req, response, sizeof(req)) != 0) {
err(EXIT_FAILURE, "invalid response to set_migration_state (header)");
}
if (memcmp(&change_state, response + sizeof(req),
sizeof(change_state)) != 0) {
err(EXIT_FAILURE, "invalid response to set_migration_state (payload)");
}
return ret;
}
static ssize_t
read_migr_data(int sock, void *buf, size_t len)
{
static int msg_id = 0x6904;
struct vfio_user_mig_data req = {
.argsz = sizeof(struct vfio_user_mig_data) + len,
.size = len
};
struct iovec send_iovecs[2] = {
[1] = {
.iov_base = &req,
.iov_len = sizeof(req)
}
};
struct vfio_user_mig_data *res = calloc(1, sizeof(req) + len);
assert(res != NULL);
pthread_mutex_lock(&mutex);
ssize_t ret = tran_sock_msg_iovec(sock, msg_id--, VFIO_USER_MIG_DATA_READ,
send_iovecs, 2, NULL, 0, NULL,
res, sizeof(req) + len, NULL, 0);
pthread_mutex_unlock(&mutex);
if (ret < 0) {
err(EXIT_FAILURE, "failed to read migration data: %ld", ret);
}
memcpy(buf, res->data, res->size);
ssize_t size = res->size;
free(res);
return size;
}
static ssize_t
write_migr_data(int sock, void *buf, size_t len)
{
static int msg_id = 0x2023;
struct vfio_user_mig_data req = {
.argsz = sizeof(struct vfio_user_mig_data) + len,
.size = len
};
struct iovec send_iovecs[3] = {
[1] = {
.iov_base = &req,
.iov_len = sizeof(req)
},
[2] = {
.iov_base = buf,
.iov_len = len
}
};
pthread_mutex_lock(&mutex);
ssize_t ret = tran_sock_msg_iovec(sock, msg_id--, VFIO_USER_MIG_DATA_WRITE,
send_iovecs, 3, NULL, 0, NULL,
&req, sizeof(req), NULL, 0);
pthread_mutex_unlock(&mutex);
return ret;
}
static void
access_bar0(int sock, time_t *t)
{
int ret;
assert(t != NULL);
ret = access_region(sock, VFU_PCI_DEV_BAR0_REGION_IDX, true, 0, t, sizeof(*t));
if (ret < 0) {
err(EXIT_FAILURE, "failed to write to BAR0");
}
printf("client: wrote to BAR0: %ld\n", *t);
ret = access_region(sock, VFU_PCI_DEV_BAR0_REGION_IDX, false, 0, t, sizeof(*t));
if (ret < 0) {
err(EXIT_FAILURE, "failed to read from BAR0");
}
printf("client: read from BAR0: %ld\n", *t);
}
static void
wait_for_irq(int irq_fd)
{
uint64_t val;
if (read(irq_fd, &val, sizeof(val)) == -1) {
err(EXIT_FAILURE, "failed to read from irqfd");
}
printf("client: INTx triggered!\n");
}
static void
handle_dma_write(int sock, struct client_dma_region *dma_regions,
int nr_dma_regions)
{
struct vfio_user_dma_region_access dma_access;
struct vfio_user_header hdr;
int ret, i;
size_t size = sizeof(dma_access);
uint16_t msg_id = 0xcafe;
void *data;
ret = tran_sock_recv(sock, &hdr, false, &msg_id, &dma_access, &size);
if (ret < 0) {
err(EXIT_FAILURE, "failed to receive DMA read");
}
data = calloc(dma_access.count, 1);
if (data == NULL) {
err(EXIT_FAILURE, NULL);
}
if (recv(sock, data, dma_access.count, 0) == -1) {
err(EXIT_FAILURE, "failed to receive DMA read data");
}
for (i = 0; i < nr_dma_regions; i++) {
off_t offset;
ssize_t c;
if (dma_access.addr < dma_regions[i].map.addr ||
dma_access.addr >= dma_regions[i].map.addr + dma_regions[i].map.size) {
continue;
}
offset = dma_regions[i].map.offset + dma_access.addr;
c = pwrite(dma_regions[i].fd, data, dma_access.count, offset);
if (c != (ssize_t)dma_access.count) {
err(EXIT_FAILURE, "failed to write to fd=%d at [%#llx-%#llx)",
dma_regions[i].fd, (ull_t)offset,
(ull_t)(offset + dma_access.count));
}
/*
* DMA regions in this example are one page in size so we use one bit
* to mark the newly-dirtied page as dirty.
*/
if (dma_regions[i].flags & CLIENT_DIRTY_PAGE_TRACKING_ENABLED) {
assert(dma_regions[i].map.size == PAGE_SIZE);
dma_regions[i].flags |= CLIENT_DIRTY_DMA_REGION;
}
break;
}
assert(i != nr_dma_regions);
ret = tran_sock_send(sock, msg_id, true, VFIO_USER_DMA_WRITE,
&dma_access, sizeof(dma_access));
if (ret < 0) {
err(EXIT_FAILURE, "failed to send reply of DMA write");
}
free(data);
}
static void
handle_dma_read(int sock, struct client_dma_region *dma_regions,
int nr_dma_regions)
{
struct vfio_user_dma_region_access dma_access, *response;
struct vfio_user_header hdr;
int ret, i, response_sz;
size_t size = sizeof(dma_access);
uint16_t msg_id = 0xcafe;
void *data;
ret = tran_sock_recv(sock, &hdr, false, &msg_id, &dma_access, &size);
if (ret < 0) {
err(EXIT_FAILURE, "failed to receive DMA read");
}
response_sz = sizeof(dma_access) + dma_access.count;
response = calloc(response_sz, 1);
if (response == NULL) {
err(EXIT_FAILURE, NULL);
}
response->addr = dma_access.addr;
response->count = dma_access.count;
data = (char *)response->data;
for (i = 0; i < nr_dma_regions; i++) {
off_t offset;
ssize_t c;
if (dma_access.addr < dma_regions[i].map.addr ||
dma_access.addr >= dma_regions[i].map.addr + dma_regions[i].map.size) {
continue;
}
offset = dma_regions[i].map.offset + dma_access.addr;
c = pread(dma_regions[i].fd, data, dma_access.count, offset);
if (c != (ssize_t)dma_access.count) {
err(EXIT_FAILURE, "failed to read from fd=%d at [%#llx-%#llx)",
dma_regions[i].fd, (ull_t)offset,
(ull_t)offset + dma_access.count);
}
break;
}
assert(i != nr_dma_regions);
ret = tran_sock_send(sock, msg_id, true, VFIO_USER_DMA_READ,
response, response_sz);
if (ret < 0) {
err(EXIT_FAILURE, "failed to send reply of DMA read");
}
free(response);
}
static void
handle_dma_io(int sock, struct client_dma_region *dma_regions,
int nr_dma_regions)
{
size_t i;
for (i = 0; i < 4096 / CLIENT_MAX_DATA_XFER_SIZE; i++) {
handle_dma_write(sock, dma_regions, nr_dma_regions);
}
for (i = 0; i < 4096 / CLIENT_MAX_DATA_XFER_SIZE; i++) {
handle_dma_read(sock, dma_regions, nr_dma_regions);
}
}
static void
get_dirty_bitmap(int sock, struct client_dma_region *dma_region,
bool expect_dirty)
{
struct vfio_user_device_feature *res;
struct vfio_user_device_feature_dma_logging_report *report;
char *bitmap;
int ret;
uint64_t bitmap_size = get_bitmap_size(dma_region->map.size,
sysconf(_SC_PAGESIZE));
size_t size = sizeof(*res) + sizeof(*report) + bitmap_size;
void *data = calloc(1, size);
assert(data != NULL);
res = data;
res->flags = VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT
| VFIO_DEVICE_FEATURE_GET;
res->argsz = size;
report = (struct vfio_user_device_feature_dma_logging_report *)(res + 1);
report->iova = dma_region->map.addr;
report->length = dma_region->map.size;
report->page_size = sysconf(_SC_PAGESIZE);
bitmap = data + sizeof(*res) + sizeof(*report);
ret = tran_sock_msg(sock, 0x99, VFIO_USER_DEVICE_FEATURE,
data, sizeof(*res) + sizeof(*report),
NULL, data, size);
if (ret != 0) {
err(EXIT_FAILURE, "failed to get dirty page bitmap");
}
char dirtied_by_server = bitmap[0];
char dirtied_by_client = (dma_region->flags & CLIENT_DIRTY_DMA_REGION) != 0;
char dirtied = dirtied_by_server | dirtied_by_client;
if (expect_dirty) {
assert(dirtied);
}
printf("client: %s: %#llx-%#llx\t%#x\n", __func__,
(ull_t)report->iova,
(ull_t)(report->iova + report->length - 1), dirtied);
free(data);
}
static void
usage(char *argv0)
{
fprintf(stderr, "Usage: %s [-h] [-m src|dst] /path/to/socket\n",
basename(argv0));
}
/*
* Normally each time the source client (QEMU) would read migration data from
* the device it would send them to the destination client. However, since in
* our sample both the source and the destination client are the same process,
* we simply accumulate the migration data of each iteration and apply it to
* the destination server at the end.
*
* Performs as many migration loops as @nr_iters or until the device has no
* more migration data (pending_bytes is zero), which ever comes first. The
* result of each migration iteration is stored in @migr_iter. @migr_iter must
* be at least @nr_iters.
*
* @returns the number of iterations performed
*/
static size_t
do_migrate(int sock, size_t nr_iters, size_t max_iter_size,
struct iovec *migr_iter)
{
ssize_t ret;
size_t i = 0;
for (i = 0; i < nr_iters; i++) {
migr_iter[i].iov_len = max_iter_size;
migr_iter[i].iov_base = malloc(migr_iter[i].iov_len);
if (migr_iter[i].iov_base == NULL) {
err(EXIT_FAILURE, "failed to allocate migration buffer");
}
/* XXX read migration data */
ret = read_migr_data(sock, migr_iter[i].iov_base, migr_iter[i].iov_len);
if (ret < 0) {
err(EXIT_FAILURE, "failed to read migration data");
}
migr_iter[i].iov_len = ret;
// We know we've finished transferring data when we read 0 bytes.
if (ret == 0) {
break;
}
}
return i;
}
struct fake_guest_data {
int sock;
size_t bar1_size;
bool done;
uint32_t *crcp;
};
static void *
fake_guest(void *arg)
{
struct fake_guest_data *fake_guest_data = arg;
int ret;
char buf[fake_guest_data->bar1_size];
FILE *fp = fopen("/dev/urandom", "r");
if (fp == NULL) {
err(EXIT_FAILURE, "failed to open /dev/urandom");
}
do {
ret = fread(buf, fake_guest_data->bar1_size, 1, fp);
if (ret != 1) {
errx(EXIT_FAILURE, "short read %d", ret);
}
ret = access_region(fake_guest_data->sock, 1, true, 0, buf,
fake_guest_data->bar1_size);
if (ret != 0) {
err(EXIT_FAILURE, "fake guest failed to write garbage to BAR1");
}
} while (!fake_guest_data->done);
*fake_guest_data->crcp = rte_hash_crc(buf, fake_guest_data->bar1_size, 0);
return NULL;
}
static size_t
migrate_from(int sock, size_t *nr_iters, struct iovec **migr_iters,
uint32_t *crcp, size_t bar1_size, size_t max_iter_size)
{
size_t expected_data;
uint32_t device_state;
size_t iters;
int ret;
pthread_t thread;
struct fake_guest_data fake_guest_data = {
.sock = sock,
.bar1_size = bar1_size,
.done = false,
.crcp = crcp
};
ret = pthread_create(&thread, NULL, fake_guest, &fake_guest_data);
if (ret != 0) {
errno = ret;
err(EXIT_FAILURE, "failed to create pthread");
}
expected_data = bar1_size;
*nr_iters = (expected_data + max_iter_size - 1) / max_iter_size;
assert(*nr_iters == 12);
*migr_iters = malloc(sizeof(struct iovec) * *nr_iters);
if (*migr_iters == NULL) {
err(EXIT_FAILURE, NULL);
}
/*
* XXX set device state to pre-copy. This is technically optional but any
* VMM that cares about performance needs this.
*/
device_state = VFIO_USER_DEVICE_STATE_PRE_COPY;
ret = set_migration_state(sock, device_state);
if (ret < 0) {
err(EXIT_FAILURE, "failed to write to device state");
}
iters = do_migrate(sock, *nr_iters, max_iter_size, *migr_iters);
assert(iters == *nr_iters);
printf("client: stopping fake guest thread\n");
fake_guest_data.done = true;
__sync_synchronize();
ret = pthread_join(thread, NULL);
if (ret != 0) {
errno = ret;
err(EXIT_FAILURE, "failed to join fake guest pthread");
}
printf("client: setting device state to stop-and-copy\n");
device_state = VFIO_USER_DEVICE_STATE_STOP_COPY;
ret = set_migration_state(sock, device_state);
if (ret < 0) {
err(EXIT_FAILURE, "failed to write to device state");
}
expected_data = bar1_size + sizeof(time_t);
*nr_iters = (expected_data + max_iter_size - 1) / max_iter_size;
assert(*nr_iters == 13);
free(*migr_iters);
*migr_iters = malloc(sizeof(struct iovec) * *nr_iters);
if (*migr_iters == NULL) {
err(EXIT_FAILURE, NULL);
}
iters = do_migrate(sock, *nr_iters, max_iter_size, *migr_iters);
assert(iters == *nr_iters);
/* XXX read device state, migration must have finished now */
device_state = VFIO_USER_DEVICE_STATE_STOP;
ret = set_migration_state(sock, device_state);
if (ret < 0) {
err(EXIT_FAILURE, "failed to write to device state");
}
return iters;
}
static int
migrate_to(char *old_sock_path, int *server_max_fds,
size_t *server_max_data_xfer_size, size_t *pgsize, size_t nr_iters,
struct iovec *migr_iters, char *path_to_server,
uint32_t src_crc, size_t bar1_size)
{
ssize_t ret;
int sock;
char *sock_path;
struct stat sb;
uint32_t device_state = VFIO_USER_DEVICE_STATE_RESUMING;
size_t i;
uint32_t dst_crc;
char buf[bar1_size];
assert(old_sock_path != NULL);
printf("client: starting destination server\n");
ret = asprintf(&sock_path, "%s_migrated", old_sock_path);
if (ret == -1) {
err(EXIT_FAILURE, "failed to asprintf");
}
ret = fork();
if (ret == -1) {
err(EXIT_FAILURE, "failed to fork");
}
if (ret > 0) { /* child (destination server) */
char *_argv[] = {
path_to_server,
(char *)"-v",
sock_path,
NULL
};
ret = execvp(_argv[0] , _argv);
if (ret != 0) {
err(EXIT_FAILURE, "failed to start destination server (%s)",
path_to_server);
}
}
/* parent (client) */
/* wait for the server to come up */
while (stat(sock_path, &sb) == -1) {
if (errno != ENOENT) {
err(EXIT_FAILURE, "failed to stat %s", sock_path);
}
}
if ((sb.st_mode & S_IFMT) != S_IFSOCK) {
errx(EXIT_FAILURE, "%s: not a socket", sock_path);
}
/* connect to the destination server */
sock = init_sock(sock_path);
free(sock_path);
negotiate(sock, server_max_fds, server_max_data_xfer_size, pgsize);
device_state = VFIO_USER_DEVICE_STATE_RESUMING;
ret = set_migration_state(sock, device_state);
if (ret < 0) {
err(EXIT_FAILURE, "failed to set device state to resuming");
}
for (i = 0; i < nr_iters; i++) {
/* XXX write migration data */
ret = write_migr_data(sock, migr_iters[i].iov_base,
migr_iters[i].iov_len);
if (ret < 0) {
err(EXIT_FAILURE, "failed to write device migration data");
}
}
/* XXX set device state to stop to finish the transfer */
device_state = VFIO_USER_DEVICE_STATE_STOP;
ret = set_migration_state(sock, device_state);
if (ret < 0) {
err(EXIT_FAILURE, "failed to set device state to stop");
}
/* validate contents of BAR1 */
if (access_region(sock, 1, false, 0, buf, bar1_size) != 0) {
err(EXIT_FAILURE, "failed to read BAR1");
}
dst_crc = rte_hash_crc(buf, bar1_size, 0);
if (dst_crc != src_crc) {
fprintf(stderr, "client: CRC mismatch: %u != %u\n", src_crc, dst_crc);
abort();
}
/* XXX set device state to running */
device_state = VFIO_USER_DEVICE_STATE_RUNNING;
ret = set_migration_state(sock, device_state);
if (ret < 0) {
err(EXIT_FAILURE, "failed to set device state to running");
}
return sock;
}
static void
map_dma_regions(int sock, struct client_dma_region *dma_regions,
int nr_dma_regions)
{
int i, ret;
for (i = 0; i < nr_dma_regions; i++) {
struct iovec iovecs[2] = {
/* [0] is for the header. */
[1] = {
.iov_base = &dma_regions[i].map,
.iov_len = sizeof(struct vfio_user_dma_map)
}
};
ret = tran_sock_msg_iovec(sock, 0x1234 + i, VFIO_USER_DMA_MAP,
iovecs, ARRAY_SIZE(iovecs),
&dma_regions[i].fd, 1,
NULL, NULL, 0, NULL, 0);
if (ret < 0) {
err(EXIT_FAILURE, "failed to map DMA regions");
}
}
}
int main(int argc, char *argv[])
{
char template[] = "/tmp/libvfio-user.XXXXXX";
int ret, sock, irq_fd;
struct client_dma_region *dma_regions;
struct vfio_user_device_info client_dev_info = {0};
int i;
int tmpfd;
int server_max_fds;
size_t server_max_data_xfer_size;
size_t pgsize;
int nr_dma_regions;
int opt;
time_t t;
char *path_to_server = NULL;
vfu_pci_hdr_t config_space;
struct iovec *migr_iters;
size_t nr_iters;
uint32_t crc;
size_t bar1_size = 0x3000; /* FIXME get this value from region info */
struct vfio_user_device_feature *dirty_pages_feature;
struct vfio_user_device_feature_dma_logging_control *dirty_pages_control;
size_t dirty_pages_size = sizeof(*dirty_pages_feature) +
sizeof(*dirty_pages_control);
void *dirty_pages = malloc(dirty_pages_size);
dirty_pages_feature = dirty_pages;
dirty_pages_control = (void *)(dirty_pages_feature + 1);
while ((opt = getopt(argc, argv, "h")) != -1) {
switch (opt) {
case 'h':
usage(argv[0]);
exit(EXIT_SUCCESS);
default:
usage(argv[0]);
exit(EXIT_FAILURE);
}
}
if (argc != optind + 1) {
usage(argv[0]);
exit(EXIT_FAILURE);
}
sock = init_sock(argv[optind]);
/*
* VFIO_USER_VERSION
*
* Do initial negotiation with the server, and discover parameters.
*/
negotiate(sock, &server_max_fds, &server_max_data_xfer_size, &pgsize);
/* try to access a bogus region, we should get an error */
ret = access_region(sock, 0xdeadbeef, false, 0, &ret, sizeof(ret));
if (ret != -1 || errno != EINVAL) {
errx(EXIT_FAILURE,
"expected EINVAL accessing bogus region, got %d instead", errno);
}
/* XXX VFIO_USER_DEVICE_GET_INFO */
get_device_info(sock, &client_dev_info);
/* VFIO_USER_DEVICE_GET_REGION_INFO */
get_device_regions_info(sock, &client_dev_info);
ret = access_region(sock, VFU_PCI_DEV_CFG_REGION_IDX, false, 0, &config_space,
sizeof(config_space));
if (ret < 0) {
err(EXIT_FAILURE, "failed to read PCI configuration space");
}
assert(config_space.id.vid == 0xdead);
assert(config_space.id.did == 0xbeef);
assert(config_space.ss.vid == 0xcafe);
assert(config_space.ss.sid == 0xbabe);
/* XXX VFIO_USER_DEVICE_RESET */
send_device_reset(sock);
/*
* XXX VFIO_USER_DMA_MAP
*
* Tell the server we have some DMA regions it can access.
*/
nr_dma_regions = server_max_fds << 1;
umask(0022);
if ((tmpfd = mkstemp(template)) == -1) {
err(EXIT_FAILURE, "failed to create backing file");
}
if ((ret = ftruncate(tmpfd, nr_dma_regions * sysconf(_SC_PAGESIZE))) == -1) {
err(EXIT_FAILURE, "failed to truncate file");
}
unlink(template);
dma_regions = calloc(nr_dma_regions, sizeof(*dma_regions));
if (dma_regions == NULL) {
err(EXIT_FAILURE, "%m\n");
}
for (i = 0; i < nr_dma_regions; i++) {
dma_regions[i].map.argsz = sizeof(struct vfio_user_dma_map);
dma_regions[i].map.addr = i * sysconf(_SC_PAGESIZE);
dma_regions[i].map.size = sysconf(_SC_PAGESIZE);
dma_regions[i].map.offset = dma_regions[i].map.addr;
dma_regions[i].map.flags = VFIO_USER_F_DMA_REGION_READ |
VFIO_USER_F_DMA_REGION_WRITE;
dma_regions[i].fd = tmpfd;
}
map_dma_regions(sock, dma_regions, nr_dma_regions);
/*
* XXX VFIO_USER_DEVICE_GET_IRQ_INFO and VFIO_IRQ_SET_ACTION_TRIGGER
* Query interrupts and configure an eventfd to be associated with INTx.
*/
irq_fd = configure_irqs(sock);
/* start dirty pages logging */
dirty_pages_feature->argsz = sizeof(*dirty_pages_feature) +
sizeof(*dirty_pages_control);
dirty_pages_feature->flags = VFIO_DEVICE_FEATURE_DMA_LOGGING_START |
VFIO_DEVICE_FEATURE_SET;
dirty_pages_control->num_ranges = 0;
dirty_pages_control->page_size = sysconf(_SC_PAGESIZE);
ret = tran_sock_msg(sock, 0, VFIO_USER_DEVICE_FEATURE, dirty_pages,
dirty_pages_size, NULL, dirty_pages, dirty_pages_size);
if (ret != 0) {
err(EXIT_FAILURE, "failed to start dirty page logging");
}
/*
* Start client-side dirty page tracking (which happens in
* `handle_dma_write` when writes are successful).
*/
for (i = 0; i < nr_dma_regions; i++) {
dma_regions[i].flags |= CLIENT_DIRTY_PAGE_TRACKING_ENABLED;
}
/*
* XXX VFIO_USER_REGION_READ and VFIO_USER_REGION_WRITE
*
* BAR0 in the server does not support memory mapping so it must be accessed
* via explicit messages.
*/
t = time(NULL) + 1;
access_bar0(sock, &t);
wait_for_irq(irq_fd);
/* FIXME check that above took at least 1s */
handle_dma_io(sock, dma_regions, nr_dma_regions);
for (i = 0; i < nr_dma_regions; i++) {
/*
* We expect regions 0 and 1 to be dirtied: 0 through messages (so
* marked by the client) and 1 directly (so marked by the server). See
* the bottom of the main function of server.c.
*/
get_dirty_bitmap(sock, &dma_regions[i], i < 2);
}
/* stop logging dirty pages */
dirty_pages_feature->argsz = sizeof(*dirty_pages_feature) +
sizeof(*dirty_pages_control);
dirty_pages_feature->flags = VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP |
VFIO_DEVICE_FEATURE_SET;
dirty_pages_control->num_ranges = 0;
dirty_pages_control->page_size = sysconf(_SC_PAGESIZE);
ret = tran_sock_msg(sock, 0, VFIO_USER_DEVICE_FEATURE, dirty_pages,
dirty_pages_size, NULL, dirty_pages, dirty_pages_size);
if (ret != 0) {
err(EXIT_FAILURE, "failed to stop dirty page logging");
}
/* Stop client-side dirty page tracking */
for (i = 0; i < nr_dma_regions; i++) {
dma_regions[i].flags &= ~CLIENT_DIRTY_PAGE_TRACKING_ENABLED;
}
/* BAR1 can be memory mapped and read directly */
/*
* XXX VFIO_USER_DMA_UNMAP
*
* unmap the first group of the DMA regions
*/
for (i = 0; i < server_max_fds; i++) {
struct vfio_user_dma_unmap r = {
.argsz = sizeof(r),
.addr = dma_regions[i].map.addr,
.size = dma_regions[i].map.size
};
ret = tran_sock_msg(sock, 7, VFIO_USER_DMA_UNMAP, &r, sizeof(r),
NULL, &r, sizeof(r));
if (ret < 0) {
err(EXIT_FAILURE, "failed to unmap DMA region");
}
}
/*
* Schedule an interrupt in 10 seconds from now in the old server and then
* immediatelly migrate the device. The new server should deliver the
* interrupt. Hopefully 10 seconds should be enough for migration to finish.
* TODO make this value a command line option.
*/
t = time(NULL) + 10;
ret = access_region(sock, VFU_PCI_DEV_BAR0_REGION_IDX, true, 0, &t, sizeof(t));
if (ret < 0) {
err(EXIT_FAILURE, "failed to write to BAR0");
}
nr_iters = migrate_from(sock, &nr_iters, &migr_iters, &crc, bar1_size,
MIN(server_max_data_xfer_size, CLIENT_MAX_DATA_XFER_SIZE));
/*
* Normally the client would now send the device state to the destination
* client and then exit. We don't demonstrate how this works as this is a
* client implementation detail. Instead, the client starts the destination
* server and then applies the migration data.
*/
if (asprintf(&path_to_server, "%s/server", dirname(argv[0])) == -1) {
err(EXIT_FAILURE, "failed to asprintf");
}
sock = migrate_to(argv[optind], &server_max_fds, &server_max_data_xfer_size,
&pgsize, nr_iters, migr_iters, path_to_server,
crc, bar1_size);
free(path_to_server);
for (i = 0; i < (int)nr_iters; i++) {
free(migr_iters[i].iov_base);
}
free(migr_iters);
/*
* Now we must reconfigure the destination server.
*/
/*
* XXX reconfigure DMA regions, note that the first half of the has been
* unmapped.
*/
map_dma_regions(sock, dma_regions + server_max_fds,
nr_dma_regions - server_max_fds);
/*
* XXX reconfigure IRQs.
* FIXME is this something the client needs to do? I would expect so since
* it's the client that creates and provides the FD. Do we need to save some
* state in the migration data?
*/
irq_fd = configure_irqs(sock);
wait_for_irq(irq_fd);
handle_dma_io(sock, dma_regions + server_max_fds,
nr_dma_regions - server_max_fds);
struct vfio_user_dma_unmap r = {
.argsz = sizeof(r),
.addr = 0,
.size = 0,
.flags = VFIO_DMA_UNMAP_FLAG_ALL
};
ret = tran_sock_msg(sock, 8, VFIO_USER_DMA_UNMAP, &r, sizeof(r),
NULL, &r, sizeof(r));
if (ret < 0) {
err(EXIT_FAILURE, "failed to unmap all DMA regions");
}
free(dma_regions);
free(dirty_pages);
return 0;
}
/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */