blob: d0707f3d670e28ce340038c499eeafe71cfb5218 [file] [log] [blame]
/*
* Sample server to be tested with samples/client.c
*
* Copyright (c) 2020, Nutanix Inc. All rights reserved.
* Author: Thanos Makatos <thanos@nutanix.com>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Nutanix nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*
*/
#include <stdio.h>
#include <err.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <signal.h>
#include <errno.h>
#include <time.h>
#include <assert.h>
#include <sys/mman.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <sys/time.h>
#include "common.h"
#include "libvfio-user.h"
#include "rte_hash_crc.h"
struct dma_regions {
struct iovec iova;
uint32_t prot;
};
#define NR_DMA_REGIONS 96
struct server_data {
time_t bar0;
void *bar1;
size_t bar1_size;
struct dma_regions regions[NR_DMA_REGIONS];
struct {
uint64_t bytes_transferred;
vfu_migr_state_t state;
} migration;
};
static void
_log(vfu_ctx_t *vfu_ctx UNUSED, UNUSED int level, char const *msg)
{
fprintf(stderr, "server[%d]: %s\n", getpid(), msg);
}
static int
arm_timer(vfu_ctx_t *vfu_ctx, time_t t)
{
struct itimerval new = {.it_value.tv_sec = t - time(NULL) };
vfu_log(vfu_ctx, LOG_DEBUG, "arming timer to trigger in %ld seconds",
new.it_value.tv_sec);
if (setitimer(ITIMER_REAL, &new, NULL) != 0) {
vfu_log(vfu_ctx, LOG_ERR, "failed to arm timer: %m");
return -1;
}
return 0;
}
static ssize_t
bar0_access(vfu_ctx_t *vfu_ctx, char * const buf, size_t count, loff_t offset,
const bool is_write)
{
struct server_data *server_data = vfu_get_private(vfu_ctx);
if (count != sizeof(time_t) || offset != 0) {
vfu_log(vfu_ctx, LOG_ERR, "bad BAR0 access %#llx-%#llx",
(unsigned long long)offset,
(unsigned long long)offset + count - 1);
errno = EINVAL;
return -1;
}
if (is_write) {
if (server_data->migration.state == VFU_MIGR_STATE_RUNNING) {
int ret = arm_timer(vfu_ctx, *(time_t*)buf);
if (ret < 0) {
return ret;
}
}
memcpy(&server_data->bar0, buf, count);
} else {
time_t delta = time(NULL) - server_data->bar0;
memcpy(buf, &delta, count);
}
return count;
}
static ssize_t
bar1_access(vfu_ctx_t *vfu_ctx, char * const buf,
size_t count, loff_t offset,
const bool is_write)
{
struct server_data *server_data = vfu_get_private(vfu_ctx);
if (offset + count > server_data->bar1_size) {
vfu_log(vfu_ctx, LOG_ERR, "bad BAR1 access %#llx-%#llx",
(unsigned long long)offset,
(unsigned long long)offset + count - 1);
errno = EINVAL;
return -1;
}
if (is_write) {
memcpy(server_data->bar1 + offset, buf, count);
} else {
memcpy(buf, server_data->bar1, count);
}
return count;
}
bool irq_triggered = false;
static void _sa_handler(int signum)
{
int _errno = errno;
if (signum == SIGALRM) {
irq_triggered = true;
}
errno = _errno;
}
static void
dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
{
struct server_data *server_data = vfu_get_private(vfu_ctx);
int idx;
for (idx = 0; idx < NR_DMA_REGIONS; idx++) {
if (server_data->regions[idx].iova.iov_base == NULL &&
server_data->regions[idx].iova.iov_len == 0)
break;
}
if (idx >= NR_DMA_REGIONS) {
errx(EXIT_FAILURE, "Failed to add dma region, slots full");
}
server_data->regions[idx].iova = info->iova;
server_data->regions[idx].prot = info->prot;
}
static void
dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
{
struct server_data *server_data = vfu_get_private(vfu_ctx);
int idx;
for (idx = 0; idx < NR_DMA_REGIONS; idx++) {
if (server_data->regions[idx].iova.iov_len == info->iova.iov_len &&
server_data->regions[idx].iova.iov_base == info->iova.iov_base) {
server_data->regions[idx].iova.iov_base = NULL;
server_data->regions[idx].iova.iov_len = 0;
}
}
}
/*
* FIXME this function does DMA write/read using messages. This should be done
* on a region that is not memory mappable or an area of a region that is not
* sparsely memory mappable. We should also have a test where the server does
* DMA directly on the client memory.
*/
static void do_dma_io(vfu_ctx_t *vfu_ctx, struct server_data *server_data,
int region, bool use_messages)
{
const int size = 1024;
const int count = 4;
unsigned char buf[size * count];
uint32_t crc1, crc2;
dma_sg_t *sg;
void *addr;
int ret;
sg = alloca(dma_sg_size());
assert(vfu_ctx != NULL);
struct iovec iov = {0};
/* Write some data, chunked into multiple calls to exercise offsets. */
for (int i = 0; i < count; ++i) {
addr = server_data->regions[region].iova.iov_base + i * size;
ret = vfu_addr_to_sgl(vfu_ctx, (vfu_dma_addr_t)addr, size, sg, 1,
PROT_WRITE);
if (ret < 0) {
err(EXIT_FAILURE, "failed to map %p-%p", addr, addr + size - 1);
}
memset(&buf[i * size], 'A' + i, size);
if (use_messages) {
vfu_log(vfu_ctx, LOG_DEBUG, "%s: MESSAGE WRITE addr %p size %d",
__func__, addr, size);
ret = vfu_sgl_write(vfu_ctx, sg, 1, &buf[i * size]);
if (ret < 0) {
err(EXIT_FAILURE, "vfu_sgl_write failed");
}
} else {
vfu_log(vfu_ctx, LOG_DEBUG, "%s: DIRECT WRITE addr %p size %d",
__func__, addr, size);
ret = vfu_sgl_get(vfu_ctx, sg, &iov, 1, 0);
if (ret < 0) {
err(EXIT_FAILURE, "vfu_sgl_get failed");
}
assert(iov.iov_len == (size_t)size);
memcpy(iov.iov_base, &buf[i * size], size);
/*
* When directly writing to client memory the server is responsible
* for tracking dirty pages. We assert that all dirty writes are
* within the first page of region 1. In fact, all regions are only
* one page in size.
*
* Note: this is not strictly necessary in this example, since we
* later call `vfu_sgl_put`, which marks pages dirty if the SGL was
* acquired with `PROT_WRITE`. However, `vfu_sgl_mark_dirty` is
* useful in cases where the server needs to mark guest memory dirty
* without releasing the memory with `vfu_sgl_put`.
*/
vfu_sgl_mark_dirty(vfu_ctx, sg, 1);
assert(region == 1);
assert(i * size < (int)PAGE_SIZE);
vfu_sgl_put(vfu_ctx, sg, &iov, 1);
}
}
crc1 = rte_hash_crc(buf, sizeof(buf), 0);
/* Read the data back at double the chunk size. */
memset(buf, 0, sizeof(buf));
for (int i = 0; i < count; i += 2) {
addr = server_data->regions[region].iova.iov_base + i * size;
ret = vfu_addr_to_sgl(vfu_ctx, (vfu_dma_addr_t)addr, size * 2, sg, 1,
PROT_READ);
if (ret < 0) {
err(EXIT_FAILURE, "failed to map %p-%p", addr, addr + 2 * size - 1);
}
if (use_messages) {
vfu_log(vfu_ctx, LOG_DEBUG, "%s: MESSAGE READ addr %p size %d",
__func__, addr, 2 * size);
ret = vfu_sgl_read(vfu_ctx, sg, 1, &buf[i * size]);
if (ret < 0) {
err(EXIT_FAILURE, "vfu_sgl_read failed");
}
} else {
vfu_log(vfu_ctx, LOG_DEBUG, "%s: DIRECT READ addr %p size %d",
__func__, addr, 2 * size);
ret = vfu_sgl_get(vfu_ctx, sg, &iov, 1, 0);
if (ret < 0) {
err(EXIT_FAILURE, "vfu_sgl_get failed");
}
assert(iov.iov_len == 2 * (size_t)size);
memcpy(&buf[i * size], iov.iov_base, 2 * size);
vfu_sgl_put(vfu_ctx, sg, &iov, 1);
}
}
crc2 = rte_hash_crc(buf, sizeof(buf), 0);
if (crc1 != crc2) {
errx(EXIT_FAILURE, "DMA write and DMA read mismatch");
} else {
vfu_log(vfu_ctx, LOG_DEBUG, "%s: %s success", __func__,
use_messages ? "MESSAGE" : "DIRECT");
}
}
static int device_reset(vfu_ctx_t *vfu_ctx UNUSED, vfu_reset_type_t type UNUSED)
{
vfu_log(vfu_ctx, LOG_DEBUG, "device reset callback");
return 0;
}
static int
migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state)
{
struct server_data *server_data = vfu_get_private(vfu_ctx);
int ret;
struct itimerval new = { { 0 }, };
vfu_log(vfu_ctx, LOG_DEBUG, "migration: transition to device state %d",
state);
switch (state) {
case VFU_MIGR_STATE_STOP_AND_COPY:
vfu_log(vfu_ctx, LOG_DEBUG, "disable timer");
if (setitimer(ITIMER_REAL, &new, NULL) != 0) {
err(EXIT_FAILURE, "failed to disable timer");
}
server_data->migration.bytes_transferred = 0;
break;
case VFU_MIGR_STATE_PRE_COPY:
server_data->migration.bytes_transferred = 0;
break;
case VFU_MIGR_STATE_STOP:
/* FIXME should gracefully fail */
if (server_data->migration.state == VFU_MIGR_STATE_STOP_AND_COPY) {
assert(server_data->migration.bytes_transferred ==
server_data->bar1_size + sizeof(time_t));
}
break;
case VFU_MIGR_STATE_RESUME:
server_data->migration.bytes_transferred = 0;
break;
case VFU_MIGR_STATE_RUNNING:
assert(server_data->migration.bytes_transferred ==
server_data->bar1_size + sizeof(time_t));
ret = arm_timer(vfu_ctx, server_data->bar0);
if (ret < 0) {
return ret;
}
break;
default:
assert(false); /* FIXME */
}
server_data->migration.state = state;
return 0;
}
static ssize_t
migration_read_data(vfu_ctx_t *vfu_ctx, void *buf, uint64_t size)
{
struct server_data *server_data = vfu_get_private(vfu_ctx);
/*
* If in pre-copy state we copy BAR1, if in stop-and-copy state we copy
* both BAR1 and BAR0. Since we always copy BAR1 in the stop-and-copy state,
* copying BAR1 in the pre-copy state is pointless. Fixing this requires
* more complex state tracking which exceeds the scope of this sample.
*/
uint32_t total_to_read = server_data->bar1_size;
if (server_data->migration.state == VFU_MIGR_STATE_STOP_AND_COPY) {
total_to_read += sizeof(server_data->bar0);
}
if (server_data->migration.bytes_transferred == total_to_read || size == 0) {
vfu_log(vfu_ctx, LOG_DEBUG, "no data left to read");
return 0;
}
uint32_t read_start = server_data->migration.bytes_transferred;
uint32_t read_end = MIN(read_start + size, total_to_read);
assert(read_end > read_start);
uint32_t bytes_read = read_end - read_start;
uint32_t length_in_bar1 = 0;
uint32_t length_in_bar0 = 0;
/* read bar1, if any */
if (read_start < server_data->bar1_size) {
length_in_bar1 = MIN(bytes_read, server_data->bar1_size - read_start);
memcpy(buf, server_data->bar1 + read_start, length_in_bar1);
read_start += length_in_bar1;
}
/* read bar0, if any */
if (read_end > server_data->bar1_size) {
length_in_bar0 = read_end - read_start;
read_start -= server_data->bar1_size;
memcpy(buf + length_in_bar1, (char *)&server_data->bar0 + read_start,
length_in_bar0);
}
server_data->migration.bytes_transferred += bytes_read;
return bytes_read;
}
static ssize_t
migration_write_data(vfu_ctx_t *vfu_ctx, void *data, uint64_t size)
{
struct server_data *server_data = vfu_get_private(vfu_ctx);
char *buf = data;
assert(server_data != NULL);
assert(data != NULL);
uint32_t total_to_write = server_data->bar1_size + sizeof(server_data->bar0);
if (server_data->migration.bytes_transferred == total_to_write || size == 0) {
return 0;
}
uint32_t write_start = server_data->migration.bytes_transferred;
uint32_t write_end = MIN(write_start + size, total_to_write); // exclusive
assert(write_end > write_start);
uint32_t bytes_written = write_end - write_start;
uint32_t length_in_bar1 = 0;
uint32_t length_in_bar0 = 0;
/* write to bar1, if any */
if (write_start < server_data->bar1_size) {
length_in_bar1 = MIN(bytes_written, server_data->bar1_size - write_start);
memcpy(server_data->bar1 + write_start, buf, length_in_bar1);
write_start += length_in_bar1;
}
/* write to bar0, if any */
if (write_end > server_data->bar1_size) {
length_in_bar0 = write_end - write_start;
write_start -= server_data->bar1_size;
memcpy((char *)&server_data->bar0 + write_start, buf + length_in_bar1,
length_in_bar0);
}
server_data->migration.bytes_transferred += bytes_written;
return bytes_written;
}
int main(int argc, char *argv[])
{
char template[] = "/tmp/libvfio-user.XXXXXX";
int ret;
bool verbose = false;
int opt;
struct sigaction act = {.sa_handler = _sa_handler};
const size_t bar1_size = 0x3000;
struct server_data server_data = {
.migration = {
.state = VFU_MIGR_STATE_RUNNING
}
};
vfu_ctx_t *vfu_ctx;
vfu_trans_t trans = VFU_TRANS_SOCK;
int tmpfd;
const vfu_migration_callbacks_t migr_callbacks = {
.version = VFU_MIGR_CALLBACKS_VERS,
.transition = &migration_device_state_transition,
.read_data = &migration_read_data,
.write_data = &migration_write_data
};
while ((opt = getopt(argc, argv, "v")) != -1) {
switch (opt) {
case 'v':
verbose = true;
break;
default: /* '?' */
errx(EXIT_FAILURE, "Usage: %s [-v] <socketpath>", argv[0]);
}
}
if (optind >= argc) {
errx(EXIT_FAILURE, "missing vfio-user socket path");
}
sigemptyset(&act.sa_mask);
if (sigaction(SIGALRM, &act, NULL) == -1) {
err(EXIT_FAILURE, "failed to register signal handler");
}
if (strcmp(argv[optind], "pipe") == 0) {
trans = VFU_TRANS_PIPE;
}
vfu_ctx = vfu_create_ctx(trans, argv[optind], 0, &server_data,
VFU_DEV_TYPE_PCI);
if (vfu_ctx == NULL) {
err(EXIT_FAILURE, "failed to initialize device emulation");
}
ret = vfu_setup_log(vfu_ctx, _log, verbose ? LOG_DEBUG : LOG_ERR);
if (ret < 0) {
err(EXIT_FAILURE, "failed to setup log");
}
ret = vfu_pci_init(vfu_ctx, VFU_PCI_TYPE_CONVENTIONAL,
PCI_HEADER_TYPE_NORMAL, 0);
if (ret < 0) {
err(EXIT_FAILURE, "vfu_pci_init() failed") ;
}
vfu_pci_set_id(vfu_ctx, 0xdead, 0xbeef, 0xcafe, 0xbabe);
ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX, sizeof(time_t),
&bar0_access, VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
if (ret < 0) {
err(EXIT_FAILURE, "failed to setup BAR0 region");
}
umask(0022);
/*
* Setup BAR1 to be 3 pages in size where only the first and the last pages
* are mappable. The client can still mmap the 2nd page, we can't prohibit
* this under Linux. If we really want to prohibit it we have to use
* separate files for the same region.
*/
if ((tmpfd = mkstemp(template)) == -1) {
err(EXIT_FAILURE, "failed to create backing file");
}
unlink(template);
server_data.bar1_size = bar1_size;
if (ftruncate(tmpfd, server_data.bar1_size) == -1) {
err(EXIT_FAILURE, "failed to truncate backing file");
}
server_data.bar1 = mmap(NULL, server_data.bar1_size, PROT_READ | PROT_WRITE,
MAP_SHARED, tmpfd, 0);
if (server_data.bar1 == MAP_FAILED) {
err(EXIT_FAILURE, "failed to mmap BAR1");
}
struct iovec bar1_mmap_areas[] = {
{ .iov_base = (void*)0, .iov_len = 0x1000 },
{ .iov_base = (void*)0x2000, .iov_len = 0x1000 }
};
ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR1_REGION_IDX,
server_data.bar1_size, &bar1_access,
VFU_REGION_FLAG_RW, bar1_mmap_areas, 2,
tmpfd, 0);
if (ret < 0) {
err(EXIT_FAILURE, "failed to setup BAR1 region");
}
ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks);
if (ret < 0) {
err(EXIT_FAILURE, "failed to setup device migration");
}
ret = vfu_setup_device_reset_cb(vfu_ctx, &device_reset);
if (ret < 0) {
err(EXIT_FAILURE, "failed to setup device reset callbacks");
}
ret = vfu_setup_device_dma(vfu_ctx, &dma_register, &dma_unregister);
if (ret < 0) {
err(EXIT_FAILURE, "failed to setup device DMA callbacks");
}
ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
if (ret < 0) {
err(EXIT_FAILURE, "failed to setup irq counts");
}
ret = vfu_realize_ctx(vfu_ctx);
if (ret < 0) {
err(EXIT_FAILURE, "failed to realize device");
}
ret = vfu_attach_ctx(vfu_ctx);
if (ret < 0) {
err(EXIT_FAILURE, "failed to attach device");
}
do {
ret = vfu_run_ctx(vfu_ctx);
if (ret == -1 && errno == EINTR) {
if (irq_triggered) {
irq_triggered = false;
ret = vfu_irq_trigger(vfu_ctx, 0);
if (ret < 0) {
err(EXIT_FAILURE, "vfu_irq_trigger() failed");
}
printf("doing dma io\n");
/*
* We initiate some dummy DMA by directly accessing the client's
* memory. In this case, we keep track of dirty pages ourselves,
* as the client has no knowledge of what and when we have
* written to its memory.
*/
do_dma_io(vfu_ctx, &server_data, 1, false);
/*
* We also do some dummy DMA via explicit messages to show how
* DMA is done if the client's RAM isn't mappable or the server
* implementation prefers it this way. In this case, the client
* is responsible for tracking pages that are dirtied, as it is
* the one actually performing the writes.
*/
do_dma_io(vfu_ctx, &server_data, 0, true);
ret = 0;
}
}
} while (ret == 0);
if (ret == -1 &&
errno != ENOTCONN && errno != EINTR && errno != ESHUTDOWN) {
errx(EXIT_FAILURE, "failed to realize device emulation");
}
vfu_destroy_ctx(vfu_ctx);
return EXIT_SUCCESS;
}
/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */