| /* |
| * Copyright (c) 2019 Nutanix Inc. All rights reserved. |
| * |
| * Authors: Mike Cui <cui@nutanix.com> |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * * Neither the name of Nutanix nor the names of its contributors may be |
| * used to endorse or promote products derived from this software without |
| * specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH |
| * DAMAGE. |
| * |
| */ |
| |
| #ifndef LIB_VFIO_USER_DMA_H |
| #define LIB_VFIO_USER_DMA_H |
| |
| /* |
| * FIXME check whether DMA regions must be page aligned. If so then the |
| * implementation can be greatly simpified. |
| */ |
| |
| /* |
| * This library emulates a DMA controller for a device emulation application to |
| * perform DMA operations on a foreign memory space. |
| * |
| * Concepts: |
| * - A DMA controller has its own 64-bit DMA address space. |
| * - Foreign memory is made available to the DMA controller in linear chunks |
| * called memory regions. |
| * - Each memory region is backed by a file descriptor and |
| * is registered with the DMA controllers at a unique, non-overlapping |
| * linear span of the DMA address space. |
| * - To perform DMA, the application should first build a scatter-gather |
| * list (sgl) of dma_sg_t from DMA addresses. Then the sgl |
| * can be mapped using dma_sgl_get() into the process's virtual address space |
| * as an iovec for direct access, and unmapped using dma_sgl_put() when done. |
| * Every region is mapped into the application's virtual address space |
| * at registration time with R/W permissions. |
| * dma_sgl_get() ignores all protection bits and only does lookups and |
| * returns pointers to the previously mapped regions. dma_sgl_put() is |
| * effectively a no-op. |
| */ |
| |
| #include <stdio.h> |
| #ifdef DMA_MAP_PROTECTED |
| #undef DMA_MAP_FAST |
| #define DMA_MAP_FAST_IMPL 0 |
| #else |
| #define DMA_MAP_FAST_IMPL 1 |
| #endif |
| |
| #include <assert.h> |
| #include <sys/types.h> |
| #include <sys/uio.h> |
| #include <sys/mman.h> |
| #include <stdint.h> |
| #include <stdlib.h> |
| #include <limits.h> |
| #include <errno.h> |
| #include <sys/queue.h> |
| |
| #include "libvfio-user.h" |
| #include "common.h" |
| #include "private.h" |
| |
| #define iov_end(iov) ((iov)->iov_base + (iov)->iov_len) |
| |
| struct vfu_ctx; |
| |
| struct dma_sg { |
| vfu_dma_addr_t dma_addr; |
| int region; |
| uint64_t length; |
| uint64_t offset; |
| bool writeable; |
| }; |
| |
| typedef struct { |
| vfu_dma_info_t info; |
| int fd; // File descriptor to mmap |
| off_t offset; // File offset |
| uint8_t *dirty_bitmap; // Dirty page bitmap |
| } dma_memory_region_t; |
| |
| typedef struct dma_controller { |
| int max_regions; |
| size_t max_size; |
| int nregions; |
| struct vfu_ctx *vfu_ctx; |
| size_t dirty_pgsize; // Dirty page granularity |
| dma_memory_region_t regions[0]; |
| } dma_controller_t; |
| |
| dma_controller_t * |
| dma_controller_create(vfu_ctx_t *vfu_ctx, size_t max_regions, size_t max_size); |
| |
| void |
| dma_controller_remove_all_regions(dma_controller_t *dma, |
| vfu_dma_unregister_cb_t *dma_unregister, |
| void *data); |
| |
| void |
| dma_controller_destroy(dma_controller_t *dma); |
| |
| /* Registers a new memory region. |
| * Returns: |
| * - On success, a non-negative region number |
| * - On failure, -1 with errno set. |
| */ |
| MOCK_DECLARE(int, dma_controller_add_region, dma_controller_t *dma, |
| vfu_dma_addr_t dma_addr, uint64_t size, int fd, off_t offset, |
| uint32_t prot); |
| |
| MOCK_DECLARE(int, dma_controller_remove_region, dma_controller_t *dma, |
| vfu_dma_addr_t dma_addr, size_t size, |
| vfu_dma_unregister_cb_t *dma_unregister, void *data); |
| |
| MOCK_DECLARE(void, dma_controller_unmap_region, dma_controller_t *dma, |
| dma_memory_region_t *region); |
| |
| // Helper for dma_addr_to_sgl() slow path. |
| int |
| _dma_addr_sg_split(const dma_controller_t *dma, |
| vfu_dma_addr_t dma_addr, uint64_t len, |
| dma_sg_t *sg, int max_nr_sgs, int prot); |
| |
| /* Convert a start address and length to its containing page numbers. */ |
| static inline void |
| range_to_pages(size_t start, size_t len, size_t pgsize, |
| size_t *pgstart, size_t *pgend) |
| { |
| *pgstart = start / pgsize; |
| *pgend = ROUND_UP(start + len, pgsize) / pgsize; |
| } |
| |
| /* Given a bit position, return the containing byte. */ |
| static inline size_t |
| bit_to_u8(size_t val) |
| { |
| return val / (CHAR_BIT); |
| } |
| |
| /* Return a value modulo the bitsize of a uint8_t. */ |
| static inline size_t |
| bit_to_u8off(size_t val) |
| { |
| return val % (CHAR_BIT); |
| } |
| |
| static inline void |
| _dma_mark_dirty(const dma_controller_t *dma, const dma_memory_region_t *region, |
| dma_sg_t *sg) |
| { |
| size_t index; |
| size_t end; |
| size_t pgstart; |
| size_t pgend; |
| size_t i; |
| |
| assert(dma != NULL); |
| assert(region != NULL); |
| assert(sg != NULL); |
| assert(region->dirty_bitmap != NULL); |
| |
| range_to_pages(sg->offset, sg->length, dma->dirty_pgsize, |
| &pgstart, &pgend); |
| |
| index = bit_to_u8(pgstart); |
| end = bit_to_u8(pgend) + !!(bit_to_u8off(pgend)); |
| |
| for (i = index; i < end; i++) { |
| uint8_t bm = ~0; |
| |
| /* Mask off any pages in the first u8 that aren't in the range. */ |
| if (i == index && bit_to_u8off(pgstart) != 0) { |
| bm &= ~((1 << bit_to_u8off(pgstart)) - 1); |
| } |
| |
| /* Mask off any pages in the last u8 that aren't in the range. */ |
| if (i == end - 1 && bit_to_u8off(pgend) != 0) { |
| bm &= ((1 << bit_to_u8off(pgend)) - 1); |
| } |
| |
| __atomic_or_fetch(®ion->dirty_bitmap[i], bm, __ATOMIC_SEQ_CST); |
| } |
| } |
| |
| static inline int |
| dma_init_sg(const dma_controller_t *dma, dma_sg_t *sg, vfu_dma_addr_t dma_addr, |
| uint64_t len, int prot, int region_index) |
| { |
| const dma_memory_region_t *const region = &dma->regions[region_index]; |
| |
| if ((prot & PROT_WRITE) && !(region->info.prot & PROT_WRITE)) { |
| vfu_log(dma->vfu_ctx, LOG_DEBUG, "read-only region"); |
| return ERROR_INT(EACCES); |
| } |
| |
| sg->dma_addr = region->info.iova.iov_base; |
| sg->region = region_index; |
| sg->offset = dma_addr - region->info.iova.iov_base; |
| sg->length = len; |
| sg->writeable = prot & PROT_WRITE; |
| |
| return 0; |
| } |
| |
| /* Takes a linear dma address span and returns a sg list suitable for DMA. |
| * A single linear dma address span may need to be split into multiple |
| * scatter gather regions due to limitations of how memory can be mapped. |
| * |
| * Returns: |
| * - On success, number of scatter gather entries created. |
| * - On failure: |
| * -1 if |
| * - the DMA address span is invalid |
| * - protection violation (errno=EACCES) |
| * (-x - 1) if @max_nr_sgs is too small, where x is the number of sg entries |
| * necessary to complete this request. |
| */ |
| static inline int |
| dma_addr_to_sgl(const dma_controller_t *dma, |
| vfu_dma_addr_t dma_addr, size_t len, |
| dma_sg_t *sgl, size_t max_nr_sgs, int prot) |
| { |
| static __thread int region_hint; |
| int cnt, ret; |
| |
| const dma_memory_region_t *const region = &dma->regions[region_hint]; |
| const void *region_end = iov_end(®ion->info.iova); |
| |
| // Fast path: single region. |
| if (likely(max_nr_sgs > 0 && len > 0 && |
| dma_addr >= region->info.iova.iov_base && |
| dma_addr + len <= region_end && |
| region_hint < dma->nregions)) { |
| ret = dma_init_sg(dma, sgl, dma_addr, len, prot, region_hint); |
| if (ret < 0) { |
| return ret; |
| } |
| |
| return 1; |
| } |
| // Slow path: search through regions. |
| cnt = _dma_addr_sg_split(dma, dma_addr, len, sgl, max_nr_sgs, prot); |
| if (likely(cnt > 0)) { |
| region_hint = sgl[0].region; |
| } |
| return cnt; |
| } |
| |
| static inline int |
| dma_sgl_get(dma_controller_t *dma, dma_sg_t *sgl, struct iovec *iov, size_t cnt) |
| { |
| dma_memory_region_t *region; |
| dma_sg_t *sg; |
| |
| assert(dma != NULL); |
| assert(sgl != NULL); |
| assert(iov != NULL); |
| assert(cnt > 0); |
| |
| sg = sgl; |
| |
| do { |
| if (sg->region >= dma->nregions) { |
| return ERROR_INT(EINVAL); |
| } |
| region = &dma->regions[sg->region]; |
| |
| if (region->info.vaddr == NULL) { |
| return ERROR_INT(EFAULT); |
| } |
| |
| #ifdef DEBUG_SGL |
| vfu_log(dma->vfu_ctx, LOG_DEBUG, "map %p-%p", |
| sg->dma_addr + sg->offset, |
| sg->dma_addr + sg->offset + sg->length); |
| |
| #endif |
| |
| iov->iov_base = region->info.vaddr + sg->offset; |
| iov->iov_len = sg->length; |
| |
| sg++; |
| iov++; |
| } while (--cnt > 0); |
| |
| return 0; |
| } |
| |
| static inline void |
| dma_sgl_mark_dirty(dma_controller_t *dma, dma_sg_t *sgl, size_t cnt) |
| { |
| dma_memory_region_t *region; |
| dma_sg_t *sg; |
| |
| assert(dma != NULL); |
| assert(sgl != NULL); |
| assert(cnt > 0); |
| |
| sg = sgl; |
| |
| do { |
| if (sg->region >= dma->nregions) { |
| return; |
| } |
| |
| region = &dma->regions[sg->region]; |
| |
| if (sg->writeable) { |
| if (dma->dirty_pgsize > 0) { |
| _dma_mark_dirty(dma, region, sg); |
| } |
| } |
| |
| #ifdef DEBUG_SGL |
| vfu_log(dma->vfu_ctx, LOG_DEBUG, "mark dirty %p-%p", |
| sg->dma_addr + sg->offset, |
| sg->dma_addr + sg->offset + sg->length); |
| #endif |
| |
| sg++; |
| } while (--cnt > 0); |
| } |
| |
| static inline void |
| dma_sgl_put(dma_controller_t *dma, dma_sg_t *sgl, size_t cnt) |
| { |
| dma_memory_region_t *region; |
| dma_sg_t *sg; |
| |
| assert(dma != NULL); |
| assert(sgl != NULL); |
| assert(cnt > 0); |
| |
| sg = sgl; |
| |
| do { |
| if (sg->region >= dma->nregions) { |
| return; |
| } |
| |
| region = &dma->regions[sg->region]; |
| |
| if (sg->writeable) { |
| if (dma->dirty_pgsize > 0) { |
| _dma_mark_dirty(dma, region, sg); |
| } |
| } |
| |
| #ifdef DEBUG_SGL |
| vfu_log(dma->vfu_ctx, LOG_DEBUG, "unmap %p-%p", |
| sg->dma_addr + sg->offset, |
| sg->dma_addr + sg->offset + sg->length); |
| #endif |
| |
| sg++; |
| } while (--cnt > 0); |
| } |
| |
| int |
| dma_controller_dirty_page_logging_start(dma_controller_t *dma, size_t pgsize); |
| |
| void |
| dma_controller_dirty_page_logging_stop(dma_controller_t *dma); |
| |
| int |
| dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr, |
| uint64_t len, size_t pgsize, size_t size, |
| char *bitmap); |
| |
| bool |
| dma_sg_is_mappable(const dma_controller_t *dma, const dma_sg_t *sg); |
| |
| |
| #endif /* LIB_VFIO_USER_DMA_H */ |
| |
| /* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ |