blob: 343f2c46d14a5814e12bdb44411238c82692f939 [file]
/*
* Copyright (c) Meta Platforms, Inc. and affiliates
*
* Authors: Mattias Nissler <mnissler@meta.com>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Nutanix nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*
*/
#include <assert.h>
#include <byteswap.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <pthread.h>
#include <stdbool.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <unistd.h>
#ifdef HAVE_LINUX_KCMP_H
#include <linux/kcmp.h>
#endif
#include "btree.h"
#include "common.h"
#include "fd_cache.h"
/*
* The file descriptor cache is a global B-tree, protected by a mutex.
*
* Note that libvfio-user's general approach to thread safety is to assume
* a given vfu_ctx_t is operated on a single thread only or protected by
* appropriate locking by the caller (with a noteworthy exception for DMA via
* some vfu_sgl_* APIs). However, since file descriptors are a per-process
* resource, we maintain a process-global file descriptor cache and protect it
* with a mutex in order to keep keep multiple vfu_ctx_t independent and avoid
* imposing locking requirements across contexts.
*/
static btree_t cache_tree;
static pthread_mutex_t cache_mutex = PTHREAD_MUTEX_INITIALIZER;
/* Not static so the unit test can access these for fallback testing. */
bool kcmp_checked;
bool kcmp_available;
struct fd_cache_entry {
int fd;
dev_t dev;
ino_t ino;
int flags;
int refcount;
};
#ifdef HAVE_LINUX_KCMP_H
static int
kcmp(pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2)
{
return syscall(SYS_kcmp, pid1, pid2, type, idx1, idx2);
}
/* Note that this is called under cache_mutex, thus guaranteeing once-only init. */
static bool
is_kcmp_available(int fd)
{
if (!kcmp_checked) {
pid_t pid = getpid();
kcmp_available = kcmp(pid, pid, KCMP_FILE, fd, fd) == 0;
kcmp_checked = true;
}
return kcmp_available;
}
#else
#define is_kcmp_available(fd) (false)
#endif
static int
is_same_file(const struct fd_cache_entry *entry1,
const struct fd_cache_entry *entry2)
{
#ifdef HAVE_LINUX_KCMP_H
if (is_kcmp_available(entry1->fd)) {
pid_t pid = getpid();
return kcmp(pid, pid, KCMP_FILE, entry1->fd, entry2->fd);
}
#endif
/*
* We prefer using kcmp for detecting file descriptor duplicates.
*
* Unfortunately, it's not available on all kernel versions/configurations,
* and Docker's default seccomp policy blocks it. Thus we fall back to
* comparing device and inode numbers as well as flags bits. This considers
* file descriptors equivalent if they are referring to the same file on
* disk (as per device/inode numbers) and were opened in identical mode
* (such as read/write, as determined by flags).
*
* Note that this fallback isn't a perfect replacement for kcmp, as it will
* consider file descriptors identical even though they originate from
* separate open() calls as long as their parameters match. That's OK
* though, as we really only care about the targets. Specifically, the file
* positions aren't of relevance due to using pread/pwrite with explicit
* offsets.
*/
return !(entry1->dev == entry2->dev && entry1->ino == entry2->ino &&
entry1->flags == entry2->flags);
}
static int
fd_cache_entry_init(int fd, struct fd_cache_entry *entry, bool *accurate)
{
struct stat st;
int flags;
if (fstat(fd, &st) != 0 || (flags = fcntl(fd, F_GETFL)) == -1) {
return -1;
}
entry->fd = fd;
entry->dev = st.st_dev;
entry->ino = st.st_ino;
entry->flags = flags;
entry->refcount = 0;
/*
* Indicate whether fd equivalence testing will work accurately for the fd.
* This is trivially true when kcmp is available. When operating in
* fallback mode, declare that non-regular files can't be accurately
* compared. This is to exclude most "exotic" file descriptors such as
* eventfd, etc. The reason is that some of these don't have unique inode
* numbers, which would cause false positives in (dev, inode)-based
* duplicate detection.
*/
*accurate = is_kcmp_available(fd) || S_ISREG(st.st_mode);
return 0;
}
static uintptr_t
fd_cache_entry_compute_key(const struct fd_cache_entry *entry)
{
/*
* Compute a key from device and inode numbers. Note that the key value
* isn't unique per file descriptor, for example in case a file gets
* open()-ed multiple times. The value is expected to be unique per file
* file though, as long as the computed keys don't happen to collide.
* Either way, identical keys are tolerable thanks to the cache B-Tree
* being able to handle multiple entries with the same key, at least as
* long as the number of identical keys doesn't grow too large.
*
* Given that device and inode numbers are usually only using a lower
* portion of a 64 bit integer's range, we byte-swap one of the numbers in
* an effort to reduce the likelihood of key collisions.
*/
return entry->ino ^ (bswap_64(entry->dev) >>
((sizeof(uint64_t) - sizeof(uintptr_t)) * CHAR_BIT));
}
static int
fd_cache_lookup(const struct fd_cache_entry *query, btree_iter_t *iter,
struct fd_cache_entry **entry)
{
uintptr_t query_key = fd_cache_entry_compute_key(query);
uintptr_t key;
int ret;
/*
* Note that the key is not guaranteed to be unique: Collisions can happen
* if we have multiple independent file descriptors (not just duplicates)
* for a given file (e.g. from multiple open() calls with different
* read/write access modes), or when the cache keys are colliding
* accidentally (due to unfortunate device/inode number values).
*
* Thus, we iterate over all entries matching the target key and rely on
* `is_same_file` to identify actually matching entries.
*/
for (btree_iter_init(&cache_tree, query_key, iter);
(*entry = btree_iter_get(iter, &key)) != NULL && query_key == key;
btree_iter_next(iter)) {
ret = is_same_file(query, *entry);
if (ret <= 0) {
return ret;
}
}
*entry = NULL;
return 0;
}
static int
fd_cache_get_locked(int fd)
{
struct fd_cache_entry *entry;
struct fd_cache_entry query;
btree_iter_t iter;
uintptr_t key;
bool accurate;
if (fd_cache_entry_init(fd, &query, &accurate) != 0) {
return -1;
}
/*
* When we can't make accurate comparisons, bypass the cache and
* pretend to the caller that everything is fine.
*/
if (!accurate) {
return fd;
}
if (fd_cache_lookup(&query, &iter, &entry) != 0) {
return -1;
}
if (entry != NULL) {
close_safely(&fd);
++entry->refcount;
return entry->fd;
}
entry = calloc(1, sizeof(*entry));
if (entry == NULL) {
errno = ENOMEM;
return -1;
}
*entry = query;
entry->refcount = 1;
key = fd_cache_entry_compute_key(entry);
if (btree_iter_insert(&iter, key, entry) != 0) {
free(entry);
return -1;
}
return entry->fd;
}
static int
fd_cache_put_locked(int *fd)
{
struct fd_cache_entry *entry;
struct fd_cache_entry query;
btree_iter_t iter;
bool accurate;
if (*fd == -1) {
return 0;
}
if (fd_cache_entry_init(*fd, &query, &accurate) != 0) {
return -1;
}
if (!accurate) {
/*
* This file descriptor isn't eligible for de-duplication and thus
* fd_cache_get hasn't created a cache entry. Just close the
* descriptor to provide consistent semantics to the caller.
*/
close_safely(fd);
return 0;
}
if (fd_cache_lookup(&query, &iter, &entry) != 0) {
return -1;
}
if (entry == NULL || entry->fd != *fd) {
errno = ENOENT;
return -1;
}
assert(entry->refcount > 0);
--entry->refcount;
if (entry->refcount == 0) {
btree_iter_remove(&iter);
close_safely(&entry->fd);
free(entry);
}
*fd = -1;
return 0;
}
int
fd_cache_get(int fd)
{
int ret;
pthread_mutex_lock(&cache_mutex);
ret = fd_cache_get_locked(fd);
pthread_mutex_unlock(&cache_mutex);
return ret;
}
int
fd_cache_put(int *fd)
{
int ret;
pthread_mutex_lock(&cache_mutex);
ret = fd_cache_put_locked(fd);
pthread_mutex_unlock(&cache_mutex);
return ret;
}
int
fd_cache_is_same_file(int fd1, int fd2)
{
struct fd_cache_entry entry1;
struct fd_cache_entry entry2;
bool accurate;
int ret = -1;
if (fd1 == fd2) {
return 0;
} else if (fd1 == -1 || fd2 == -1) {
return 1;
}
pthread_mutex_lock(&cache_mutex);
if (fd_cache_entry_init(fd1, &entry1, &accurate) == 0 &&
fd_cache_entry_init(fd2, &entry2, &accurate) == 0) {
ret = is_same_file(&entry1, &entry2);
}
pthread_mutex_unlock(&cache_mutex);
return ret;
}
/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */