Andrey Gruzdev | 0e9b5cd | 2021-01-29 13:14:04 +0300 | [diff] [blame] | 1 | /* |
| 2 | * Linux UFFD-WP support |
| 3 | * |
| 4 | * Copyright Virtuozzo GmbH, 2020 |
| 5 | * |
| 6 | * Authors: |
| 7 | * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> |
| 8 | * |
| 9 | * This work is licensed under the terms of the GNU GPL, version 2 or |
| 10 | * later. See the COPYING file in the top-level directory. |
| 11 | */ |
| 12 | |
| 13 | #include "qemu/osdep.h" |
| 14 | #include "qemu/bitops.h" |
| 15 | #include "qemu/error-report.h" |
| 16 | #include "qemu/userfaultfd.h" |
| 17 | #include "trace.h" |
| 18 | #include <poll.h> |
| 19 | #include <sys/syscall.h> |
| 20 | #include <sys/ioctl.h> |
| 21 | |
| 22 | /** |
| 23 | * uffd_query_features: query UFFD features |
| 24 | * |
| 25 | * Returns: 0 on success, negative value in case of an error |
| 26 | * |
| 27 | * @features: parameter to receive 'uffdio_api.features' |
| 28 | */ |
| 29 | int uffd_query_features(uint64_t *features) |
| 30 | { |
| 31 | int uffd_fd; |
| 32 | struct uffdio_api api_struct = { 0 }; |
| 33 | int ret = -1; |
| 34 | |
| 35 | uffd_fd = syscall(__NR_userfaultfd, O_CLOEXEC); |
| 36 | if (uffd_fd < 0) { |
| 37 | trace_uffd_query_features_nosys(errno); |
| 38 | return -1; |
| 39 | } |
| 40 | |
| 41 | api_struct.api = UFFD_API; |
| 42 | api_struct.features = 0; |
| 43 | |
| 44 | if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { |
| 45 | trace_uffd_query_features_api_failed(errno); |
| 46 | goto out; |
| 47 | } |
| 48 | *features = api_struct.features; |
| 49 | ret = 0; |
| 50 | |
| 51 | out: |
| 52 | close(uffd_fd); |
| 53 | return ret; |
| 54 | } |
| 55 | |
| 56 | /** |
| 57 | * uffd_create_fd: create UFFD file descriptor |
| 58 | * |
| 59 | * Returns non-negative file descriptor or negative value in case of an error |
| 60 | * |
| 61 | * @features: UFFD features to request |
| 62 | * @non_blocking: create UFFD file descriptor for non-blocking operation |
| 63 | */ |
| 64 | int uffd_create_fd(uint64_t features, bool non_blocking) |
| 65 | { |
| 66 | int uffd_fd; |
| 67 | int flags; |
| 68 | struct uffdio_api api_struct = { 0 }; |
| 69 | uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER); |
| 70 | |
| 71 | flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0); |
| 72 | uffd_fd = syscall(__NR_userfaultfd, flags); |
| 73 | if (uffd_fd < 0) { |
| 74 | trace_uffd_create_fd_nosys(errno); |
| 75 | return -1; |
| 76 | } |
| 77 | |
| 78 | api_struct.api = UFFD_API; |
| 79 | api_struct.features = features; |
| 80 | if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { |
| 81 | trace_uffd_create_fd_api_failed(errno); |
| 82 | goto fail; |
| 83 | } |
| 84 | if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) { |
| 85 | trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls); |
| 86 | goto fail; |
| 87 | } |
| 88 | |
| 89 | return uffd_fd; |
| 90 | |
| 91 | fail: |
| 92 | close(uffd_fd); |
| 93 | return -1; |
| 94 | } |
| 95 | |
| 96 | /** |
| 97 | * uffd_close_fd: close UFFD file descriptor |
| 98 | * |
| 99 | * @uffd_fd: UFFD file descriptor |
| 100 | */ |
| 101 | void uffd_close_fd(int uffd_fd) |
| 102 | { |
| 103 | assert(uffd_fd >= 0); |
| 104 | close(uffd_fd); |
| 105 | } |
| 106 | |
| 107 | /** |
| 108 | * uffd_register_memory: register memory range via UFFD-IO |
| 109 | * |
| 110 | * Returns 0 in case of success, negative value in case of an error |
| 111 | * |
| 112 | * @uffd_fd: UFFD file descriptor |
| 113 | * @addr: base address of memory range |
| 114 | * @length: length of memory range |
| 115 | * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...) |
| 116 | * @ioctls: optional pointer to receive supported IOCTL mask |
| 117 | */ |
| 118 | int uffd_register_memory(int uffd_fd, void *addr, uint64_t length, |
| 119 | uint64_t mode, uint64_t *ioctls) |
| 120 | { |
| 121 | struct uffdio_register uffd_register; |
| 122 | |
| 123 | uffd_register.range.start = (uintptr_t) addr; |
| 124 | uffd_register.range.len = length; |
| 125 | uffd_register.mode = mode; |
| 126 | |
| 127 | if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) { |
| 128 | trace_uffd_register_memory_failed(addr, length, mode, errno); |
| 129 | return -1; |
| 130 | } |
| 131 | if (ioctls) { |
| 132 | *ioctls = uffd_register.ioctls; |
| 133 | } |
| 134 | |
| 135 | return 0; |
| 136 | } |
| 137 | |
| 138 | /** |
| 139 | * uffd_unregister_memory: un-register memory range with UFFD-IO |
| 140 | * |
| 141 | * Returns 0 in case of success, negative value in case of an error |
| 142 | * |
| 143 | * @uffd_fd: UFFD file descriptor |
| 144 | * @addr: base address of memory range |
| 145 | * @length: length of memory range |
| 146 | */ |
| 147 | int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length) |
| 148 | { |
| 149 | struct uffdio_range uffd_range; |
| 150 | |
| 151 | uffd_range.start = (uintptr_t) addr; |
| 152 | uffd_range.len = length; |
| 153 | |
| 154 | if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) { |
| 155 | trace_uffd_unregister_memory_failed(addr, length, errno); |
| 156 | return -1; |
| 157 | } |
| 158 | |
| 159 | return 0; |
| 160 | } |
| 161 | |
| 162 | /** |
| 163 | * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO |
| 164 | * |
| 165 | * Returns 0 on success, negative value in case of error |
| 166 | * |
| 167 | * @uffd_fd: UFFD file descriptor |
| 168 | * @addr: base address of memory range |
| 169 | * @length: length of memory range |
| 170 | * @wp: write-protect/unprotect |
| 171 | * @dont_wake: do not wake threads waiting on wr-protected page |
| 172 | */ |
| 173 | int uffd_change_protection(int uffd_fd, void *addr, uint64_t length, |
| 174 | bool wp, bool dont_wake) |
| 175 | { |
| 176 | struct uffdio_writeprotect uffd_writeprotect; |
| 177 | |
| 178 | uffd_writeprotect.range.start = (uintptr_t) addr; |
| 179 | uffd_writeprotect.range.len = length; |
| 180 | if (!wp && dont_wake) { |
| 181 | /* DONTWAKE is meaningful only on protection release */ |
| 182 | uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE; |
| 183 | } else { |
| 184 | uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0); |
| 185 | } |
| 186 | |
| 187 | if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { |
| 188 | error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64 |
| 189 | " mode=%" PRIx64 " errno=%i", addr, length, |
| 190 | (uint64_t) uffd_writeprotect.mode, errno); |
| 191 | return -1; |
| 192 | } |
| 193 | |
| 194 | return 0; |
| 195 | } |
| 196 | |
| 197 | /** |
| 198 | * uffd_copy_page: copy range of pages to destination via UFFD-IO |
| 199 | * |
| 200 | * Copy range of source pages to the destination to resolve |
| 201 | * missing page fault somewhere in the destination range. |
| 202 | * |
| 203 | * Returns 0 on success, negative value in case of an error |
| 204 | * |
| 205 | * @uffd_fd: UFFD file descriptor |
| 206 | * @dst_addr: destination base address |
| 207 | * @src_addr: source base address |
| 208 | * @length: length of the range to copy |
| 209 | * @dont_wake: do not wake threads waiting on missing page |
| 210 | */ |
| 211 | int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr, |
| 212 | uint64_t length, bool dont_wake) |
| 213 | { |
| 214 | struct uffdio_copy uffd_copy; |
| 215 | |
| 216 | uffd_copy.dst = (uintptr_t) dst_addr; |
| 217 | uffd_copy.src = (uintptr_t) src_addr; |
| 218 | uffd_copy.len = length; |
| 219 | uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0; |
| 220 | |
| 221 | if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) { |
| 222 | error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64 |
| 223 | " mode=%" PRIx64 " errno=%i", dst_addr, src_addr, |
| 224 | length, (uint64_t) uffd_copy.mode, errno); |
| 225 | return -1; |
| 226 | } |
| 227 | |
| 228 | return 0; |
| 229 | } |
| 230 | |
| 231 | /** |
| 232 | * uffd_zero_page: fill range of pages with zeroes via UFFD-IO |
| 233 | * |
| 234 | * Fill range pages with zeroes to resolve missing page fault within the range. |
| 235 | * |
| 236 | * Returns 0 on success, negative value in case of an error |
| 237 | * |
| 238 | * @uffd_fd: UFFD file descriptor |
| 239 | * @addr: base address |
| 240 | * @length: length of the range to fill with zeroes |
| 241 | * @dont_wake: do not wake threads waiting on missing page |
| 242 | */ |
| 243 | int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake) |
| 244 | { |
| 245 | struct uffdio_zeropage uffd_zeropage; |
| 246 | |
| 247 | uffd_zeropage.range.start = (uintptr_t) addr; |
| 248 | uffd_zeropage.range.len = length; |
| 249 | uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0; |
| 250 | |
| 251 | if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) { |
| 252 | error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64 |
| 253 | " mode=%" PRIx64 " errno=%i", addr, length, |
| 254 | (uint64_t) uffd_zeropage.mode, errno); |
| 255 | return -1; |
| 256 | } |
| 257 | |
| 258 | return 0; |
| 259 | } |
| 260 | |
| 261 | /** |
| 262 | * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution |
| 263 | * |
| 264 | * Wake up threads waiting on any page/pages from the designated range. |
| 265 | * The main use case is when during some period, page faults are resolved |
| 266 | * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits |
| 267 | * for the whole memory range are satisfied in a single call to uffd_wakeup(). |
| 268 | * |
| 269 | * Returns 0 on success, negative value in case of an error |
| 270 | * |
| 271 | * @uffd_fd: UFFD file descriptor |
| 272 | * @addr: base address |
| 273 | * @length: length of the range |
| 274 | */ |
| 275 | int uffd_wakeup(int uffd_fd, void *addr, uint64_t length) |
| 276 | { |
| 277 | struct uffdio_range uffd_range; |
| 278 | |
| 279 | uffd_range.start = (uintptr_t) addr; |
| 280 | uffd_range.len = length; |
| 281 | |
| 282 | if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) { |
| 283 | error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i", |
| 284 | addr, length, errno); |
| 285 | return -1; |
| 286 | } |
| 287 | |
| 288 | return 0; |
| 289 | } |
| 290 | |
| 291 | /** |
| 292 | * uffd_read_events: read pending UFFD events |
| 293 | * |
| 294 | * Returns number of fetched messages, 0 if non is available or |
| 295 | * negative value in case of an error |
| 296 | * |
| 297 | * @uffd_fd: UFFD file descriptor |
| 298 | * @msgs: pointer to message buffer |
| 299 | * @count: number of messages that can fit in the buffer |
| 300 | */ |
| 301 | int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count) |
| 302 | { |
| 303 | ssize_t res; |
| 304 | do { |
| 305 | res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg)); |
| 306 | } while (res < 0 && errno == EINTR); |
| 307 | |
| 308 | if ((res < 0 && errno == EAGAIN)) { |
| 309 | return 0; |
| 310 | } |
| 311 | if (res < 0) { |
| 312 | error_report("uffd_read_events() failed: errno=%i", errno); |
| 313 | return -1; |
| 314 | } |
| 315 | |
| 316 | return (int) (res / sizeof(struct uffd_msg)); |
| 317 | } |
| 318 | |
| 319 | /** |
| 320 | * uffd_poll_events: poll UFFD file descriptor for read |
| 321 | * |
| 322 | * Returns true if events are available for read, false otherwise |
| 323 | * |
| 324 | * @uffd_fd: UFFD file descriptor |
| 325 | * @tmo: timeout value |
| 326 | */ |
| 327 | bool uffd_poll_events(int uffd_fd, int tmo) |
| 328 | { |
| 329 | int res; |
| 330 | struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 }; |
| 331 | |
| 332 | do { |
| 333 | res = poll(&poll_fd, 1, tmo); |
| 334 | } while (res < 0 && errno == EINTR); |
| 335 | |
| 336 | if (res == 0) { |
| 337 | return false; |
| 338 | } |
| 339 | if (res < 0) { |
| 340 | error_report("uffd_poll_events() failed: errno=%i", errno); |
| 341 | return false; |
| 342 | } |
| 343 | |
| 344 | return (poll_fd.revents & POLLIN) != 0; |
| 345 | } |