blob: f1cd6af2b19e12af94eea6f72eb3b7558a5abd50 [file] [log] [blame]
Andrey Gruzdev0e9b5cd2021-01-29 13:14:04 +03001/*
2 * Linux UFFD-WP support
3 *
4 * Copyright Virtuozzo GmbH, 2020
5 *
6 * Authors:
7 * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2 or
10 * later. See the COPYING file in the top-level directory.
11 */
12
13#include "qemu/osdep.h"
14#include "qemu/bitops.h"
15#include "qemu/error-report.h"
16#include "qemu/userfaultfd.h"
17#include "trace.h"
18#include <poll.h>
19#include <sys/syscall.h>
20#include <sys/ioctl.h>
21
22/**
23 * uffd_query_features: query UFFD features
24 *
25 * Returns: 0 on success, negative value in case of an error
26 *
27 * @features: parameter to receive 'uffdio_api.features'
28 */
29int uffd_query_features(uint64_t *features)
30{
31 int uffd_fd;
32 struct uffdio_api api_struct = { 0 };
33 int ret = -1;
34
35 uffd_fd = syscall(__NR_userfaultfd, O_CLOEXEC);
36 if (uffd_fd < 0) {
37 trace_uffd_query_features_nosys(errno);
38 return -1;
39 }
40
41 api_struct.api = UFFD_API;
42 api_struct.features = 0;
43
44 if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
45 trace_uffd_query_features_api_failed(errno);
46 goto out;
47 }
48 *features = api_struct.features;
49 ret = 0;
50
51out:
52 close(uffd_fd);
53 return ret;
54}
55
56/**
57 * uffd_create_fd: create UFFD file descriptor
58 *
59 * Returns non-negative file descriptor or negative value in case of an error
60 *
61 * @features: UFFD features to request
62 * @non_blocking: create UFFD file descriptor for non-blocking operation
63 */
64int uffd_create_fd(uint64_t features, bool non_blocking)
65{
66 int uffd_fd;
67 int flags;
68 struct uffdio_api api_struct = { 0 };
69 uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
70
71 flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0);
72 uffd_fd = syscall(__NR_userfaultfd, flags);
73 if (uffd_fd < 0) {
74 trace_uffd_create_fd_nosys(errno);
75 return -1;
76 }
77
78 api_struct.api = UFFD_API;
79 api_struct.features = features;
80 if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
81 trace_uffd_create_fd_api_failed(errno);
82 goto fail;
83 }
84 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
85 trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls);
86 goto fail;
87 }
88
89 return uffd_fd;
90
91fail:
92 close(uffd_fd);
93 return -1;
94}
95
96/**
97 * uffd_close_fd: close UFFD file descriptor
98 *
99 * @uffd_fd: UFFD file descriptor
100 */
101void uffd_close_fd(int uffd_fd)
102{
103 assert(uffd_fd >= 0);
104 close(uffd_fd);
105}
106
107/**
108 * uffd_register_memory: register memory range via UFFD-IO
109 *
110 * Returns 0 in case of success, negative value in case of an error
111 *
112 * @uffd_fd: UFFD file descriptor
113 * @addr: base address of memory range
114 * @length: length of memory range
115 * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...)
116 * @ioctls: optional pointer to receive supported IOCTL mask
117 */
118int uffd_register_memory(int uffd_fd, void *addr, uint64_t length,
119 uint64_t mode, uint64_t *ioctls)
120{
121 struct uffdio_register uffd_register;
122
123 uffd_register.range.start = (uintptr_t) addr;
124 uffd_register.range.len = length;
125 uffd_register.mode = mode;
126
127 if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) {
128 trace_uffd_register_memory_failed(addr, length, mode, errno);
129 return -1;
130 }
131 if (ioctls) {
132 *ioctls = uffd_register.ioctls;
133 }
134
135 return 0;
136}
137
138/**
139 * uffd_unregister_memory: un-register memory range with UFFD-IO
140 *
141 * Returns 0 in case of success, negative value in case of an error
142 *
143 * @uffd_fd: UFFD file descriptor
144 * @addr: base address of memory range
145 * @length: length of memory range
146 */
147int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length)
148{
149 struct uffdio_range uffd_range;
150
151 uffd_range.start = (uintptr_t) addr;
152 uffd_range.len = length;
153
154 if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) {
155 trace_uffd_unregister_memory_failed(addr, length, errno);
156 return -1;
157 }
158
159 return 0;
160}
161
162/**
163 * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO
164 *
165 * Returns 0 on success, negative value in case of error
166 *
167 * @uffd_fd: UFFD file descriptor
168 * @addr: base address of memory range
169 * @length: length of memory range
170 * @wp: write-protect/unprotect
171 * @dont_wake: do not wake threads waiting on wr-protected page
172 */
173int uffd_change_protection(int uffd_fd, void *addr, uint64_t length,
174 bool wp, bool dont_wake)
175{
176 struct uffdio_writeprotect uffd_writeprotect;
177
178 uffd_writeprotect.range.start = (uintptr_t) addr;
179 uffd_writeprotect.range.len = length;
180 if (!wp && dont_wake) {
181 /* DONTWAKE is meaningful only on protection release */
182 uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
183 } else {
184 uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
185 }
186
187 if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
188 error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64
189 " mode=%" PRIx64 " errno=%i", addr, length,
190 (uint64_t) uffd_writeprotect.mode, errno);
191 return -1;
192 }
193
194 return 0;
195}
196
197/**
198 * uffd_copy_page: copy range of pages to destination via UFFD-IO
199 *
200 * Copy range of source pages to the destination to resolve
201 * missing page fault somewhere in the destination range.
202 *
203 * Returns 0 on success, negative value in case of an error
204 *
205 * @uffd_fd: UFFD file descriptor
206 * @dst_addr: destination base address
207 * @src_addr: source base address
208 * @length: length of the range to copy
209 * @dont_wake: do not wake threads waiting on missing page
210 */
211int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr,
212 uint64_t length, bool dont_wake)
213{
214 struct uffdio_copy uffd_copy;
215
216 uffd_copy.dst = (uintptr_t) dst_addr;
217 uffd_copy.src = (uintptr_t) src_addr;
218 uffd_copy.len = length;
219 uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0;
220
221 if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) {
222 error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64
223 " mode=%" PRIx64 " errno=%i", dst_addr, src_addr,
224 length, (uint64_t) uffd_copy.mode, errno);
225 return -1;
226 }
227
228 return 0;
229}
230
231/**
232 * uffd_zero_page: fill range of pages with zeroes via UFFD-IO
233 *
234 * Fill range pages with zeroes to resolve missing page fault within the range.
235 *
236 * Returns 0 on success, negative value in case of an error
237 *
238 * @uffd_fd: UFFD file descriptor
239 * @addr: base address
240 * @length: length of the range to fill with zeroes
241 * @dont_wake: do not wake threads waiting on missing page
242 */
243int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake)
244{
245 struct uffdio_zeropage uffd_zeropage;
246
247 uffd_zeropage.range.start = (uintptr_t) addr;
248 uffd_zeropage.range.len = length;
249 uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0;
250
251 if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) {
252 error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64
253 " mode=%" PRIx64 " errno=%i", addr, length,
254 (uint64_t) uffd_zeropage.mode, errno);
255 return -1;
256 }
257
258 return 0;
259}
260
261/**
262 * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution
263 *
264 * Wake up threads waiting on any page/pages from the designated range.
265 * The main use case is when during some period, page faults are resolved
266 * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits
267 * for the whole memory range are satisfied in a single call to uffd_wakeup().
268 *
269 * Returns 0 on success, negative value in case of an error
270 *
271 * @uffd_fd: UFFD file descriptor
272 * @addr: base address
273 * @length: length of the range
274 */
275int uffd_wakeup(int uffd_fd, void *addr, uint64_t length)
276{
277 struct uffdio_range uffd_range;
278
279 uffd_range.start = (uintptr_t) addr;
280 uffd_range.len = length;
281
282 if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) {
283 error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i",
284 addr, length, errno);
285 return -1;
286 }
287
288 return 0;
289}
290
291/**
292 * uffd_read_events: read pending UFFD events
293 *
294 * Returns number of fetched messages, 0 if non is available or
295 * negative value in case of an error
296 *
297 * @uffd_fd: UFFD file descriptor
298 * @msgs: pointer to message buffer
299 * @count: number of messages that can fit in the buffer
300 */
301int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count)
302{
303 ssize_t res;
304 do {
305 res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg));
306 } while (res < 0 && errno == EINTR);
307
308 if ((res < 0 && errno == EAGAIN)) {
309 return 0;
310 }
311 if (res < 0) {
312 error_report("uffd_read_events() failed: errno=%i", errno);
313 return -1;
314 }
315
316 return (int) (res / sizeof(struct uffd_msg));
317}
318
319/**
320 * uffd_poll_events: poll UFFD file descriptor for read
321 *
322 * Returns true if events are available for read, false otherwise
323 *
324 * @uffd_fd: UFFD file descriptor
325 * @tmo: timeout value
326 */
327bool uffd_poll_events(int uffd_fd, int tmo)
328{
329 int res;
330 struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 };
331
332 do {
333 res = poll(&poll_fd, 1, tmo);
334 } while (res < 0 && errno == EINTR);
335
336 if (res == 0) {
337 return false;
338 }
339 if (res < 0) {
340 error_report("uffd_poll_events() failed: errno=%i", errno);
341 return false;
342 }
343
344 return (poll_fd.revents & POLLIN) != 0;
345}