b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame^] | 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | /* |
| 3 | * memfd GUP test-case |
| 4 | * This tests memfd interactions with get_user_pages(). We require the |
| 5 | * fuse_mnt.c program to provide a fake direct-IO FUSE mount-point for us. This |
| 6 | * file-system delays _all_ reads by 1s and forces direct-IO. This means, any |
| 7 | * read() on files in that file-system will pin the receive-buffer pages for at |
| 8 | * least 1s via get_user_pages(). |
| 9 | * |
| 10 | * We use this trick to race ADD_SEALS against a write on a memfd object. The |
| 11 | * ADD_SEALS must fail if the memfd pages are still pinned. Note that we use |
| 12 | * the read() syscall with our memory-mapped memfd object as receive buffer to |
| 13 | * force the kernel to write into our memfd object. |
| 14 | */ |
| 15 | |
| 16 | #define _GNU_SOURCE |
| 17 | #define __EXPORTED_HEADERS__ |
| 18 | |
| 19 | #include <errno.h> |
| 20 | #include <inttypes.h> |
| 21 | #include <limits.h> |
| 22 | #include <linux/falloc.h> |
| 23 | #include <linux/fcntl.h> |
| 24 | #include <linux/memfd.h> |
| 25 | #include <linux/types.h> |
| 26 | #include <sched.h> |
| 27 | #include <stdio.h> |
| 28 | #include <stdlib.h> |
| 29 | #include <signal.h> |
| 30 | #include <string.h> |
| 31 | #include <sys/mman.h> |
| 32 | #include <sys/stat.h> |
| 33 | #include <sys/syscall.h> |
| 34 | #include <sys/wait.h> |
| 35 | #include <unistd.h> |
| 36 | |
| 37 | #include "common.h" |
| 38 | |
| 39 | #define MFD_DEF_SIZE 8192 |
| 40 | #define STACK_SIZE 65536 |
| 41 | |
| 42 | static size_t mfd_def_size = MFD_DEF_SIZE; |
| 43 | |
| 44 | static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) |
| 45 | { |
| 46 | int r, fd; |
| 47 | |
| 48 | fd = sys_memfd_create(name, flags); |
| 49 | if (fd < 0) { |
| 50 | printf("memfd_create(\"%s\", %u) failed: %m\n", |
| 51 | name, flags); |
| 52 | abort(); |
| 53 | } |
| 54 | |
| 55 | r = ftruncate(fd, sz); |
| 56 | if (r < 0) { |
| 57 | printf("ftruncate(%llu) failed: %m\n", (unsigned long long)sz); |
| 58 | abort(); |
| 59 | } |
| 60 | |
| 61 | return fd; |
| 62 | } |
| 63 | |
| 64 | static __u64 mfd_assert_get_seals(int fd) |
| 65 | { |
| 66 | long r; |
| 67 | |
| 68 | r = fcntl(fd, F_GET_SEALS); |
| 69 | if (r < 0) { |
| 70 | printf("GET_SEALS(%d) failed: %m\n", fd); |
| 71 | abort(); |
| 72 | } |
| 73 | |
| 74 | return r; |
| 75 | } |
| 76 | |
| 77 | static void mfd_assert_has_seals(int fd, __u64 seals) |
| 78 | { |
| 79 | __u64 s; |
| 80 | |
| 81 | s = mfd_assert_get_seals(fd); |
| 82 | if (s != seals) { |
| 83 | printf("%llu != %llu = GET_SEALS(%d)\n", |
| 84 | (unsigned long long)seals, (unsigned long long)s, fd); |
| 85 | abort(); |
| 86 | } |
| 87 | } |
| 88 | |
| 89 | static void mfd_assert_add_seals(int fd, __u64 seals) |
| 90 | { |
| 91 | long r; |
| 92 | __u64 s; |
| 93 | |
| 94 | s = mfd_assert_get_seals(fd); |
| 95 | r = fcntl(fd, F_ADD_SEALS, seals); |
| 96 | if (r < 0) { |
| 97 | printf("ADD_SEALS(%d, %llu -> %llu) failed: %m\n", |
| 98 | fd, (unsigned long long)s, (unsigned long long)seals); |
| 99 | abort(); |
| 100 | } |
| 101 | } |
| 102 | |
| 103 | static int mfd_busy_add_seals(int fd, __u64 seals) |
| 104 | { |
| 105 | long r; |
| 106 | __u64 s; |
| 107 | |
| 108 | r = fcntl(fd, F_GET_SEALS); |
| 109 | if (r < 0) |
| 110 | s = 0; |
| 111 | else |
| 112 | s = r; |
| 113 | |
| 114 | r = fcntl(fd, F_ADD_SEALS, seals); |
| 115 | if (r < 0 && errno != EBUSY) { |
| 116 | printf("ADD_SEALS(%d, %llu -> %llu) didn't fail as expected with EBUSY: %m\n", |
| 117 | fd, (unsigned long long)s, (unsigned long long)seals); |
| 118 | abort(); |
| 119 | } |
| 120 | |
| 121 | return r; |
| 122 | } |
| 123 | |
| 124 | static void *mfd_assert_mmap_shared(int fd) |
| 125 | { |
| 126 | void *p; |
| 127 | |
| 128 | p = mmap(NULL, |
| 129 | mfd_def_size, |
| 130 | PROT_READ | PROT_WRITE, |
| 131 | MAP_SHARED, |
| 132 | fd, |
| 133 | 0); |
| 134 | if (p == MAP_FAILED) { |
| 135 | printf("mmap() failed: %m\n"); |
| 136 | abort(); |
| 137 | } |
| 138 | |
| 139 | return p; |
| 140 | } |
| 141 | |
| 142 | static void *mfd_assert_mmap_private(int fd) |
| 143 | { |
| 144 | void *p; |
| 145 | |
| 146 | p = mmap(NULL, |
| 147 | mfd_def_size, |
| 148 | PROT_READ | PROT_WRITE, |
| 149 | MAP_PRIVATE, |
| 150 | fd, |
| 151 | 0); |
| 152 | if (p == MAP_FAILED) { |
| 153 | printf("mmap() failed: %m\n"); |
| 154 | abort(); |
| 155 | } |
| 156 | |
| 157 | return p; |
| 158 | } |
| 159 | |
| 160 | static int global_mfd = -1; |
| 161 | static void *global_p = NULL; |
| 162 | |
| 163 | static int sealing_thread_fn(void *arg) |
| 164 | { |
| 165 | int sig, r; |
| 166 | |
| 167 | /* |
| 168 | * This thread first waits 200ms so any pending operation in the parent |
| 169 | * is correctly started. After that, it tries to seal @global_mfd as |
| 170 | * SEAL_WRITE. This _must_ fail as the parent thread has a read() into |
| 171 | * that memory mapped object still ongoing. |
| 172 | * We then wait one more second and try sealing again. This time it |
| 173 | * must succeed as there shouldn't be anyone else pinning the pages. |
| 174 | */ |
| 175 | |
| 176 | /* wait 200ms for FUSE-request to be active */ |
| 177 | usleep(200000); |
| 178 | |
| 179 | /* unmount mapping before sealing to avoid i_mmap_writable failures */ |
| 180 | munmap(global_p, mfd_def_size); |
| 181 | |
| 182 | /* Try sealing the global file; expect EBUSY or success. Current |
| 183 | * kernels will never succeed, but in the future, kernels might |
| 184 | * implement page-replacements or other fancy ways to avoid racing |
| 185 | * writes. */ |
| 186 | r = mfd_busy_add_seals(global_mfd, F_SEAL_WRITE); |
| 187 | if (r >= 0) { |
| 188 | printf("HURRAY! This kernel fixed GUP races!\n"); |
| 189 | } else { |
| 190 | /* wait 1s more so the FUSE-request is done */ |
| 191 | sleep(1); |
| 192 | |
| 193 | /* try sealing the global file again */ |
| 194 | mfd_assert_add_seals(global_mfd, F_SEAL_WRITE); |
| 195 | } |
| 196 | |
| 197 | return 0; |
| 198 | } |
| 199 | |
| 200 | static pid_t spawn_sealing_thread(void) |
| 201 | { |
| 202 | uint8_t *stack; |
| 203 | pid_t pid; |
| 204 | |
| 205 | stack = malloc(STACK_SIZE); |
| 206 | if (!stack) { |
| 207 | printf("malloc(STACK_SIZE) failed: %m\n"); |
| 208 | abort(); |
| 209 | } |
| 210 | |
| 211 | pid = clone(sealing_thread_fn, |
| 212 | stack + STACK_SIZE, |
| 213 | SIGCHLD | CLONE_FILES | CLONE_FS | CLONE_VM, |
| 214 | NULL); |
| 215 | if (pid < 0) { |
| 216 | printf("clone() failed: %m\n"); |
| 217 | abort(); |
| 218 | } |
| 219 | |
| 220 | return pid; |
| 221 | } |
| 222 | |
| 223 | static void join_sealing_thread(pid_t pid) |
| 224 | { |
| 225 | waitpid(pid, NULL, 0); |
| 226 | } |
| 227 | |
| 228 | int main(int argc, char **argv) |
| 229 | { |
| 230 | char *zero; |
| 231 | int fd, mfd, r; |
| 232 | void *p; |
| 233 | int was_sealed; |
| 234 | pid_t pid; |
| 235 | |
| 236 | if (argc < 2) { |
| 237 | printf("error: please pass path to file in fuse_mnt mount-point\n"); |
| 238 | abort(); |
| 239 | } |
| 240 | |
| 241 | if (argc >= 3) { |
| 242 | if (!strcmp(argv[2], "hugetlbfs")) { |
| 243 | unsigned long hpage_size = default_huge_page_size(); |
| 244 | |
| 245 | if (!hpage_size) { |
| 246 | printf("Unable to determine huge page size\n"); |
| 247 | abort(); |
| 248 | } |
| 249 | |
| 250 | hugetlbfs_test = 1; |
| 251 | mfd_def_size = hpage_size * 2; |
| 252 | } else { |
| 253 | printf("Unknown option: %s\n", argv[2]); |
| 254 | abort(); |
| 255 | } |
| 256 | } |
| 257 | |
| 258 | zero = calloc(sizeof(*zero), mfd_def_size); |
| 259 | |
| 260 | /* open FUSE memfd file for GUP testing */ |
| 261 | printf("opening: %s\n", argv[1]); |
| 262 | fd = open(argv[1], O_RDONLY | O_CLOEXEC); |
| 263 | if (fd < 0) { |
| 264 | printf("cannot open(\"%s\"): %m\n", argv[1]); |
| 265 | abort(); |
| 266 | } |
| 267 | |
| 268 | /* create new memfd-object */ |
| 269 | mfd = mfd_assert_new("kern_memfd_fuse", |
| 270 | mfd_def_size, |
| 271 | MFD_CLOEXEC | MFD_ALLOW_SEALING); |
| 272 | |
| 273 | /* mmap memfd-object for writing */ |
| 274 | p = mfd_assert_mmap_shared(mfd); |
| 275 | |
| 276 | /* pass mfd+mapping to a separate sealing-thread which tries to seal |
| 277 | * the memfd objects with SEAL_WRITE while we write into it */ |
| 278 | global_mfd = mfd; |
| 279 | global_p = p; |
| 280 | pid = spawn_sealing_thread(); |
| 281 | |
| 282 | /* Use read() on the FUSE file to read into our memory-mapped memfd |
| 283 | * object. This races the other thread which tries to seal the |
| 284 | * memfd-object. |
| 285 | * If @fd is on the memfd-fake-FUSE-FS, the read() is delayed by 1s. |
| 286 | * This guarantees that the receive-buffer is pinned for 1s until the |
| 287 | * data is written into it. The racing ADD_SEALS should thus fail as |
| 288 | * the pages are still pinned. */ |
| 289 | r = read(fd, p, mfd_def_size); |
| 290 | if (r < 0) { |
| 291 | printf("read() failed: %m\n"); |
| 292 | abort(); |
| 293 | } else if (!r) { |
| 294 | printf("unexpected EOF on read()\n"); |
| 295 | abort(); |
| 296 | } |
| 297 | |
| 298 | was_sealed = mfd_assert_get_seals(mfd) & F_SEAL_WRITE; |
| 299 | |
| 300 | /* Wait for sealing-thread to finish and verify that it |
| 301 | * successfully sealed the file after the second try. */ |
| 302 | join_sealing_thread(pid); |
| 303 | mfd_assert_has_seals(mfd, F_SEAL_WRITE); |
| 304 | |
| 305 | /* *IF* the memfd-object was sealed at the time our read() returned, |
| 306 | * then the kernel did a page-replacement or canceled the read() (or |
| 307 | * whatever magic it did..). In that case, the memfd object is still |
| 308 | * all zero. |
| 309 | * In case the memfd-object was *not* sealed, the read() was successfull |
| 310 | * and the memfd object must *not* be all zero. |
| 311 | * Note that in real scenarios, there might be a mixture of both, but |
| 312 | * in this test-cases, we have explicit 200ms delays which should be |
| 313 | * enough to avoid any in-flight writes. */ |
| 314 | |
| 315 | p = mfd_assert_mmap_private(mfd); |
| 316 | if (was_sealed && memcmp(p, zero, mfd_def_size)) { |
| 317 | printf("memfd sealed during read() but data not discarded\n"); |
| 318 | abort(); |
| 319 | } else if (!was_sealed && !memcmp(p, zero, mfd_def_size)) { |
| 320 | printf("memfd sealed after read() but data discarded\n"); |
| 321 | abort(); |
| 322 | } |
| 323 | |
| 324 | close(mfd); |
| 325 | close(fd); |
| 326 | |
| 327 | printf("fuse: DONE\n"); |
| 328 | free(zero); |
| 329 | |
| 330 | return 0; |
| 331 | } |