b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame^] | 1 | // SPDX-License-Identifier: GPL-2.0-only |
| 2 | /* |
| 3 | * Copyright 2018 Google Inc. |
| 4 | * Author: Eric Dumazet (edumazet@google.com) |
| 5 | * |
| 6 | * Reference program demonstrating tcp mmap() usage, |
| 7 | * and SO_RCVLOWAT hints for receiver. |
| 8 | * |
| 9 | * Note : NIC with header split is needed to use mmap() on TCP : |
| 10 | * Each incoming frame must be a multiple of PAGE_SIZE bytes of TCP payload. |
| 11 | * |
| 12 | * How to use on loopback interface : |
| 13 | * |
| 14 | * ifconfig lo mtu 61512 # 15*4096 + 40 (ipv6 header) + 32 (TCP with TS option header) |
| 15 | * tcp_mmap -s -z & |
| 16 | * tcp_mmap -H ::1 -z |
| 17 | * |
| 18 | * Or leave default lo mtu, but use -M option to set TCP_MAXSEG option to (4096 + 12) |
| 19 | * (4096 : page size on x86, 12: TCP TS option length) |
| 20 | * tcp_mmap -s -z -M $((4096+12)) & |
| 21 | * tcp_mmap -H ::1 -z -M $((4096+12)) |
| 22 | * |
| 23 | * Note: -z option on sender uses MSG_ZEROCOPY, which forces a copy when packets go through loopback interface. |
| 24 | * We might use sendfile() instead, but really this test program is about mmap(), for receivers ;) |
| 25 | * |
| 26 | * $ ./tcp_mmap -s & # Without mmap() |
| 27 | * $ for i in {1..4}; do ./tcp_mmap -H ::1 -z ; done |
| 28 | * received 32768 MB (0 % mmap'ed) in 14.1157 s, 19.4732 Gbit |
| 29 | * cpu usage user:0.057 sys:7.815, 240.234 usec per MB, 65531 c-switches |
| 30 | * received 32768 MB (0 % mmap'ed) in 14.6833 s, 18.7204 Gbit |
| 31 | * cpu usage user:0.043 sys:8.103, 248.596 usec per MB, 65524 c-switches |
| 32 | * received 32768 MB (0 % mmap'ed) in 11.143 s, 24.6682 Gbit |
| 33 | * cpu usage user:0.044 sys:6.576, 202.026 usec per MB, 65519 c-switches |
| 34 | * received 32768 MB (0 % mmap'ed) in 14.9056 s, 18.4413 Gbit |
| 35 | * cpu usage user:0.036 sys:8.193, 251.129 usec per MB, 65530 c-switches |
| 36 | * $ kill %1 # kill tcp_mmap server |
| 37 | * |
| 38 | * $ ./tcp_mmap -s -z & # With mmap() |
| 39 | * $ for i in {1..4}; do ./tcp_mmap -H ::1 -z ; done |
| 40 | * received 32768 MB (99.9939 % mmap'ed) in 6.73792 s, 40.7956 Gbit |
| 41 | * cpu usage user:0.045 sys:2.827, 87.6465 usec per MB, 65532 c-switches |
| 42 | * received 32768 MB (99.9939 % mmap'ed) in 7.26732 s, 37.8238 Gbit |
| 43 | * cpu usage user:0.037 sys:3.087, 95.3369 usec per MB, 65532 c-switches |
| 44 | * received 32768 MB (99.9939 % mmap'ed) in 7.61661 s, 36.0893 Gbit |
| 45 | * cpu usage user:0.046 sys:3.559, 110.016 usec per MB, 65529 c-switches |
| 46 | * received 32768 MB (99.9939 % mmap'ed) in 7.43764 s, 36.9577 Gbit |
| 47 | * cpu usage user:0.035 sys:3.467, 106.873 usec per MB, 65530 c-switches |
| 48 | */ |
| 49 | #define _GNU_SOURCE |
| 50 | #include <pthread.h> |
| 51 | #include <sys/types.h> |
| 52 | #include <fcntl.h> |
| 53 | #include <error.h> |
| 54 | #include <sys/socket.h> |
| 55 | #include <sys/mman.h> |
| 56 | #include <sys/resource.h> |
| 57 | #include <unistd.h> |
| 58 | #include <string.h> |
| 59 | #include <stdlib.h> |
| 60 | #include <stdio.h> |
| 61 | #include <errno.h> |
| 62 | #include <time.h> |
| 63 | #include <sys/time.h> |
| 64 | #include <netinet/in.h> |
| 65 | #include <arpa/inet.h> |
| 66 | #include <poll.h> |
| 67 | #include <linux/tcp.h> |
| 68 | #include <assert.h> |
| 69 | |
| 70 | #ifndef MSG_ZEROCOPY |
| 71 | #define MSG_ZEROCOPY 0x4000000 |
| 72 | #endif |
| 73 | |
| 74 | #define FILE_SZ (1UL << 35) |
| 75 | static int cfg_family = AF_INET6; |
| 76 | static socklen_t cfg_alen = sizeof(struct sockaddr_in6); |
| 77 | static int cfg_port = 8787; |
| 78 | |
| 79 | static int rcvbuf; /* Default: autotuning. Can be set with -r <integer> option */ |
| 80 | static int sndbuf; /* Default: autotuning. Can be set with -w <integer> option */ |
| 81 | static int zflg; /* zero copy option. (MSG_ZEROCOPY for sender, mmap() for receiver */ |
| 82 | static int xflg; /* hash received data (simple xor) (-h option) */ |
| 83 | static int keepflag; /* -k option: receiver shall keep all received file in memory (no munmap() calls) */ |
| 84 | |
| 85 | static int chunk_size = 512*1024; |
| 86 | |
| 87 | unsigned long htotal; |
| 88 | |
| 89 | static inline void prefetch(const void *x) |
| 90 | { |
| 91 | #if defined(__x86_64__) |
| 92 | asm volatile("prefetcht0 %P0" : : "m" (*(const char *)x)); |
| 93 | #endif |
| 94 | } |
| 95 | |
| 96 | void hash_zone(void *zone, unsigned int length) |
| 97 | { |
| 98 | unsigned long temp = htotal; |
| 99 | |
| 100 | while (length >= 8*sizeof(long)) { |
| 101 | prefetch(zone + 384); |
| 102 | temp ^= *(unsigned long *)zone; |
| 103 | temp ^= *(unsigned long *)(zone + sizeof(long)); |
| 104 | temp ^= *(unsigned long *)(zone + 2*sizeof(long)); |
| 105 | temp ^= *(unsigned long *)(zone + 3*sizeof(long)); |
| 106 | temp ^= *(unsigned long *)(zone + 4*sizeof(long)); |
| 107 | temp ^= *(unsigned long *)(zone + 5*sizeof(long)); |
| 108 | temp ^= *(unsigned long *)(zone + 6*sizeof(long)); |
| 109 | temp ^= *(unsigned long *)(zone + 7*sizeof(long)); |
| 110 | zone += 8*sizeof(long); |
| 111 | length -= 8*sizeof(long); |
| 112 | } |
| 113 | while (length >= 1) { |
| 114 | temp ^= *(unsigned char *)zone; |
| 115 | zone += 1; |
| 116 | length--; |
| 117 | } |
| 118 | htotal = temp; |
| 119 | } |
| 120 | |
| 121 | void *child_thread(void *arg) |
| 122 | { |
| 123 | unsigned long total_mmap = 0, total = 0; |
| 124 | struct tcp_zerocopy_receive zc; |
| 125 | unsigned long delta_usec; |
| 126 | int flags = MAP_SHARED; |
| 127 | struct timeval t0, t1; |
| 128 | char *buffer = NULL; |
| 129 | void *addr = NULL; |
| 130 | double throughput; |
| 131 | struct rusage ru; |
| 132 | int lu, fd; |
| 133 | |
| 134 | fd = (int)(unsigned long)arg; |
| 135 | |
| 136 | gettimeofday(&t0, NULL); |
| 137 | |
| 138 | fcntl(fd, F_SETFL, O_NDELAY); |
| 139 | buffer = malloc(chunk_size); |
| 140 | if (!buffer) { |
| 141 | perror("malloc"); |
| 142 | goto error; |
| 143 | } |
| 144 | if (zflg) { |
| 145 | addr = mmap(NULL, chunk_size, PROT_READ, flags, fd, 0); |
| 146 | if (addr == (void *)-1) |
| 147 | zflg = 0; |
| 148 | } |
| 149 | while (1) { |
| 150 | struct pollfd pfd = { .fd = fd, .events = POLLIN, }; |
| 151 | int sub; |
| 152 | |
| 153 | poll(&pfd, 1, 10000); |
| 154 | if (zflg) { |
| 155 | socklen_t zc_len = sizeof(zc); |
| 156 | int res; |
| 157 | |
| 158 | zc.address = (__u64)addr; |
| 159 | zc.length = chunk_size; |
| 160 | zc.recv_skip_hint = 0; |
| 161 | res = getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, |
| 162 | &zc, &zc_len); |
| 163 | if (res == -1) |
| 164 | break; |
| 165 | |
| 166 | if (zc.length) { |
| 167 | assert(zc.length <= chunk_size); |
| 168 | total_mmap += zc.length; |
| 169 | if (xflg) |
| 170 | hash_zone(addr, zc.length); |
| 171 | total += zc.length; |
| 172 | } |
| 173 | if (zc.recv_skip_hint) { |
| 174 | assert(zc.recv_skip_hint <= chunk_size); |
| 175 | lu = read(fd, buffer, zc.recv_skip_hint); |
| 176 | if (lu > 0) { |
| 177 | if (xflg) |
| 178 | hash_zone(buffer, lu); |
| 179 | total += lu; |
| 180 | } |
| 181 | } |
| 182 | continue; |
| 183 | } |
| 184 | sub = 0; |
| 185 | while (sub < chunk_size) { |
| 186 | lu = read(fd, buffer + sub, chunk_size - sub); |
| 187 | if (lu == 0) |
| 188 | goto end; |
| 189 | if (lu < 0) |
| 190 | break; |
| 191 | if (xflg) |
| 192 | hash_zone(buffer + sub, lu); |
| 193 | total += lu; |
| 194 | sub += lu; |
| 195 | } |
| 196 | } |
| 197 | end: |
| 198 | gettimeofday(&t1, NULL); |
| 199 | delta_usec = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; |
| 200 | |
| 201 | throughput = 0; |
| 202 | if (delta_usec) |
| 203 | throughput = total * 8.0 / (double)delta_usec / 1000.0; |
| 204 | getrusage(RUSAGE_THREAD, &ru); |
| 205 | if (total > 1024*1024) { |
| 206 | unsigned long total_usec; |
| 207 | unsigned long mb = total >> 20; |
| 208 | total_usec = 1000000*ru.ru_utime.tv_sec + ru.ru_utime.tv_usec + |
| 209 | 1000000*ru.ru_stime.tv_sec + ru.ru_stime.tv_usec; |
| 210 | printf("received %lg MB (%lg %% mmap'ed) in %lg s, %lg Gbit\n" |
| 211 | " cpu usage user:%lg sys:%lg, %lg usec per MB, %lu c-switches\n", |
| 212 | total / (1024.0 * 1024.0), |
| 213 | 100.0*total_mmap/total, |
| 214 | (double)delta_usec / 1000000.0, |
| 215 | throughput, |
| 216 | (double)ru.ru_utime.tv_sec + (double)ru.ru_utime.tv_usec / 1000000.0, |
| 217 | (double)ru.ru_stime.tv_sec + (double)ru.ru_stime.tv_usec / 1000000.0, |
| 218 | (double)total_usec/mb, |
| 219 | ru.ru_nvcsw); |
| 220 | } |
| 221 | error: |
| 222 | free(buffer); |
| 223 | close(fd); |
| 224 | if (zflg) |
| 225 | munmap(addr, chunk_size); |
| 226 | pthread_exit(0); |
| 227 | } |
| 228 | |
| 229 | static void apply_rcvsnd_buf(int fd) |
| 230 | { |
| 231 | if (rcvbuf && setsockopt(fd, SOL_SOCKET, |
| 232 | SO_RCVBUF, &rcvbuf, sizeof(rcvbuf)) == -1) { |
| 233 | perror("setsockopt SO_RCVBUF"); |
| 234 | } |
| 235 | |
| 236 | if (sndbuf && setsockopt(fd, SOL_SOCKET, |
| 237 | SO_SNDBUF, &sndbuf, sizeof(sndbuf)) == -1) { |
| 238 | perror("setsockopt SO_SNDBUF"); |
| 239 | } |
| 240 | } |
| 241 | |
| 242 | |
| 243 | static void setup_sockaddr(int domain, const char *str_addr, |
| 244 | struct sockaddr_storage *sockaddr) |
| 245 | { |
| 246 | struct sockaddr_in6 *addr6 = (void *) sockaddr; |
| 247 | struct sockaddr_in *addr4 = (void *) sockaddr; |
| 248 | |
| 249 | switch (domain) { |
| 250 | case PF_INET: |
| 251 | memset(addr4, 0, sizeof(*addr4)); |
| 252 | addr4->sin_family = AF_INET; |
| 253 | addr4->sin_port = htons(cfg_port); |
| 254 | if (str_addr && |
| 255 | inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1) |
| 256 | error(1, 0, "ipv4 parse error: %s", str_addr); |
| 257 | break; |
| 258 | case PF_INET6: |
| 259 | memset(addr6, 0, sizeof(*addr6)); |
| 260 | addr6->sin6_family = AF_INET6; |
| 261 | addr6->sin6_port = htons(cfg_port); |
| 262 | if (str_addr && |
| 263 | inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1) |
| 264 | error(1, 0, "ipv6 parse error: %s", str_addr); |
| 265 | break; |
| 266 | default: |
| 267 | error(1, 0, "illegal domain"); |
| 268 | } |
| 269 | } |
| 270 | |
| 271 | static void do_accept(int fdlisten) |
| 272 | { |
| 273 | if (setsockopt(fdlisten, SOL_SOCKET, SO_RCVLOWAT, |
| 274 | &chunk_size, sizeof(chunk_size)) == -1) { |
| 275 | perror("setsockopt SO_RCVLOWAT"); |
| 276 | } |
| 277 | |
| 278 | apply_rcvsnd_buf(fdlisten); |
| 279 | |
| 280 | while (1) { |
| 281 | struct sockaddr_in addr; |
| 282 | socklen_t addrlen = sizeof(addr); |
| 283 | pthread_t th; |
| 284 | int fd, res; |
| 285 | |
| 286 | fd = accept(fdlisten, (struct sockaddr *)&addr, &addrlen); |
| 287 | if (fd == -1) { |
| 288 | perror("accept"); |
| 289 | continue; |
| 290 | } |
| 291 | res = pthread_create(&th, NULL, child_thread, |
| 292 | (void *)(unsigned long)fd); |
| 293 | if (res) { |
| 294 | errno = res; |
| 295 | perror("pthread_create"); |
| 296 | close(fd); |
| 297 | } |
| 298 | } |
| 299 | } |
| 300 | |
| 301 | int main(int argc, char *argv[]) |
| 302 | { |
| 303 | struct sockaddr_storage listenaddr, addr; |
| 304 | unsigned int max_pacing_rate = 0; |
| 305 | unsigned long total = 0; |
| 306 | char *host = NULL; |
| 307 | int fd, c, on = 1; |
| 308 | char *buffer; |
| 309 | int sflg = 0; |
| 310 | int mss = 0; |
| 311 | |
| 312 | while ((c = getopt(argc, argv, "46p:svr:w:H:zxkP:M:")) != -1) { |
| 313 | switch (c) { |
| 314 | case '4': |
| 315 | cfg_family = PF_INET; |
| 316 | cfg_alen = sizeof(struct sockaddr_in); |
| 317 | break; |
| 318 | case '6': |
| 319 | cfg_family = PF_INET6; |
| 320 | cfg_alen = sizeof(struct sockaddr_in6); |
| 321 | break; |
| 322 | case 'p': |
| 323 | cfg_port = atoi(optarg); |
| 324 | break; |
| 325 | case 'H': |
| 326 | host = optarg; |
| 327 | break; |
| 328 | case 's': /* server : listen for incoming connections */ |
| 329 | sflg++; |
| 330 | break; |
| 331 | case 'r': |
| 332 | rcvbuf = atoi(optarg); |
| 333 | break; |
| 334 | case 'w': |
| 335 | sndbuf = atoi(optarg); |
| 336 | break; |
| 337 | case 'z': |
| 338 | zflg = 1; |
| 339 | break; |
| 340 | case 'M': |
| 341 | mss = atoi(optarg); |
| 342 | break; |
| 343 | case 'x': |
| 344 | xflg = 1; |
| 345 | break; |
| 346 | case 'k': |
| 347 | keepflag = 1; |
| 348 | break; |
| 349 | case 'P': |
| 350 | max_pacing_rate = atoi(optarg) ; |
| 351 | break; |
| 352 | default: |
| 353 | exit(1); |
| 354 | } |
| 355 | } |
| 356 | if (sflg) { |
| 357 | int fdlisten = socket(cfg_family, SOCK_STREAM, 0); |
| 358 | |
| 359 | if (fdlisten == -1) { |
| 360 | perror("socket"); |
| 361 | exit(1); |
| 362 | } |
| 363 | apply_rcvsnd_buf(fdlisten); |
| 364 | setsockopt(fdlisten, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); |
| 365 | |
| 366 | setup_sockaddr(cfg_family, host, &listenaddr); |
| 367 | |
| 368 | if (mss && |
| 369 | setsockopt(fdlisten, IPPROTO_TCP, TCP_MAXSEG, |
| 370 | &mss, sizeof(mss)) == -1) { |
| 371 | perror("setsockopt TCP_MAXSEG"); |
| 372 | exit(1); |
| 373 | } |
| 374 | if (bind(fdlisten, (const struct sockaddr *)&listenaddr, cfg_alen) == -1) { |
| 375 | perror("bind"); |
| 376 | exit(1); |
| 377 | } |
| 378 | if (listen(fdlisten, 128) == -1) { |
| 379 | perror("listen"); |
| 380 | exit(1); |
| 381 | } |
| 382 | do_accept(fdlisten); |
| 383 | } |
| 384 | buffer = mmap(NULL, chunk_size, PROT_READ | PROT_WRITE, |
| 385 | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); |
| 386 | if (buffer == (char *)-1) { |
| 387 | perror("mmap"); |
| 388 | exit(1); |
| 389 | } |
| 390 | |
| 391 | fd = socket(cfg_family, SOCK_STREAM, 0); |
| 392 | if (fd == -1) { |
| 393 | perror("socket"); |
| 394 | exit(1); |
| 395 | } |
| 396 | apply_rcvsnd_buf(fd); |
| 397 | |
| 398 | setup_sockaddr(cfg_family, host, &addr); |
| 399 | |
| 400 | if (mss && |
| 401 | setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) { |
| 402 | perror("setsockopt TCP_MAXSEG"); |
| 403 | exit(1); |
| 404 | } |
| 405 | if (connect(fd, (const struct sockaddr *)&addr, cfg_alen) == -1) { |
| 406 | perror("connect"); |
| 407 | exit(1); |
| 408 | } |
| 409 | if (max_pacing_rate && |
| 410 | setsockopt(fd, SOL_SOCKET, SO_MAX_PACING_RATE, |
| 411 | &max_pacing_rate, sizeof(max_pacing_rate)) == -1) |
| 412 | perror("setsockopt SO_MAX_PACING_RATE"); |
| 413 | |
| 414 | if (zflg && setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, |
| 415 | &on, sizeof(on)) == -1) { |
| 416 | perror("setsockopt SO_ZEROCOPY, (-z option disabled)"); |
| 417 | zflg = 0; |
| 418 | } |
| 419 | while (total < FILE_SZ) { |
| 420 | long wr = FILE_SZ - total; |
| 421 | |
| 422 | if (wr > chunk_size) |
| 423 | wr = chunk_size; |
| 424 | /* Note : we just want to fill the pipe with 0 bytes */ |
| 425 | wr = send(fd, buffer, wr, zflg ? MSG_ZEROCOPY : 0); |
| 426 | if (wr <= 0) |
| 427 | break; |
| 428 | total += wr; |
| 429 | } |
| 430 | close(fd); |
| 431 | munmap(buffer, chunk_size); |
| 432 | return 0; |
| 433 | } |