Merge branch 'locking-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[cris-mirror.git] / tools / testing / selftests / net / msg_zerocopy.c
blobe11fe84de0fd92f9123b9bb75c0745123ad43546
1 /* Evaluate MSG_ZEROCOPY
3 * Send traffic between two processes over one of the supported
4 * protocols and modes:
6 * PF_INET/PF_INET6
7 * - SOCK_STREAM
8 * - SOCK_DGRAM
9 * - SOCK_DGRAM with UDP_CORK
10 * - SOCK_RAW
11 * - SOCK_RAW with IP_HDRINCL
13 * PF_PACKET
14 * - SOCK_DGRAM
15 * - SOCK_RAW
17 * Start this program on two connected hosts, one in send mode and
18 * the other with option '-r' to put it in receiver mode.
20 * If zerocopy mode ('-z') is enabled, the sender will verify that
21 * the kernel queues completions on the error queue for all zerocopy
22 * transfers.
25 #define _GNU_SOURCE
27 #include <arpa/inet.h>
28 #include <error.h>
29 #include <errno.h>
30 #include <limits.h>
31 #include <linux/errqueue.h>
32 #include <linux/if_packet.h>
33 #include <linux/ipv6.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <net/ethernet.h>
37 #include <net/if.h>
38 #include <netinet/ip.h>
39 #include <netinet/ip6.h>
40 #include <netinet/tcp.h>
41 #include <netinet/udp.h>
42 #include <poll.h>
43 #include <sched.h>
44 #include <stdbool.h>
45 #include <stdio.h>
46 #include <stdint.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <sys/ioctl.h>
50 #include <sys/socket.h>
51 #include <sys/stat.h>
52 #include <sys/time.h>
53 #include <sys/types.h>
54 #include <sys/wait.h>
55 #include <unistd.h>
57 #ifndef SO_EE_ORIGIN_ZEROCOPY
58 #define SO_EE_ORIGIN_ZEROCOPY 5
59 #endif
61 #ifndef SO_ZEROCOPY
62 #define SO_ZEROCOPY 60
63 #endif
65 #ifndef SO_EE_CODE_ZEROCOPY_COPIED
66 #define SO_EE_CODE_ZEROCOPY_COPIED 1
67 #endif
69 #ifndef MSG_ZEROCOPY
70 #define MSG_ZEROCOPY 0x4000000
71 #endif
73 static int cfg_cork;
74 static bool cfg_cork_mixed;
75 static int cfg_cpu = -1; /* default: pin to last cpu */
76 static int cfg_family = PF_UNSPEC;
77 static int cfg_ifindex = 1;
78 static int cfg_payload_len;
79 static int cfg_port = 8000;
80 static bool cfg_rx;
81 static int cfg_runtime_ms = 4200;
82 static int cfg_verbose;
83 static int cfg_waittime_ms = 500;
84 static bool cfg_zerocopy;
86 static socklen_t cfg_alen;
87 static struct sockaddr_storage cfg_dst_addr;
88 static struct sockaddr_storage cfg_src_addr;
90 static char payload[IP_MAXPACKET];
91 static long packets, bytes, completions, expected_completions;
92 static int zerocopied = -1;
93 static uint32_t next_completion;
95 static unsigned long gettimeofday_ms(void)
97 struct timeval tv;
99 gettimeofday(&tv, NULL);
100 return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
103 static uint16_t get_ip_csum(const uint16_t *start, int num_words)
105 unsigned long sum = 0;
106 int i;
108 for (i = 0; i < num_words; i++)
109 sum += start[i];
111 while (sum >> 16)
112 sum = (sum & 0xFFFF) + (sum >> 16);
114 return ~sum;
117 static int do_setcpu(int cpu)
119 cpu_set_t mask;
121 CPU_ZERO(&mask);
122 CPU_SET(cpu, &mask);
123 if (sched_setaffinity(0, sizeof(mask), &mask))
124 error(1, 0, "setaffinity %d", cpu);
126 if (cfg_verbose)
127 fprintf(stderr, "cpu: %u\n", cpu);
129 return 0;
132 static void do_setsockopt(int fd, int level, int optname, int val)
134 if (setsockopt(fd, level, optname, &val, sizeof(val)))
135 error(1, errno, "setsockopt %d.%d: %d", level, optname, val);
138 static int do_poll(int fd, int events)
140 struct pollfd pfd;
141 int ret;
143 pfd.events = events;
144 pfd.revents = 0;
145 pfd.fd = fd;
147 ret = poll(&pfd, 1, cfg_waittime_ms);
148 if (ret == -1)
149 error(1, errno, "poll");
151 return ret && (pfd.revents & events);
154 static int do_accept(int fd)
156 int fda = fd;
158 fd = accept(fda, NULL, NULL);
159 if (fd == -1)
160 error(1, errno, "accept");
161 if (close(fda))
162 error(1, errno, "close listen sock");
164 return fd;
167 static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy)
169 int ret, len, i, flags;
171 len = 0;
172 for (i = 0; i < msg->msg_iovlen; i++)
173 len += msg->msg_iov[i].iov_len;
175 flags = MSG_DONTWAIT;
176 if (do_zerocopy)
177 flags |= MSG_ZEROCOPY;
179 ret = sendmsg(fd, msg, flags);
180 if (ret == -1 && errno == EAGAIN)
181 return false;
182 if (ret == -1)
183 error(1, errno, "send");
184 if (cfg_verbose && ret != len)
185 fprintf(stderr, "send: ret=%u != %u\n", ret, len);
187 if (len) {
188 packets++;
189 bytes += ret;
190 if (do_zerocopy && ret)
191 expected_completions++;
194 return true;
197 static void do_sendmsg_corked(int fd, struct msghdr *msg)
199 bool do_zerocopy = cfg_zerocopy;
200 int i, payload_len, extra_len;
202 /* split up the packet. for non-multiple, make first buffer longer */
203 payload_len = cfg_payload_len / cfg_cork;
204 extra_len = cfg_payload_len - (cfg_cork * payload_len);
206 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1);
208 for (i = 0; i < cfg_cork; i++) {
210 /* in mixed-frags mode, alternate zerocopy and copy frags
211 * start with non-zerocopy, to ensure attach later works
213 if (cfg_cork_mixed)
214 do_zerocopy = (i & 1);
216 msg->msg_iov[0].iov_len = payload_len + extra_len;
217 extra_len = 0;
219 do_sendmsg(fd, msg, do_zerocopy);
222 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
225 static int setup_iph(struct iphdr *iph, uint16_t payload_len)
227 struct sockaddr_in *daddr = (void *) &cfg_dst_addr;
228 struct sockaddr_in *saddr = (void *) &cfg_src_addr;
230 memset(iph, 0, sizeof(*iph));
232 iph->version = 4;
233 iph->tos = 0;
234 iph->ihl = 5;
235 iph->ttl = 2;
236 iph->saddr = saddr->sin_addr.s_addr;
237 iph->daddr = daddr->sin_addr.s_addr;
238 iph->protocol = IPPROTO_EGP;
239 iph->tot_len = htons(sizeof(*iph) + payload_len);
240 iph->check = get_ip_csum((void *) iph, iph->ihl << 1);
242 return sizeof(*iph);
245 static int setup_ip6h(struct ipv6hdr *ip6h, uint16_t payload_len)
247 struct sockaddr_in6 *daddr = (void *) &cfg_dst_addr;
248 struct sockaddr_in6 *saddr = (void *) &cfg_src_addr;
250 memset(ip6h, 0, sizeof(*ip6h));
252 ip6h->version = 6;
253 ip6h->payload_len = htons(payload_len);
254 ip6h->nexthdr = IPPROTO_EGP;
255 ip6h->hop_limit = 2;
256 ip6h->saddr = saddr->sin6_addr;
257 ip6h->daddr = daddr->sin6_addr;
259 return sizeof(*ip6h);
263 static void setup_sockaddr(int domain, const char *str_addr,
264 struct sockaddr_storage *sockaddr)
266 struct sockaddr_in6 *addr6 = (void *) sockaddr;
267 struct sockaddr_in *addr4 = (void *) sockaddr;
269 switch (domain) {
270 case PF_INET:
271 memset(addr4, 0, sizeof(*addr4));
272 addr4->sin_family = AF_INET;
273 addr4->sin_port = htons(cfg_port);
274 if (str_addr &&
275 inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
276 error(1, 0, "ipv4 parse error: %s", str_addr);
277 break;
278 case PF_INET6:
279 memset(addr6, 0, sizeof(*addr6));
280 addr6->sin6_family = AF_INET6;
281 addr6->sin6_port = htons(cfg_port);
282 if (str_addr &&
283 inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
284 error(1, 0, "ipv6 parse error: %s", str_addr);
285 break;
286 default:
287 error(1, 0, "illegal domain");
291 static int do_setup_tx(int domain, int type, int protocol)
293 int fd;
295 fd = socket(domain, type, protocol);
296 if (fd == -1)
297 error(1, errno, "socket t");
299 do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21);
300 if (cfg_zerocopy)
301 do_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, 1);
303 if (domain != PF_PACKET)
304 if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
305 error(1, errno, "connect");
307 return fd;
310 static bool do_recv_completion(int fd)
312 struct sock_extended_err *serr;
313 struct msghdr msg = {};
314 struct cmsghdr *cm;
315 uint32_t hi, lo, range;
316 int ret, zerocopy;
317 char control[100];
319 msg.msg_control = control;
320 msg.msg_controllen = sizeof(control);
322 ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
323 if (ret == -1 && errno == EAGAIN)
324 return false;
325 if (ret == -1)
326 error(1, errno, "recvmsg notification");
327 if (msg.msg_flags & MSG_CTRUNC)
328 error(1, errno, "recvmsg notification: truncated");
330 cm = CMSG_FIRSTHDR(&msg);
331 if (!cm)
332 error(1, 0, "cmsg: no cmsg");
333 if (!((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) ||
334 (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR) ||
335 (cm->cmsg_level == SOL_PACKET && cm->cmsg_type == PACKET_TX_TIMESTAMP)))
336 error(1, 0, "serr: wrong type: %d.%d",
337 cm->cmsg_level, cm->cmsg_type);
339 serr = (void *) CMSG_DATA(cm);
340 if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY)
341 error(1, 0, "serr: wrong origin: %u", serr->ee_origin);
342 if (serr->ee_errno != 0)
343 error(1, 0, "serr: wrong error code: %u", serr->ee_errno);
345 hi = serr->ee_data;
346 lo = serr->ee_info;
347 range = hi - lo + 1;
349 /* Detect notification gaps. These should not happen often, if at all.
350 * Gaps can occur due to drops, reordering and retransmissions.
352 if (lo != next_completion)
353 fprintf(stderr, "gap: %u..%u does not append to %u\n",
354 lo, hi, next_completion);
355 next_completion = hi + 1;
357 zerocopy = !(serr->ee_code & SO_EE_CODE_ZEROCOPY_COPIED);
358 if (zerocopied == -1)
359 zerocopied = zerocopy;
360 else if (zerocopied != zerocopy) {
361 fprintf(stderr, "serr: inconsistent\n");
362 zerocopied = zerocopy;
365 if (cfg_verbose >= 2)
366 fprintf(stderr, "completed: %u (h=%u l=%u)\n",
367 range, hi, lo);
369 completions += range;
370 return true;
373 /* Read all outstanding messages on the errqueue */
374 static void do_recv_completions(int fd)
376 while (do_recv_completion(fd)) {}
379 /* Wait for all remaining completions on the errqueue */
380 static void do_recv_remaining_completions(int fd)
382 int64_t tstop = gettimeofday_ms() + cfg_waittime_ms;
384 while (completions < expected_completions &&
385 gettimeofday_ms() < tstop) {
386 if (do_poll(fd, POLLERR))
387 do_recv_completions(fd);
390 if (completions < expected_completions)
391 fprintf(stderr, "missing notifications: %lu < %lu\n",
392 completions, expected_completions);
395 static void do_tx(int domain, int type, int protocol)
397 struct iovec iov[3] = { {0} };
398 struct sockaddr_ll laddr;
399 struct msghdr msg = {0};
400 struct ethhdr eth;
401 union {
402 struct ipv6hdr ip6h;
403 struct iphdr iph;
404 } nh;
405 uint64_t tstop;
406 int fd;
408 fd = do_setup_tx(domain, type, protocol);
410 if (domain == PF_PACKET) {
411 uint16_t proto = cfg_family == PF_INET ? ETH_P_IP : ETH_P_IPV6;
413 /* sock_raw passes ll header as data */
414 if (type == SOCK_RAW) {
415 memset(eth.h_dest, 0x06, ETH_ALEN);
416 memset(eth.h_source, 0x02, ETH_ALEN);
417 eth.h_proto = htons(proto);
418 iov[0].iov_base = &eth;
419 iov[0].iov_len = sizeof(eth);
420 msg.msg_iovlen++;
423 /* both sock_raw and sock_dgram expect name */
424 memset(&laddr, 0, sizeof(laddr));
425 laddr.sll_family = AF_PACKET;
426 laddr.sll_ifindex = cfg_ifindex;
427 laddr.sll_protocol = htons(proto);
428 laddr.sll_halen = ETH_ALEN;
430 memset(laddr.sll_addr, 0x06, ETH_ALEN);
432 msg.msg_name = &laddr;
433 msg.msg_namelen = sizeof(laddr);
436 /* packet and raw sockets with hdrincl must pass network header */
437 if (domain == PF_PACKET || protocol == IPPROTO_RAW) {
438 if (cfg_family == PF_INET)
439 iov[1].iov_len = setup_iph(&nh.iph, cfg_payload_len);
440 else
441 iov[1].iov_len = setup_ip6h(&nh.ip6h, cfg_payload_len);
443 iov[1].iov_base = (void *) &nh;
444 msg.msg_iovlen++;
447 iov[2].iov_base = payload;
448 iov[2].iov_len = cfg_payload_len;
449 msg.msg_iovlen++;
450 msg.msg_iov = &iov[3 - msg.msg_iovlen];
452 tstop = gettimeofday_ms() + cfg_runtime_ms;
453 do {
454 if (cfg_cork)
455 do_sendmsg_corked(fd, &msg);
456 else
457 do_sendmsg(fd, &msg, cfg_zerocopy);
459 while (!do_poll(fd, POLLOUT)) {
460 if (cfg_zerocopy)
461 do_recv_completions(fd);
464 } while (gettimeofday_ms() < tstop);
466 if (cfg_zerocopy)
467 do_recv_remaining_completions(fd);
469 if (close(fd))
470 error(1, errno, "close");
472 fprintf(stderr, "tx=%lu (%lu MB) txc=%lu zc=%c\n",
473 packets, bytes >> 20, completions,
474 zerocopied == 1 ? 'y' : 'n');
477 static int do_setup_rx(int domain, int type, int protocol)
479 int fd;
481 /* If tx over PF_PACKET, rx over PF_INET(6)/SOCK_RAW,
482 * to recv the only copy of the packet, not a clone
484 if (domain == PF_PACKET)
485 error(1, 0, "Use PF_INET/SOCK_RAW to read");
487 if (type == SOCK_RAW && protocol == IPPROTO_RAW)
488 error(1, 0, "IPPROTO_RAW: not supported on Rx");
490 fd = socket(domain, type, protocol);
491 if (fd == -1)
492 error(1, errno, "socket r");
494 do_setsockopt(fd, SOL_SOCKET, SO_RCVBUF, 1 << 21);
495 do_setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, 1 << 16);
496 do_setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, 1);
498 if (bind(fd, (void *) &cfg_dst_addr, cfg_alen))
499 error(1, errno, "bind");
501 if (type == SOCK_STREAM) {
502 if (listen(fd, 1))
503 error(1, errno, "listen");
504 fd = do_accept(fd);
507 return fd;
510 /* Flush all outstanding bytes for the tcp receive queue */
511 static void do_flush_tcp(int fd)
513 int ret;
515 /* MSG_TRUNC flushes up to len bytes */
516 ret = recv(fd, NULL, 1 << 21, MSG_TRUNC | MSG_DONTWAIT);
517 if (ret == -1 && errno == EAGAIN)
518 return;
519 if (ret == -1)
520 error(1, errno, "flush");
521 if (!ret)
522 return;
524 packets++;
525 bytes += ret;
528 /* Flush all outstanding datagrams. Verify first few bytes of each. */
529 static void do_flush_datagram(int fd, int type)
531 int ret, off = 0;
532 char buf[64];
534 /* MSG_TRUNC will return full datagram length */
535 ret = recv(fd, buf, sizeof(buf), MSG_DONTWAIT | MSG_TRUNC);
536 if (ret == -1 && errno == EAGAIN)
537 return;
539 /* raw ipv4 return with header, raw ipv6 without */
540 if (cfg_family == PF_INET && type == SOCK_RAW) {
541 off += sizeof(struct iphdr);
542 ret -= sizeof(struct iphdr);
545 if (ret == -1)
546 error(1, errno, "recv");
547 if (ret != cfg_payload_len)
548 error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len);
549 if (ret > sizeof(buf) - off)
550 ret = sizeof(buf) - off;
551 if (memcmp(buf + off, payload, ret))
552 error(1, 0, "recv: data mismatch");
554 packets++;
555 bytes += cfg_payload_len;
558 static void do_rx(int domain, int type, int protocol)
560 uint64_t tstop;
561 int fd;
563 fd = do_setup_rx(domain, type, protocol);
565 tstop = gettimeofday_ms() + cfg_runtime_ms;
566 do {
567 if (type == SOCK_STREAM)
568 do_flush_tcp(fd);
569 else
570 do_flush_datagram(fd, type);
572 do_poll(fd, POLLIN);
574 } while (gettimeofday_ms() < tstop);
576 if (close(fd))
577 error(1, errno, "close");
579 fprintf(stderr, "rx=%lu (%lu MB)\n", packets, bytes >> 20);
582 static void do_test(int domain, int type, int protocol)
584 int i;
586 if (cfg_cork && (domain == PF_PACKET || type != SOCK_DGRAM))
587 error(1, 0, "can only cork udp sockets");
589 do_setcpu(cfg_cpu);
591 for (i = 0; i < IP_MAXPACKET; i++)
592 payload[i] = 'a' + (i % 26);
594 if (cfg_rx)
595 do_rx(domain, type, protocol);
596 else
597 do_tx(domain, type, protocol);
600 static void usage(const char *filepath)
602 error(1, 0, "Usage: %s [options] <test>", filepath);
605 static void parse_opts(int argc, char **argv)
607 const int max_payload_len = sizeof(payload) -
608 sizeof(struct ipv6hdr) -
609 sizeof(struct tcphdr) -
610 40 /* max tcp options */;
611 int c;
612 char *daddr = NULL, *saddr = NULL;
614 cfg_payload_len = max_payload_len;
616 while ((c = getopt(argc, argv, "46c:C:D:i:mp:rs:S:t:vz")) != -1) {
617 switch (c) {
618 case '4':
619 if (cfg_family != PF_UNSPEC)
620 error(1, 0, "Pass one of -4 or -6");
621 cfg_family = PF_INET;
622 cfg_alen = sizeof(struct sockaddr_in);
623 break;
624 case '6':
625 if (cfg_family != PF_UNSPEC)
626 error(1, 0, "Pass one of -4 or -6");
627 cfg_family = PF_INET6;
628 cfg_alen = sizeof(struct sockaddr_in6);
629 break;
630 case 'c':
631 cfg_cork = strtol(optarg, NULL, 0);
632 break;
633 case 'C':
634 cfg_cpu = strtol(optarg, NULL, 0);
635 break;
636 case 'D':
637 daddr = optarg;
638 break;
639 case 'i':
640 cfg_ifindex = if_nametoindex(optarg);
641 if (cfg_ifindex == 0)
642 error(1, errno, "invalid iface: %s", optarg);
643 break;
644 case 'm':
645 cfg_cork_mixed = true;
646 break;
647 case 'p':
648 cfg_port = strtoul(optarg, NULL, 0);
649 break;
650 case 'r':
651 cfg_rx = true;
652 break;
653 case 's':
654 cfg_payload_len = strtoul(optarg, NULL, 0);
655 break;
656 case 'S':
657 saddr = optarg;
658 break;
659 case 't':
660 cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000;
661 break;
662 case 'v':
663 cfg_verbose++;
664 break;
665 case 'z':
666 cfg_zerocopy = true;
667 break;
670 setup_sockaddr(cfg_family, daddr, &cfg_dst_addr);
671 setup_sockaddr(cfg_family, saddr, &cfg_src_addr);
673 if (cfg_payload_len > max_payload_len)
674 error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
675 if (cfg_cork_mixed && (!cfg_zerocopy || !cfg_cork))
676 error(1, 0, "-m: cork_mixed requires corking and zerocopy");
678 if (optind != argc - 1)
679 usage(argv[0]);
682 int main(int argc, char **argv)
684 const char *cfg_test;
686 parse_opts(argc, argv);
688 cfg_test = argv[argc - 1];
690 if (!strcmp(cfg_test, "packet"))
691 do_test(PF_PACKET, SOCK_RAW, 0);
692 else if (!strcmp(cfg_test, "packet_dgram"))
693 do_test(PF_PACKET, SOCK_DGRAM, 0);
694 else if (!strcmp(cfg_test, "raw"))
695 do_test(cfg_family, SOCK_RAW, IPPROTO_EGP);
696 else if (!strcmp(cfg_test, "raw_hdrincl"))
697 do_test(cfg_family, SOCK_RAW, IPPROTO_RAW);
698 else if (!strcmp(cfg_test, "tcp"))
699 do_test(cfg_family, SOCK_STREAM, 0);
700 else if (!strcmp(cfg_test, "udp"))
701 do_test(cfg_family, SOCK_DGRAM, 0);
702 else
703 error(1, 0, "unknown cfg_test %s", cfg_test);
705 return 0;