1 /* Evaluate MSG_ZEROCOPY
3 * Send traffic between two processes over one of the supported
9 * - SOCK_DGRAM with UDP_CORK
11 * - SOCK_RAW with IP_HDRINCL
17 * Start this program on two connected hosts, one in send mode and
18 * the other with option '-r' to put it in receiver mode.
20 * If zerocopy mode ('-z') is enabled, the sender will verify that
21 * the kernel queues completions on the error queue for all zerocopy
27 #include <arpa/inet.h>
31 #include <linux/errqueue.h>
32 #include <linux/if_packet.h>
33 #include <linux/ipv6.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <net/ethernet.h>
38 #include <netinet/ip.h>
39 #include <netinet/ip6.h>
40 #include <netinet/tcp.h>
41 #include <netinet/udp.h>
49 #include <sys/ioctl.h>
50 #include <sys/socket.h>
53 #include <sys/types.h>
57 #ifndef SO_EE_ORIGIN_ZEROCOPY
58 #define SO_EE_ORIGIN_ZEROCOPY 5
62 #define SO_ZEROCOPY 60
65 #ifndef SO_EE_CODE_ZEROCOPY_COPIED
66 #define SO_EE_CODE_ZEROCOPY_COPIED 1
70 #define MSG_ZEROCOPY 0x4000000
74 static bool cfg_cork_mixed
;
75 static int cfg_cpu
= -1; /* default: pin to last cpu */
76 static int cfg_family
= PF_UNSPEC
;
77 static int cfg_ifindex
= 1;
78 static int cfg_payload_len
;
79 static int cfg_port
= 8000;
81 static int cfg_runtime_ms
= 4200;
82 static int cfg_verbose
;
83 static int cfg_waittime_ms
= 500;
84 static bool cfg_zerocopy
;
86 static socklen_t cfg_alen
;
87 static struct sockaddr_storage cfg_dst_addr
;
88 static struct sockaddr_storage cfg_src_addr
;
90 static char payload
[IP_MAXPACKET
];
91 static long packets
, bytes
, completions
, expected_completions
;
92 static int zerocopied
= -1;
93 static uint32_t next_completion
;
95 static unsigned long gettimeofday_ms(void)
99 gettimeofday(&tv
, NULL
);
100 return (tv
.tv_sec
* 1000) + (tv
.tv_usec
/ 1000);
103 static uint16_t get_ip_csum(const uint16_t *start
, int num_words
)
105 unsigned long sum
= 0;
108 for (i
= 0; i
< num_words
; i
++)
112 sum
= (sum
& 0xFFFF) + (sum
>> 16);
117 static int do_setcpu(int cpu
)
123 if (sched_setaffinity(0, sizeof(mask
), &mask
))
124 error(1, 0, "setaffinity %d", cpu
);
127 fprintf(stderr
, "cpu: %u\n", cpu
);
132 static void do_setsockopt(int fd
, int level
, int optname
, int val
)
134 if (setsockopt(fd
, level
, optname
, &val
, sizeof(val
)))
135 error(1, errno
, "setsockopt %d.%d: %d", level
, optname
, val
);
138 static int do_poll(int fd
, int events
)
147 ret
= poll(&pfd
, 1, cfg_waittime_ms
);
149 error(1, errno
, "poll");
151 return ret
&& (pfd
.revents
& events
);
154 static int do_accept(int fd
)
158 fd
= accept(fda
, NULL
, NULL
);
160 error(1, errno
, "accept");
162 error(1, errno
, "close listen sock");
167 static bool do_sendmsg(int fd
, struct msghdr
*msg
, bool do_zerocopy
)
169 int ret
, len
, i
, flags
;
172 for (i
= 0; i
< msg
->msg_iovlen
; i
++)
173 len
+= msg
->msg_iov
[i
].iov_len
;
175 flags
= MSG_DONTWAIT
;
177 flags
|= MSG_ZEROCOPY
;
179 ret
= sendmsg(fd
, msg
, flags
);
180 if (ret
== -1 && errno
== EAGAIN
)
183 error(1, errno
, "send");
184 if (cfg_verbose
&& ret
!= len
)
185 fprintf(stderr
, "send: ret=%u != %u\n", ret
, len
);
190 if (do_zerocopy
&& ret
)
191 expected_completions
++;
197 static void do_sendmsg_corked(int fd
, struct msghdr
*msg
)
199 bool do_zerocopy
= cfg_zerocopy
;
200 int i
, payload_len
, extra_len
;
202 /* split up the packet. for non-multiple, make first buffer longer */
203 payload_len
= cfg_payload_len
/ cfg_cork
;
204 extra_len
= cfg_payload_len
- (cfg_cork
* payload_len
);
206 do_setsockopt(fd
, IPPROTO_UDP
, UDP_CORK
, 1);
208 for (i
= 0; i
< cfg_cork
; i
++) {
210 /* in mixed-frags mode, alternate zerocopy and copy frags
211 * start with non-zerocopy, to ensure attach later works
214 do_zerocopy
= (i
& 1);
216 msg
->msg_iov
[0].iov_len
= payload_len
+ extra_len
;
219 do_sendmsg(fd
, msg
, do_zerocopy
);
222 do_setsockopt(fd
, IPPROTO_UDP
, UDP_CORK
, 0);
225 static int setup_iph(struct iphdr
*iph
, uint16_t payload_len
)
227 struct sockaddr_in
*daddr
= (void *) &cfg_dst_addr
;
228 struct sockaddr_in
*saddr
= (void *) &cfg_src_addr
;
230 memset(iph
, 0, sizeof(*iph
));
236 iph
->saddr
= saddr
->sin_addr
.s_addr
;
237 iph
->daddr
= daddr
->sin_addr
.s_addr
;
238 iph
->protocol
= IPPROTO_EGP
;
239 iph
->tot_len
= htons(sizeof(*iph
) + payload_len
);
240 iph
->check
= get_ip_csum((void *) iph
, iph
->ihl
<< 1);
245 static int setup_ip6h(struct ipv6hdr
*ip6h
, uint16_t payload_len
)
247 struct sockaddr_in6
*daddr
= (void *) &cfg_dst_addr
;
248 struct sockaddr_in6
*saddr
= (void *) &cfg_src_addr
;
250 memset(ip6h
, 0, sizeof(*ip6h
));
253 ip6h
->payload_len
= htons(payload_len
);
254 ip6h
->nexthdr
= IPPROTO_EGP
;
256 ip6h
->saddr
= saddr
->sin6_addr
;
257 ip6h
->daddr
= daddr
->sin6_addr
;
259 return sizeof(*ip6h
);
263 static void setup_sockaddr(int domain
, const char *str_addr
,
264 struct sockaddr_storage
*sockaddr
)
266 struct sockaddr_in6
*addr6
= (void *) sockaddr
;
267 struct sockaddr_in
*addr4
= (void *) sockaddr
;
271 memset(addr4
, 0, sizeof(*addr4
));
272 addr4
->sin_family
= AF_INET
;
273 addr4
->sin_port
= htons(cfg_port
);
275 inet_pton(AF_INET
, str_addr
, &(addr4
->sin_addr
)) != 1)
276 error(1, 0, "ipv4 parse error: %s", str_addr
);
279 memset(addr6
, 0, sizeof(*addr6
));
280 addr6
->sin6_family
= AF_INET6
;
281 addr6
->sin6_port
= htons(cfg_port
);
283 inet_pton(AF_INET6
, str_addr
, &(addr6
->sin6_addr
)) != 1)
284 error(1, 0, "ipv6 parse error: %s", str_addr
);
287 error(1, 0, "illegal domain");
291 static int do_setup_tx(int domain
, int type
, int protocol
)
295 fd
= socket(domain
, type
, protocol
);
297 error(1, errno
, "socket t");
299 do_setsockopt(fd
, SOL_SOCKET
, SO_SNDBUF
, 1 << 21);
301 do_setsockopt(fd
, SOL_SOCKET
, SO_ZEROCOPY
, 1);
303 if (domain
!= PF_PACKET
)
304 if (connect(fd
, (void *) &cfg_dst_addr
, cfg_alen
))
305 error(1, errno
, "connect");
310 static bool do_recv_completion(int fd
)
312 struct sock_extended_err
*serr
;
313 struct msghdr msg
= {};
315 uint32_t hi
, lo
, range
;
319 msg
.msg_control
= control
;
320 msg
.msg_controllen
= sizeof(control
);
322 ret
= recvmsg(fd
, &msg
, MSG_ERRQUEUE
);
323 if (ret
== -1 && errno
== EAGAIN
)
326 error(1, errno
, "recvmsg notification");
327 if (msg
.msg_flags
& MSG_CTRUNC
)
328 error(1, errno
, "recvmsg notification: truncated");
330 cm
= CMSG_FIRSTHDR(&msg
);
332 error(1, 0, "cmsg: no cmsg");
333 if (!((cm
->cmsg_level
== SOL_IP
&& cm
->cmsg_type
== IP_RECVERR
) ||
334 (cm
->cmsg_level
== SOL_IPV6
&& cm
->cmsg_type
== IPV6_RECVERR
) ||
335 (cm
->cmsg_level
== SOL_PACKET
&& cm
->cmsg_type
== PACKET_TX_TIMESTAMP
)))
336 error(1, 0, "serr: wrong type: %d.%d",
337 cm
->cmsg_level
, cm
->cmsg_type
);
339 serr
= (void *) CMSG_DATA(cm
);
340 if (serr
->ee_origin
!= SO_EE_ORIGIN_ZEROCOPY
)
341 error(1, 0, "serr: wrong origin: %u", serr
->ee_origin
);
342 if (serr
->ee_errno
!= 0)
343 error(1, 0, "serr: wrong error code: %u", serr
->ee_errno
);
349 /* Detect notification gaps. These should not happen often, if at all.
350 * Gaps can occur due to drops, reordering and retransmissions.
352 if (lo
!= next_completion
)
353 fprintf(stderr
, "gap: %u..%u does not append to %u\n",
354 lo
, hi
, next_completion
);
355 next_completion
= hi
+ 1;
357 zerocopy
= !(serr
->ee_code
& SO_EE_CODE_ZEROCOPY_COPIED
);
358 if (zerocopied
== -1)
359 zerocopied
= zerocopy
;
360 else if (zerocopied
!= zerocopy
) {
361 fprintf(stderr
, "serr: inconsistent\n");
362 zerocopied
= zerocopy
;
365 if (cfg_verbose
>= 2)
366 fprintf(stderr
, "completed: %u (h=%u l=%u)\n",
369 completions
+= range
;
373 /* Read all outstanding messages on the errqueue */
374 static void do_recv_completions(int fd
)
376 while (do_recv_completion(fd
)) {}
379 /* Wait for all remaining completions on the errqueue */
380 static void do_recv_remaining_completions(int fd
)
382 int64_t tstop
= gettimeofday_ms() + cfg_waittime_ms
;
384 while (completions
< expected_completions
&&
385 gettimeofday_ms() < tstop
) {
386 if (do_poll(fd
, POLLERR
))
387 do_recv_completions(fd
);
390 if (completions
< expected_completions
)
391 fprintf(stderr
, "missing notifications: %lu < %lu\n",
392 completions
, expected_completions
);
395 static void do_tx(int domain
, int type
, int protocol
)
397 struct iovec iov
[3] = { {0} };
398 struct sockaddr_ll laddr
;
399 struct msghdr msg
= {0};
408 fd
= do_setup_tx(domain
, type
, protocol
);
410 if (domain
== PF_PACKET
) {
411 uint16_t proto
= cfg_family
== PF_INET
? ETH_P_IP
: ETH_P_IPV6
;
413 /* sock_raw passes ll header as data */
414 if (type
== SOCK_RAW
) {
415 memset(eth
.h_dest
, 0x06, ETH_ALEN
);
416 memset(eth
.h_source
, 0x02, ETH_ALEN
);
417 eth
.h_proto
= htons(proto
);
418 iov
[0].iov_base
= ð
;
419 iov
[0].iov_len
= sizeof(eth
);
423 /* both sock_raw and sock_dgram expect name */
424 memset(&laddr
, 0, sizeof(laddr
));
425 laddr
.sll_family
= AF_PACKET
;
426 laddr
.sll_ifindex
= cfg_ifindex
;
427 laddr
.sll_protocol
= htons(proto
);
428 laddr
.sll_halen
= ETH_ALEN
;
430 memset(laddr
.sll_addr
, 0x06, ETH_ALEN
);
432 msg
.msg_name
= &laddr
;
433 msg
.msg_namelen
= sizeof(laddr
);
436 /* packet and raw sockets with hdrincl must pass network header */
437 if (domain
== PF_PACKET
|| protocol
== IPPROTO_RAW
) {
438 if (cfg_family
== PF_INET
)
439 iov
[1].iov_len
= setup_iph(&nh
.iph
, cfg_payload_len
);
441 iov
[1].iov_len
= setup_ip6h(&nh
.ip6h
, cfg_payload_len
);
443 iov
[1].iov_base
= (void *) &nh
;
447 iov
[2].iov_base
= payload
;
448 iov
[2].iov_len
= cfg_payload_len
;
450 msg
.msg_iov
= &iov
[3 - msg
.msg_iovlen
];
452 tstop
= gettimeofday_ms() + cfg_runtime_ms
;
455 do_sendmsg_corked(fd
, &msg
);
457 do_sendmsg(fd
, &msg
, cfg_zerocopy
);
459 while (!do_poll(fd
, POLLOUT
)) {
461 do_recv_completions(fd
);
464 } while (gettimeofday_ms() < tstop
);
467 do_recv_remaining_completions(fd
);
470 error(1, errno
, "close");
472 fprintf(stderr
, "tx=%lu (%lu MB) txc=%lu zc=%c\n",
473 packets
, bytes
>> 20, completions
,
474 zerocopied
== 1 ? 'y' : 'n');
477 static int do_setup_rx(int domain
, int type
, int protocol
)
481 /* If tx over PF_PACKET, rx over PF_INET(6)/SOCK_RAW,
482 * to recv the only copy of the packet, not a clone
484 if (domain
== PF_PACKET
)
485 error(1, 0, "Use PF_INET/SOCK_RAW to read");
487 if (type
== SOCK_RAW
&& protocol
== IPPROTO_RAW
)
488 error(1, 0, "IPPROTO_RAW: not supported on Rx");
490 fd
= socket(domain
, type
, protocol
);
492 error(1, errno
, "socket r");
494 do_setsockopt(fd
, SOL_SOCKET
, SO_RCVBUF
, 1 << 21);
495 do_setsockopt(fd
, SOL_SOCKET
, SO_RCVLOWAT
, 1 << 16);
496 do_setsockopt(fd
, SOL_SOCKET
, SO_REUSEPORT
, 1);
498 if (bind(fd
, (void *) &cfg_dst_addr
, cfg_alen
))
499 error(1, errno
, "bind");
501 if (type
== SOCK_STREAM
) {
503 error(1, errno
, "listen");
510 /* Flush all outstanding bytes for the tcp receive queue */
511 static void do_flush_tcp(int fd
)
515 /* MSG_TRUNC flushes up to len bytes */
516 ret
= recv(fd
, NULL
, 1 << 21, MSG_TRUNC
| MSG_DONTWAIT
);
517 if (ret
== -1 && errno
== EAGAIN
)
520 error(1, errno
, "flush");
528 /* Flush all outstanding datagrams. Verify first few bytes of each. */
529 static void do_flush_datagram(int fd
, int type
)
534 /* MSG_TRUNC will return full datagram length */
535 ret
= recv(fd
, buf
, sizeof(buf
), MSG_DONTWAIT
| MSG_TRUNC
);
536 if (ret
== -1 && errno
== EAGAIN
)
539 /* raw ipv4 return with header, raw ipv6 without */
540 if (cfg_family
== PF_INET
&& type
== SOCK_RAW
) {
541 off
+= sizeof(struct iphdr
);
542 ret
-= sizeof(struct iphdr
);
546 error(1, errno
, "recv");
547 if (ret
!= cfg_payload_len
)
548 error(1, 0, "recv: ret=%u != %u", ret
, cfg_payload_len
);
549 if (ret
> sizeof(buf
) - off
)
550 ret
= sizeof(buf
) - off
;
551 if (memcmp(buf
+ off
, payload
, ret
))
552 error(1, 0, "recv: data mismatch");
555 bytes
+= cfg_payload_len
;
558 static void do_rx(int domain
, int type
, int protocol
)
563 fd
= do_setup_rx(domain
, type
, protocol
);
565 tstop
= gettimeofday_ms() + cfg_runtime_ms
;
567 if (type
== SOCK_STREAM
)
570 do_flush_datagram(fd
, type
);
574 } while (gettimeofday_ms() < tstop
);
577 error(1, errno
, "close");
579 fprintf(stderr
, "rx=%lu (%lu MB)\n", packets
, bytes
>> 20);
582 static void do_test(int domain
, int type
, int protocol
)
586 if (cfg_cork
&& (domain
== PF_PACKET
|| type
!= SOCK_DGRAM
))
587 error(1, 0, "can only cork udp sockets");
591 for (i
= 0; i
< IP_MAXPACKET
; i
++)
592 payload
[i
] = 'a' + (i
% 26);
595 do_rx(domain
, type
, protocol
);
597 do_tx(domain
, type
, protocol
);
600 static void usage(const char *filepath
)
602 error(1, 0, "Usage: %s [options] <test>", filepath
);
605 static void parse_opts(int argc
, char **argv
)
607 const int max_payload_len
= sizeof(payload
) -
608 sizeof(struct ipv6hdr
) -
609 sizeof(struct tcphdr
) -
610 40 /* max tcp options */;
612 char *daddr
= NULL
, *saddr
= NULL
;
614 cfg_payload_len
= max_payload_len
;
616 while ((c
= getopt(argc
, argv
, "46c:C:D:i:mp:rs:S:t:vz")) != -1) {
619 if (cfg_family
!= PF_UNSPEC
)
620 error(1, 0, "Pass one of -4 or -6");
621 cfg_family
= PF_INET
;
622 cfg_alen
= sizeof(struct sockaddr_in
);
625 if (cfg_family
!= PF_UNSPEC
)
626 error(1, 0, "Pass one of -4 or -6");
627 cfg_family
= PF_INET6
;
628 cfg_alen
= sizeof(struct sockaddr_in6
);
631 cfg_cork
= strtol(optarg
, NULL
, 0);
634 cfg_cpu
= strtol(optarg
, NULL
, 0);
640 cfg_ifindex
= if_nametoindex(optarg
);
641 if (cfg_ifindex
== 0)
642 error(1, errno
, "invalid iface: %s", optarg
);
645 cfg_cork_mixed
= true;
648 cfg_port
= strtoul(optarg
, NULL
, 0);
654 cfg_payload_len
= strtoul(optarg
, NULL
, 0);
660 cfg_runtime_ms
= 200 + strtoul(optarg
, NULL
, 10) * 1000;
670 setup_sockaddr(cfg_family
, daddr
, &cfg_dst_addr
);
671 setup_sockaddr(cfg_family
, saddr
, &cfg_src_addr
);
673 if (cfg_payload_len
> max_payload_len
)
674 error(1, 0, "-s: payload exceeds max (%d)", max_payload_len
);
675 if (cfg_cork_mixed
&& (!cfg_zerocopy
|| !cfg_cork
))
676 error(1, 0, "-m: cork_mixed requires corking and zerocopy");
678 if (optind
!= argc
- 1)
682 int main(int argc
, char **argv
)
684 const char *cfg_test
;
686 parse_opts(argc
, argv
);
688 cfg_test
= argv
[argc
- 1];
690 if (!strcmp(cfg_test
, "packet"))
691 do_test(PF_PACKET
, SOCK_RAW
, 0);
692 else if (!strcmp(cfg_test
, "packet_dgram"))
693 do_test(PF_PACKET
, SOCK_DGRAM
, 0);
694 else if (!strcmp(cfg_test
, "raw"))
695 do_test(cfg_family
, SOCK_RAW
, IPPROTO_EGP
);
696 else if (!strcmp(cfg_test
, "raw_hdrincl"))
697 do_test(cfg_family
, SOCK_RAW
, IPPROTO_RAW
);
698 else if (!strcmp(cfg_test
, "tcp"))
699 do_test(cfg_family
, SOCK_STREAM
, 0);
700 else if (!strcmp(cfg_test
, "udp"))
701 do_test(cfg_family
, SOCK_DGRAM
, 0);
703 error(1, 0, "unknown cfg_test %s", cfg_test
);