1 /* Evaluate MSG_ZEROCOPY
3 * Send traffic between two processes over one of the supported
9 * - SOCK_DGRAM with UDP_CORK
11 * - SOCK_RAW with IP_HDRINCL
20 * Start this program on two connected hosts, one in send mode and
21 * the other with option '-r' to put it in receiver mode.
23 * If zerocopy mode ('-z') is enabled, the sender will verify that
24 * the kernel queues completions on the error queue for all zerocopy
30 #include <arpa/inet.h>
34 #include <linux/errqueue.h>
35 #include <linux/if_packet.h>
36 #include <linux/ipv6.h>
37 #include <linux/socket.h>
38 #include <linux/sockios.h>
39 #include <net/ethernet.h>
41 #include <netinet/ip.h>
42 #include <netinet/ip6.h>
43 #include <netinet/tcp.h>
44 #include <netinet/udp.h>
52 #include <sys/ioctl.h>
53 #include <sys/socket.h>
56 #include <sys/types.h>
59 #include <linux/rds.h>
61 #ifndef SO_EE_ORIGIN_ZEROCOPY
62 #define SO_EE_ORIGIN_ZEROCOPY 5
66 #define SO_ZEROCOPY 60
69 #ifndef SO_EE_CODE_ZEROCOPY_COPIED
70 #define SO_EE_CODE_ZEROCOPY_COPIED 1
74 #define MSG_ZEROCOPY 0x4000000
78 static bool cfg_cork_mixed
;
79 static int cfg_cpu
= -1; /* default: pin to last cpu */
80 static int cfg_family
= PF_UNSPEC
;
81 static int cfg_ifindex
= 1;
82 static int cfg_payload_len
;
83 static int cfg_port
= 8000;
85 static int cfg_runtime_ms
= 4200;
86 static int cfg_verbose
;
87 static int cfg_waittime_ms
= 500;
88 static bool cfg_zerocopy
;
90 static socklen_t cfg_alen
;
91 static struct sockaddr_storage cfg_dst_addr
;
92 static struct sockaddr_storage cfg_src_addr
;
94 static char payload
[IP_MAXPACKET
];
95 static long packets
, bytes
, completions
, expected_completions
;
96 static int zerocopied
= -1;
97 static uint32_t next_completion
;
99 static unsigned long gettimeofday_ms(void)
103 gettimeofday(&tv
, NULL
);
104 return (tv
.tv_sec
* 1000) + (tv
.tv_usec
/ 1000);
107 static uint16_t get_ip_csum(const uint16_t *start
, int num_words
)
109 unsigned long sum
= 0;
112 for (i
= 0; i
< num_words
; i
++)
116 sum
= (sum
& 0xFFFF) + (sum
>> 16);
121 static int do_setcpu(int cpu
)
127 if (sched_setaffinity(0, sizeof(mask
), &mask
))
128 error(1, 0, "setaffinity %d", cpu
);
131 fprintf(stderr
, "cpu: %u\n", cpu
);
136 static void do_setsockopt(int fd
, int level
, int optname
, int val
)
138 if (setsockopt(fd
, level
, optname
, &val
, sizeof(val
)))
139 error(1, errno
, "setsockopt %d.%d: %d", level
, optname
, val
);
142 static int do_poll(int fd
, int events
)
151 ret
= poll(&pfd
, 1, cfg_waittime_ms
);
153 error(1, errno
, "poll");
155 return ret
&& (pfd
.revents
& events
);
158 static int do_accept(int fd
)
162 fd
= accept(fda
, NULL
, NULL
);
164 error(1, errno
, "accept");
166 error(1, errno
, "close listen sock");
171 static void add_zcopy_cookie(struct msghdr
*msg
, uint32_t cookie
)
175 if (!msg
->msg_control
)
176 error(1, errno
, "NULL cookie");
177 cm
= (void *)msg
->msg_control
;
178 cm
->cmsg_len
= CMSG_LEN(sizeof(cookie
));
179 cm
->cmsg_level
= SOL_RDS
;
180 cm
->cmsg_type
= RDS_CMSG_ZCOPY_COOKIE
;
181 memcpy(CMSG_DATA(cm
), &cookie
, sizeof(cookie
));
184 static bool do_sendmsg(int fd
, struct msghdr
*msg
, bool do_zerocopy
, int domain
)
186 int ret
, len
, i
, flags
;
187 static uint32_t cookie
;
188 char ckbuf
[CMSG_SPACE(sizeof(cookie
))];
191 for (i
= 0; i
< msg
->msg_iovlen
; i
++)
192 len
+= msg
->msg_iov
[i
].iov_len
;
194 flags
= MSG_DONTWAIT
;
196 flags
|= MSG_ZEROCOPY
;
197 if (domain
== PF_RDS
) {
198 memset(&msg
->msg_control
, 0, sizeof(msg
->msg_control
));
199 msg
->msg_controllen
= CMSG_SPACE(sizeof(cookie
));
200 msg
->msg_control
= (struct cmsghdr
*)ckbuf
;
201 add_zcopy_cookie(msg
, ++cookie
);
205 ret
= sendmsg(fd
, msg
, flags
);
206 if (ret
== -1 && errno
== EAGAIN
)
209 error(1, errno
, "send");
210 if (cfg_verbose
&& ret
!= len
)
211 fprintf(stderr
, "send: ret=%u != %u\n", ret
, len
);
216 if (do_zerocopy
&& ret
)
217 expected_completions
++;
219 if (do_zerocopy
&& domain
== PF_RDS
) {
220 msg
->msg_control
= NULL
;
221 msg
->msg_controllen
= 0;
227 static void do_sendmsg_corked(int fd
, struct msghdr
*msg
)
229 bool do_zerocopy
= cfg_zerocopy
;
230 int i
, payload_len
, extra_len
;
232 /* split up the packet. for non-multiple, make first buffer longer */
233 payload_len
= cfg_payload_len
/ cfg_cork
;
234 extra_len
= cfg_payload_len
- (cfg_cork
* payload_len
);
236 do_setsockopt(fd
, IPPROTO_UDP
, UDP_CORK
, 1);
238 for (i
= 0; i
< cfg_cork
; i
++) {
240 /* in mixed-frags mode, alternate zerocopy and copy frags
241 * start with non-zerocopy, to ensure attach later works
244 do_zerocopy
= (i
& 1);
246 msg
->msg_iov
[0].iov_len
= payload_len
+ extra_len
;
249 do_sendmsg(fd
, msg
, do_zerocopy
,
250 (cfg_dst_addr
.ss_family
== AF_INET
?
251 PF_INET
: PF_INET6
));
254 do_setsockopt(fd
, IPPROTO_UDP
, UDP_CORK
, 0);
257 static int setup_iph(struct iphdr
*iph
, uint16_t payload_len
)
259 struct sockaddr_in
*daddr
= (void *) &cfg_dst_addr
;
260 struct sockaddr_in
*saddr
= (void *) &cfg_src_addr
;
262 memset(iph
, 0, sizeof(*iph
));
268 iph
->saddr
= saddr
->sin_addr
.s_addr
;
269 iph
->daddr
= daddr
->sin_addr
.s_addr
;
270 iph
->protocol
= IPPROTO_EGP
;
271 iph
->tot_len
= htons(sizeof(*iph
) + payload_len
);
272 iph
->check
= get_ip_csum((void *) iph
, iph
->ihl
<< 1);
277 static int setup_ip6h(struct ipv6hdr
*ip6h
, uint16_t payload_len
)
279 struct sockaddr_in6
*daddr
= (void *) &cfg_dst_addr
;
280 struct sockaddr_in6
*saddr
= (void *) &cfg_src_addr
;
282 memset(ip6h
, 0, sizeof(*ip6h
));
285 ip6h
->payload_len
= htons(payload_len
);
286 ip6h
->nexthdr
= IPPROTO_EGP
;
288 ip6h
->saddr
= saddr
->sin6_addr
;
289 ip6h
->daddr
= daddr
->sin6_addr
;
291 return sizeof(*ip6h
);
295 static void setup_sockaddr(int domain
, const char *str_addr
,
296 struct sockaddr_storage
*sockaddr
)
298 struct sockaddr_in6
*addr6
= (void *) sockaddr
;
299 struct sockaddr_in
*addr4
= (void *) sockaddr
;
303 memset(addr4
, 0, sizeof(*addr4
));
304 addr4
->sin_family
= AF_INET
;
305 addr4
->sin_port
= htons(cfg_port
);
307 inet_pton(AF_INET
, str_addr
, &(addr4
->sin_addr
)) != 1)
308 error(1, 0, "ipv4 parse error: %s", str_addr
);
311 memset(addr6
, 0, sizeof(*addr6
));
312 addr6
->sin6_family
= AF_INET6
;
313 addr6
->sin6_port
= htons(cfg_port
);
315 inet_pton(AF_INET6
, str_addr
, &(addr6
->sin6_addr
)) != 1)
316 error(1, 0, "ipv6 parse error: %s", str_addr
);
319 error(1, 0, "illegal domain");
323 static int do_setup_tx(int domain
, int type
, int protocol
)
327 fd
= socket(domain
, type
, protocol
);
329 error(1, errno
, "socket t");
331 do_setsockopt(fd
, SOL_SOCKET
, SO_SNDBUF
, 1 << 21);
333 do_setsockopt(fd
, SOL_SOCKET
, SO_ZEROCOPY
, 1);
335 if (domain
!= PF_PACKET
&& domain
!= PF_RDS
)
336 if (connect(fd
, (void *) &cfg_dst_addr
, cfg_alen
))
337 error(1, errno
, "connect");
339 if (domain
== PF_RDS
) {
340 if (bind(fd
, (void *) &cfg_src_addr
, cfg_alen
))
341 error(1, errno
, "bind");
347 static uint32_t do_process_zerocopy_cookies(struct rds_zcopy_cookies
*ck
)
351 if (ck
->num
> RDS_MAX_ZCOOKIES
)
352 error(1, 0, "Returned %d cookies, max expected %d\n",
353 ck
->num
, RDS_MAX_ZCOOKIES
);
354 for (i
= 0; i
< ck
->num
; i
++)
355 if (cfg_verbose
>= 2)
356 fprintf(stderr
, "%d\n", ck
->cookies
[i
]);
360 static bool do_recvmsg_completion(int fd
)
362 char cmsgbuf
[CMSG_SPACE(sizeof(struct rds_zcopy_cookies
))];
363 struct rds_zcopy_cookies
*ck
;
364 struct cmsghdr
*cmsg
;
368 memset(&msg
, 0, sizeof(msg
));
369 msg
.msg_control
= cmsgbuf
;
370 msg
.msg_controllen
= sizeof(cmsgbuf
);
372 if (recvmsg(fd
, &msg
, MSG_DONTWAIT
))
375 if (msg
.msg_flags
& MSG_CTRUNC
)
376 error(1, errno
, "recvmsg notification: truncated");
378 for (cmsg
= CMSG_FIRSTHDR(&msg
); cmsg
; cmsg
= CMSG_NXTHDR(&msg
, cmsg
)) {
379 if (cmsg
->cmsg_level
== SOL_RDS
&&
380 cmsg
->cmsg_type
== RDS_CMSG_ZCOPY_COMPLETION
) {
382 ck
= (struct rds_zcopy_cookies
*)CMSG_DATA(cmsg
);
383 completions
+= do_process_zerocopy_cookies(ck
);
387 error(0, 0, "ignoring cmsg at level %d type %d\n",
388 cmsg
->cmsg_level
, cmsg
->cmsg_type
);
393 static bool do_recv_completion(int fd
, int domain
)
395 struct sock_extended_err
*serr
;
396 struct msghdr msg
= {};
398 uint32_t hi
, lo
, range
;
402 if (domain
== PF_RDS
)
403 return do_recvmsg_completion(fd
);
405 msg
.msg_control
= control
;
406 msg
.msg_controllen
= sizeof(control
);
408 ret
= recvmsg(fd
, &msg
, MSG_ERRQUEUE
);
409 if (ret
== -1 && errno
== EAGAIN
)
412 error(1, errno
, "recvmsg notification");
413 if (msg
.msg_flags
& MSG_CTRUNC
)
414 error(1, errno
, "recvmsg notification: truncated");
416 cm
= CMSG_FIRSTHDR(&msg
);
418 error(1, 0, "cmsg: no cmsg");
419 if (!((cm
->cmsg_level
== SOL_IP
&& cm
->cmsg_type
== IP_RECVERR
) ||
420 (cm
->cmsg_level
== SOL_IPV6
&& cm
->cmsg_type
== IPV6_RECVERR
) ||
421 (cm
->cmsg_level
== SOL_PACKET
&& cm
->cmsg_type
== PACKET_TX_TIMESTAMP
)))
422 error(1, 0, "serr: wrong type: %d.%d",
423 cm
->cmsg_level
, cm
->cmsg_type
);
425 serr
= (void *) CMSG_DATA(cm
);
427 if (serr
->ee_origin
!= SO_EE_ORIGIN_ZEROCOPY
)
428 error(1, 0, "serr: wrong origin: %u", serr
->ee_origin
);
429 if (serr
->ee_errno
!= 0)
430 error(1, 0, "serr: wrong error code: %u", serr
->ee_errno
);
436 /* Detect notification gaps. These should not happen often, if at all.
437 * Gaps can occur due to drops, reordering and retransmissions.
439 if (lo
!= next_completion
)
440 fprintf(stderr
, "gap: %u..%u does not append to %u\n",
441 lo
, hi
, next_completion
);
442 next_completion
= hi
+ 1;
444 zerocopy
= !(serr
->ee_code
& SO_EE_CODE_ZEROCOPY_COPIED
);
445 if (zerocopied
== -1)
446 zerocopied
= zerocopy
;
447 else if (zerocopied
!= zerocopy
) {
448 fprintf(stderr
, "serr: inconsistent\n");
449 zerocopied
= zerocopy
;
452 if (cfg_verbose
>= 2)
453 fprintf(stderr
, "completed: %u (h=%u l=%u)\n",
456 completions
+= range
;
460 /* Read all outstanding messages on the errqueue */
461 static void do_recv_completions(int fd
, int domain
)
463 while (do_recv_completion(fd
, domain
)) {}
466 /* Wait for all remaining completions on the errqueue */
467 static void do_recv_remaining_completions(int fd
, int domain
)
469 int64_t tstop
= gettimeofday_ms() + cfg_waittime_ms
;
471 while (completions
< expected_completions
&&
472 gettimeofday_ms() < tstop
) {
473 if (do_poll(fd
, domain
== PF_RDS
? POLLIN
: POLLERR
))
474 do_recv_completions(fd
, domain
);
477 if (completions
< expected_completions
)
478 fprintf(stderr
, "missing notifications: %lu < %lu\n",
479 completions
, expected_completions
);
482 static void do_tx(int domain
, int type
, int protocol
)
484 struct iovec iov
[3] = { {0} };
485 struct sockaddr_ll laddr
;
486 struct msghdr msg
= {0};
495 fd
= do_setup_tx(domain
, type
, protocol
);
497 if (domain
== PF_PACKET
) {
498 uint16_t proto
= cfg_family
== PF_INET
? ETH_P_IP
: ETH_P_IPV6
;
500 /* sock_raw passes ll header as data */
501 if (type
== SOCK_RAW
) {
502 memset(eth
.h_dest
, 0x06, ETH_ALEN
);
503 memset(eth
.h_source
, 0x02, ETH_ALEN
);
504 eth
.h_proto
= htons(proto
);
505 iov
[0].iov_base
= ð
;
506 iov
[0].iov_len
= sizeof(eth
);
510 /* both sock_raw and sock_dgram expect name */
511 memset(&laddr
, 0, sizeof(laddr
));
512 laddr
.sll_family
= AF_PACKET
;
513 laddr
.sll_ifindex
= cfg_ifindex
;
514 laddr
.sll_protocol
= htons(proto
);
515 laddr
.sll_halen
= ETH_ALEN
;
517 memset(laddr
.sll_addr
, 0x06, ETH_ALEN
);
519 msg
.msg_name
= &laddr
;
520 msg
.msg_namelen
= sizeof(laddr
);
523 /* packet and raw sockets with hdrincl must pass network header */
524 if (domain
== PF_PACKET
|| protocol
== IPPROTO_RAW
) {
525 if (cfg_family
== PF_INET
)
526 iov
[1].iov_len
= setup_iph(&nh
.iph
, cfg_payload_len
);
528 iov
[1].iov_len
= setup_ip6h(&nh
.ip6h
, cfg_payload_len
);
530 iov
[1].iov_base
= (void *) &nh
;
534 if (domain
== PF_RDS
) {
535 msg
.msg_name
= &cfg_dst_addr
;
536 msg
.msg_namelen
= (cfg_dst_addr
.ss_family
== AF_INET
?
537 sizeof(struct sockaddr_in
) :
538 sizeof(struct sockaddr_in6
));
541 iov
[2].iov_base
= payload
;
542 iov
[2].iov_len
= cfg_payload_len
;
544 msg
.msg_iov
= &iov
[3 - msg
.msg_iovlen
];
546 tstop
= gettimeofday_ms() + cfg_runtime_ms
;
549 do_sendmsg_corked(fd
, &msg
);
551 do_sendmsg(fd
, &msg
, cfg_zerocopy
, domain
);
553 while (!do_poll(fd
, POLLOUT
)) {
555 do_recv_completions(fd
, domain
);
558 } while (gettimeofday_ms() < tstop
);
561 do_recv_remaining_completions(fd
, domain
);
564 error(1, errno
, "close");
566 fprintf(stderr
, "tx=%lu (%lu MB) txc=%lu zc=%c\n",
567 packets
, bytes
>> 20, completions
,
568 zerocopied
== 1 ? 'y' : 'n');
571 static int do_setup_rx(int domain
, int type
, int protocol
)
575 /* If tx over PF_PACKET, rx over PF_INET(6)/SOCK_RAW,
576 * to recv the only copy of the packet, not a clone
578 if (domain
== PF_PACKET
)
579 error(1, 0, "Use PF_INET/SOCK_RAW to read");
581 if (type
== SOCK_RAW
&& protocol
== IPPROTO_RAW
)
582 error(1, 0, "IPPROTO_RAW: not supported on Rx");
584 fd
= socket(domain
, type
, protocol
);
586 error(1, errno
, "socket r");
588 do_setsockopt(fd
, SOL_SOCKET
, SO_RCVBUF
, 1 << 21);
589 do_setsockopt(fd
, SOL_SOCKET
, SO_RCVLOWAT
, 1 << 16);
590 do_setsockopt(fd
, SOL_SOCKET
, SO_REUSEPORT
, 1);
592 if (bind(fd
, (void *) &cfg_dst_addr
, cfg_alen
))
593 error(1, errno
, "bind");
595 if (type
== SOCK_STREAM
) {
597 error(1, errno
, "listen");
604 /* Flush all outstanding bytes for the tcp receive queue */
605 static void do_flush_tcp(int fd
)
609 /* MSG_TRUNC flushes up to len bytes */
610 ret
= recv(fd
, NULL
, 1 << 21, MSG_TRUNC
| MSG_DONTWAIT
);
611 if (ret
== -1 && errno
== EAGAIN
)
614 error(1, errno
, "flush");
622 /* Flush all outstanding datagrams. Verify first few bytes of each. */
623 static void do_flush_datagram(int fd
, int type
)
628 /* MSG_TRUNC will return full datagram length */
629 ret
= recv(fd
, buf
, sizeof(buf
), MSG_DONTWAIT
| MSG_TRUNC
);
630 if (ret
== -1 && errno
== EAGAIN
)
633 /* raw ipv4 return with header, raw ipv6 without */
634 if (cfg_family
== PF_INET
&& type
== SOCK_RAW
) {
635 off
+= sizeof(struct iphdr
);
636 ret
-= sizeof(struct iphdr
);
640 error(1, errno
, "recv");
641 if (ret
!= cfg_payload_len
)
642 error(1, 0, "recv: ret=%u != %u", ret
, cfg_payload_len
);
643 if (ret
> sizeof(buf
) - off
)
644 ret
= sizeof(buf
) - off
;
645 if (memcmp(buf
+ off
, payload
, ret
))
646 error(1, 0, "recv: data mismatch");
649 bytes
+= cfg_payload_len
;
652 static void do_rx(int domain
, int type
, int protocol
)
657 fd
= do_setup_rx(domain
, type
, protocol
);
659 tstop
= gettimeofday_ms() + cfg_runtime_ms
;
661 if (type
== SOCK_STREAM
)
664 do_flush_datagram(fd
, type
);
668 } while (gettimeofday_ms() < tstop
);
671 error(1, errno
, "close");
673 fprintf(stderr
, "rx=%lu (%lu MB)\n", packets
, bytes
>> 20);
676 static void do_test(int domain
, int type
, int protocol
)
680 if (cfg_cork
&& (domain
== PF_PACKET
|| type
!= SOCK_DGRAM
))
681 error(1, 0, "can only cork udp sockets");
685 for (i
= 0; i
< IP_MAXPACKET
; i
++)
686 payload
[i
] = 'a' + (i
% 26);
689 do_rx(domain
, type
, protocol
);
691 do_tx(domain
, type
, protocol
);
694 static void usage(const char *filepath
)
696 error(1, 0, "Usage: %s [options] <test>", filepath
);
699 static void parse_opts(int argc
, char **argv
)
701 const int max_payload_len
= sizeof(payload
) -
702 sizeof(struct ipv6hdr
) -
703 sizeof(struct tcphdr
) -
704 40 /* max tcp options */;
706 char *daddr
= NULL
, *saddr
= NULL
;
709 cfg_payload_len
= max_payload_len
;
711 while ((c
= getopt(argc
, argv
, "46c:C:D:i:mp:rs:S:t:vz")) != -1) {
714 if (cfg_family
!= PF_UNSPEC
)
715 error(1, 0, "Pass one of -4 or -6");
716 cfg_family
= PF_INET
;
717 cfg_alen
= sizeof(struct sockaddr_in
);
720 if (cfg_family
!= PF_UNSPEC
)
721 error(1, 0, "Pass one of -4 or -6");
722 cfg_family
= PF_INET6
;
723 cfg_alen
= sizeof(struct sockaddr_in6
);
726 cfg_cork
= strtol(optarg
, NULL
, 0);
729 cfg_cpu
= strtol(optarg
, NULL
, 0);
735 cfg_ifindex
= if_nametoindex(optarg
);
736 if (cfg_ifindex
== 0)
737 error(1, errno
, "invalid iface: %s", optarg
);
740 cfg_cork_mixed
= true;
743 cfg_port
= strtoul(optarg
, NULL
, 0);
749 cfg_payload_len
= strtoul(optarg
, NULL
, 0);
755 cfg_runtime_ms
= 200 + strtoul(optarg
, NULL
, 10) * 1000;
766 cfg_test
= argv
[argc
- 1];
767 if (strcmp(cfg_test
, "rds") == 0) {
769 error(1, 0, "-D <server addr> required for PF_RDS\n");
770 if (!cfg_rx
&& !saddr
)
771 error(1, 0, "-S <client addr> required for PF_RDS\n");
773 setup_sockaddr(cfg_family
, daddr
, &cfg_dst_addr
);
774 setup_sockaddr(cfg_family
, saddr
, &cfg_src_addr
);
776 if (cfg_payload_len
> max_payload_len
)
777 error(1, 0, "-s: payload exceeds max (%d)", max_payload_len
);
778 if (cfg_cork_mixed
&& (!cfg_zerocopy
|| !cfg_cork
))
779 error(1, 0, "-m: cork_mixed requires corking and zerocopy");
781 if (optind
!= argc
- 1)
785 int main(int argc
, char **argv
)
787 const char *cfg_test
;
789 parse_opts(argc
, argv
);
791 cfg_test
= argv
[argc
- 1];
793 if (!strcmp(cfg_test
, "packet"))
794 do_test(PF_PACKET
, SOCK_RAW
, 0);
795 else if (!strcmp(cfg_test
, "packet_dgram"))
796 do_test(PF_PACKET
, SOCK_DGRAM
, 0);
797 else if (!strcmp(cfg_test
, "raw"))
798 do_test(cfg_family
, SOCK_RAW
, IPPROTO_EGP
);
799 else if (!strcmp(cfg_test
, "raw_hdrincl"))
800 do_test(cfg_family
, SOCK_RAW
, IPPROTO_RAW
);
801 else if (!strcmp(cfg_test
, "tcp"))
802 do_test(cfg_family
, SOCK_STREAM
, 0);
803 else if (!strcmp(cfg_test
, "udp"))
804 do_test(cfg_family
, SOCK_DGRAM
, 0);
805 else if (!strcmp(cfg_test
, "rds"))
806 do_test(PF_RDS
, SOCK_SEQPACKET
, 0);
808 error(1, 0, "unknown cfg_test %s", cfg_test
);