1 // SPDX-License-Identifier: GPL-2.0
3 /* Reference program for verifying XDP metadata on real HW. Functional test
4 * only, doesn't test the performance.
7 * - UDP 9091 packets are diverted into AF_XDP
13 * - UDP 9091 packets trigger TX reply
14 * - TX HW timestamp is requested and reported back upon completion
15 * - TX checksum is requested
18 #include <test_progs.h>
19 #include <network_helpers.h>
20 #include "xdp_hw_metadata.skel.h"
24 #include <linux/kernel.h>
25 #include <linux/bits.h>
26 #include <linux/bitfield.h>
27 #include <linux/errqueue.h>
28 #include <linux/if_link.h>
29 #include <linux/net_tstamp.h>
30 #include <linux/udp.h>
31 #include <linux/sockios.h>
32 #include <linux/if_xdp.h>
41 #include "xdp_metadata.h"
44 #define UMEM_FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE
45 #define UMEM_SIZE (UMEM_FRAME_SIZE * UMEM_NUM)
46 #define XDP_FLAGS (XDP_FLAGS_DRV_MODE | XDP_FLAGS_REPLACE)
50 struct xsk_umem
*umem
;
51 struct xsk_ring_prod fill
;
52 struct xsk_ring_cons comp
;
53 struct xsk_ring_prod tx
;
54 struct xsk_ring_cons rx
;
55 struct xsk_socket
*socket
;
58 struct xdp_hw_metadata
*bpf_obj
;
59 __u16 bind_flags
= XDP_USE_NEED_WAKEUP
| XDP_ZEROCOPY
;
65 __u64 last_hw_rx_timestamp
;
66 __u64 last_xdp_rx_timestamp
;
68 void test__fail(void) { /* for network_helpers.c */ }
70 static int open_xsk(int ifindex
, struct xsk
*xsk
, __u32 queue_id
)
72 int mmap_flags
= MAP_PRIVATE
| MAP_ANONYMOUS
| MAP_NORESERVE
;
73 const struct xsk_socket_config socket_config
= {
74 .rx_size
= XSK_RING_PROD__DEFAULT_NUM_DESCS
,
75 .tx_size
= XSK_RING_PROD__DEFAULT_NUM_DESCS
,
76 .bind_flags
= bind_flags
,
78 const struct xsk_umem_config umem_config
= {
79 .fill_size
= XSK_RING_PROD__DEFAULT_NUM_DESCS
,
80 .comp_size
= XSK_RING_CONS__DEFAULT_NUM_DESCS
,
81 .frame_size
= XSK_UMEM__DEFAULT_FRAME_SIZE
,
82 .flags
= XSK_UMEM__DEFAULT_FLAGS
,
83 .tx_metadata_len
= sizeof(struct xsk_tx_metadata
),
90 xsk
->umem_area
= mmap(NULL
, UMEM_SIZE
, PROT_READ
| PROT_WRITE
, mmap_flags
, -1, 0);
91 if (xsk
->umem_area
== MAP_FAILED
)
94 ret
= xsk_umem__create(&xsk
->umem
,
95 xsk
->umem_area
, UMEM_SIZE
,
102 ret
= xsk_socket__create(&xsk
->socket
, ifindex
, queue_id
,
110 /* First half of umem is for TX. This way address matches 1-to-1
111 * to the completion queue index.
114 for (i
= 0; i
< UMEM_NUM
/ 2; i
++) {
115 addr
= i
* UMEM_FRAME_SIZE
;
116 printf("%p: tx_desc[%d] -> %lx\n", xsk
, i
, addr
);
119 /* Second half of umem is for RX. */
121 ret
= xsk_ring_prod__reserve(&xsk
->fill
, UMEM_NUM
/ 2, &idx
);
122 for (i
= 0; i
< UMEM_NUM
/ 2; i
++) {
123 addr
= (UMEM_NUM
/ 2 + i
) * UMEM_FRAME_SIZE
;
124 printf("%p: rx_desc[%d] -> %lx\n", xsk
, i
, addr
);
125 *xsk_ring_prod__fill_addr(&xsk
->fill
, idx
+ i
) = addr
;
127 xsk_ring_prod__submit(&xsk
->fill
, ret
);
132 static void close_xsk(struct xsk
*xsk
)
135 xsk_umem__delete(xsk
->umem
);
137 xsk_socket__delete(xsk
->socket
);
138 munmap(xsk
->umem_area
, UMEM_SIZE
);
141 static void refill_rx(struct xsk
*xsk
, __u64 addr
)
145 if (xsk_ring_prod__reserve(&xsk
->fill
, 1, &idx
) == 1) {
146 printf("%p: complete rx idx=%u addr=%llx\n", xsk
, idx
, addr
);
147 *xsk_ring_prod__fill_addr(&xsk
->fill
, idx
) = addr
;
148 xsk_ring_prod__submit(&xsk
->fill
, 1);
152 static int kick_tx(struct xsk
*xsk
)
154 return sendto(xsk_socket__fd(xsk
->socket
), NULL
, 0, MSG_DONTWAIT
, NULL
, 0);
157 static int kick_rx(struct xsk
*xsk
)
159 return recvfrom(xsk_socket__fd(xsk
->socket
), NULL
, 0, MSG_DONTWAIT
, NULL
, NULL
);
162 #define NANOSEC_PER_SEC 1000000000 /* 10^9 */
163 static __u64
gettime(clockid_t clock_id
)
168 /* See man clock_gettime(2) for type of clock_id's */
169 res
= clock_gettime(clock_id
, &t
);
172 error(res
, errno
, "Error with clock_gettime()");
174 return (__u64
) t
.tv_sec
* NANOSEC_PER_SEC
+ t
.tv_nsec
;
177 static void print_tstamp_delta(const char *name
, const char *refname
,
178 __u64 tstamp
, __u64 reference
)
180 __s64 delta
= (__s64
)reference
- (__s64
)tstamp
;
182 printf("%s: %llu (sec:%0.4f) delta to %s sec:%0.4f (%0.3f usec)\n",
183 name
, tstamp
, (double)tstamp
/ NANOSEC_PER_SEC
, refname
,
184 (double)delta
/ NANOSEC_PER_SEC
,
185 (double)delta
/ 1000);
188 #define VLAN_PRIO_MASK GENMASK(15, 13) /* Priority Code Point */
189 #define VLAN_DEI_MASK GENMASK(12, 12) /* Drop Eligible Indicator */
190 #define VLAN_VID_MASK GENMASK(11, 0) /* VLAN Identifier */
191 static void print_vlan_tci(__u16 tag
)
193 __u16 vlan_id
= FIELD_GET(VLAN_VID_MASK
, tag
);
194 __u8 pcp
= FIELD_GET(VLAN_PRIO_MASK
, tag
);
195 bool dei
= FIELD_GET(VLAN_DEI_MASK
, tag
);
197 printf("PCP=%u, DEI=%d, VID=0x%X\n", pcp
, dei
, vlan_id
);
200 static void verify_xdp_metadata(void *data
, clockid_t clock_id
)
202 struct xdp_meta
*meta
;
204 meta
= data
- sizeof(*meta
);
206 if (meta
->hint_valid
& XDP_META_FIELD_RSS
)
207 printf("rx_hash: 0x%X with RSS type:0x%X\n",
208 meta
->rx_hash
, meta
->rx_hash_type
);
210 printf("No rx_hash, err=%d\n", meta
->rx_hash_err
);
212 if (meta
->hint_valid
& XDP_META_FIELD_TS
) {
213 __u64 ref_tstamp
= gettime(clock_id
);
215 /* store received timestamps to calculate a delta at tx */
216 last_hw_rx_timestamp
= meta
->rx_timestamp
;
217 last_xdp_rx_timestamp
= meta
->xdp_timestamp
;
219 print_tstamp_delta("HW RX-time", "User RX-time",
220 meta
->rx_timestamp
, ref_tstamp
);
221 print_tstamp_delta("XDP RX-time", "User RX-time",
222 meta
->xdp_timestamp
, ref_tstamp
);
224 printf("No rx_timestamp, err=%d\n", meta
->rx_timestamp_err
);
227 if (meta
->hint_valid
& XDP_META_FIELD_VLAN_TAG
) {
228 printf("rx_vlan_proto: 0x%X\n", ntohs(meta
->rx_vlan_proto
));
229 printf("rx_vlan_tci: ");
230 print_vlan_tci(meta
->rx_vlan_tci
);
232 printf("No rx_vlan_tci or rx_vlan_proto, err=%d\n",
233 meta
->rx_vlan_tag_err
);
237 static void verify_skb_metadata(int fd
)
240 char packet_buf
[128];
242 struct scm_timestamping
*ts
;
243 struct iovec packet_iov
;
244 struct cmsghdr
*cmsg
;
247 memset(&hdr
, 0, sizeof(hdr
));
248 hdr
.msg_iov
= &packet_iov
;
250 packet_iov
.iov_base
= packet_buf
;
251 packet_iov
.iov_len
= sizeof(packet_buf
);
253 hdr
.msg_control
= cmsg_buf
;
254 hdr
.msg_controllen
= sizeof(cmsg_buf
);
256 if (recvmsg(fd
, &hdr
, 0) < 0)
257 error(1, errno
, "recvmsg");
259 for (cmsg
= CMSG_FIRSTHDR(&hdr
); cmsg
!= NULL
;
260 cmsg
= CMSG_NXTHDR(&hdr
, cmsg
)) {
262 if (cmsg
->cmsg_level
!= SOL_SOCKET
)
265 switch (cmsg
->cmsg_type
) {
266 case SCM_TIMESTAMPING
:
267 ts
= (struct scm_timestamping
*)CMSG_DATA(cmsg
);
268 if (ts
->ts
[2].tv_sec
|| ts
->ts
[2].tv_nsec
) {
269 printf("found skb hwtstamp = %lu.%lu\n",
270 ts
->ts
[2].tv_sec
, ts
->ts
[2].tv_nsec
);
279 printf("skb hwtstamp is not found!\n");
282 static bool complete_tx(struct xsk
*xsk
, clockid_t clock_id
)
284 struct xsk_tx_metadata
*meta
;
289 if (!xsk_ring_cons__peek(&xsk
->comp
, 1, &idx
))
292 addr
= *xsk_ring_cons__comp_addr(&xsk
->comp
, idx
);
293 data
= xsk_umem__get_data(xsk
->umem_area
, addr
);
294 meta
= data
- sizeof(struct xsk_tx_metadata
);
296 printf("%p: complete tx idx=%u addr=%llx\n", xsk
, idx
, addr
);
298 if (meta
->completion
.tx_timestamp
) {
299 __u64 ref_tstamp
= gettime(clock_id
);
301 print_tstamp_delta("HW TX-complete-time", "User TX-complete-time",
302 meta
->completion
.tx_timestamp
, ref_tstamp
);
303 print_tstamp_delta("XDP RX-time", "User TX-complete-time",
304 last_xdp_rx_timestamp
, ref_tstamp
);
305 print_tstamp_delta("HW RX-time", "HW TX-complete-time",
306 last_hw_rx_timestamp
, meta
->completion
.tx_timestamp
);
308 printf("No tx_timestamp\n");
311 xsk_ring_cons__release(&xsk
->comp
, 1);
316 #define swap(a, b, len) do { \
317 for (int i = 0; i < len; i++) { \
318 __u8 tmp = ((__u8 *)a)[i]; \
319 ((__u8 *)a)[i] = ((__u8 *)b)[i]; \
320 ((__u8 *)b)[i] = tmp; \
324 static void ping_pong(struct xsk
*xsk
, void *rx_packet
, clockid_t clock_id
)
326 struct xsk_tx_metadata
*meta
;
327 struct ipv6hdr
*ip6h
= NULL
;
328 struct iphdr
*iph
= NULL
;
329 struct xdp_desc
*tx_desc
;
338 ret
= xsk_ring_prod__reserve(&xsk
->tx
, 1, &idx
);
340 printf("%p: failed to reserve tx slot\n", xsk
);
344 tx_desc
= xsk_ring_prod__tx_desc(&xsk
->tx
, idx
);
345 tx_desc
->addr
= idx
% (UMEM_NUM
/ 2) * UMEM_FRAME_SIZE
+ sizeof(struct xsk_tx_metadata
);
346 data
= xsk_umem__get_data(xsk
->umem_area
, tx_desc
->addr
);
348 meta
= data
- sizeof(struct xsk_tx_metadata
);
349 memset(meta
, 0, sizeof(*meta
));
350 meta
->flags
= XDP_TXMD_FLAGS_TIMESTAMP
;
354 if (eth
->h_proto
== htons(ETH_P_IP
)) {
355 iph
= (void *)(eth
+ 1);
356 udph
= (void *)(iph
+ 1);
357 } else if (eth
->h_proto
== htons(ETH_P_IPV6
)) {
358 ip6h
= (void *)(eth
+ 1);
359 udph
= (void *)(ip6h
+ 1);
361 printf("%p: failed to detect IP version for ping pong %04x\n", xsk
, eth
->h_proto
);
362 xsk_ring_prod__cancel(&xsk
->tx
, 1);
368 len
+= sizeof(*ip6h
) + ntohs(ip6h
->payload_len
);
370 len
+= ntohs(iph
->tot_len
);
372 swap(eth
->h_dest
, eth
->h_source
, ETH_ALEN
);
374 swap(&iph
->saddr
, &iph
->daddr
, 4);
376 swap(&ip6h
->saddr
, &ip6h
->daddr
, 16);
377 swap(&udph
->source
, &udph
->dest
, 2);
379 want_csum
= udph
->check
;
381 udph
->check
= ~csum_ipv6_magic(&ip6h
->saddr
, &ip6h
->daddr
,
382 ntohs(udph
->len
), IPPROTO_UDP
, 0);
384 udph
->check
= ~csum_tcpudp_magic(iph
->saddr
, iph
->daddr
,
385 ntohs(udph
->len
), IPPROTO_UDP
, 0);
387 meta
->flags
|= XDP_TXMD_FLAGS_CHECKSUM
;
389 meta
->request
.csum_start
= sizeof(*eth
) + sizeof(*iph
);
391 meta
->request
.csum_start
= sizeof(*eth
) + sizeof(*ip6h
);
392 meta
->request
.csum_offset
= offsetof(struct udphdr
, check
);
394 printf("%p: ping-pong with csum=%04x (want %04x) csum_start=%d csum_offset=%d\n",
395 xsk
, ntohs(udph
->check
), ntohs(want_csum
),
396 meta
->request
.csum_start
, meta
->request
.csum_offset
);
398 memcpy(data
, rx_packet
, len
); /* don't share umem chunk for simplicity */
399 tx_desc
->options
|= XDP_TX_METADATA
;
402 xsk_ring_prod__submit(&xsk
->tx
, 1);
405 static int verify_metadata(struct xsk
*rx_xsk
, int rxq
, int server_fd
, clockid_t clock_id
)
407 const struct xdp_desc
*rx_desc
;
408 struct pollfd fds
[rxq
+ 1];
415 for (i
= 0; i
< rxq
; i
++) {
416 fds
[i
].fd
= xsk_socket__fd(rx_xsk
[i
].socket
);
417 fds
[i
].events
= POLLIN
;
421 fds
[rxq
].fd
= server_fd
;
422 fds
[rxq
].events
= POLLIN
;
423 fds
[rxq
].revents
= 0;
428 for (i
= 0; i
< rxq
; i
++) {
429 ret
= kick_rx(&rx_xsk
[i
]);
431 printf("kick_rx ret=%d\n", ret
);
434 ret
= poll(fds
, rxq
+ 1, 1000);
435 printf("poll: %d (%d) skip=%llu fail=%llu redir=%llu\n",
436 ret
, errno
, bpf_obj
->bss
->pkts_skip
,
437 bpf_obj
->bss
->pkts_fail
, bpf_obj
->bss
->pkts_redir
);
443 if (fds
[rxq
].revents
)
444 verify_skb_metadata(server_fd
);
446 for (i
= 0; i
< rxq
; i
++) {
447 bool first_seg
= true;
450 if (fds
[i
].revents
== 0)
453 struct xsk
*xsk
= &rx_xsk
[i
];
455 ret
= xsk_ring_cons__peek(&xsk
->rx
, 1, &idx
);
456 printf("xsk_ring_cons__peek: %d\n", ret
);
460 rx_desc
= xsk_ring_cons__rx_desc(&xsk
->rx
, idx
);
461 comp_addr
= xsk_umem__extract_addr(rx_desc
->addr
);
462 addr
= xsk_umem__add_offset_to_addr(rx_desc
->addr
);
463 is_eop
= !(rx_desc
->options
& XDP_PKT_CONTD
);
464 printf("%p: rx_desc[%u]->addr=%llx addr=%llx comp_addr=%llx%s\n",
465 xsk
, idx
, rx_desc
->addr
, addr
, comp_addr
, is_eop
? " EoP" : "");
467 verify_xdp_metadata(xsk_umem__get_data(xsk
->umem_area
, addr
),
472 /* mirror first chunk back */
473 ping_pong(xsk
, xsk_umem__get_data(xsk
->umem_area
, addr
),
478 printf("kick_tx ret=%d\n", ret
);
480 for (int j
= 0; j
< 500; j
++) {
481 if (complete_tx(xsk
, clock_id
))
488 xsk_ring_cons__release(&xsk
->rx
, 1);
489 refill_rx(xsk
, comp_addr
);
498 static int rxq_num(const char *ifname
)
500 struct ethtool_channels ch
= {
501 .cmd
= ETHTOOL_GCHANNELS
,
505 .ifr_data
= (void *)&ch
,
507 strncpy(ifr
.ifr_name
, ifname
, IF_NAMESIZE
- 1);
510 fd
= socket(AF_UNIX
, SOCK_DGRAM
, 0);
512 error(1, errno
, "socket");
514 ret
= ioctl(fd
, SIOCETHTOOL
, &ifr
);
516 error(1, errno
, "ioctl(SIOCETHTOOL)");
520 return ch
.rx_count
+ ch
.combined_count
;
523 static void hwtstamp_ioctl(int op
, const char *ifname
, struct hwtstamp_config
*cfg
)
526 .ifr_data
= (void *)cfg
,
528 strncpy(ifr
.ifr_name
, ifname
, IF_NAMESIZE
- 1);
531 fd
= socket(AF_UNIX
, SOCK_DGRAM
, 0);
533 error(1, errno
, "socket");
535 ret
= ioctl(fd
, op
, &ifr
);
537 error(1, errno
, "ioctl(%d)", op
);
542 static struct hwtstamp_config saved_hwtstamp_cfg
;
543 static const char *saved_hwtstamp_ifname
;
545 static void hwtstamp_restore(void)
547 hwtstamp_ioctl(SIOCSHWTSTAMP
, saved_hwtstamp_ifname
, &saved_hwtstamp_cfg
);
550 static void hwtstamp_enable(const char *ifname
)
552 struct hwtstamp_config cfg
= {
553 .rx_filter
= HWTSTAMP_FILTER_ALL
,
556 hwtstamp_ioctl(SIOCGHWTSTAMP
, ifname
, &saved_hwtstamp_cfg
);
557 saved_hwtstamp_ifname
= strdup(ifname
);
558 atexit(hwtstamp_restore
);
560 hwtstamp_ioctl(SIOCSHWTSTAMP
, ifname
, &cfg
);
563 static void cleanup(void)
565 LIBBPF_OPTS(bpf_xdp_attach_opts
, opts
);
570 opts
.old_prog_fd
= bpf_program__fd(bpf_obj
->progs
.rx
);
571 if (opts
.old_prog_fd
>= 0) {
572 printf("detaching bpf program....\n");
573 ret
= bpf_xdp_detach(ifindex
, XDP_FLAGS
, &opts
);
575 printf("failed to detach XDP program: %d\n", ret
);
579 for (i
= 0; i
< rxq
; i
++)
580 close_xsk(&rx_xsk
[i
]);
583 xdp_hw_metadata__destroy(bpf_obj
);
585 free((void *)saved_hwtstamp_ifname
);
588 static void handle_signal(int sig
)
590 /* interrupting poll() is all we need */
593 static void timestamping_enable(int fd
, int val
)
597 ret
= setsockopt(fd
, SOL_SOCKET
, SO_TIMESTAMPING
, &val
, sizeof(val
));
599 error(1, errno
, "setsockopt(SO_TIMESTAMPING)");
602 static void print_usage(void)
605 "Usage: xdp_hw_metadata [OPTIONS] [IFNAME]\n"
606 " -c Run in copy mode (zerocopy is default)\n"
607 " -h Display this help and exit\n\n"
608 " -m Enable multi-buffer XDP for larger MTU\n"
609 " -r Don't generate AF_XDP reply (rx metadata only)\n"
610 "Generate test packets on the other machine with:\n"
611 " echo -n xdp | nc -u -q1 <dst_ip> 9091\n";
616 static void read_args(int argc
, char *argv
[])
620 while ((opt
= getopt(argc
, argv
, "chmr")) != -1) {
623 bind_flags
&= ~XDP_USE_NEED_WAKEUP
;
624 bind_flags
&= ~XDP_ZEROCOPY
;
625 bind_flags
|= XDP_COPY
;
631 bind_flags
|= XDP_USE_SG
;
638 fprintf(stderr
, "Unknown option: -%c\n", optopt
);
642 error(-1, opterr
, "Command line options error");
646 if (optind
>= argc
) {
647 fprintf(stderr
, "No device name provided\n");
652 ifname
= argv
[optind
];
653 ifindex
= if_nametoindex(ifname
);
656 error(-1, errno
, "Invalid interface name");
659 int main(int argc
, char *argv
[])
661 clockid_t clock_id
= CLOCK_TAI
;
666 struct bpf_program
*prog
;
668 read_args(argc
, argv
);
670 rxq
= rxq_num(ifname
);
672 printf("rxq: %d\n", rxq
);
674 hwtstamp_enable(ifname
);
676 rx_xsk
= malloc(sizeof(struct xsk
) * rxq
);
678 error(1, ENOMEM
, "malloc");
680 for (i
= 0; i
< rxq
; i
++) {
681 printf("open_xsk(%s, %p, %d)\n", ifname
, &rx_xsk
[i
], i
);
682 ret
= open_xsk(ifindex
, &rx_xsk
[i
], i
);
684 error(1, -ret
, "open_xsk");
686 printf("xsk_socket__fd() -> %d\n", xsk_socket__fd(rx_xsk
[i
].socket
));
689 printf("open bpf program...\n");
690 bpf_obj
= xdp_hw_metadata__open();
691 if (libbpf_get_error(bpf_obj
))
692 error(1, libbpf_get_error(bpf_obj
), "xdp_hw_metadata__open");
694 prog
= bpf_object__find_program_by_name(bpf_obj
->obj
, "rx");
695 bpf_program__set_ifindex(prog
, ifindex
);
696 bpf_program__set_flags(prog
, BPF_F_XDP_DEV_BOUND_ONLY
);
698 printf("load bpf program...\n");
699 ret
= xdp_hw_metadata__load(bpf_obj
);
701 error(1, -ret
, "xdp_hw_metadata__load");
703 printf("prepare skb endpoint...\n");
704 server_fd
= start_server(AF_INET6
, SOCK_DGRAM
, NULL
, 9092, 1000);
706 error(1, errno
, "start_server");
707 timestamping_enable(server_fd
,
708 SOF_TIMESTAMPING_SOFTWARE
|
709 SOF_TIMESTAMPING_RAW_HARDWARE
);
711 printf("prepare xsk map...\n");
712 for (i
= 0; i
< rxq
; i
++) {
713 int sock_fd
= xsk_socket__fd(rx_xsk
[i
].socket
);
716 printf("map[%d] = %d\n", queue_id
, sock_fd
);
717 ret
= bpf_map_update_elem(bpf_map__fd(bpf_obj
->maps
.xsk
), &queue_id
, &sock_fd
, 0);
719 error(1, -ret
, "bpf_map_update_elem");
722 printf("attach bpf program...\n");
723 ret
= bpf_xdp_attach(ifindex
,
724 bpf_program__fd(bpf_obj
->progs
.rx
),
727 error(1, -ret
, "bpf_xdp_attach");
729 signal(SIGINT
, handle_signal
);
730 ret
= verify_metadata(rx_xsk
, rxq
, server_fd
, clock_id
);
734 error(1, -ret
, "verify_metadata");