1 // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
4 * AF_XDP user-space access library.
6 * Copyright(c) 2018 - 2019 Intel Corporation.
8 * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
15 #include <arpa/inet.h>
16 #include <asm/barrier.h>
17 #include <linux/compiler.h>
18 #include <linux/ethtool.h>
19 #include <linux/filter.h>
20 #include <linux/if_ether.h>
21 #include <linux/if_link.h>
22 #include <linux/if_packet.h>
23 #include <linux/if_xdp.h>
24 #include <linux/kernel.h>
25 #include <linux/list.h>
26 #include <linux/netlink.h>
27 #include <linux/rtnetlink.h>
28 #include <linux/sockios.h>
30 #include <sys/ioctl.h>
32 #include <sys/socket.h>
33 #include <sys/types.h>
36 #include <bpf/libbpf.h>
52 #define pr_warn(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__)
57 struct xsk_ring_prod
*fill_save
;
58 struct xsk_ring_cons
*comp_save
;
60 struct xsk_umem_config config
;
63 struct list_head ctx_list
;
64 bool rx_ring_setup_done
;
65 bool tx_ring_setup_done
;
69 struct xsk_ring_prod
*fill
;
70 struct xsk_ring_cons
*comp
;
72 struct xsk_umem
*umem
;
75 struct list_head list
;
79 struct xsk_ring_cons
*rx
;
80 struct xsk_ring_prod
*tx
;
82 struct xsk_socket_config config
;
92 int xsk_umem__fd(const struct xsk_umem
*umem
)
94 return umem
? umem
->fd
: -EINVAL
;
97 int xsk_socket__fd(const struct xsk_socket
*xsk
)
99 return xsk
? xsk
->fd
: -EINVAL
;
102 static bool xsk_page_aligned(void *buffer
)
104 unsigned long addr
= (unsigned long)buffer
;
106 return !(addr
& (getpagesize() - 1));
109 static void xsk_set_umem_config(struct xsk_umem_config
*cfg
,
110 const struct xsk_umem_config
*usr_cfg
)
113 cfg
->fill_size
= XSK_RING_PROD__DEFAULT_NUM_DESCS
;
114 cfg
->comp_size
= XSK_RING_CONS__DEFAULT_NUM_DESCS
;
115 cfg
->frame_size
= XSK_UMEM__DEFAULT_FRAME_SIZE
;
116 cfg
->frame_headroom
= XSK_UMEM__DEFAULT_FRAME_HEADROOM
;
117 cfg
->flags
= XSK_UMEM__DEFAULT_FLAGS
;
118 cfg
->tx_metadata_len
= 0;
122 cfg
->fill_size
= usr_cfg
->fill_size
;
123 cfg
->comp_size
= usr_cfg
->comp_size
;
124 cfg
->frame_size
= usr_cfg
->frame_size
;
125 cfg
->frame_headroom
= usr_cfg
->frame_headroom
;
126 cfg
->flags
= usr_cfg
->flags
;
127 cfg
->tx_metadata_len
= usr_cfg
->tx_metadata_len
;
130 static int xsk_set_xdp_socket_config(struct xsk_socket_config
*cfg
,
131 const struct xsk_socket_config
*usr_cfg
)
134 cfg
->rx_size
= XSK_RING_CONS__DEFAULT_NUM_DESCS
;
135 cfg
->tx_size
= XSK_RING_PROD__DEFAULT_NUM_DESCS
;
140 cfg
->rx_size
= usr_cfg
->rx_size
;
141 cfg
->tx_size
= usr_cfg
->tx_size
;
142 cfg
->bind_flags
= usr_cfg
->bind_flags
;
147 static int xsk_get_mmap_offsets(int fd
, struct xdp_mmap_offsets
*off
)
152 optlen
= sizeof(*off
);
153 err
= getsockopt(fd
, SOL_XDP
, XDP_MMAP_OFFSETS
, off
, &optlen
);
157 if (optlen
== sizeof(*off
))
163 static int xsk_create_umem_rings(struct xsk_umem
*umem
, int fd
,
164 struct xsk_ring_prod
*fill
,
165 struct xsk_ring_cons
*comp
)
167 struct xdp_mmap_offsets off
;
171 err
= setsockopt(fd
, SOL_XDP
, XDP_UMEM_FILL_RING
,
172 &umem
->config
.fill_size
,
173 sizeof(umem
->config
.fill_size
));
177 err
= setsockopt(fd
, SOL_XDP
, XDP_UMEM_COMPLETION_RING
,
178 &umem
->config
.comp_size
,
179 sizeof(umem
->config
.comp_size
));
183 err
= xsk_get_mmap_offsets(fd
, &off
);
187 map
= mmap(NULL
, off
.fr
.desc
+ umem
->config
.fill_size
* sizeof(__u64
),
188 PROT_READ
| PROT_WRITE
, MAP_SHARED
| MAP_POPULATE
, fd
,
189 XDP_UMEM_PGOFF_FILL_RING
);
190 if (map
== MAP_FAILED
)
193 fill
->mask
= umem
->config
.fill_size
- 1;
194 fill
->size
= umem
->config
.fill_size
;
195 fill
->producer
= map
+ off
.fr
.producer
;
196 fill
->consumer
= map
+ off
.fr
.consumer
;
197 fill
->flags
= map
+ off
.fr
.flags
;
198 fill
->ring
= map
+ off
.fr
.desc
;
199 fill
->cached_cons
= umem
->config
.fill_size
;
201 map
= mmap(NULL
, off
.cr
.desc
+ umem
->config
.comp_size
* sizeof(__u64
),
202 PROT_READ
| PROT_WRITE
, MAP_SHARED
| MAP_POPULATE
, fd
,
203 XDP_UMEM_PGOFF_COMPLETION_RING
);
204 if (map
== MAP_FAILED
) {
209 comp
->mask
= umem
->config
.comp_size
- 1;
210 comp
->size
= umem
->config
.comp_size
;
211 comp
->producer
= map
+ off
.cr
.producer
;
212 comp
->consumer
= map
+ off
.cr
.consumer
;
213 comp
->flags
= map
+ off
.cr
.flags
;
214 comp
->ring
= map
+ off
.cr
.desc
;
219 munmap(map
, off
.fr
.desc
+ umem
->config
.fill_size
* sizeof(__u64
));
223 int xsk_umem__create(struct xsk_umem
**umem_ptr
, void *umem_area
,
224 __u64 size
, struct xsk_ring_prod
*fill
,
225 struct xsk_ring_cons
*comp
,
226 const struct xsk_umem_config
*usr_config
)
228 struct xdp_umem_reg mr
;
229 struct xsk_umem
*umem
;
232 if (!umem_area
|| !umem_ptr
|| !fill
|| !comp
)
234 if (!size
&& !xsk_page_aligned(umem_area
))
237 umem
= calloc(1, sizeof(*umem
));
241 umem
->fd
= socket(AF_XDP
, SOCK_RAW
| SOCK_CLOEXEC
, 0);
247 umem
->umem_area
= umem_area
;
248 INIT_LIST_HEAD(&umem
->ctx_list
);
249 xsk_set_umem_config(&umem
->config
, usr_config
);
251 memset(&mr
, 0, sizeof(mr
));
252 mr
.addr
= (uintptr_t)umem_area
;
254 mr
.chunk_size
= umem
->config
.frame_size
;
255 mr
.headroom
= umem
->config
.frame_headroom
;
256 mr
.flags
= umem
->config
.flags
;
257 mr
.tx_metadata_len
= umem
->config
.tx_metadata_len
;
259 err
= setsockopt(umem
->fd
, SOL_XDP
, XDP_UMEM_REG
, &mr
, sizeof(mr
));
265 err
= xsk_create_umem_rings(umem
, umem
->fd
, fill
, comp
);
269 umem
->fill_save
= fill
;
270 umem
->comp_save
= comp
;
281 bool xsk_is_in_mode(u32 ifindex
, int mode
)
283 LIBBPF_OPTS(bpf_xdp_query_opts
, opts
);
286 ret
= bpf_xdp_query(ifindex
, mode
, &opts
);
288 printf("XDP mode query returned error %s\n", strerror(errno
));
292 if (mode
== XDP_FLAGS_DRV_MODE
)
293 return opts
.attach_mode
== XDP_ATTACHED_DRV
;
294 else if (mode
== XDP_FLAGS_SKB_MODE
)
295 return opts
.attach_mode
== XDP_ATTACHED_SKB
;
300 /* Lifted from netlink.c in tools/lib/bpf */
301 static int netlink_recvmsg(int sock
, struct msghdr
*mhdr
, int flags
)
306 len
= recvmsg(sock
, mhdr
, flags
);
307 } while (len
< 0 && (errno
== EINTR
|| errno
== EAGAIN
));
314 /* Lifted from netlink.c in tools/lib/bpf */
315 static int alloc_iov(struct iovec
*iov
, int len
)
319 nbuf
= realloc(iov
->iov_base
, len
);
323 iov
->iov_base
= nbuf
;
328 /* Original version lifted from netlink.c in tools/lib/bpf */
329 static int netlink_recv(int sock
)
331 struct iovec iov
= {};
332 struct msghdr mhdr
= {
336 bool multipart
= true;
337 struct nlmsgerr
*err
;
341 ret
= alloc_iov(&iov
, 4096);
347 len
= netlink_recvmsg(sock
, &mhdr
, MSG_PEEK
| MSG_TRUNC
);
353 if (len
> iov
.iov_len
) {
354 ret
= alloc_iov(&iov
, len
);
359 len
= netlink_recvmsg(sock
, &mhdr
, 0);
368 for (nh
= (struct nlmsghdr
*)iov
.iov_base
; NLMSG_OK(nh
, len
);
369 nh
= NLMSG_NEXT(nh
, len
)) {
370 if (nh
->nlmsg_flags
& NLM_F_MULTI
)
372 switch (nh
->nlmsg_type
) {
374 err
= (struct nlmsgerr
*)NLMSG_DATA(nh
);
393 int xsk_set_mtu(int ifindex
, int mtu
)
395 struct nl_mtu_req req
;
399 fd
= socket(AF_NETLINK
, SOCK_DGRAM
, NETLINK_ROUTE
);
403 memset(&req
, 0, sizeof(req
));
404 req
.nh
.nlmsg_len
= NLMSG_LENGTH(sizeof(struct ifinfomsg
));
405 req
.nh
.nlmsg_flags
= NLM_F_REQUEST
| NLM_F_ACK
;
406 req
.nh
.nlmsg_type
= RTM_NEWLINK
;
407 req
.msg
.ifi_family
= AF_UNSPEC
;
408 req
.msg
.ifi_index
= ifindex
;
409 rta
= (struct rtattr
*)(((char *)&req
) + NLMSG_ALIGN(req
.nh
.nlmsg_len
));
410 rta
->rta_type
= IFLA_MTU
;
411 rta
->rta_len
= RTA_LENGTH(sizeof(unsigned int));
412 req
.nh
.nlmsg_len
= NLMSG_ALIGN(req
.nh
.nlmsg_len
) + RTA_LENGTH(sizeof(mtu
));
413 memcpy(RTA_DATA(rta
), &mtu
, sizeof(mtu
));
415 ret
= send(fd
, &req
, req
.nh
.nlmsg_len
, 0);
421 ret
= netlink_recv(fd
);
426 int xsk_attach_xdp_program(struct bpf_program
*prog
, int ifindex
, u32 xdp_flags
)
430 prog_fd
= bpf_program__fd(prog
);
431 return bpf_xdp_attach(ifindex
, prog_fd
, xdp_flags
, NULL
);
434 void xsk_detach_xdp_program(int ifindex
, u32 xdp_flags
)
436 bpf_xdp_detach(ifindex
, xdp_flags
, NULL
);
439 void xsk_clear_xskmap(struct bpf_map
*map
)
444 map_fd
= bpf_map__fd(map
);
445 bpf_map_delete_elem(map_fd
, &index
);
448 int xsk_update_xskmap(struct bpf_map
*map
, struct xsk_socket
*xsk
, u32 index
)
452 map_fd
= bpf_map__fd(map
);
453 sock_fd
= xsk_socket__fd(xsk
);
455 return bpf_map_update_elem(map_fd
, &index
, &sock_fd
, 0);
458 static struct xsk_ctx
*xsk_get_ctx(struct xsk_umem
*umem
, int ifindex
,
463 if (list_empty(&umem
->ctx_list
))
466 list_for_each_entry(ctx
, &umem
->ctx_list
, list
) {
467 if (ctx
->ifindex
== ifindex
&& ctx
->queue_id
== queue_id
) {
476 static void xsk_put_ctx(struct xsk_ctx
*ctx
, bool unmap
)
478 struct xsk_umem
*umem
= ctx
->umem
;
479 struct xdp_mmap_offsets off
;
488 err
= xsk_get_mmap_offsets(umem
->fd
, &off
);
492 munmap(ctx
->fill
->ring
- off
.fr
.desc
, off
.fr
.desc
+ umem
->config
.fill_size
*
494 munmap(ctx
->comp
->ring
- off
.cr
.desc
, off
.cr
.desc
+ umem
->config
.comp_size
*
498 list_del(&ctx
->list
);
502 static struct xsk_ctx
*xsk_create_ctx(struct xsk_socket
*xsk
,
503 struct xsk_umem
*umem
, int ifindex
,
505 struct xsk_ring_prod
*fill
,
506 struct xsk_ring_cons
*comp
)
511 ctx
= calloc(1, sizeof(*ctx
));
515 if (!umem
->fill_save
) {
516 err
= xsk_create_umem_rings(umem
, xsk
->fd
, fill
, comp
);
521 } else if (umem
->fill_save
!= fill
|| umem
->comp_save
!= comp
) {
522 /* Copy over rings to new structs. */
523 memcpy(fill
, umem
->fill_save
, sizeof(*fill
));
524 memcpy(comp
, umem
->comp_save
, sizeof(*comp
));
527 ctx
->ifindex
= ifindex
;
530 ctx
->queue_id
= queue_id
;
534 list_add(&ctx
->list
, &umem
->ctx_list
);
538 int xsk_socket__create_shared(struct xsk_socket
**xsk_ptr
,
540 __u32 queue_id
, struct xsk_umem
*umem
,
541 struct xsk_ring_cons
*rx
,
542 struct xsk_ring_prod
*tx
,
543 struct xsk_ring_prod
*fill
,
544 struct xsk_ring_cons
*comp
,
545 const struct xsk_socket_config
*usr_config
)
547 bool unmap
, rx_setup_done
= false, tx_setup_done
= false;
548 void *rx_map
= NULL
, *tx_map
= NULL
;
549 struct sockaddr_xdp sxdp
= {};
550 struct xdp_mmap_offsets off
;
551 struct xsk_socket
*xsk
;
555 if (!umem
|| !xsk_ptr
|| !(rx
|| tx
))
558 unmap
= umem
->fill_save
!= fill
;
560 xsk
= calloc(1, sizeof(*xsk
));
564 err
= xsk_set_xdp_socket_config(&xsk
->config
, usr_config
);
568 if (umem
->refcount
++ > 0) {
569 xsk
->fd
= socket(AF_XDP
, SOCK_RAW
| SOCK_CLOEXEC
, 0);
576 rx_setup_done
= umem
->rx_ring_setup_done
;
577 tx_setup_done
= umem
->tx_ring_setup_done
;
580 ctx
= xsk_get_ctx(umem
, ifindex
, queue_id
);
582 if (!fill
|| !comp
) {
587 ctx
= xsk_create_ctx(xsk
, umem
, ifindex
, queue_id
, fill
, comp
);
595 if (rx
&& !rx_setup_done
) {
596 err
= setsockopt(xsk
->fd
, SOL_XDP
, XDP_RX_RING
,
597 &xsk
->config
.rx_size
,
598 sizeof(xsk
->config
.rx_size
));
603 if (xsk
->fd
== umem
->fd
)
604 umem
->rx_ring_setup_done
= true;
606 if (tx
&& !tx_setup_done
) {
607 err
= setsockopt(xsk
->fd
, SOL_XDP
, XDP_TX_RING
,
608 &xsk
->config
.tx_size
,
609 sizeof(xsk
->config
.tx_size
));
614 if (xsk
->fd
== umem
->fd
)
615 umem
->tx_ring_setup_done
= true;
618 err
= xsk_get_mmap_offsets(xsk
->fd
, &off
);
625 rx_map
= mmap(NULL
, off
.rx
.desc
+
626 xsk
->config
.rx_size
* sizeof(struct xdp_desc
),
627 PROT_READ
| PROT_WRITE
, MAP_SHARED
| MAP_POPULATE
,
628 xsk
->fd
, XDP_PGOFF_RX_RING
);
629 if (rx_map
== MAP_FAILED
) {
634 rx
->mask
= xsk
->config
.rx_size
- 1;
635 rx
->size
= xsk
->config
.rx_size
;
636 rx
->producer
= rx_map
+ off
.rx
.producer
;
637 rx
->consumer
= rx_map
+ off
.rx
.consumer
;
638 rx
->flags
= rx_map
+ off
.rx
.flags
;
639 rx
->ring
= rx_map
+ off
.rx
.desc
;
640 rx
->cached_prod
= *rx
->producer
;
641 rx
->cached_cons
= *rx
->consumer
;
646 tx_map
= mmap(NULL
, off
.tx
.desc
+
647 xsk
->config
.tx_size
* sizeof(struct xdp_desc
),
648 PROT_READ
| PROT_WRITE
, MAP_SHARED
| MAP_POPULATE
,
649 xsk
->fd
, XDP_PGOFF_TX_RING
);
650 if (tx_map
== MAP_FAILED
) {
655 tx
->mask
= xsk
->config
.tx_size
- 1;
656 tx
->size
= xsk
->config
.tx_size
;
657 tx
->producer
= tx_map
+ off
.tx
.producer
;
658 tx
->consumer
= tx_map
+ off
.tx
.consumer
;
659 tx
->flags
= tx_map
+ off
.tx
.flags
;
660 tx
->ring
= tx_map
+ off
.tx
.desc
;
661 tx
->cached_prod
= *tx
->producer
;
662 /* cached_cons is r->size bigger than the real consumer pointer
663 * See xsk_prod_nb_free
665 tx
->cached_cons
= *tx
->consumer
+ xsk
->config
.tx_size
;
669 sxdp
.sxdp_family
= PF_XDP
;
670 sxdp
.sxdp_ifindex
= ctx
->ifindex
;
671 sxdp
.sxdp_queue_id
= ctx
->queue_id
;
672 if (umem
->refcount
> 1) {
673 sxdp
.sxdp_flags
|= XDP_SHARED_UMEM
;
674 sxdp
.sxdp_shared_umem_fd
= umem
->fd
;
676 sxdp
.sxdp_flags
= xsk
->config
.bind_flags
;
679 err
= bind(xsk
->fd
, (struct sockaddr
*)&sxdp
, sizeof(sxdp
));
686 umem
->fill_save
= NULL
;
687 umem
->comp_save
= NULL
;
692 munmap(tx_map
, off
.tx
.desc
+
693 xsk
->config
.tx_size
* sizeof(struct xdp_desc
));
696 munmap(rx_map
, off
.rx
.desc
+
697 xsk
->config
.rx_size
* sizeof(struct xdp_desc
));
699 xsk_put_ctx(ctx
, unmap
);
701 if (--umem
->refcount
)
708 int xsk_socket__create(struct xsk_socket
**xsk_ptr
, int ifindex
,
709 __u32 queue_id
, struct xsk_umem
*umem
,
710 struct xsk_ring_cons
*rx
, struct xsk_ring_prod
*tx
,
711 const struct xsk_socket_config
*usr_config
)
716 return xsk_socket__create_shared(xsk_ptr
, ifindex
, queue_id
, umem
,
717 rx
, tx
, umem
->fill_save
,
718 umem
->comp_save
, usr_config
);
721 int xsk_umem__delete(struct xsk_umem
*umem
)
723 struct xdp_mmap_offsets off
;
732 err
= xsk_get_mmap_offsets(umem
->fd
, &off
);
733 if (!err
&& umem
->fill_save
&& umem
->comp_save
) {
734 munmap(umem
->fill_save
->ring
- off
.fr
.desc
,
735 off
.fr
.desc
+ umem
->config
.fill_size
* sizeof(__u64
));
736 munmap(umem
->comp_save
->ring
- off
.cr
.desc
,
737 off
.cr
.desc
+ umem
->config
.comp_size
* sizeof(__u64
));
746 void xsk_socket__delete(struct xsk_socket
*xsk
)
748 size_t desc_sz
= sizeof(struct xdp_desc
);
749 struct xdp_mmap_offsets off
;
750 struct xsk_umem
*umem
;
760 xsk_put_ctx(ctx
, true);
762 err
= xsk_get_mmap_offsets(xsk
->fd
, &off
);
765 munmap(xsk
->rx
->ring
- off
.rx
.desc
,
766 off
.rx
.desc
+ xsk
->config
.rx_size
* desc_sz
);
769 munmap(xsk
->tx
->ring
- off
.tx
.desc
,
770 off
.tx
.desc
+ xsk
->config
.tx_size
* desc_sz
);
775 /* Do not close an fd that also has an associated umem connected
778 if (xsk
->fd
!= umem
->fd
)