1 // SPDX-License-Identifier: GPL-2.0
4 * AF_XDP sockets allows a channel between XDP programs and userspace
6 * Copyright(c) 2018 Intel Corporation.
8 * Author(s): Björn Töpel <bjorn.topel@intel.com>
9 * Magnus Karlsson <magnus.karlsson@intel.com>
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <net/xdp_sock.h>
28 #include "xsk_queue.h"
32 #define TX_BATCH_SIZE 16
34 bool xsk_is_setup_for_bpf_map(struct xdp_sock
*xs
)
36 return READ_ONCE(xs
->rx
) && READ_ONCE(xs
->umem
) &&
37 READ_ONCE(xs
->umem
->fq
);
40 u64
*xsk_umem_peek_addr(struct xdp_umem
*umem
, u64
*addr
)
42 return xskq_peek_addr(umem
->fq
, addr
);
44 EXPORT_SYMBOL(xsk_umem_peek_addr
);
46 void xsk_umem_discard_addr(struct xdp_umem
*umem
)
48 xskq_discard_addr(umem
->fq
);
50 EXPORT_SYMBOL(xsk_umem_discard_addr
);
52 static int __xsk_rcv(struct xdp_sock
*xs
, struct xdp_buff
*xdp
, u32 len
)
54 void *to_buf
, *from_buf
;
59 if (!xskq_peek_addr(xs
->umem
->fq
, &addr
) ||
60 len
> xs
->umem
->chunk_size_nohr
- XDP_PACKET_HEADROOM
) {
65 addr
+= xs
->umem
->headroom
;
67 if (unlikely(xdp_data_meta_unsupported(xdp
))) {
71 from_buf
= xdp
->data_meta
;
72 metalen
= xdp
->data
- xdp
->data_meta
;
75 to_buf
= xdp_umem_get_data(xs
->umem
, addr
);
76 memcpy(to_buf
, from_buf
, len
+ metalen
);
78 err
= xskq_produce_batch_desc(xs
->rx
, addr
, len
);
80 xskq_discard_addr(xs
->umem
->fq
);
89 static int __xsk_rcv_zc(struct xdp_sock
*xs
, struct xdp_buff
*xdp
, u32 len
)
91 int err
= xskq_produce_batch_desc(xs
->rx
, (u64
)xdp
->handle
, len
);
99 int xsk_rcv(struct xdp_sock
*xs
, struct xdp_buff
*xdp
)
103 if (xs
->dev
!= xdp
->rxq
->dev
|| xs
->queue_id
!= xdp
->rxq
->queue_index
)
106 len
= xdp
->data_end
- xdp
->data
;
108 return (xdp
->rxq
->mem
.type
== MEM_TYPE_ZERO_COPY
) ?
109 __xsk_rcv_zc(xs
, xdp
, len
) : __xsk_rcv(xs
, xdp
, len
);
112 void xsk_flush(struct xdp_sock
*xs
)
114 xskq_produce_flush_desc(xs
->rx
);
115 xs
->sk
.sk_data_ready(&xs
->sk
);
118 int xsk_generic_rcv(struct xdp_sock
*xs
, struct xdp_buff
*xdp
)
120 u32 metalen
= xdp
->data
- xdp
->data_meta
;
121 u32 len
= xdp
->data_end
- xdp
->data
;
126 if (xs
->dev
!= xdp
->rxq
->dev
|| xs
->queue_id
!= xdp
->rxq
->queue_index
)
129 if (!xskq_peek_addr(xs
->umem
->fq
, &addr
) ||
130 len
> xs
->umem
->chunk_size_nohr
- XDP_PACKET_HEADROOM
) {
135 addr
+= xs
->umem
->headroom
;
137 buffer
= xdp_umem_get_data(xs
->umem
, addr
);
138 memcpy(buffer
, xdp
->data_meta
, len
+ metalen
);
140 err
= xskq_produce_batch_desc(xs
->rx
, addr
, len
);
142 xskq_discard_addr(xs
->umem
->fq
);
151 void xsk_umem_complete_tx(struct xdp_umem
*umem
, u32 nb_entries
)
153 xskq_produce_flush_addr_n(umem
->cq
, nb_entries
);
155 EXPORT_SYMBOL(xsk_umem_complete_tx
);
157 void xsk_umem_consume_tx_done(struct xdp_umem
*umem
)
162 list_for_each_entry_rcu(xs
, &umem
->xsk_list
, list
) {
163 xs
->sk
.sk_write_space(&xs
->sk
);
167 EXPORT_SYMBOL(xsk_umem_consume_tx_done
);
169 bool xsk_umem_consume_tx(struct xdp_umem
*umem
, dma_addr_t
*dma
, u32
*len
)
171 struct xdp_desc desc
;
175 list_for_each_entry_rcu(xs
, &umem
->xsk_list
, list
) {
176 if (!xskq_peek_desc(xs
->tx
, &desc
))
179 if (xskq_produce_addr_lazy(umem
->cq
, desc
.addr
))
182 *dma
= xdp_umem_get_dma(umem
, desc
.addr
);
185 xskq_discard_desc(xs
->tx
);
194 EXPORT_SYMBOL(xsk_umem_consume_tx
);
196 static int xsk_zc_xmit(struct sock
*sk
)
198 struct xdp_sock
*xs
= xdp_sk(sk
);
199 struct net_device
*dev
= xs
->dev
;
201 return dev
->netdev_ops
->ndo_xsk_async_xmit(dev
, xs
->queue_id
);
204 static void xsk_destruct_skb(struct sk_buff
*skb
)
206 u64 addr
= (u64
)(long)skb_shinfo(skb
)->destructor_arg
;
207 struct xdp_sock
*xs
= xdp_sk(skb
->sk
);
210 spin_lock_irqsave(&xs
->tx_completion_lock
, flags
);
211 WARN_ON_ONCE(xskq_produce_addr(xs
->umem
->cq
, addr
));
212 spin_unlock_irqrestore(&xs
->tx_completion_lock
, flags
);
217 static int xsk_generic_xmit(struct sock
*sk
, struct msghdr
*m
,
220 u32 max_batch
= TX_BATCH_SIZE
;
221 struct xdp_sock
*xs
= xdp_sk(sk
);
222 bool sent_frame
= false;
223 struct xdp_desc desc
;
227 mutex_lock(&xs
->mutex
);
229 while (xskq_peek_desc(xs
->tx
, &desc
)) {
234 if (max_batch
-- == 0) {
239 if (xskq_reserve_addr(xs
->umem
->cq
))
242 if (xs
->queue_id
>= xs
->dev
->real_num_tx_queues
)
246 skb
= sock_alloc_send_skb(sk
, len
, 1, &err
);
247 if (unlikely(!skb
)) {
254 buffer
= xdp_umem_get_data(xs
->umem
, addr
);
255 err
= skb_store_bits(skb
, 0, buffer
, len
);
262 skb
->priority
= sk
->sk_priority
;
263 skb
->mark
= sk
->sk_mark
;
264 skb_shinfo(skb
)->destructor_arg
= (void *)(long)addr
;
265 skb
->destructor
= xsk_destruct_skb
;
267 err
= dev_direct_xmit(skb
, xs
->queue_id
);
268 xskq_discard_desc(xs
->tx
);
269 /* Ignore NET_XMIT_CN as packet might have been sent */
270 if (err
== NET_XMIT_DROP
|| err
== NETDEV_TX_BUSY
) {
271 /* SKB completed but not sent */
281 sk
->sk_write_space(sk
);
283 mutex_unlock(&xs
->mutex
);
287 static int xsk_sendmsg(struct socket
*sock
, struct msghdr
*m
, size_t total_len
)
289 bool need_wait
= !(m
->msg_flags
& MSG_DONTWAIT
);
290 struct sock
*sk
= sock
->sk
;
291 struct xdp_sock
*xs
= xdp_sk(sk
);
293 if (unlikely(!xs
->dev
))
295 if (unlikely(!(xs
->dev
->flags
& IFF_UP
)))
297 if (unlikely(!xs
->tx
))
302 return (xs
->zc
) ? xsk_zc_xmit(sk
) : xsk_generic_xmit(sk
, m
, total_len
);
305 static unsigned int xsk_poll(struct file
*file
, struct socket
*sock
,
306 struct poll_table_struct
*wait
)
308 unsigned int mask
= datagram_poll(file
, sock
, wait
);
309 struct sock
*sk
= sock
->sk
;
310 struct xdp_sock
*xs
= xdp_sk(sk
);
312 if (xs
->rx
&& !xskq_empty_desc(xs
->rx
))
313 mask
|= POLLIN
| POLLRDNORM
;
314 if (xs
->tx
&& !xskq_full_desc(xs
->tx
))
315 mask
|= POLLOUT
| POLLWRNORM
;
320 static int xsk_init_queue(u32 entries
, struct xsk_queue
**queue
,
325 if (entries
== 0 || *queue
|| !is_power_of_2(entries
))
328 q
= xskq_create(entries
, umem_queue
);
332 /* Make sure queue is ready before it can be seen by others */
338 static int xsk_release(struct socket
*sock
)
340 struct sock
*sk
= sock
->sk
;
341 struct xdp_sock
*xs
= xdp_sk(sk
);
349 mutex_lock(&net
->xdp
.lock
);
350 sk_del_node_init_rcu(sk
);
351 mutex_unlock(&net
->xdp
.lock
);
354 sock_prot_inuse_add(net
, sk
->sk_prot
, -1);
358 struct net_device
*dev
= xs
->dev
;
360 /* Wait for driver to stop using the xdp socket. */
361 xdp_del_sk_umem(xs
->umem
, xs
);
367 xskq_destroy(xs
->rx
);
368 xskq_destroy(xs
->tx
);
373 sk_refcnt_debug_release(sk
);
379 static struct socket
*xsk_lookup_xsk_from_fd(int fd
)
384 sock
= sockfd_lookup(fd
, &err
);
386 return ERR_PTR(-ENOTSOCK
);
388 if (sock
->sk
->sk_family
!= PF_XDP
) {
390 return ERR_PTR(-ENOPROTOOPT
);
396 static int xsk_bind(struct socket
*sock
, struct sockaddr
*addr
, int addr_len
)
398 struct sockaddr_xdp
*sxdp
= (struct sockaddr_xdp
*)addr
;
399 struct sock
*sk
= sock
->sk
;
400 struct xdp_sock
*xs
= xdp_sk(sk
);
401 struct net_device
*dev
;
405 if (addr_len
< sizeof(struct sockaddr_xdp
))
407 if (sxdp
->sxdp_family
!= AF_XDP
)
410 flags
= sxdp
->sxdp_flags
;
411 if (flags
& ~(XDP_SHARED_UMEM
| XDP_COPY
| XDP_ZEROCOPY
))
414 mutex_lock(&xs
->mutex
);
420 dev
= dev_get_by_index(sock_net(sk
), sxdp
->sxdp_ifindex
);
426 if (!xs
->rx
&& !xs
->tx
) {
431 qid
= sxdp
->sxdp_queue_id
;
433 if (flags
& XDP_SHARED_UMEM
) {
434 struct xdp_sock
*umem_xs
;
437 if ((flags
& XDP_COPY
) || (flags
& XDP_ZEROCOPY
)) {
438 /* Cannot specify flags for shared sockets. */
444 /* We have already our own. */
449 sock
= xsk_lookup_xsk_from_fd(sxdp
->sxdp_shared_umem_fd
);
455 umem_xs
= xdp_sk(sock
->sk
);
456 if (!umem_xs
->umem
) {
457 /* No umem to inherit. */
461 } else if (umem_xs
->dev
!= dev
|| umem_xs
->queue_id
!= qid
) {
467 xdp_get_umem(umem_xs
->umem
);
468 xs
->umem
= umem_xs
->umem
;
470 } else if (!xs
->umem
|| !xdp_umem_validate_queues(xs
->umem
)) {
474 /* This xsk has its own umem. */
475 xskq_set_umem(xs
->umem
->fq
, xs
->umem
->size
,
476 xs
->umem
->chunk_mask
);
477 xskq_set_umem(xs
->umem
->cq
, xs
->umem
->size
,
478 xs
->umem
->chunk_mask
);
480 err
= xdp_umem_assign_dev(xs
->umem
, dev
, qid
, flags
);
486 xs
->zc
= xs
->umem
->zc
;
488 xskq_set_umem(xs
->rx
, xs
->umem
->size
, xs
->umem
->chunk_mask
);
489 xskq_set_umem(xs
->tx
, xs
->umem
->size
, xs
->umem
->chunk_mask
);
490 xdp_add_sk_umem(xs
->umem
, xs
);
496 mutex_unlock(&xs
->mutex
);
500 static int xsk_setsockopt(struct socket
*sock
, int level
, int optname
,
501 char __user
*optval
, unsigned int optlen
)
503 struct sock
*sk
= sock
->sk
;
504 struct xdp_sock
*xs
= xdp_sk(sk
);
507 if (level
!= SOL_XDP
)
514 struct xsk_queue
**q
;
517 if (optlen
< sizeof(entries
))
519 if (copy_from_user(&entries
, optval
, sizeof(entries
)))
522 mutex_lock(&xs
->mutex
);
523 q
= (optname
== XDP_TX_RING
) ? &xs
->tx
: &xs
->rx
;
524 err
= xsk_init_queue(entries
, q
, false);
525 mutex_unlock(&xs
->mutex
);
530 struct xdp_umem_reg mr
;
531 struct xdp_umem
*umem
;
533 if (copy_from_user(&mr
, optval
, sizeof(mr
)))
536 mutex_lock(&xs
->mutex
);
538 mutex_unlock(&xs
->mutex
);
542 umem
= xdp_umem_create(&mr
);
544 mutex_unlock(&xs
->mutex
);
545 return PTR_ERR(umem
);
548 /* Make sure umem is ready before it can be seen by others */
551 mutex_unlock(&xs
->mutex
);
554 case XDP_UMEM_FILL_RING
:
555 case XDP_UMEM_COMPLETION_RING
:
557 struct xsk_queue
**q
;
560 if (copy_from_user(&entries
, optval
, sizeof(entries
)))
563 mutex_lock(&xs
->mutex
);
565 mutex_unlock(&xs
->mutex
);
569 q
= (optname
== XDP_UMEM_FILL_RING
) ? &xs
->umem
->fq
:
571 err
= xsk_init_queue(entries
, q
, true);
572 mutex_unlock(&xs
->mutex
);
582 static int xsk_getsockopt(struct socket
*sock
, int level
, int optname
,
583 char __user
*optval
, int __user
*optlen
)
585 struct sock
*sk
= sock
->sk
;
586 struct xdp_sock
*xs
= xdp_sk(sk
);
589 if (level
!= SOL_XDP
)
592 if (get_user(len
, optlen
))
600 struct xdp_statistics stats
;
602 if (len
< sizeof(stats
))
605 mutex_lock(&xs
->mutex
);
606 stats
.rx_dropped
= xs
->rx_dropped
;
607 stats
.rx_invalid_descs
= xskq_nb_invalid_descs(xs
->rx
);
608 stats
.tx_invalid_descs
= xskq_nb_invalid_descs(xs
->tx
);
609 mutex_unlock(&xs
->mutex
);
611 if (copy_to_user(optval
, &stats
, sizeof(stats
)))
613 if (put_user(sizeof(stats
), optlen
))
618 case XDP_MMAP_OFFSETS
:
620 struct xdp_mmap_offsets off
;
622 if (len
< sizeof(off
))
625 off
.rx
.producer
= offsetof(struct xdp_rxtx_ring
, ptrs
.producer
);
626 off
.rx
.consumer
= offsetof(struct xdp_rxtx_ring
, ptrs
.consumer
);
627 off
.rx
.desc
= offsetof(struct xdp_rxtx_ring
, desc
);
628 off
.tx
.producer
= offsetof(struct xdp_rxtx_ring
, ptrs
.producer
);
629 off
.tx
.consumer
= offsetof(struct xdp_rxtx_ring
, ptrs
.consumer
);
630 off
.tx
.desc
= offsetof(struct xdp_rxtx_ring
, desc
);
632 off
.fr
.producer
= offsetof(struct xdp_umem_ring
, ptrs
.producer
);
633 off
.fr
.consumer
= offsetof(struct xdp_umem_ring
, ptrs
.consumer
);
634 off
.fr
.desc
= offsetof(struct xdp_umem_ring
, desc
);
635 off
.cr
.producer
= offsetof(struct xdp_umem_ring
, ptrs
.producer
);
636 off
.cr
.consumer
= offsetof(struct xdp_umem_ring
, ptrs
.consumer
);
637 off
.cr
.desc
= offsetof(struct xdp_umem_ring
, desc
);
640 if (copy_to_user(optval
, &off
, len
))
642 if (put_user(len
, optlen
))
654 static int xsk_mmap(struct file
*file
, struct socket
*sock
,
655 struct vm_area_struct
*vma
)
657 loff_t offset
= (loff_t
)vma
->vm_pgoff
<< PAGE_SHIFT
;
658 unsigned long size
= vma
->vm_end
- vma
->vm_start
;
659 struct xdp_sock
*xs
= xdp_sk(sock
->sk
);
660 struct xsk_queue
*q
= NULL
;
661 struct xdp_umem
*umem
;
665 if (offset
== XDP_PGOFF_RX_RING
) {
666 q
= READ_ONCE(xs
->rx
);
667 } else if (offset
== XDP_PGOFF_TX_RING
) {
668 q
= READ_ONCE(xs
->tx
);
670 umem
= READ_ONCE(xs
->umem
);
674 /* Matches the smp_wmb() in XDP_UMEM_REG */
676 if (offset
== XDP_UMEM_PGOFF_FILL_RING
)
677 q
= READ_ONCE(umem
->fq
);
678 else if (offset
== XDP_UMEM_PGOFF_COMPLETION_RING
)
679 q
= READ_ONCE(umem
->cq
);
685 /* Matches the smp_wmb() in xsk_init_queue */
687 qpg
= virt_to_head_page(q
->ring
);
688 if (size
> (PAGE_SIZE
<< compound_order(qpg
)))
691 pfn
= virt_to_phys(q
->ring
) >> PAGE_SHIFT
;
692 return remap_pfn_range(vma
, vma
->vm_start
, pfn
,
693 size
, vma
->vm_page_prot
);
696 static struct proto xsk_proto
= {
698 .owner
= THIS_MODULE
,
699 .obj_size
= sizeof(struct xdp_sock
),
702 static const struct proto_ops xsk_proto_ops
= {
704 .owner
= THIS_MODULE
,
705 .release
= xsk_release
,
707 .connect
= sock_no_connect
,
708 .socketpair
= sock_no_socketpair
,
709 .accept
= sock_no_accept
,
710 .getname
= sock_no_getname
,
712 .ioctl
= sock_no_ioctl
,
713 .listen
= sock_no_listen
,
714 .shutdown
= sock_no_shutdown
,
715 .setsockopt
= xsk_setsockopt
,
716 .getsockopt
= xsk_getsockopt
,
717 .sendmsg
= xsk_sendmsg
,
718 .recvmsg
= sock_no_recvmsg
,
720 .sendpage
= sock_no_sendpage
,
723 static void xsk_destruct(struct sock
*sk
)
725 struct xdp_sock
*xs
= xdp_sk(sk
);
727 if (!sock_flag(sk
, SOCK_DEAD
))
730 xdp_put_umem(xs
->umem
);
732 sk_refcnt_debug_dec(sk
);
735 static int xsk_create(struct net
*net
, struct socket
*sock
, int protocol
,
741 if (!ns_capable(net
->user_ns
, CAP_NET_RAW
))
743 if (sock
->type
!= SOCK_RAW
)
744 return -ESOCKTNOSUPPORT
;
747 return -EPROTONOSUPPORT
;
749 sock
->state
= SS_UNCONNECTED
;
751 sk
= sk_alloc(net
, PF_XDP
, GFP_KERNEL
, &xsk_proto
, kern
);
755 sock
->ops
= &xsk_proto_ops
;
757 sock_init_data(sock
, sk
);
759 sk
->sk_family
= PF_XDP
;
761 sk
->sk_destruct
= xsk_destruct
;
762 sk_refcnt_debug_inc(sk
);
764 sock_set_flag(sk
, SOCK_RCU_FREE
);
767 mutex_init(&xs
->mutex
);
768 spin_lock_init(&xs
->tx_completion_lock
);
770 mutex_lock(&net
->xdp
.lock
);
771 sk_add_node_rcu(sk
, &net
->xdp
.list
);
772 mutex_unlock(&net
->xdp
.lock
);
775 sock_prot_inuse_add(net
, &xsk_proto
, 1);
781 static const struct net_proto_family xsk_family_ops
= {
783 .create
= xsk_create
,
784 .owner
= THIS_MODULE
,
787 static int __net_init
xsk_net_init(struct net
*net
)
789 mutex_init(&net
->xdp
.lock
);
790 INIT_HLIST_HEAD(&net
->xdp
.list
);
794 static void __net_exit
xsk_net_exit(struct net
*net
)
796 WARN_ON_ONCE(!hlist_empty(&net
->xdp
.list
));
799 static struct pernet_operations xsk_net_ops
= {
800 .init
= xsk_net_init
,
801 .exit
= xsk_net_exit
,
804 static int __init
xsk_init(void)
808 err
= proto_register(&xsk_proto
, 0 /* no slab */);
812 err
= sock_register(&xsk_family_ops
);
816 err
= register_pernet_subsys(&xsk_net_ops
);
822 sock_unregister(PF_XDP
);
824 proto_unregister(&xsk_proto
);
829 fs_initcall(xsk_init
);