1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP user-space packet buffer
3 * Copyright(c) 2018 Intel Corporation.
6 #include <linux/init.h>
7 #include <linux/sched/mm.h>
8 #include <linux/sched/signal.h>
9 #include <linux/sched/task.h>
10 #include <linux/uaccess.h>
11 #include <linux/slab.h>
12 #include <linux/bpf.h>
14 #include <linux/netdevice.h>
15 #include <linux/rtnetlink.h>
16 #include <linux/idr.h>
17 #include <linux/vmalloc.h>
20 #include "xsk_queue.h"
22 #define XDP_UMEM_MIN_CHUNK_SIZE 2048
24 static DEFINE_IDA(umem_ida
);
26 void xdp_add_sk_umem(struct xdp_umem
*umem
, struct xdp_sock
*xs
)
33 spin_lock_irqsave(&umem
->xsk_list_lock
, flags
);
34 list_add_rcu(&xs
->list
, &umem
->xsk_list
);
35 spin_unlock_irqrestore(&umem
->xsk_list_lock
, flags
);
38 void xdp_del_sk_umem(struct xdp_umem
*umem
, struct xdp_sock
*xs
)
45 spin_lock_irqsave(&umem
->xsk_list_lock
, flags
);
46 list_del_rcu(&xs
->list
);
47 spin_unlock_irqrestore(&umem
->xsk_list_lock
, flags
);
50 /* The umem is stored both in the _rx struct and the _tx struct as we do
51 * not know if the device has more tx queues than rx, or the opposite.
52 * This might also change during run time.
54 static int xdp_reg_umem_at_qid(struct net_device
*dev
, struct xdp_umem
*umem
,
57 if (queue_id
>= max_t(unsigned int,
58 dev
->real_num_rx_queues
,
59 dev
->real_num_tx_queues
))
62 if (queue_id
< dev
->real_num_rx_queues
)
63 dev
->_rx
[queue_id
].umem
= umem
;
64 if (queue_id
< dev
->real_num_tx_queues
)
65 dev
->_tx
[queue_id
].umem
= umem
;
70 struct xdp_umem
*xdp_get_umem_from_qid(struct net_device
*dev
,
73 if (queue_id
< dev
->real_num_rx_queues
)
74 return dev
->_rx
[queue_id
].umem
;
75 if (queue_id
< dev
->real_num_tx_queues
)
76 return dev
->_tx
[queue_id
].umem
;
80 EXPORT_SYMBOL(xdp_get_umem_from_qid
);
82 static void xdp_clear_umem_at_qid(struct net_device
*dev
, u16 queue_id
)
84 if (queue_id
< dev
->real_num_rx_queues
)
85 dev
->_rx
[queue_id
].umem
= NULL
;
86 if (queue_id
< dev
->real_num_tx_queues
)
87 dev
->_tx
[queue_id
].umem
= NULL
;
90 int xdp_umem_assign_dev(struct xdp_umem
*umem
, struct net_device
*dev
,
91 u16 queue_id
, u16 flags
)
93 bool force_zc
, force_copy
;
94 struct netdev_bpf bpf
;
99 force_zc
= flags
& XDP_ZEROCOPY
;
100 force_copy
= flags
& XDP_COPY
;
102 if (force_zc
&& force_copy
)
105 if (xdp_get_umem_from_qid(dev
, queue_id
))
108 err
= xdp_reg_umem_at_qid(dev
, umem
, queue_id
);
113 umem
->queue_id
= queue_id
;
115 if (flags
& XDP_USE_NEED_WAKEUP
) {
116 umem
->flags
|= XDP_UMEM_USES_NEED_WAKEUP
;
117 /* Tx needs to be explicitly woken up the first time.
118 * Also for supporting drivers that do not implement this
119 * feature. They will always have to call sendto().
121 xsk_set_tx_need_wakeup(umem
);
127 /* For copy-mode, we are done. */
130 if (!dev
->netdev_ops
->ndo_bpf
|| !dev
->netdev_ops
->ndo_xsk_wakeup
) {
135 bpf
.command
= XDP_SETUP_XSK_UMEM
;
137 bpf
.xsk
.queue_id
= queue_id
;
139 err
= dev
->netdev_ops
->ndo_bpf(dev
, &bpf
);
148 err
= 0; /* fallback to copy mode */
150 xdp_clear_umem_at_qid(dev
, queue_id
);
154 void xdp_umem_clear_dev(struct xdp_umem
*umem
)
156 struct netdev_bpf bpf
;
165 bpf
.command
= XDP_SETUP_XSK_UMEM
;
167 bpf
.xsk
.queue_id
= umem
->queue_id
;
169 err
= umem
->dev
->netdev_ops
->ndo_bpf(umem
->dev
, &bpf
);
172 WARN(1, "failed to disable umem!\n");
175 xdp_clear_umem_at_qid(umem
->dev
, umem
->queue_id
);
182 static void xdp_umem_unmap_pages(struct xdp_umem
*umem
)
186 for (i
= 0; i
< umem
->npgs
; i
++)
187 if (PageHighMem(umem
->pgs
[i
]))
188 vunmap(umem
->pages
[i
].addr
);
191 static int xdp_umem_map_pages(struct xdp_umem
*umem
)
196 for (i
= 0; i
< umem
->npgs
; i
++) {
197 if (PageHighMem(umem
->pgs
[i
]))
198 addr
= vmap(&umem
->pgs
[i
], 1, VM_MAP
, PAGE_KERNEL
);
200 addr
= page_address(umem
->pgs
[i
]);
203 xdp_umem_unmap_pages(umem
);
207 umem
->pages
[i
].addr
= addr
;
213 static void xdp_umem_unpin_pages(struct xdp_umem
*umem
)
215 unpin_user_pages_dirty_lock(umem
->pgs
, umem
->npgs
, true);
221 static void xdp_umem_unaccount_pages(struct xdp_umem
*umem
)
224 atomic_long_sub(umem
->npgs
, &umem
->user
->locked_vm
);
225 free_uid(umem
->user
);
229 static void xdp_umem_release(struct xdp_umem
*umem
)
232 xdp_umem_clear_dev(umem
);
235 ida_simple_remove(&umem_ida
, umem
->id
);
238 xskq_destroy(umem
->fq
);
243 xskq_destroy(umem
->cq
);
247 xsk_reuseq_destroy(umem
);
249 xdp_umem_unmap_pages(umem
);
250 xdp_umem_unpin_pages(umem
);
255 xdp_umem_unaccount_pages(umem
);
259 static void xdp_umem_release_deferred(struct work_struct
*work
)
261 struct xdp_umem
*umem
= container_of(work
, struct xdp_umem
, work
);
263 xdp_umem_release(umem
);
266 void xdp_get_umem(struct xdp_umem
*umem
)
268 refcount_inc(&umem
->users
);
271 void xdp_put_umem(struct xdp_umem
*umem
)
276 if (refcount_dec_and_test(&umem
->users
)) {
277 INIT_WORK(&umem
->work
, xdp_umem_release_deferred
);
278 schedule_work(&umem
->work
);
282 static int xdp_umem_pin_pages(struct xdp_umem
*umem
)
284 unsigned int gup_flags
= FOLL_WRITE
;
288 umem
->pgs
= kcalloc(umem
->npgs
, sizeof(*umem
->pgs
),
289 GFP_KERNEL
| __GFP_NOWARN
);
293 down_read(¤t
->mm
->mmap_sem
);
294 npgs
= pin_user_pages(umem
->address
, umem
->npgs
,
295 gup_flags
| FOLL_LONGTERM
, &umem
->pgs
[0], NULL
);
296 up_read(¤t
->mm
->mmap_sem
);
298 if (npgs
!= umem
->npgs
) {
310 xdp_umem_unpin_pages(umem
);
317 static int xdp_umem_account_pages(struct xdp_umem
*umem
)
319 unsigned long lock_limit
, new_npgs
, old_npgs
;
321 if (capable(CAP_IPC_LOCK
))
324 lock_limit
= rlimit(RLIMIT_MEMLOCK
) >> PAGE_SHIFT
;
325 umem
->user
= get_uid(current_user());
328 old_npgs
= atomic_long_read(&umem
->user
->locked_vm
);
329 new_npgs
= old_npgs
+ umem
->npgs
;
330 if (new_npgs
> lock_limit
) {
331 free_uid(umem
->user
);
335 } while (atomic_long_cmpxchg(&umem
->user
->locked_vm
, old_npgs
,
336 new_npgs
) != old_npgs
);
340 static int xdp_umem_reg(struct xdp_umem
*umem
, struct xdp_umem_reg
*mr
)
342 bool unaligned_chunks
= mr
->flags
& XDP_UMEM_UNALIGNED_CHUNK_FLAG
;
343 u32 chunk_size
= mr
->chunk_size
, headroom
= mr
->headroom
;
344 u64 npgs
, addr
= mr
->addr
, size
= mr
->len
;
345 unsigned int chunks
, chunks_per_page
;
348 if (chunk_size
< XDP_UMEM_MIN_CHUNK_SIZE
|| chunk_size
> PAGE_SIZE
) {
349 /* Strictly speaking we could support this, if:
351 * - using an IOMMU, or
352 * - making sure the memory area is consecutive
353 * but for now, we simply say "computer says no".
358 if (mr
->flags
& ~(XDP_UMEM_UNALIGNED_CHUNK_FLAG
|
359 XDP_UMEM_USES_NEED_WAKEUP
))
362 if (!unaligned_chunks
&& !is_power_of_2(chunk_size
))
365 if (!PAGE_ALIGNED(addr
)) {
366 /* Memory area has to be page size aligned. For
367 * simplicity, this might change.
372 if ((addr
+ size
) < addr
)
375 npgs
= div_u64(size
, PAGE_SIZE
);
379 chunks
= (unsigned int)div_u64(size
, chunk_size
);
383 if (!unaligned_chunks
) {
384 chunks_per_page
= PAGE_SIZE
/ chunk_size
;
385 if (chunks
< chunks_per_page
|| chunks
% chunks_per_page
)
389 if (headroom
>= chunk_size
- XDP_PACKET_HEADROOM
)
392 umem
->address
= (unsigned long)addr
;
393 umem
->chunk_mask
= unaligned_chunks
? XSK_UNALIGNED_BUF_ADDR_MASK
394 : ~((u64
)chunk_size
- 1);
396 umem
->headroom
= headroom
;
397 umem
->chunk_size_nohr
= chunk_size
- headroom
;
398 umem
->npgs
= (u32
)npgs
;
401 umem
->flags
= mr
->flags
;
402 INIT_LIST_HEAD(&umem
->xsk_list
);
403 spin_lock_init(&umem
->xsk_list_lock
);
405 refcount_set(&umem
->users
, 1);
407 err
= xdp_umem_account_pages(umem
);
411 err
= xdp_umem_pin_pages(umem
);
415 umem
->pages
= kvcalloc(umem
->npgs
, sizeof(*umem
->pages
),
422 err
= xdp_umem_map_pages(umem
);
429 xdp_umem_unpin_pages(umem
);
431 xdp_umem_unaccount_pages(umem
);
435 struct xdp_umem
*xdp_umem_create(struct xdp_umem_reg
*mr
)
437 struct xdp_umem
*umem
;
440 umem
= kzalloc(sizeof(*umem
), GFP_KERNEL
);
442 return ERR_PTR(-ENOMEM
);
444 err
= ida_simple_get(&umem_ida
, 0, 0, GFP_KERNEL
);
451 err
= xdp_umem_reg(umem
, mr
);
453 ida_simple_remove(&umem_ida
, umem
->id
);
461 bool xdp_umem_validate_queues(struct xdp_umem
*umem
)
463 return umem
->fq
&& umem
->cq
;