1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
6 #include "linux/filter.h"
7 #include <linux/btf_ids.h>
8 #include <linux/vmalloc.h>
9 #include <linux/pagemap.h>
10 #include "range_tree.h"
13 * bpf_arena is a sparsely populated shared memory region between bpf program and
16 * For example on x86-64 the values could be:
17 * user_vm_start 7f7d26200000 // picked by mmap()
18 * kern_vm_start ffffc90001e69000 // picked by get_vm_area()
19 * For user space all pointers within the arena are normal 8-byte addresses.
20 * In this example 7f7d26200000 is the address of the first page (pgoff=0).
21 * The bpf program will access it as: kern_vm_start + lower_32bit_of_user_ptr
22 * (u32)7f7d26200000 -> 26200000
24 * ffffc90001e69000 + 26200000 == ffffc90028069000 is "pgoff=0" within 4Gb
25 * kernel memory region.
27 * BPF JITs generate the following code to access arena:
28 * mov eax, eax // eax has lower 32-bit of user pointer
29 * mov word ptr [rax + r12 + off], bx
30 * where r12 == kern_vm_start and off is s16.
31 * Hence allocate 4Gb + GUARD_SZ/2 on each side.
33 * Initially kernel vm_area and user vma are not populated.
34 * User space can fault-in any address which will insert the page
35 * into kernel and user vma.
36 * bpf program can allocate a page via bpf_arena_alloc_pages() kfunc
37 * which will insert it into kernel vm_area.
38 * The later fault-in from user space will populate that page into user vma.
41 /* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */
42 #define GUARD_SZ (1ull << sizeof_field(struct bpf_insn, off) * 8)
43 #define KERN_VM_SZ (SZ_4G + GUARD_SZ)
49 struct vm_struct
*kern_vm
;
51 struct list_head vma_list
;
55 u64
bpf_arena_get_kern_vm_start(struct bpf_arena
*arena
)
57 return arena
? (u64
) (long) arena
->kern_vm
->addr
+ GUARD_SZ
/ 2 : 0;
60 u64
bpf_arena_get_user_vm_start(struct bpf_arena
*arena
)
62 return arena
? arena
->user_vm_start
: 0;
65 static long arena_map_peek_elem(struct bpf_map
*map
, void *value
)
70 static long arena_map_push_elem(struct bpf_map
*map
, void *value
, u64 flags
)
75 static long arena_map_pop_elem(struct bpf_map
*map
, void *value
)
80 static long arena_map_delete_elem(struct bpf_map
*map
, void *value
)
85 static int arena_map_get_next_key(struct bpf_map
*map
, void *key
, void *next_key
)
90 static long compute_pgoff(struct bpf_arena
*arena
, long uaddr
)
92 return (u32
)(uaddr
- (u32
)arena
->user_vm_start
) >> PAGE_SHIFT
;
95 static struct bpf_map
*arena_map_alloc(union bpf_attr
*attr
)
97 struct vm_struct
*kern_vm
;
98 int numa_node
= bpf_map_attr_numa_node(attr
);
99 struct bpf_arena
*arena
;
103 if (!bpf_jit_supports_arena())
104 return ERR_PTR(-EOPNOTSUPP
);
106 if (attr
->key_size
|| attr
->value_size
|| attr
->max_entries
== 0 ||
107 /* BPF_F_MMAPABLE must be set */
108 !(attr
->map_flags
& BPF_F_MMAPABLE
) ||
109 /* No unsupported flags present */
110 (attr
->map_flags
& ~(BPF_F_SEGV_ON_FAULT
| BPF_F_MMAPABLE
| BPF_F_NO_USER_CONV
)))
111 return ERR_PTR(-EINVAL
);
113 if (attr
->map_extra
& ~PAGE_MASK
)
114 /* If non-zero the map_extra is an expected user VMA start address */
115 return ERR_PTR(-EINVAL
);
117 vm_range
= (u64
)attr
->max_entries
* PAGE_SIZE
;
118 if (vm_range
> SZ_4G
)
119 return ERR_PTR(-E2BIG
);
121 if ((attr
->map_extra
>> 32) != ((attr
->map_extra
+ vm_range
- 1) >> 32))
122 /* user vma must not cross 32-bit boundary */
123 return ERR_PTR(-ERANGE
);
125 kern_vm
= get_vm_area(KERN_VM_SZ
, VM_SPARSE
| VM_USERMAP
);
127 return ERR_PTR(-ENOMEM
);
129 arena
= bpf_map_area_alloc(sizeof(*arena
), numa_node
);
133 arena
->kern_vm
= kern_vm
;
134 arena
->user_vm_start
= attr
->map_extra
;
135 if (arena
->user_vm_start
)
136 arena
->user_vm_end
= arena
->user_vm_start
+ vm_range
;
138 INIT_LIST_HEAD(&arena
->vma_list
);
139 bpf_map_init_from_attr(&arena
->map
, attr
);
140 range_tree_init(&arena
->rt
);
141 range_tree_set(&arena
->rt
, 0, attr
->max_entries
);
142 mutex_init(&arena
->lock
);
146 free_vm_area(kern_vm
);
150 static int existing_page_cb(pte_t
*ptep
, unsigned long addr
, void *data
)
155 pte
= ptep_get(ptep
);
156 if (!pte_present(pte
)) /* sanity check */
158 page
= pte_page(pte
);
160 * We do not update pte here:
161 * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
162 * 2. TLB flushing is batched or deferred. Even if we clear pte,
163 * the TLB entries can stick around and continue to permit access to
164 * the freed page. So it all relies on 1.
170 static void arena_map_free(struct bpf_map
*map
)
172 struct bpf_arena
*arena
= container_of(map
, struct bpf_arena
, map
);
175 * Check that user vma-s are not around when bpf map is freed.
176 * mmap() holds vm_file which holds bpf_map refcnt.
177 * munmap() must have happened on vma followed by arena_vm_close()
178 * which would clear arena->vma_list.
180 if (WARN_ON_ONCE(!list_empty(&arena
->vma_list
)))
184 * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area().
185 * It unmaps everything from vmalloc area and clears pgtables.
186 * Call apply_to_existing_page_range() first to find populated ptes and
189 apply_to_existing_page_range(&init_mm
, bpf_arena_get_kern_vm_start(arena
),
190 KERN_VM_SZ
- GUARD_SZ
, existing_page_cb
, NULL
);
191 free_vm_area(arena
->kern_vm
);
192 range_tree_destroy(&arena
->rt
);
193 bpf_map_area_free(arena
);
196 static void *arena_map_lookup_elem(struct bpf_map
*map
, void *key
)
198 return ERR_PTR(-EINVAL
);
201 static long arena_map_update_elem(struct bpf_map
*map
, void *key
,
202 void *value
, u64 flags
)
207 static int arena_map_check_btf(const struct bpf_map
*map
, const struct btf
*btf
,
208 const struct btf_type
*key_type
, const struct btf_type
*value_type
)
213 static u64
arena_map_mem_usage(const struct bpf_map
*map
)
219 struct vm_area_struct
*vma
;
220 struct list_head head
;
224 static int remember_vma(struct bpf_arena
*arena
, struct vm_area_struct
*vma
)
226 struct vma_list
*vml
;
228 vml
= kmalloc(sizeof(*vml
), GFP_KERNEL
);
231 atomic_set(&vml
->mmap_count
, 1);
232 vma
->vm_private_data
= vml
;
234 list_add(&vml
->head
, &arena
->vma_list
);
238 static void arena_vm_open(struct vm_area_struct
*vma
)
240 struct vma_list
*vml
= vma
->vm_private_data
;
242 atomic_inc(&vml
->mmap_count
);
245 static void arena_vm_close(struct vm_area_struct
*vma
)
247 struct bpf_map
*map
= vma
->vm_file
->private_data
;
248 struct bpf_arena
*arena
= container_of(map
, struct bpf_arena
, map
);
249 struct vma_list
*vml
= vma
->vm_private_data
;
251 if (!atomic_dec_and_test(&vml
->mmap_count
))
253 guard(mutex
)(&arena
->lock
);
254 /* update link list under lock */
255 list_del(&vml
->head
);
256 vma
->vm_private_data
= NULL
;
260 #define MT_ENTRY ((void *)&arena_map_ops) /* unused. has to be valid pointer */
262 static vm_fault_t
arena_vm_fault(struct vm_fault
*vmf
)
264 struct bpf_map
*map
= vmf
->vma
->vm_file
->private_data
;
265 struct bpf_arena
*arena
= container_of(map
, struct bpf_arena
, map
);
270 kbase
= bpf_arena_get_kern_vm_start(arena
);
271 kaddr
= kbase
+ (u32
)(vmf
->address
);
273 guard(mutex
)(&arena
->lock
);
274 page
= vmalloc_to_page((void *)kaddr
);
276 /* already have a page vmap-ed */
279 if (arena
->map
.map_flags
& BPF_F_SEGV_ON_FAULT
)
280 /* User space requested to segfault when page is not allocated by bpf prog */
281 return VM_FAULT_SIGSEGV
;
283 ret
= range_tree_clear(&arena
->rt
, vmf
->pgoff
, 1);
285 return VM_FAULT_SIGSEGV
;
287 /* Account into memcg of the process that created bpf_arena */
288 ret
= bpf_map_alloc_pages(map
, GFP_KERNEL
| __GFP_ZERO
, NUMA_NO_NODE
, 1, &page
);
290 range_tree_set(&arena
->rt
, vmf
->pgoff
, 1);
291 return VM_FAULT_SIGSEGV
;
294 ret
= vm_area_map_pages(arena
->kern_vm
, kaddr
, kaddr
+ PAGE_SIZE
, &page
);
296 range_tree_set(&arena
->rt
, vmf
->pgoff
, 1);
298 return VM_FAULT_SIGSEGV
;
301 page_ref_add(page
, 1);
306 static const struct vm_operations_struct arena_vm_ops
= {
307 .open
= arena_vm_open
,
308 .close
= arena_vm_close
,
309 .fault
= arena_vm_fault
,
312 static unsigned long arena_get_unmapped_area(struct file
*filp
, unsigned long addr
,
313 unsigned long len
, unsigned long pgoff
,
316 struct bpf_map
*map
= filp
->private_data
;
317 struct bpf_arena
*arena
= container_of(map
, struct bpf_arena
, map
);
325 /* if user_vm_start was specified at arena creation time */
326 if (arena
->user_vm_start
) {
327 if (len
> arena
->user_vm_end
- arena
->user_vm_start
)
329 if (len
!= arena
->user_vm_end
- arena
->user_vm_start
)
331 if (addr
!= arena
->user_vm_start
)
335 ret
= mm_get_unmapped_area(current
->mm
, filp
, addr
, len
* 2, 0, flags
);
336 if (IS_ERR_VALUE(ret
))
338 if ((ret
>> 32) == ((ret
+ len
- 1) >> 32))
340 if (WARN_ON_ONCE(arena
->user_vm_start
))
341 /* checks at map creation time should prevent this */
343 return round_up(ret
, SZ_4G
);
346 static int arena_map_mmap(struct bpf_map
*map
, struct vm_area_struct
*vma
)
348 struct bpf_arena
*arena
= container_of(map
, struct bpf_arena
, map
);
350 guard(mutex
)(&arena
->lock
);
351 if (arena
->user_vm_start
&& arena
->user_vm_start
!= vma
->vm_start
)
353 * If map_extra was not specified at arena creation time then
354 * 1st user process can do mmap(NULL, ...) to pick user_vm_start
355 * 2nd user process must pass the same addr to mmap(addr, MAP_FIXED..);
357 * specify addr in map_extra and
358 * use the same addr later with mmap(addr, MAP_FIXED..);
362 if (arena
->user_vm_end
&& arena
->user_vm_end
!= vma
->vm_end
)
363 /* all user processes must have the same size of mmap-ed region */
366 /* Earlier checks should prevent this */
367 if (WARN_ON_ONCE(vma
->vm_end
- vma
->vm_start
> SZ_4G
|| vma
->vm_pgoff
))
370 if (remember_vma(arena
, vma
))
373 arena
->user_vm_start
= vma
->vm_start
;
374 arena
->user_vm_end
= vma
->vm_end
;
376 * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and
377 * clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid
378 * potential change of user_vm_start.
380 vm_flags_set(vma
, VM_DONTEXPAND
);
381 vma
->vm_ops
= &arena_vm_ops
;
385 static int arena_map_direct_value_addr(const struct bpf_map
*map
, u64
*imm
, u32 off
)
387 struct bpf_arena
*arena
= container_of(map
, struct bpf_arena
, map
);
389 if ((u64
)off
> arena
->user_vm_end
- arena
->user_vm_start
)
391 *imm
= (unsigned long)arena
->user_vm_start
;
395 BTF_ID_LIST_SINGLE(bpf_arena_map_btf_ids
, struct, bpf_arena
)
396 const struct bpf_map_ops arena_map_ops
= {
397 .map_meta_equal
= bpf_map_meta_equal
,
398 .map_alloc
= arena_map_alloc
,
399 .map_free
= arena_map_free
,
400 .map_direct_value_addr
= arena_map_direct_value_addr
,
401 .map_mmap
= arena_map_mmap
,
402 .map_get_unmapped_area
= arena_get_unmapped_area
,
403 .map_get_next_key
= arena_map_get_next_key
,
404 .map_push_elem
= arena_map_push_elem
,
405 .map_peek_elem
= arena_map_peek_elem
,
406 .map_pop_elem
= arena_map_pop_elem
,
407 .map_lookup_elem
= arena_map_lookup_elem
,
408 .map_update_elem
= arena_map_update_elem
,
409 .map_delete_elem
= arena_map_delete_elem
,
410 .map_check_btf
= arena_map_check_btf
,
411 .map_mem_usage
= arena_map_mem_usage
,
412 .map_btf_id
= &bpf_arena_map_btf_ids
[0],
415 static u64
clear_lo32(u64 val
)
417 return val
& ~(u64
)~0U;
421 * Allocate pages and vmap them into kernel vmalloc area.
422 * Later the pages will be mmaped into user space vma.
424 static long arena_alloc_pages(struct bpf_arena
*arena
, long uaddr
, long page_cnt
, int node_id
)
426 /* user_vm_end/start are fixed before bpf prog runs */
427 long page_cnt_max
= (arena
->user_vm_end
- arena
->user_vm_start
) >> PAGE_SHIFT
;
428 u64 kern_vm_start
= bpf_arena_get_kern_vm_start(arena
);
434 if (page_cnt
> page_cnt_max
)
438 if (uaddr
& ~PAGE_MASK
)
440 pgoff
= compute_pgoff(arena
, uaddr
);
441 if (pgoff
> page_cnt_max
- page_cnt
)
442 /* requested address will be outside of user VMA */
446 /* zeroing is needed, since alloc_pages_bulk_array() only fills in non-zero entries */
447 pages
= kvcalloc(page_cnt
, sizeof(struct page
*), GFP_KERNEL
);
451 guard(mutex
)(&arena
->lock
);
454 ret
= is_range_tree_set(&arena
->rt
, pgoff
, page_cnt
);
457 ret
= range_tree_clear(&arena
->rt
, pgoff
, page_cnt
);
459 ret
= pgoff
= range_tree_find(&arena
->rt
, page_cnt
);
461 ret
= range_tree_clear(&arena
->rt
, pgoff
, page_cnt
);
466 ret
= bpf_map_alloc_pages(&arena
->map
, GFP_KERNEL
| __GFP_ZERO
,
467 node_id
, page_cnt
, pages
);
471 uaddr32
= (u32
)(arena
->user_vm_start
+ pgoff
* PAGE_SIZE
);
472 /* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
473 * will not overflow 32-bit. Lower 32-bit need to represent
474 * contiguous user address range.
475 * Map these pages at kern_vm_start base.
476 * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
477 * lower 32-bit and it's ok.
479 ret
= vm_area_map_pages(arena
->kern_vm
, kern_vm_start
+ uaddr32
,
480 kern_vm_start
+ uaddr32
+ page_cnt
* PAGE_SIZE
, pages
);
482 for (i
= 0; i
< page_cnt
; i
++)
483 __free_page(pages
[i
]);
487 return clear_lo32(arena
->user_vm_start
) + uaddr32
;
489 range_tree_set(&arena
->rt
, pgoff
, page_cnt
);
496 * If page is present in vmalloc area, unmap it from vmalloc area,
497 * unmap it from all user space vma-s,
500 static void zap_pages(struct bpf_arena
*arena
, long uaddr
, long page_cnt
)
502 struct vma_list
*vml
;
504 list_for_each_entry(vml
, &arena
->vma_list
, head
)
505 zap_page_range_single(vml
->vma
, uaddr
,
506 PAGE_SIZE
* page_cnt
, NULL
);
509 static void arena_free_pages(struct bpf_arena
*arena
, long uaddr
, long page_cnt
)
511 u64 full_uaddr
, uaddr_end
;
512 long kaddr
, pgoff
, i
;
515 /* only aligned lower 32-bit are relevant */
518 full_uaddr
= clear_lo32(arena
->user_vm_start
) + uaddr
;
519 uaddr_end
= min(arena
->user_vm_end
, full_uaddr
+ (page_cnt
<< PAGE_SHIFT
));
520 if (full_uaddr
>= uaddr_end
)
523 page_cnt
= (uaddr_end
- full_uaddr
) >> PAGE_SHIFT
;
525 guard(mutex
)(&arena
->lock
);
527 pgoff
= compute_pgoff(arena
, uaddr
);
529 range_tree_set(&arena
->rt
, pgoff
, page_cnt
);
532 /* bulk zap if multiple pages being freed */
533 zap_pages(arena
, full_uaddr
, page_cnt
);
535 kaddr
= bpf_arena_get_kern_vm_start(arena
) + uaddr
;
536 for (i
= 0; i
< page_cnt
; i
++, kaddr
+= PAGE_SIZE
, full_uaddr
+= PAGE_SIZE
) {
537 page
= vmalloc_to_page((void *)kaddr
);
540 if (page_cnt
== 1 && page_mapped(page
)) /* mapped by some user process */
541 /* Optimization for the common case of page_cnt==1:
542 * If page wasn't mapped into some user vma there
543 * is no need to call zap_pages which is slow. When
544 * page_cnt is big it's faster to do the batched zap.
546 zap_pages(arena
, full_uaddr
, 1);
547 vm_area_unmap_pages(arena
->kern_vm
, kaddr
, kaddr
+ PAGE_SIZE
);
552 __bpf_kfunc_start_defs();
554 __bpf_kfunc
void *bpf_arena_alloc_pages(void *p__map
, void *addr__ign
, u32 page_cnt
,
555 int node_id
, u64 flags
)
557 struct bpf_map
*map
= p__map
;
558 struct bpf_arena
*arena
= container_of(map
, struct bpf_arena
, map
);
560 if (map
->map_type
!= BPF_MAP_TYPE_ARENA
|| flags
|| !page_cnt
)
563 return (void *)arena_alloc_pages(arena
, (long)addr__ign
, page_cnt
, node_id
);
566 __bpf_kfunc
void bpf_arena_free_pages(void *p__map
, void *ptr__ign
, u32 page_cnt
)
568 struct bpf_map
*map
= p__map
;
569 struct bpf_arena
*arena
= container_of(map
, struct bpf_arena
, map
);
571 if (map
->map_type
!= BPF_MAP_TYPE_ARENA
|| !page_cnt
|| !ptr__ign
)
573 arena_free_pages(arena
, (long)ptr__ign
, page_cnt
);
575 __bpf_kfunc_end_defs();
577 BTF_KFUNCS_START(arena_kfuncs
)
578 BTF_ID_FLAGS(func
, bpf_arena_alloc_pages
, KF_TRUSTED_ARGS
| KF_SLEEPABLE
)
579 BTF_ID_FLAGS(func
, bpf_arena_free_pages
, KF_TRUSTED_ARGS
| KF_SLEEPABLE
)
580 BTF_KFUNCS_END(arena_kfuncs
)
582 static const struct btf_kfunc_id_set common_kfunc_set
= {
583 .owner
= THIS_MODULE
,
584 .set
= &arena_kfuncs
,
587 static int __init
kfunc_init(void)
589 return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC
, &common_kfunc_set
);
591 late_initcall(kfunc_init
);