1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2016 Facebook
5 #include <linux/jhash.h>
6 #include <linux/filter.h>
7 #include <linux/stacktrace.h>
8 #include <linux/perf_event.h>
10 #include <linux/pagemap.h>
11 #include <linux/irq_work.h>
12 #include "percpu_freelist.h"
14 #define STACK_CREATE_FLAG_MASK \
15 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
18 struct stack_map_bucket
{
19 struct pcpu_freelist_node fnode
;
25 struct bpf_stack_map
{
28 struct pcpu_freelist freelist
;
30 struct stack_map_bucket
*buckets
[];
33 /* irq_work to run up_read() for build_id lookup in nmi context */
34 struct stack_map_irq_work
{
35 struct irq_work irq_work
;
36 struct rw_semaphore
*sem
;
39 static void do_up_read(struct irq_work
*entry
)
41 struct stack_map_irq_work
*work
;
43 work
= container_of(entry
, struct stack_map_irq_work
, irq_work
);
44 up_read_non_owner(work
->sem
);
48 static DEFINE_PER_CPU(struct stack_map_irq_work
, up_read_work
);
50 static inline bool stack_map_use_build_id(struct bpf_map
*map
)
52 return (map
->map_flags
& BPF_F_STACK_BUILD_ID
);
55 static inline int stack_map_data_size(struct bpf_map
*map
)
57 return stack_map_use_build_id(map
) ?
58 sizeof(struct bpf_stack_build_id
) : sizeof(u64
);
61 static int prealloc_elems_and_freelist(struct bpf_stack_map
*smap
)
63 u32 elem_size
= sizeof(struct stack_map_bucket
) + smap
->map
.value_size
;
66 smap
->elems
= bpf_map_area_alloc(elem_size
* smap
->map
.max_entries
,
71 err
= pcpu_freelist_init(&smap
->freelist
);
75 pcpu_freelist_populate(&smap
->freelist
, smap
->elems
, elem_size
,
76 smap
->map
.max_entries
);
80 bpf_map_area_free(smap
->elems
);
84 /* Called from syscall */
85 static struct bpf_map
*stack_map_alloc(union bpf_attr
*attr
)
87 u32 value_size
= attr
->value_size
;
88 struct bpf_stack_map
*smap
;
89 struct bpf_map_memory mem
;
93 if (!capable(CAP_SYS_ADMIN
))
94 return ERR_PTR(-EPERM
);
96 if (attr
->map_flags
& ~STACK_CREATE_FLAG_MASK
)
97 return ERR_PTR(-EINVAL
);
99 /* check sanity of attributes */
100 if (attr
->max_entries
== 0 || attr
->key_size
!= 4 ||
101 value_size
< 8 || value_size
% 8)
102 return ERR_PTR(-EINVAL
);
104 BUILD_BUG_ON(sizeof(struct bpf_stack_build_id
) % sizeof(u64
));
105 if (attr
->map_flags
& BPF_F_STACK_BUILD_ID
) {
106 if (value_size
% sizeof(struct bpf_stack_build_id
) ||
107 value_size
/ sizeof(struct bpf_stack_build_id
)
108 > sysctl_perf_event_max_stack
)
109 return ERR_PTR(-EINVAL
);
110 } else if (value_size
/ 8 > sysctl_perf_event_max_stack
)
111 return ERR_PTR(-EINVAL
);
113 /* hash table size must be power of 2 */
114 n_buckets
= roundup_pow_of_two(attr
->max_entries
);
116 cost
= n_buckets
* sizeof(struct stack_map_bucket
*) + sizeof(*smap
);
117 cost
+= n_buckets
* (value_size
+ sizeof(struct stack_map_bucket
));
118 err
= bpf_map_charge_init(&mem
, cost
);
122 smap
= bpf_map_area_alloc(cost
, bpf_map_attr_numa_node(attr
));
124 bpf_map_charge_finish(&mem
);
125 return ERR_PTR(-ENOMEM
);
128 bpf_map_init_from_attr(&smap
->map
, attr
);
129 smap
->map
.value_size
= value_size
;
130 smap
->n_buckets
= n_buckets
;
132 err
= get_callchain_buffers(sysctl_perf_event_max_stack
);
136 err
= prealloc_elems_and_freelist(smap
);
140 bpf_map_charge_move(&smap
->map
.memory
, &mem
);
145 put_callchain_buffers();
147 bpf_map_charge_finish(&mem
);
148 bpf_map_area_free(smap
);
152 #define BPF_BUILD_ID 3
154 * Parse build id from the note segment. This logic can be shared between
155 * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
158 static inline int stack_map_parse_build_id(void *page_addr
,
159 unsigned char *build_id
,
161 Elf32_Word note_size
)
163 Elf32_Word note_offs
= 0, new_offs
;
165 /* check for overflow */
166 if (note_start
< page_addr
|| note_start
+ note_size
< note_start
)
169 /* only supports note that fits in the first page */
170 if (note_start
+ note_size
> page_addr
+ PAGE_SIZE
)
173 while (note_offs
+ sizeof(Elf32_Nhdr
) < note_size
) {
174 Elf32_Nhdr
*nhdr
= (Elf32_Nhdr
*)(note_start
+ note_offs
);
176 if (nhdr
->n_type
== BPF_BUILD_ID
&&
177 nhdr
->n_namesz
== sizeof("GNU") &&
178 nhdr
->n_descsz
> 0 &&
179 nhdr
->n_descsz
<= BPF_BUILD_ID_SIZE
) {
181 note_start
+ note_offs
+
182 ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr
),
184 memset(build_id
+ nhdr
->n_descsz
, 0,
185 BPF_BUILD_ID_SIZE
- nhdr
->n_descsz
);
188 new_offs
= note_offs
+ sizeof(Elf32_Nhdr
) +
189 ALIGN(nhdr
->n_namesz
, 4) + ALIGN(nhdr
->n_descsz
, 4);
190 if (new_offs
<= note_offs
) /* overflow */
192 note_offs
= new_offs
;
197 /* Parse build ID from 32-bit ELF */
198 static int stack_map_get_build_id_32(void *page_addr
,
199 unsigned char *build_id
)
201 Elf32_Ehdr
*ehdr
= (Elf32_Ehdr
*)page_addr
;
205 /* only supports phdr that fits in one page */
207 (PAGE_SIZE
- sizeof(Elf32_Ehdr
)) / sizeof(Elf32_Phdr
))
210 phdr
= (Elf32_Phdr
*)(page_addr
+ sizeof(Elf32_Ehdr
));
212 for (i
= 0; i
< ehdr
->e_phnum
; ++i
)
213 if (phdr
[i
].p_type
== PT_NOTE
)
214 return stack_map_parse_build_id(page_addr
, build_id
,
215 page_addr
+ phdr
[i
].p_offset
,
220 /* Parse build ID from 64-bit ELF */
221 static int stack_map_get_build_id_64(void *page_addr
,
222 unsigned char *build_id
)
224 Elf64_Ehdr
*ehdr
= (Elf64_Ehdr
*)page_addr
;
228 /* only supports phdr that fits in one page */
230 (PAGE_SIZE
- sizeof(Elf64_Ehdr
)) / sizeof(Elf64_Phdr
))
233 phdr
= (Elf64_Phdr
*)(page_addr
+ sizeof(Elf64_Ehdr
));
235 for (i
= 0; i
< ehdr
->e_phnum
; ++i
)
236 if (phdr
[i
].p_type
== PT_NOTE
)
237 return stack_map_parse_build_id(page_addr
, build_id
,
238 page_addr
+ phdr
[i
].p_offset
,
243 /* Parse build ID of ELF file mapped to vma */
244 static int stack_map_get_build_id(struct vm_area_struct
*vma
,
245 unsigned char *build_id
)
252 /* only works for page backed storage */
256 page
= find_get_page(vma
->vm_file
->f_mapping
, 0);
258 return -EFAULT
; /* page not mapped */
261 page_addr
= kmap_atomic(page
);
262 ehdr
= (Elf32_Ehdr
*)page_addr
;
264 /* compare magic x7f "ELF" */
265 if (memcmp(ehdr
->e_ident
, ELFMAG
, SELFMAG
) != 0)
268 /* only support executable file and shared object file */
269 if (ehdr
->e_type
!= ET_EXEC
&& ehdr
->e_type
!= ET_DYN
)
272 if (ehdr
->e_ident
[EI_CLASS
] == ELFCLASS32
)
273 ret
= stack_map_get_build_id_32(page_addr
, build_id
);
274 else if (ehdr
->e_ident
[EI_CLASS
] == ELFCLASS64
)
275 ret
= stack_map_get_build_id_64(page_addr
, build_id
);
277 kunmap_atomic(page_addr
);
282 static void stack_map_get_build_id_offset(struct bpf_stack_build_id
*id_offs
,
283 u64
*ips
, u32 trace_nr
, bool user
)
286 struct vm_area_struct
*vma
;
287 bool irq_work_busy
= false;
288 struct stack_map_irq_work
*work
= NULL
;
290 if (irqs_disabled()) {
291 work
= this_cpu_ptr(&up_read_work
);
292 if (atomic_read(&work
->irq_work
.flags
) & IRQ_WORK_BUSY
)
293 /* cannot queue more up_read, fallback */
294 irq_work_busy
= true;
298 * We cannot do up_read() when the irq is disabled, because of
299 * risk to deadlock with rq_lock. To do build_id lookup when the
300 * irqs are disabled, we need to run up_read() in irq_work. We use
301 * a percpu variable to do the irq_work. If the irq_work is
302 * already used by another lookup, we fall back to report ips.
304 * Same fallback is used for kernel stack (!user) on a stackmap
307 if (!user
|| !current
|| !current
->mm
|| irq_work_busy
||
308 down_read_trylock(¤t
->mm
->mmap_sem
) == 0) {
309 /* cannot access current->mm, fall back to ips */
310 for (i
= 0; i
< trace_nr
; i
++) {
311 id_offs
[i
].status
= BPF_STACK_BUILD_ID_IP
;
312 id_offs
[i
].ip
= ips
[i
];
313 memset(id_offs
[i
].build_id
, 0, BPF_BUILD_ID_SIZE
);
318 for (i
= 0; i
< trace_nr
; i
++) {
319 vma
= find_vma(current
->mm
, ips
[i
]);
320 if (!vma
|| stack_map_get_build_id(vma
, id_offs
[i
].build_id
)) {
321 /* per entry fall back to ips */
322 id_offs
[i
].status
= BPF_STACK_BUILD_ID_IP
;
323 id_offs
[i
].ip
= ips
[i
];
324 memset(id_offs
[i
].build_id
, 0, BPF_BUILD_ID_SIZE
);
327 id_offs
[i
].offset
= (vma
->vm_pgoff
<< PAGE_SHIFT
) + ips
[i
]
329 id_offs
[i
].status
= BPF_STACK_BUILD_ID_VALID
;
333 up_read(¤t
->mm
->mmap_sem
);
335 work
->sem
= ¤t
->mm
->mmap_sem
;
336 irq_work_queue(&work
->irq_work
);
338 * The irq_work will release the mmap_sem with
339 * up_read_non_owner(). The rwsem_release() is called
340 * here to release the lock from lockdep's perspective.
342 rwsem_release(¤t
->mm
->mmap_sem
.dep_map
, _RET_IP_
);
346 BPF_CALL_3(bpf_get_stackid
, struct pt_regs
*, regs
, struct bpf_map
*, map
,
349 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
350 struct perf_callchain_entry
*trace
;
351 struct stack_map_bucket
*bucket
, *new_bucket
, *old_bucket
;
352 u32 max_depth
= map
->value_size
/ stack_map_data_size(map
);
353 /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
354 u32 init_nr
= sysctl_perf_event_max_stack
- max_depth
;
355 u32 skip
= flags
& BPF_F_SKIP_FIELD_MASK
;
356 u32 hash
, id
, trace_nr
, trace_len
;
357 bool user
= flags
& BPF_F_USER_STACK
;
362 if (unlikely(flags
& ~(BPF_F_SKIP_FIELD_MASK
| BPF_F_USER_STACK
|
363 BPF_F_FAST_STACK_CMP
| BPF_F_REUSE_STACKID
)))
366 trace
= get_perf_callchain(regs
, init_nr
, kernel
, user
,
367 sysctl_perf_event_max_stack
, false, false);
369 if (unlikely(!trace
))
370 /* couldn't fetch the stack trace */
373 /* get_perf_callchain() guarantees that trace->nr >= init_nr
374 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
376 trace_nr
= trace
->nr
- init_nr
;
378 if (trace_nr
<= skip
)
379 /* skipping more than usable stack trace */
383 trace_len
= trace_nr
* sizeof(u64
);
384 ips
= trace
->ip
+ skip
+ init_nr
;
385 hash
= jhash2((u32
*)ips
, trace_len
/ sizeof(u32
), 0);
386 id
= hash
& (smap
->n_buckets
- 1);
387 bucket
= READ_ONCE(smap
->buckets
[id
]);
389 hash_matches
= bucket
&& bucket
->hash
== hash
;
391 if (hash_matches
&& flags
& BPF_F_FAST_STACK_CMP
)
394 if (stack_map_use_build_id(map
)) {
395 /* for build_id+offset, pop a bucket before slow cmp */
396 new_bucket
= (struct stack_map_bucket
*)
397 pcpu_freelist_pop(&smap
->freelist
);
398 if (unlikely(!new_bucket
))
400 new_bucket
->nr
= trace_nr
;
401 stack_map_get_build_id_offset(
402 (struct bpf_stack_build_id
*)new_bucket
->data
,
403 ips
, trace_nr
, user
);
404 trace_len
= trace_nr
* sizeof(struct bpf_stack_build_id
);
405 if (hash_matches
&& bucket
->nr
== trace_nr
&&
406 memcmp(bucket
->data
, new_bucket
->data
, trace_len
) == 0) {
407 pcpu_freelist_push(&smap
->freelist
, &new_bucket
->fnode
);
410 if (bucket
&& !(flags
& BPF_F_REUSE_STACKID
)) {
411 pcpu_freelist_push(&smap
->freelist
, &new_bucket
->fnode
);
415 if (hash_matches
&& bucket
->nr
== trace_nr
&&
416 memcmp(bucket
->data
, ips
, trace_len
) == 0)
418 if (bucket
&& !(flags
& BPF_F_REUSE_STACKID
))
421 new_bucket
= (struct stack_map_bucket
*)
422 pcpu_freelist_pop(&smap
->freelist
);
423 if (unlikely(!new_bucket
))
425 memcpy(new_bucket
->data
, ips
, trace_len
);
428 new_bucket
->hash
= hash
;
429 new_bucket
->nr
= trace_nr
;
431 old_bucket
= xchg(&smap
->buckets
[id
], new_bucket
);
433 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
437 const struct bpf_func_proto bpf_get_stackid_proto
= {
438 .func
= bpf_get_stackid
,
440 .ret_type
= RET_INTEGER
,
441 .arg1_type
= ARG_PTR_TO_CTX
,
442 .arg2_type
= ARG_CONST_MAP_PTR
,
443 .arg3_type
= ARG_ANYTHING
,
446 BPF_CALL_4(bpf_get_stack
, struct pt_regs
*, regs
, void *, buf
, u32
, size
,
449 u32 init_nr
, trace_nr
, copy_len
, elem_size
, num_elem
;
450 bool user_build_id
= flags
& BPF_F_USER_BUILD_ID
;
451 u32 skip
= flags
& BPF_F_SKIP_FIELD_MASK
;
452 bool user
= flags
& BPF_F_USER_STACK
;
453 struct perf_callchain_entry
*trace
;
458 if (unlikely(flags
& ~(BPF_F_SKIP_FIELD_MASK
| BPF_F_USER_STACK
|
459 BPF_F_USER_BUILD_ID
)))
461 if (kernel
&& user_build_id
)
464 elem_size
= (user
&& user_build_id
) ? sizeof(struct bpf_stack_build_id
)
466 if (unlikely(size
% elem_size
))
469 num_elem
= size
/ elem_size
;
470 if (sysctl_perf_event_max_stack
< num_elem
)
473 init_nr
= sysctl_perf_event_max_stack
- num_elem
;
474 trace
= get_perf_callchain(regs
, init_nr
, kernel
, user
,
475 sysctl_perf_event_max_stack
, false, false);
476 if (unlikely(!trace
))
479 trace_nr
= trace
->nr
- init_nr
;
484 trace_nr
= (trace_nr
<= num_elem
) ? trace_nr
: num_elem
;
485 copy_len
= trace_nr
* elem_size
;
486 ips
= trace
->ip
+ skip
+ init_nr
;
487 if (user
&& user_build_id
)
488 stack_map_get_build_id_offset(buf
, ips
, trace_nr
, user
);
490 memcpy(buf
, ips
, copy_len
);
493 memset(buf
+ copy_len
, 0, size
- copy_len
);
499 memset(buf
, 0, size
);
503 const struct bpf_func_proto bpf_get_stack_proto
= {
504 .func
= bpf_get_stack
,
506 .ret_type
= RET_INTEGER
,
507 .arg1_type
= ARG_PTR_TO_CTX
,
508 .arg2_type
= ARG_PTR_TO_UNINIT_MEM
,
509 .arg3_type
= ARG_CONST_SIZE_OR_ZERO
,
510 .arg4_type
= ARG_ANYTHING
,
513 /* Called from eBPF program */
514 static void *stack_map_lookup_elem(struct bpf_map
*map
, void *key
)
516 return ERR_PTR(-EOPNOTSUPP
);
519 /* Called from syscall */
520 int bpf_stackmap_copy(struct bpf_map
*map
, void *key
, void *value
)
522 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
523 struct stack_map_bucket
*bucket
, *old_bucket
;
524 u32 id
= *(u32
*)key
, trace_len
;
526 if (unlikely(id
>= smap
->n_buckets
))
529 bucket
= xchg(&smap
->buckets
[id
], NULL
);
533 trace_len
= bucket
->nr
* stack_map_data_size(map
);
534 memcpy(value
, bucket
->data
, trace_len
);
535 memset(value
+ trace_len
, 0, map
->value_size
- trace_len
);
537 old_bucket
= xchg(&smap
->buckets
[id
], bucket
);
539 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
543 static int stack_map_get_next_key(struct bpf_map
*map
, void *key
,
546 struct bpf_stack_map
*smap
= container_of(map
,
547 struct bpf_stack_map
, map
);
550 WARN_ON_ONCE(!rcu_read_lock_held());
556 if (id
>= smap
->n_buckets
|| !smap
->buckets
[id
])
562 while (id
< smap
->n_buckets
&& !smap
->buckets
[id
])
565 if (id
>= smap
->n_buckets
)
568 *(u32
*)next_key
= id
;
572 static int stack_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
578 /* Called from syscall or from eBPF program */
579 static int stack_map_delete_elem(struct bpf_map
*map
, void *key
)
581 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
582 struct stack_map_bucket
*old_bucket
;
583 u32 id
= *(u32
*)key
;
585 if (unlikely(id
>= smap
->n_buckets
))
588 old_bucket
= xchg(&smap
->buckets
[id
], NULL
);
590 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
597 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
598 static void stack_map_free(struct bpf_map
*map
)
600 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
602 /* wait for bpf programs to complete before freeing stack map */
605 bpf_map_area_free(smap
->elems
);
606 pcpu_freelist_destroy(&smap
->freelist
);
607 bpf_map_area_free(smap
);
608 put_callchain_buffers();
611 const struct bpf_map_ops stack_trace_map_ops
= {
612 .map_alloc
= stack_map_alloc
,
613 .map_free
= stack_map_free
,
614 .map_get_next_key
= stack_map_get_next_key
,
615 .map_lookup_elem
= stack_map_lookup_elem
,
616 .map_update_elem
= stack_map_update_elem
,
617 .map_delete_elem
= stack_map_delete_elem
,
618 .map_check_btf
= map_check_no_btf
,
621 static int __init
stack_map_init(void)
624 struct stack_map_irq_work
*work
;
626 for_each_possible_cpu(cpu
) {
627 work
= per_cpu_ptr(&up_read_work
, cpu
);
628 init_irq_work(&work
->irq_work
, do_up_read
);
632 subsys_initcall(stack_map_init
);