1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2016 Facebook
5 #include <linux/jhash.h>
6 #include <linux/filter.h>
7 #include <linux/kernel.h>
8 #include <linux/stacktrace.h>
9 #include <linux/perf_event.h>
10 #include <linux/elf.h>
11 #include <linux/pagemap.h>
12 #include <linux/irq_work.h>
13 #include <linux/btf_ids.h>
14 #include "percpu_freelist.h"
16 #define STACK_CREATE_FLAG_MASK \
17 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
20 struct stack_map_bucket
{
21 struct pcpu_freelist_node fnode
;
27 struct bpf_stack_map
{
30 struct pcpu_freelist freelist
;
32 struct stack_map_bucket
*buckets
[];
35 /* irq_work to run up_read() for build_id lookup in nmi context */
36 struct stack_map_irq_work
{
37 struct irq_work irq_work
;
41 static void do_up_read(struct irq_work
*entry
)
43 struct stack_map_irq_work
*work
;
45 if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT
)))
48 work
= container_of(entry
, struct stack_map_irq_work
, irq_work
);
49 mmap_read_unlock_non_owner(work
->mm
);
52 static DEFINE_PER_CPU(struct stack_map_irq_work
, up_read_work
);
54 static inline bool stack_map_use_build_id(struct bpf_map
*map
)
56 return (map
->map_flags
& BPF_F_STACK_BUILD_ID
);
59 static inline int stack_map_data_size(struct bpf_map
*map
)
61 return stack_map_use_build_id(map
) ?
62 sizeof(struct bpf_stack_build_id
) : sizeof(u64
);
65 static int prealloc_elems_and_freelist(struct bpf_stack_map
*smap
)
67 u32 elem_size
= sizeof(struct stack_map_bucket
) + smap
->map
.value_size
;
70 smap
->elems
= bpf_map_area_alloc(elem_size
* smap
->map
.max_entries
,
75 err
= pcpu_freelist_init(&smap
->freelist
);
79 pcpu_freelist_populate(&smap
->freelist
, smap
->elems
, elem_size
,
80 smap
->map
.max_entries
);
84 bpf_map_area_free(smap
->elems
);
88 /* Called from syscall */
89 static struct bpf_map
*stack_map_alloc(union bpf_attr
*attr
)
91 u32 value_size
= attr
->value_size
;
92 struct bpf_stack_map
*smap
;
97 return ERR_PTR(-EPERM
);
99 if (attr
->map_flags
& ~STACK_CREATE_FLAG_MASK
)
100 return ERR_PTR(-EINVAL
);
102 /* check sanity of attributes */
103 if (attr
->max_entries
== 0 || attr
->key_size
!= 4 ||
104 value_size
< 8 || value_size
% 8)
105 return ERR_PTR(-EINVAL
);
107 BUILD_BUG_ON(sizeof(struct bpf_stack_build_id
) % sizeof(u64
));
108 if (attr
->map_flags
& BPF_F_STACK_BUILD_ID
) {
109 if (value_size
% sizeof(struct bpf_stack_build_id
) ||
110 value_size
/ sizeof(struct bpf_stack_build_id
)
111 > sysctl_perf_event_max_stack
)
112 return ERR_PTR(-EINVAL
);
113 } else if (value_size
/ 8 > sysctl_perf_event_max_stack
)
114 return ERR_PTR(-EINVAL
);
116 /* hash table size must be power of 2 */
117 n_buckets
= roundup_pow_of_two(attr
->max_entries
);
119 cost
= n_buckets
* sizeof(struct stack_map_bucket
*) + sizeof(*smap
);
120 cost
+= n_buckets
* (value_size
+ sizeof(struct stack_map_bucket
));
121 smap
= bpf_map_area_alloc(cost
, bpf_map_attr_numa_node(attr
));
123 return ERR_PTR(-ENOMEM
);
125 bpf_map_init_from_attr(&smap
->map
, attr
);
126 smap
->map
.value_size
= value_size
;
127 smap
->n_buckets
= n_buckets
;
129 err
= get_callchain_buffers(sysctl_perf_event_max_stack
);
133 err
= prealloc_elems_and_freelist(smap
);
140 put_callchain_buffers();
142 bpf_map_area_free(smap
);
146 #define BPF_BUILD_ID 3
148 * Parse build id from the note segment. This logic can be shared between
149 * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
152 static inline int stack_map_parse_build_id(void *page_addr
,
153 unsigned char *build_id
,
155 Elf32_Word note_size
)
157 Elf32_Word note_offs
= 0, new_offs
;
159 /* check for overflow */
160 if (note_start
< page_addr
|| note_start
+ note_size
< note_start
)
163 /* only supports note that fits in the first page */
164 if (note_start
+ note_size
> page_addr
+ PAGE_SIZE
)
167 while (note_offs
+ sizeof(Elf32_Nhdr
) < note_size
) {
168 Elf32_Nhdr
*nhdr
= (Elf32_Nhdr
*)(note_start
+ note_offs
);
170 if (nhdr
->n_type
== BPF_BUILD_ID
&&
171 nhdr
->n_namesz
== sizeof("GNU") &&
172 nhdr
->n_descsz
> 0 &&
173 nhdr
->n_descsz
<= BPF_BUILD_ID_SIZE
) {
175 note_start
+ note_offs
+
176 ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr
),
178 memset(build_id
+ nhdr
->n_descsz
, 0,
179 BPF_BUILD_ID_SIZE
- nhdr
->n_descsz
);
182 new_offs
= note_offs
+ sizeof(Elf32_Nhdr
) +
183 ALIGN(nhdr
->n_namesz
, 4) + ALIGN(nhdr
->n_descsz
, 4);
184 if (new_offs
<= note_offs
) /* overflow */
186 note_offs
= new_offs
;
191 /* Parse build ID from 32-bit ELF */
192 static int stack_map_get_build_id_32(void *page_addr
,
193 unsigned char *build_id
)
195 Elf32_Ehdr
*ehdr
= (Elf32_Ehdr
*)page_addr
;
199 /* only supports phdr that fits in one page */
201 (PAGE_SIZE
- sizeof(Elf32_Ehdr
)) / sizeof(Elf32_Phdr
))
204 phdr
= (Elf32_Phdr
*)(page_addr
+ sizeof(Elf32_Ehdr
));
206 for (i
= 0; i
< ehdr
->e_phnum
; ++i
) {
207 if (phdr
[i
].p_type
== PT_NOTE
&&
208 !stack_map_parse_build_id(page_addr
, build_id
,
209 page_addr
+ phdr
[i
].p_offset
,
216 /* Parse build ID from 64-bit ELF */
217 static int stack_map_get_build_id_64(void *page_addr
,
218 unsigned char *build_id
)
220 Elf64_Ehdr
*ehdr
= (Elf64_Ehdr
*)page_addr
;
224 /* only supports phdr that fits in one page */
226 (PAGE_SIZE
- sizeof(Elf64_Ehdr
)) / sizeof(Elf64_Phdr
))
229 phdr
= (Elf64_Phdr
*)(page_addr
+ sizeof(Elf64_Ehdr
));
231 for (i
= 0; i
< ehdr
->e_phnum
; ++i
) {
232 if (phdr
[i
].p_type
== PT_NOTE
&&
233 !stack_map_parse_build_id(page_addr
, build_id
,
234 page_addr
+ phdr
[i
].p_offset
,
241 /* Parse build ID of ELF file mapped to vma */
242 static int stack_map_get_build_id(struct vm_area_struct
*vma
,
243 unsigned char *build_id
)
250 /* only works for page backed storage */
254 page
= find_get_page(vma
->vm_file
->f_mapping
, 0);
256 return -EFAULT
; /* page not mapped */
259 page_addr
= kmap_atomic(page
);
260 ehdr
= (Elf32_Ehdr
*)page_addr
;
262 /* compare magic x7f "ELF" */
263 if (memcmp(ehdr
->e_ident
, ELFMAG
, SELFMAG
) != 0)
266 /* only support executable file and shared object file */
267 if (ehdr
->e_type
!= ET_EXEC
&& ehdr
->e_type
!= ET_DYN
)
270 if (ehdr
->e_ident
[EI_CLASS
] == ELFCLASS32
)
271 ret
= stack_map_get_build_id_32(page_addr
, build_id
);
272 else if (ehdr
->e_ident
[EI_CLASS
] == ELFCLASS64
)
273 ret
= stack_map_get_build_id_64(page_addr
, build_id
);
275 kunmap_atomic(page_addr
);
280 static void stack_map_get_build_id_offset(struct bpf_stack_build_id
*id_offs
,
281 u64
*ips
, u32 trace_nr
, bool user
)
284 struct vm_area_struct
*vma
;
285 bool irq_work_busy
= false;
286 struct stack_map_irq_work
*work
= NULL
;
288 if (irqs_disabled()) {
289 if (!IS_ENABLED(CONFIG_PREEMPT_RT
)) {
290 work
= this_cpu_ptr(&up_read_work
);
291 if (irq_work_is_busy(&work
->irq_work
)) {
292 /* cannot queue more up_read, fallback */
293 irq_work_busy
= true;
297 * PREEMPT_RT does not allow to trylock mmap sem in
298 * interrupt disabled context. Force the fallback code.
300 irq_work_busy
= true;
305 * We cannot do up_read() when the irq is disabled, because of
306 * risk to deadlock with rq_lock. To do build_id lookup when the
307 * irqs are disabled, we need to run up_read() in irq_work. We use
308 * a percpu variable to do the irq_work. If the irq_work is
309 * already used by another lookup, we fall back to report ips.
311 * Same fallback is used for kernel stack (!user) on a stackmap
314 if (!user
|| !current
|| !current
->mm
|| irq_work_busy
||
315 !mmap_read_trylock_non_owner(current
->mm
)) {
316 /* cannot access current->mm, fall back to ips */
317 for (i
= 0; i
< trace_nr
; i
++) {
318 id_offs
[i
].status
= BPF_STACK_BUILD_ID_IP
;
319 id_offs
[i
].ip
= ips
[i
];
320 memset(id_offs
[i
].build_id
, 0, BPF_BUILD_ID_SIZE
);
325 for (i
= 0; i
< trace_nr
; i
++) {
326 vma
= find_vma(current
->mm
, ips
[i
]);
327 if (!vma
|| stack_map_get_build_id(vma
, id_offs
[i
].build_id
)) {
328 /* per entry fall back to ips */
329 id_offs
[i
].status
= BPF_STACK_BUILD_ID_IP
;
330 id_offs
[i
].ip
= ips
[i
];
331 memset(id_offs
[i
].build_id
, 0, BPF_BUILD_ID_SIZE
);
334 id_offs
[i
].offset
= (vma
->vm_pgoff
<< PAGE_SHIFT
) + ips
[i
]
336 id_offs
[i
].status
= BPF_STACK_BUILD_ID_VALID
;
340 mmap_read_unlock_non_owner(current
->mm
);
342 work
->mm
= current
->mm
;
343 irq_work_queue(&work
->irq_work
);
347 static struct perf_callchain_entry
*
348 get_callchain_entry_for_task(struct task_struct
*task
, u32 init_nr
)
350 #ifdef CONFIG_STACKTRACE
351 struct perf_callchain_entry
*entry
;
354 entry
= get_callchain_entry(&rctx
);
359 entry
->nr
= init_nr
+
360 stack_trace_save_tsk(task
, (unsigned long *)(entry
->ip
+ init_nr
),
361 sysctl_perf_event_max_stack
- init_nr
, 0);
363 /* stack_trace_save_tsk() works on unsigned long array, while
364 * perf_callchain_entry uses u64 array. For 32-bit systems, it is
365 * necessary to fix this mismatch.
367 if (__BITS_PER_LONG
!= 64) {
368 unsigned long *from
= (unsigned long *) entry
->ip
;
372 /* copy data from the end to avoid using extra buffer */
373 for (i
= entry
->nr
- 1; i
>= (int)init_nr
; i
--)
374 to
[i
] = (u64
)(from
[i
]);
377 put_callchain_entry(rctx
);
380 #else /* CONFIG_STACKTRACE */
385 static long __bpf_get_stackid(struct bpf_map
*map
,
386 struct perf_callchain_entry
*trace
, u64 flags
)
388 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
389 struct stack_map_bucket
*bucket
, *new_bucket
, *old_bucket
;
390 u32 max_depth
= map
->value_size
/ stack_map_data_size(map
);
391 /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
392 u32 init_nr
= sysctl_perf_event_max_stack
- max_depth
;
393 u32 skip
= flags
& BPF_F_SKIP_FIELD_MASK
;
394 u32 hash
, id
, trace_nr
, trace_len
;
395 bool user
= flags
& BPF_F_USER_STACK
;
399 /* get_perf_callchain() guarantees that trace->nr >= init_nr
400 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
402 trace_nr
= trace
->nr
- init_nr
;
404 if (trace_nr
<= skip
)
405 /* skipping more than usable stack trace */
409 trace_len
= trace_nr
* sizeof(u64
);
410 ips
= trace
->ip
+ skip
+ init_nr
;
411 hash
= jhash2((u32
*)ips
, trace_len
/ sizeof(u32
), 0);
412 id
= hash
& (smap
->n_buckets
- 1);
413 bucket
= READ_ONCE(smap
->buckets
[id
]);
415 hash_matches
= bucket
&& bucket
->hash
== hash
;
417 if (hash_matches
&& flags
& BPF_F_FAST_STACK_CMP
)
420 if (stack_map_use_build_id(map
)) {
421 /* for build_id+offset, pop a bucket before slow cmp */
422 new_bucket
= (struct stack_map_bucket
*)
423 pcpu_freelist_pop(&smap
->freelist
);
424 if (unlikely(!new_bucket
))
426 new_bucket
->nr
= trace_nr
;
427 stack_map_get_build_id_offset(
428 (struct bpf_stack_build_id
*)new_bucket
->data
,
429 ips
, trace_nr
, user
);
430 trace_len
= trace_nr
* sizeof(struct bpf_stack_build_id
);
431 if (hash_matches
&& bucket
->nr
== trace_nr
&&
432 memcmp(bucket
->data
, new_bucket
->data
, trace_len
) == 0) {
433 pcpu_freelist_push(&smap
->freelist
, &new_bucket
->fnode
);
436 if (bucket
&& !(flags
& BPF_F_REUSE_STACKID
)) {
437 pcpu_freelist_push(&smap
->freelist
, &new_bucket
->fnode
);
441 if (hash_matches
&& bucket
->nr
== trace_nr
&&
442 memcmp(bucket
->data
, ips
, trace_len
) == 0)
444 if (bucket
&& !(flags
& BPF_F_REUSE_STACKID
))
447 new_bucket
= (struct stack_map_bucket
*)
448 pcpu_freelist_pop(&smap
->freelist
);
449 if (unlikely(!new_bucket
))
451 memcpy(new_bucket
->data
, ips
, trace_len
);
454 new_bucket
->hash
= hash
;
455 new_bucket
->nr
= trace_nr
;
457 old_bucket
= xchg(&smap
->buckets
[id
], new_bucket
);
459 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
463 BPF_CALL_3(bpf_get_stackid
, struct pt_regs
*, regs
, struct bpf_map
*, map
,
466 u32 max_depth
= map
->value_size
/ stack_map_data_size(map
);
467 /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
468 u32 init_nr
= sysctl_perf_event_max_stack
- max_depth
;
469 bool user
= flags
& BPF_F_USER_STACK
;
470 struct perf_callchain_entry
*trace
;
473 if (unlikely(flags
& ~(BPF_F_SKIP_FIELD_MASK
| BPF_F_USER_STACK
|
474 BPF_F_FAST_STACK_CMP
| BPF_F_REUSE_STACKID
)))
477 trace
= get_perf_callchain(regs
, init_nr
, kernel
, user
,
478 sysctl_perf_event_max_stack
, false, false);
480 if (unlikely(!trace
))
481 /* couldn't fetch the stack trace */
484 return __bpf_get_stackid(map
, trace
, flags
);
487 const struct bpf_func_proto bpf_get_stackid_proto
= {
488 .func
= bpf_get_stackid
,
490 .ret_type
= RET_INTEGER
,
491 .arg1_type
= ARG_PTR_TO_CTX
,
492 .arg2_type
= ARG_CONST_MAP_PTR
,
493 .arg3_type
= ARG_ANYTHING
,
496 static __u64
count_kernel_ip(struct perf_callchain_entry
*trace
)
500 while (nr_kernel
< trace
->nr
) {
501 if (trace
->ip
[nr_kernel
] == PERF_CONTEXT_USER
)
508 BPF_CALL_3(bpf_get_stackid_pe
, struct bpf_perf_event_data_kern
*, ctx
,
509 struct bpf_map
*, map
, u64
, flags
)
511 struct perf_event
*event
= ctx
->event
;
512 struct perf_callchain_entry
*trace
;
517 /* perf_sample_data doesn't have callchain, use bpf_get_stackid */
518 if (!(event
->attr
.sample_type
& __PERF_SAMPLE_CALLCHAIN_EARLY
))
519 return bpf_get_stackid((unsigned long)(ctx
->regs
),
520 (unsigned long) map
, flags
, 0, 0);
522 if (unlikely(flags
& ~(BPF_F_SKIP_FIELD_MASK
| BPF_F_USER_STACK
|
523 BPF_F_FAST_STACK_CMP
| BPF_F_REUSE_STACKID
)))
526 user
= flags
& BPF_F_USER_STACK
;
529 trace
= ctx
->data
->callchain
;
530 if (unlikely(!trace
))
533 nr_kernel
= count_kernel_ip(trace
);
536 __u64 nr
= trace
->nr
;
538 trace
->nr
= nr_kernel
;
539 ret
= __bpf_get_stackid(map
, trace
, flags
);
544 u64 skip
= flags
& BPF_F_SKIP_FIELD_MASK
;
547 if (skip
> BPF_F_SKIP_FIELD_MASK
)
550 flags
= (flags
& ~BPF_F_SKIP_FIELD_MASK
) | skip
;
551 ret
= __bpf_get_stackid(map
, trace
, flags
);
556 const struct bpf_func_proto bpf_get_stackid_proto_pe
= {
557 .func
= bpf_get_stackid_pe
,
559 .ret_type
= RET_INTEGER
,
560 .arg1_type
= ARG_PTR_TO_CTX
,
561 .arg2_type
= ARG_CONST_MAP_PTR
,
562 .arg3_type
= ARG_ANYTHING
,
565 static long __bpf_get_stack(struct pt_regs
*regs
, struct task_struct
*task
,
566 struct perf_callchain_entry
*trace_in
,
567 void *buf
, u32 size
, u64 flags
)
569 u32 init_nr
, trace_nr
, copy_len
, elem_size
, num_elem
;
570 bool user_build_id
= flags
& BPF_F_USER_BUILD_ID
;
571 u32 skip
= flags
& BPF_F_SKIP_FIELD_MASK
;
572 bool user
= flags
& BPF_F_USER_STACK
;
573 struct perf_callchain_entry
*trace
;
578 if (unlikely(flags
& ~(BPF_F_SKIP_FIELD_MASK
| BPF_F_USER_STACK
|
579 BPF_F_USER_BUILD_ID
)))
581 if (kernel
&& user_build_id
)
584 elem_size
= (user
&& user_build_id
) ? sizeof(struct bpf_stack_build_id
)
586 if (unlikely(size
% elem_size
))
589 /* cannot get valid user stack for task without user_mode regs */
590 if (task
&& user
&& !user_mode(regs
))
593 num_elem
= size
/ elem_size
;
594 if (sysctl_perf_event_max_stack
< num_elem
)
597 init_nr
= sysctl_perf_event_max_stack
- num_elem
;
601 else if (kernel
&& task
)
602 trace
= get_callchain_entry_for_task(task
, init_nr
);
604 trace
= get_perf_callchain(regs
, init_nr
, kernel
, user
,
605 sysctl_perf_event_max_stack
,
607 if (unlikely(!trace
))
610 trace_nr
= trace
->nr
- init_nr
;
615 trace_nr
= (trace_nr
<= num_elem
) ? trace_nr
: num_elem
;
616 copy_len
= trace_nr
* elem_size
;
617 ips
= trace
->ip
+ skip
+ init_nr
;
618 if (user
&& user_build_id
)
619 stack_map_get_build_id_offset(buf
, ips
, trace_nr
, user
);
621 memcpy(buf
, ips
, copy_len
);
624 memset(buf
+ copy_len
, 0, size
- copy_len
);
630 memset(buf
, 0, size
);
634 BPF_CALL_4(bpf_get_stack
, struct pt_regs
*, regs
, void *, buf
, u32
, size
,
637 return __bpf_get_stack(regs
, NULL
, NULL
, buf
, size
, flags
);
640 const struct bpf_func_proto bpf_get_stack_proto
= {
641 .func
= bpf_get_stack
,
643 .ret_type
= RET_INTEGER
,
644 .arg1_type
= ARG_PTR_TO_CTX
,
645 .arg2_type
= ARG_PTR_TO_UNINIT_MEM
,
646 .arg3_type
= ARG_CONST_SIZE_OR_ZERO
,
647 .arg4_type
= ARG_ANYTHING
,
650 BPF_CALL_4(bpf_get_task_stack
, struct task_struct
*, task
, void *, buf
,
651 u32
, size
, u64
, flags
)
653 struct pt_regs
*regs
= task_pt_regs(task
);
655 return __bpf_get_stack(regs
, task
, NULL
, buf
, size
, flags
);
658 BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids
, struct, task_struct
)
660 const struct bpf_func_proto bpf_get_task_stack_proto
= {
661 .func
= bpf_get_task_stack
,
663 .ret_type
= RET_INTEGER
,
664 .arg1_type
= ARG_PTR_TO_BTF_ID
,
665 .arg1_btf_id
= &bpf_get_task_stack_btf_ids
[0],
666 .arg2_type
= ARG_PTR_TO_UNINIT_MEM
,
667 .arg3_type
= ARG_CONST_SIZE_OR_ZERO
,
668 .arg4_type
= ARG_ANYTHING
,
671 BPF_CALL_4(bpf_get_stack_pe
, struct bpf_perf_event_data_kern
*, ctx
,
672 void *, buf
, u32
, size
, u64
, flags
)
674 struct pt_regs
*regs
= (struct pt_regs
*)(ctx
->regs
);
675 struct perf_event
*event
= ctx
->event
;
676 struct perf_callchain_entry
*trace
;
681 if (!(event
->attr
.sample_type
& __PERF_SAMPLE_CALLCHAIN_EARLY
))
682 return __bpf_get_stack(regs
, NULL
, NULL
, buf
, size
, flags
);
684 if (unlikely(flags
& ~(BPF_F_SKIP_FIELD_MASK
| BPF_F_USER_STACK
|
685 BPF_F_USER_BUILD_ID
)))
688 user
= flags
& BPF_F_USER_STACK
;
692 trace
= ctx
->data
->callchain
;
693 if (unlikely(!trace
))
696 nr_kernel
= count_kernel_ip(trace
);
699 __u64 nr
= trace
->nr
;
701 trace
->nr
= nr_kernel
;
702 err
= __bpf_get_stack(regs
, NULL
, trace
, buf
, size
, flags
);
707 u64 skip
= flags
& BPF_F_SKIP_FIELD_MASK
;
710 if (skip
> BPF_F_SKIP_FIELD_MASK
)
713 flags
= (flags
& ~BPF_F_SKIP_FIELD_MASK
) | skip
;
714 err
= __bpf_get_stack(regs
, NULL
, trace
, buf
, size
, flags
);
719 memset(buf
, 0, size
);
724 const struct bpf_func_proto bpf_get_stack_proto_pe
= {
725 .func
= bpf_get_stack_pe
,
727 .ret_type
= RET_INTEGER
,
728 .arg1_type
= ARG_PTR_TO_CTX
,
729 .arg2_type
= ARG_PTR_TO_UNINIT_MEM
,
730 .arg3_type
= ARG_CONST_SIZE_OR_ZERO
,
731 .arg4_type
= ARG_ANYTHING
,
734 /* Called from eBPF program */
735 static void *stack_map_lookup_elem(struct bpf_map
*map
, void *key
)
737 return ERR_PTR(-EOPNOTSUPP
);
740 /* Called from syscall */
741 int bpf_stackmap_copy(struct bpf_map
*map
, void *key
, void *value
)
743 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
744 struct stack_map_bucket
*bucket
, *old_bucket
;
745 u32 id
= *(u32
*)key
, trace_len
;
747 if (unlikely(id
>= smap
->n_buckets
))
750 bucket
= xchg(&smap
->buckets
[id
], NULL
);
754 trace_len
= bucket
->nr
* stack_map_data_size(map
);
755 memcpy(value
, bucket
->data
, trace_len
);
756 memset(value
+ trace_len
, 0, map
->value_size
- trace_len
);
758 old_bucket
= xchg(&smap
->buckets
[id
], bucket
);
760 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
764 static int stack_map_get_next_key(struct bpf_map
*map
, void *key
,
767 struct bpf_stack_map
*smap
= container_of(map
,
768 struct bpf_stack_map
, map
);
771 WARN_ON_ONCE(!rcu_read_lock_held());
777 if (id
>= smap
->n_buckets
|| !smap
->buckets
[id
])
783 while (id
< smap
->n_buckets
&& !smap
->buckets
[id
])
786 if (id
>= smap
->n_buckets
)
789 *(u32
*)next_key
= id
;
793 static int stack_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
799 /* Called from syscall or from eBPF program */
800 static int stack_map_delete_elem(struct bpf_map
*map
, void *key
)
802 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
803 struct stack_map_bucket
*old_bucket
;
804 u32 id
= *(u32
*)key
;
806 if (unlikely(id
>= smap
->n_buckets
))
809 old_bucket
= xchg(&smap
->buckets
[id
], NULL
);
811 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
818 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
819 static void stack_map_free(struct bpf_map
*map
)
821 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
823 bpf_map_area_free(smap
->elems
);
824 pcpu_freelist_destroy(&smap
->freelist
);
825 bpf_map_area_free(smap
);
826 put_callchain_buffers();
829 static int stack_trace_map_btf_id
;
830 const struct bpf_map_ops stack_trace_map_ops
= {
831 .map_meta_equal
= bpf_map_meta_equal
,
832 .map_alloc
= stack_map_alloc
,
833 .map_free
= stack_map_free
,
834 .map_get_next_key
= stack_map_get_next_key
,
835 .map_lookup_elem
= stack_map_lookup_elem
,
836 .map_update_elem
= stack_map_update_elem
,
837 .map_delete_elem
= stack_map_delete_elem
,
838 .map_check_btf
= map_check_no_btf
,
839 .map_btf_name
= "bpf_stack_map",
840 .map_btf_id
= &stack_trace_map_btf_id
,
843 static int __init
stack_map_init(void)
846 struct stack_map_irq_work
*work
;
848 for_each_possible_cpu(cpu
) {
849 work
= per_cpu_ptr(&up_read_work
, cpu
);
850 init_irq_work(&work
->irq_work
, do_up_read
);
854 subsys_initcall(stack_map_init
);