1 /* Copyright (c) 2016 Facebook
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
8 #include <linux/jhash.h>
9 #include <linux/filter.h>
10 #include <linux/stacktrace.h>
11 #include <linux/perf_event.h>
12 #include "percpu_freelist.h"
14 struct stack_map_bucket
{
15 struct pcpu_freelist_node fnode
;
21 struct bpf_stack_map
{
24 struct pcpu_freelist freelist
;
26 struct stack_map_bucket
*buckets
[];
29 static int prealloc_elems_and_freelist(struct bpf_stack_map
*smap
)
31 u32 elem_size
= sizeof(struct stack_map_bucket
) + smap
->map
.value_size
;
34 smap
->elems
= bpf_map_area_alloc(elem_size
* smap
->map
.max_entries
);
38 err
= pcpu_freelist_init(&smap
->freelist
);
42 pcpu_freelist_populate(&smap
->freelist
, smap
->elems
, elem_size
,
43 smap
->map
.max_entries
);
47 bpf_map_area_free(smap
->elems
);
51 /* Called from syscall */
52 static struct bpf_map
*stack_map_alloc(union bpf_attr
*attr
)
54 u32 value_size
= attr
->value_size
;
55 struct bpf_stack_map
*smap
;
59 if (!capable(CAP_SYS_ADMIN
))
60 return ERR_PTR(-EPERM
);
63 return ERR_PTR(-EINVAL
);
65 /* check sanity of attributes */
66 if (attr
->max_entries
== 0 || attr
->key_size
!= 4 ||
67 value_size
< 8 || value_size
% 8 ||
68 value_size
/ 8 > sysctl_perf_event_max_stack
)
69 return ERR_PTR(-EINVAL
);
71 /* hash table size must be power of 2 */
72 n_buckets
= roundup_pow_of_two(attr
->max_entries
);
74 cost
= n_buckets
* sizeof(struct stack_map_bucket
*) + sizeof(*smap
);
75 if (cost
>= U32_MAX
- PAGE_SIZE
)
76 return ERR_PTR(-E2BIG
);
78 smap
= bpf_map_area_alloc(cost
);
80 return ERR_PTR(-ENOMEM
);
83 cost
+= n_buckets
* (value_size
+ sizeof(struct stack_map_bucket
));
84 if (cost
>= U32_MAX
- PAGE_SIZE
)
87 smap
->map
.map_type
= attr
->map_type
;
88 smap
->map
.key_size
= attr
->key_size
;
89 smap
->map
.value_size
= value_size
;
90 smap
->map
.max_entries
= attr
->max_entries
;
91 smap
->n_buckets
= n_buckets
;
92 smap
->map
.pages
= round_up(cost
, PAGE_SIZE
) >> PAGE_SHIFT
;
94 err
= bpf_map_precharge_memlock(smap
->map
.pages
);
98 err
= get_callchain_buffers(sysctl_perf_event_max_stack
);
102 err
= prealloc_elems_and_freelist(smap
);
109 put_callchain_buffers();
111 bpf_map_area_free(smap
);
115 BPF_CALL_3(bpf_get_stackid
, struct pt_regs
*, regs
, struct bpf_map
*, map
,
118 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
119 struct perf_callchain_entry
*trace
;
120 struct stack_map_bucket
*bucket
, *new_bucket
, *old_bucket
;
121 u32 max_depth
= map
->value_size
/ 8;
122 /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
123 u32 init_nr
= sysctl_perf_event_max_stack
- max_depth
;
124 u32 skip
= flags
& BPF_F_SKIP_FIELD_MASK
;
125 u32 hash
, id
, trace_nr
, trace_len
;
126 bool user
= flags
& BPF_F_USER_STACK
;
130 if (unlikely(flags
& ~(BPF_F_SKIP_FIELD_MASK
| BPF_F_USER_STACK
|
131 BPF_F_FAST_STACK_CMP
| BPF_F_REUSE_STACKID
)))
134 trace
= get_perf_callchain(regs
, init_nr
, kernel
, user
,
135 sysctl_perf_event_max_stack
, false, false);
137 if (unlikely(!trace
))
138 /* couldn't fetch the stack trace */
141 /* get_perf_callchain() guarantees that trace->nr >= init_nr
142 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
144 trace_nr
= trace
->nr
- init_nr
;
146 if (trace_nr
<= skip
)
147 /* skipping more than usable stack trace */
151 trace_len
= trace_nr
* sizeof(u64
);
152 ips
= trace
->ip
+ skip
+ init_nr
;
153 hash
= jhash2((u32
*)ips
, trace_len
/ sizeof(u32
), 0);
154 id
= hash
& (smap
->n_buckets
- 1);
155 bucket
= READ_ONCE(smap
->buckets
[id
]);
157 if (bucket
&& bucket
->hash
== hash
) {
158 if (flags
& BPF_F_FAST_STACK_CMP
)
160 if (bucket
->nr
== trace_nr
&&
161 memcmp(bucket
->ip
, ips
, trace_len
) == 0)
165 /* this call stack is not in the map, try to add it */
166 if (bucket
&& !(flags
& BPF_F_REUSE_STACKID
))
169 new_bucket
= (struct stack_map_bucket
*)
170 pcpu_freelist_pop(&smap
->freelist
);
171 if (unlikely(!new_bucket
))
174 memcpy(new_bucket
->ip
, ips
, trace_len
);
175 new_bucket
->hash
= hash
;
176 new_bucket
->nr
= trace_nr
;
178 old_bucket
= xchg(&smap
->buckets
[id
], new_bucket
);
180 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
184 const struct bpf_func_proto bpf_get_stackid_proto
= {
185 .func
= bpf_get_stackid
,
187 .ret_type
= RET_INTEGER
,
188 .arg1_type
= ARG_PTR_TO_CTX
,
189 .arg2_type
= ARG_CONST_MAP_PTR
,
190 .arg3_type
= ARG_ANYTHING
,
193 /* Called from eBPF program */
194 static void *stack_map_lookup_elem(struct bpf_map
*map
, void *key
)
199 /* Called from syscall */
200 int bpf_stackmap_copy(struct bpf_map
*map
, void *key
, void *value
)
202 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
203 struct stack_map_bucket
*bucket
, *old_bucket
;
204 u32 id
= *(u32
*)key
, trace_len
;
206 if (unlikely(id
>= smap
->n_buckets
))
209 bucket
= xchg(&smap
->buckets
[id
], NULL
);
213 trace_len
= bucket
->nr
* sizeof(u64
);
214 memcpy(value
, bucket
->ip
, trace_len
);
215 memset(value
+ trace_len
, 0, map
->value_size
- trace_len
);
217 old_bucket
= xchg(&smap
->buckets
[id
], bucket
);
219 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
223 static int stack_map_get_next_key(struct bpf_map
*map
, void *key
, void *next_key
)
228 static int stack_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
234 /* Called from syscall or from eBPF program */
235 static int stack_map_delete_elem(struct bpf_map
*map
, void *key
)
237 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
238 struct stack_map_bucket
*old_bucket
;
239 u32 id
= *(u32
*)key
;
241 if (unlikely(id
>= smap
->n_buckets
))
244 old_bucket
= xchg(&smap
->buckets
[id
], NULL
);
246 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
253 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
254 static void stack_map_free(struct bpf_map
*map
)
256 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
258 /* wait for bpf programs to complete before freeing stack map */
261 bpf_map_area_free(smap
->elems
);
262 pcpu_freelist_destroy(&smap
->freelist
);
263 bpf_map_area_free(smap
);
264 put_callchain_buffers();
267 static const struct bpf_map_ops stack_map_ops
= {
268 .map_alloc
= stack_map_alloc
,
269 .map_free
= stack_map_free
,
270 .map_get_next_key
= stack_map_get_next_key
,
271 .map_lookup_elem
= stack_map_lookup_elem
,
272 .map_update_elem
= stack_map_update_elem
,
273 .map_delete_elem
= stack_map_delete_elem
,
276 static struct bpf_map_type_list stack_map_type __read_mostly
= {
277 .ops
= &stack_map_ops
,
278 .type
= BPF_MAP_TYPE_STACK_TRACE
,
281 static int __init
register_stack_map(void)
283 bpf_register_map_type(&stack_map_type
);
286 late_initcall(register_stack_map
);