1 /* Copyright (c) 2016 Facebook
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
8 #include <linux/jhash.h>
9 #include <linux/filter.h>
10 #include <linux/vmalloc.h>
11 #include <linux/stacktrace.h>
12 #include <linux/perf_event.h>
13 #include "percpu_freelist.h"
15 struct stack_map_bucket
{
16 struct pcpu_freelist_node fnode
;
22 struct bpf_stack_map
{
25 struct pcpu_freelist freelist
;
27 struct stack_map_bucket
*buckets
[];
30 static int prealloc_elems_and_freelist(struct bpf_stack_map
*smap
)
32 u32 elem_size
= sizeof(struct stack_map_bucket
) + smap
->map
.value_size
;
35 smap
->elems
= vzalloc(elem_size
* smap
->map
.max_entries
);
39 err
= pcpu_freelist_init(&smap
->freelist
);
43 pcpu_freelist_populate(&smap
->freelist
, smap
->elems
, elem_size
,
44 smap
->map
.max_entries
);
52 /* Called from syscall */
53 static struct bpf_map
*stack_map_alloc(union bpf_attr
*attr
)
55 u32 value_size
= attr
->value_size
;
56 struct bpf_stack_map
*smap
;
60 if (!capable(CAP_SYS_ADMIN
))
61 return ERR_PTR(-EPERM
);
64 return ERR_PTR(-EINVAL
);
66 /* check sanity of attributes */
67 if (attr
->max_entries
== 0 || attr
->key_size
!= 4 ||
68 value_size
< 8 || value_size
% 8 ||
69 value_size
/ 8 > sysctl_perf_event_max_stack
)
70 return ERR_PTR(-EINVAL
);
72 /* hash table size must be power of 2 */
73 n_buckets
= roundup_pow_of_two(attr
->max_entries
);
75 cost
= n_buckets
* sizeof(struct stack_map_bucket
*) + sizeof(*smap
);
76 if (cost
>= U32_MAX
- PAGE_SIZE
)
77 return ERR_PTR(-E2BIG
);
79 smap
= kzalloc(cost
, GFP_USER
| __GFP_NOWARN
);
83 return ERR_PTR(-ENOMEM
);
87 cost
+= n_buckets
* (value_size
+ sizeof(struct stack_map_bucket
));
88 if (cost
>= U32_MAX
- PAGE_SIZE
)
91 smap
->map
.map_type
= attr
->map_type
;
92 smap
->map
.key_size
= attr
->key_size
;
93 smap
->map
.value_size
= value_size
;
94 smap
->map
.max_entries
= attr
->max_entries
;
95 smap
->n_buckets
= n_buckets
;
96 smap
->map
.pages
= round_up(cost
, PAGE_SIZE
) >> PAGE_SHIFT
;
98 err
= bpf_map_precharge_memlock(smap
->map
.pages
);
102 err
= get_callchain_buffers(sysctl_perf_event_max_stack
);
106 err
= prealloc_elems_and_freelist(smap
);
113 put_callchain_buffers();
119 BPF_CALL_3(bpf_get_stackid
, struct pt_regs
*, regs
, struct bpf_map
*, map
,
122 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
123 struct perf_callchain_entry
*trace
;
124 struct stack_map_bucket
*bucket
, *new_bucket
, *old_bucket
;
125 u32 max_depth
= map
->value_size
/ 8;
126 /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
127 u32 init_nr
= sysctl_perf_event_max_stack
- max_depth
;
128 u32 skip
= flags
& BPF_F_SKIP_FIELD_MASK
;
129 u32 hash
, id
, trace_nr
, trace_len
;
130 bool user
= flags
& BPF_F_USER_STACK
;
134 if (unlikely(flags
& ~(BPF_F_SKIP_FIELD_MASK
| BPF_F_USER_STACK
|
135 BPF_F_FAST_STACK_CMP
| BPF_F_REUSE_STACKID
)))
138 trace
= get_perf_callchain(regs
, init_nr
, kernel
, user
,
139 sysctl_perf_event_max_stack
, false, false);
141 if (unlikely(!trace
))
142 /* couldn't fetch the stack trace */
145 /* get_perf_callchain() guarantees that trace->nr >= init_nr
146 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
148 trace_nr
= trace
->nr
- init_nr
;
150 if (trace_nr
<= skip
)
151 /* skipping more than usable stack trace */
155 trace_len
= trace_nr
* sizeof(u64
);
156 ips
= trace
->ip
+ skip
+ init_nr
;
157 hash
= jhash2((u32
*)ips
, trace_len
/ sizeof(u32
), 0);
158 id
= hash
& (smap
->n_buckets
- 1);
159 bucket
= READ_ONCE(smap
->buckets
[id
]);
161 if (bucket
&& bucket
->hash
== hash
) {
162 if (flags
& BPF_F_FAST_STACK_CMP
)
164 if (bucket
->nr
== trace_nr
&&
165 memcmp(bucket
->ip
, ips
, trace_len
) == 0)
169 /* this call stack is not in the map, try to add it */
170 if (bucket
&& !(flags
& BPF_F_REUSE_STACKID
))
173 new_bucket
= (struct stack_map_bucket
*)
174 pcpu_freelist_pop(&smap
->freelist
);
175 if (unlikely(!new_bucket
))
178 memcpy(new_bucket
->ip
, ips
, trace_len
);
179 new_bucket
->hash
= hash
;
180 new_bucket
->nr
= trace_nr
;
182 old_bucket
= xchg(&smap
->buckets
[id
], new_bucket
);
184 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
188 const struct bpf_func_proto bpf_get_stackid_proto
= {
189 .func
= bpf_get_stackid
,
191 .ret_type
= RET_INTEGER
,
192 .arg1_type
= ARG_PTR_TO_CTX
,
193 .arg2_type
= ARG_CONST_MAP_PTR
,
194 .arg3_type
= ARG_ANYTHING
,
197 /* Called from eBPF program */
198 static void *stack_map_lookup_elem(struct bpf_map
*map
, void *key
)
203 /* Called from syscall */
204 int bpf_stackmap_copy(struct bpf_map
*map
, void *key
, void *value
)
206 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
207 struct stack_map_bucket
*bucket
, *old_bucket
;
208 u32 id
= *(u32
*)key
, trace_len
;
210 if (unlikely(id
>= smap
->n_buckets
))
213 bucket
= xchg(&smap
->buckets
[id
], NULL
);
217 trace_len
= bucket
->nr
* sizeof(u64
);
218 memcpy(value
, bucket
->ip
, trace_len
);
219 memset(value
+ trace_len
, 0, map
->value_size
- trace_len
);
221 old_bucket
= xchg(&smap
->buckets
[id
], bucket
);
223 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
227 static int stack_map_get_next_key(struct bpf_map
*map
, void *key
, void *next_key
)
232 static int stack_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
238 /* Called from syscall or from eBPF program */
239 static int stack_map_delete_elem(struct bpf_map
*map
, void *key
)
241 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
242 struct stack_map_bucket
*old_bucket
;
243 u32 id
= *(u32
*)key
;
245 if (unlikely(id
>= smap
->n_buckets
))
248 old_bucket
= xchg(&smap
->buckets
[id
], NULL
);
250 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
257 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
258 static void stack_map_free(struct bpf_map
*map
)
260 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
262 /* wait for bpf programs to complete before freeing stack map */
266 pcpu_freelist_destroy(&smap
->freelist
);
268 put_callchain_buffers();
271 static const struct bpf_map_ops stack_map_ops
= {
272 .map_alloc
= stack_map_alloc
,
273 .map_free
= stack_map_free
,
274 .map_get_next_key
= stack_map_get_next_key
,
275 .map_lookup_elem
= stack_map_lookup_elem
,
276 .map_update_elem
= stack_map_update_elem
,
277 .map_delete_elem
= stack_map_delete_elem
,
280 static struct bpf_map_type_list stack_map_type __read_mostly
= {
281 .ops
= &stack_map_ops
,
282 .type
= BPF_MAP_TYPE_STACK_TRACE
,
285 static int __init
register_stack_map(void)
287 bpf_register_map_type(&stack_map_type
);
290 late_initcall(register_stack_map
);