1 /* Copyright (c) 2016 Facebook
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
8 #include <linux/jhash.h>
9 #include <linux/filter.h>
10 #include <linux/stacktrace.h>
11 #include <linux/perf_event.h>
12 #include "percpu_freelist.h"
14 #define STACK_CREATE_FLAG_MASK \
15 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
17 struct stack_map_bucket
{
18 struct pcpu_freelist_node fnode
;
24 struct bpf_stack_map
{
27 struct pcpu_freelist freelist
;
29 struct stack_map_bucket
*buckets
[];
32 static int prealloc_elems_and_freelist(struct bpf_stack_map
*smap
)
34 u32 elem_size
= sizeof(struct stack_map_bucket
) + smap
->map
.value_size
;
37 smap
->elems
= bpf_map_area_alloc(elem_size
* smap
->map
.max_entries
,
42 err
= pcpu_freelist_init(&smap
->freelist
);
46 pcpu_freelist_populate(&smap
->freelist
, smap
->elems
, elem_size
,
47 smap
->map
.max_entries
);
51 bpf_map_area_free(smap
->elems
);
55 /* Called from syscall */
56 static struct bpf_map
*stack_map_alloc(union bpf_attr
*attr
)
58 u32 value_size
= attr
->value_size
;
59 struct bpf_stack_map
*smap
;
63 if (!capable(CAP_SYS_ADMIN
))
64 return ERR_PTR(-EPERM
);
66 if (attr
->map_flags
& ~STACK_CREATE_FLAG_MASK
)
67 return ERR_PTR(-EINVAL
);
69 /* check sanity of attributes */
70 if (attr
->max_entries
== 0 || attr
->key_size
!= 4 ||
71 value_size
< 8 || value_size
% 8 ||
72 value_size
/ 8 > sysctl_perf_event_max_stack
)
73 return ERR_PTR(-EINVAL
);
75 /* hash table size must be power of 2 */
76 n_buckets
= roundup_pow_of_two(attr
->max_entries
);
78 cost
= n_buckets
* sizeof(struct stack_map_bucket
*) + sizeof(*smap
);
79 if (cost
>= U32_MAX
- PAGE_SIZE
)
80 return ERR_PTR(-E2BIG
);
82 smap
= bpf_map_area_alloc(cost
, bpf_map_attr_numa_node(attr
));
84 return ERR_PTR(-ENOMEM
);
87 cost
+= n_buckets
* (value_size
+ sizeof(struct stack_map_bucket
));
88 if (cost
>= U32_MAX
- PAGE_SIZE
)
91 bpf_map_init_from_attr(&smap
->map
, attr
);
92 smap
->map
.value_size
= value_size
;
93 smap
->n_buckets
= n_buckets
;
94 smap
->map
.pages
= round_up(cost
, PAGE_SIZE
) >> PAGE_SHIFT
;
96 err
= bpf_map_precharge_memlock(smap
->map
.pages
);
100 err
= get_callchain_buffers(sysctl_perf_event_max_stack
);
104 err
= prealloc_elems_and_freelist(smap
);
111 put_callchain_buffers();
113 bpf_map_area_free(smap
);
117 BPF_CALL_3(bpf_get_stackid
, struct pt_regs
*, regs
, struct bpf_map
*, map
,
120 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
121 struct perf_callchain_entry
*trace
;
122 struct stack_map_bucket
*bucket
, *new_bucket
, *old_bucket
;
123 u32 max_depth
= map
->value_size
/ 8;
124 /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
125 u32 init_nr
= sysctl_perf_event_max_stack
- max_depth
;
126 u32 skip
= flags
& BPF_F_SKIP_FIELD_MASK
;
127 u32 hash
, id
, trace_nr
, trace_len
;
128 bool user
= flags
& BPF_F_USER_STACK
;
132 if (unlikely(flags
& ~(BPF_F_SKIP_FIELD_MASK
| BPF_F_USER_STACK
|
133 BPF_F_FAST_STACK_CMP
| BPF_F_REUSE_STACKID
)))
136 trace
= get_perf_callchain(regs
, init_nr
, kernel
, user
,
137 sysctl_perf_event_max_stack
, false, false);
139 if (unlikely(!trace
))
140 /* couldn't fetch the stack trace */
143 /* get_perf_callchain() guarantees that trace->nr >= init_nr
144 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
146 trace_nr
= trace
->nr
- init_nr
;
148 if (trace_nr
<= skip
)
149 /* skipping more than usable stack trace */
153 trace_len
= trace_nr
* sizeof(u64
);
154 ips
= trace
->ip
+ skip
+ init_nr
;
155 hash
= jhash2((u32
*)ips
, trace_len
/ sizeof(u32
), 0);
156 id
= hash
& (smap
->n_buckets
- 1);
157 bucket
= READ_ONCE(smap
->buckets
[id
]);
159 if (bucket
&& bucket
->hash
== hash
) {
160 if (flags
& BPF_F_FAST_STACK_CMP
)
162 if (bucket
->nr
== trace_nr
&&
163 memcmp(bucket
->ip
, ips
, trace_len
) == 0)
167 /* this call stack is not in the map, try to add it */
168 if (bucket
&& !(flags
& BPF_F_REUSE_STACKID
))
171 new_bucket
= (struct stack_map_bucket
*)
172 pcpu_freelist_pop(&smap
->freelist
);
173 if (unlikely(!new_bucket
))
176 memcpy(new_bucket
->ip
, ips
, trace_len
);
177 new_bucket
->hash
= hash
;
178 new_bucket
->nr
= trace_nr
;
180 old_bucket
= xchg(&smap
->buckets
[id
], new_bucket
);
182 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
186 const struct bpf_func_proto bpf_get_stackid_proto
= {
187 .func
= bpf_get_stackid
,
189 .ret_type
= RET_INTEGER
,
190 .arg1_type
= ARG_PTR_TO_CTX
,
191 .arg2_type
= ARG_CONST_MAP_PTR
,
192 .arg3_type
= ARG_ANYTHING
,
195 /* Called from eBPF program */
196 static void *stack_map_lookup_elem(struct bpf_map
*map
, void *key
)
201 /* Called from syscall */
202 int bpf_stackmap_copy(struct bpf_map
*map
, void *key
, void *value
)
204 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
205 struct stack_map_bucket
*bucket
, *old_bucket
;
206 u32 id
= *(u32
*)key
, trace_len
;
208 if (unlikely(id
>= smap
->n_buckets
))
211 bucket
= xchg(&smap
->buckets
[id
], NULL
);
215 trace_len
= bucket
->nr
* sizeof(u64
);
216 memcpy(value
, bucket
->ip
, trace_len
);
217 memset(value
+ trace_len
, 0, map
->value_size
- trace_len
);
219 old_bucket
= xchg(&smap
->buckets
[id
], bucket
);
221 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
225 static int stack_map_get_next_key(struct bpf_map
*map
, void *key
,
228 struct bpf_stack_map
*smap
= container_of(map
,
229 struct bpf_stack_map
, map
);
232 WARN_ON_ONCE(!rcu_read_lock_held());
238 if (id
>= smap
->n_buckets
|| !smap
->buckets
[id
])
244 while (id
< smap
->n_buckets
&& !smap
->buckets
[id
])
247 if (id
>= smap
->n_buckets
)
250 *(u32
*)next_key
= id
;
254 static int stack_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
260 /* Called from syscall or from eBPF program */
261 static int stack_map_delete_elem(struct bpf_map
*map
, void *key
)
263 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
264 struct stack_map_bucket
*old_bucket
;
265 u32 id
= *(u32
*)key
;
267 if (unlikely(id
>= smap
->n_buckets
))
270 old_bucket
= xchg(&smap
->buckets
[id
], NULL
);
272 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
279 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
280 static void stack_map_free(struct bpf_map
*map
)
282 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
284 /* wait for bpf programs to complete before freeing stack map */
287 bpf_map_area_free(smap
->elems
);
288 pcpu_freelist_destroy(&smap
->freelist
);
289 bpf_map_area_free(smap
);
290 put_callchain_buffers();
293 const struct bpf_map_ops stack_map_ops
= {
294 .map_alloc
= stack_map_alloc
,
295 .map_free
= stack_map_free
,
296 .map_get_next_key
= stack_map_get_next_key
,
297 .map_lookup_elem
= stack_map_lookup_elem
,
298 .map_update_elem
= stack_map_update_elem
,
299 .map_delete_elem
= stack_map_delete_elem
,