1 /* Copyright (c) 2016 Facebook
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
8 #include <linux/jhash.h>
9 #include <linux/filter.h>
10 #include <linux/stacktrace.h>
11 #include <linux/perf_event.h>
12 #include "percpu_freelist.h"
14 struct stack_map_bucket
{
15 struct pcpu_freelist_node fnode
;
21 struct bpf_stack_map
{
24 struct pcpu_freelist freelist
;
26 struct stack_map_bucket
*buckets
[];
29 static int prealloc_elems_and_freelist(struct bpf_stack_map
*smap
)
31 u32 elem_size
= sizeof(struct stack_map_bucket
) + smap
->map
.value_size
;
34 smap
->elems
= bpf_map_area_alloc(elem_size
* smap
->map
.max_entries
);
38 err
= pcpu_freelist_init(&smap
->freelist
);
42 pcpu_freelist_populate(&smap
->freelist
, smap
->elems
, elem_size
,
43 smap
->map
.max_entries
);
47 bpf_map_area_free(smap
->elems
);
51 /* Called from syscall */
52 static struct bpf_map
*stack_map_alloc(union bpf_attr
*attr
)
54 u32 value_size
= attr
->value_size
;
55 struct bpf_stack_map
*smap
;
59 if (!capable(CAP_SYS_ADMIN
))
60 return ERR_PTR(-EPERM
);
63 return ERR_PTR(-EINVAL
);
65 /* check sanity of attributes */
66 if (attr
->max_entries
== 0 || attr
->key_size
!= 4 ||
67 value_size
< 8 || value_size
% 8 ||
68 value_size
/ 8 > sysctl_perf_event_max_stack
)
69 return ERR_PTR(-EINVAL
);
71 /* hash table size must be power of 2 */
72 n_buckets
= roundup_pow_of_two(attr
->max_entries
);
74 cost
= n_buckets
* sizeof(struct stack_map_bucket
*) + sizeof(*smap
);
75 if (cost
>= U32_MAX
- PAGE_SIZE
)
76 return ERR_PTR(-E2BIG
);
78 smap
= bpf_map_area_alloc(cost
);
80 return ERR_PTR(-ENOMEM
);
83 cost
+= n_buckets
* (value_size
+ sizeof(struct stack_map_bucket
));
84 if (cost
>= U32_MAX
- PAGE_SIZE
)
87 smap
->map
.map_type
= attr
->map_type
;
88 smap
->map
.key_size
= attr
->key_size
;
89 smap
->map
.value_size
= value_size
;
90 smap
->map
.max_entries
= attr
->max_entries
;
91 smap
->map
.map_flags
= attr
->map_flags
;
92 smap
->n_buckets
= n_buckets
;
93 smap
->map
.pages
= round_up(cost
, PAGE_SIZE
) >> PAGE_SHIFT
;
95 err
= bpf_map_precharge_memlock(smap
->map
.pages
);
99 err
= get_callchain_buffers(sysctl_perf_event_max_stack
);
103 err
= prealloc_elems_and_freelist(smap
);
110 put_callchain_buffers();
112 bpf_map_area_free(smap
);
116 BPF_CALL_3(bpf_get_stackid
, struct pt_regs
*, regs
, struct bpf_map
*, map
,
119 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
120 struct perf_callchain_entry
*trace
;
121 struct stack_map_bucket
*bucket
, *new_bucket
, *old_bucket
;
122 u32 max_depth
= map
->value_size
/ 8;
123 /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
124 u32 init_nr
= sysctl_perf_event_max_stack
- max_depth
;
125 u32 skip
= flags
& BPF_F_SKIP_FIELD_MASK
;
126 u32 hash
, id
, trace_nr
, trace_len
;
127 bool user
= flags
& BPF_F_USER_STACK
;
131 if (unlikely(flags
& ~(BPF_F_SKIP_FIELD_MASK
| BPF_F_USER_STACK
|
132 BPF_F_FAST_STACK_CMP
| BPF_F_REUSE_STACKID
)))
135 trace
= get_perf_callchain(regs
, init_nr
, kernel
, user
,
136 sysctl_perf_event_max_stack
, false, false);
138 if (unlikely(!trace
))
139 /* couldn't fetch the stack trace */
142 /* get_perf_callchain() guarantees that trace->nr >= init_nr
143 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
145 trace_nr
= trace
->nr
- init_nr
;
147 if (trace_nr
<= skip
)
148 /* skipping more than usable stack trace */
152 trace_len
= trace_nr
* sizeof(u64
);
153 ips
= trace
->ip
+ skip
+ init_nr
;
154 hash
= jhash2((u32
*)ips
, trace_len
/ sizeof(u32
), 0);
155 id
= hash
& (smap
->n_buckets
- 1);
156 bucket
= READ_ONCE(smap
->buckets
[id
]);
158 if (bucket
&& bucket
->hash
== hash
) {
159 if (flags
& BPF_F_FAST_STACK_CMP
)
161 if (bucket
->nr
== trace_nr
&&
162 memcmp(bucket
->ip
, ips
, trace_len
) == 0)
166 /* this call stack is not in the map, try to add it */
167 if (bucket
&& !(flags
& BPF_F_REUSE_STACKID
))
170 new_bucket
= (struct stack_map_bucket
*)
171 pcpu_freelist_pop(&smap
->freelist
);
172 if (unlikely(!new_bucket
))
175 memcpy(new_bucket
->ip
, ips
, trace_len
);
176 new_bucket
->hash
= hash
;
177 new_bucket
->nr
= trace_nr
;
179 old_bucket
= xchg(&smap
->buckets
[id
], new_bucket
);
181 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
185 const struct bpf_func_proto bpf_get_stackid_proto
= {
186 .func
= bpf_get_stackid
,
188 .ret_type
= RET_INTEGER
,
189 .arg1_type
= ARG_PTR_TO_CTX
,
190 .arg2_type
= ARG_CONST_MAP_PTR
,
191 .arg3_type
= ARG_ANYTHING
,
194 /* Called from eBPF program */
195 static void *stack_map_lookup_elem(struct bpf_map
*map
, void *key
)
200 /* Called from syscall */
201 int bpf_stackmap_copy(struct bpf_map
*map
, void *key
, void *value
)
203 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
204 struct stack_map_bucket
*bucket
, *old_bucket
;
205 u32 id
= *(u32
*)key
, trace_len
;
207 if (unlikely(id
>= smap
->n_buckets
))
210 bucket
= xchg(&smap
->buckets
[id
], NULL
);
214 trace_len
= bucket
->nr
* sizeof(u64
);
215 memcpy(value
, bucket
->ip
, trace_len
);
216 memset(value
+ trace_len
, 0, map
->value_size
- trace_len
);
218 old_bucket
= xchg(&smap
->buckets
[id
], bucket
);
220 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
224 static int stack_map_get_next_key(struct bpf_map
*map
, void *key
, void *next_key
)
229 static int stack_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
235 /* Called from syscall or from eBPF program */
236 static int stack_map_delete_elem(struct bpf_map
*map
, void *key
)
238 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
239 struct stack_map_bucket
*old_bucket
;
240 u32 id
= *(u32
*)key
;
242 if (unlikely(id
>= smap
->n_buckets
))
245 old_bucket
= xchg(&smap
->buckets
[id
], NULL
);
247 pcpu_freelist_push(&smap
->freelist
, &old_bucket
->fnode
);
254 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
255 static void stack_map_free(struct bpf_map
*map
)
257 struct bpf_stack_map
*smap
= container_of(map
, struct bpf_stack_map
, map
);
259 /* wait for bpf programs to complete before freeing stack map */
262 bpf_map_area_free(smap
->elems
);
263 pcpu_freelist_destroy(&smap
->freelist
);
264 bpf_map_area_free(smap
);
265 put_callchain_buffers();
268 static const struct bpf_map_ops stack_map_ops
= {
269 .map_alloc
= stack_map_alloc
,
270 .map_free
= stack_map_free
,
271 .map_get_next_key
= stack_map_get_next_key
,
272 .map_lookup_elem
= stack_map_lookup_elem
,
273 .map_update_elem
= stack_map_update_elem
,
274 .map_delete_elem
= stack_map_delete_elem
,
277 static struct bpf_map_type_list stack_map_type __read_mostly
= {
278 .ops
= &stack_map_ops
,
279 .type
= BPF_MAP_TYPE_STACK_TRACE
,
282 static int __init
register_stack_map(void)
284 bpf_register_map_type(&stack_map_type
);
287 late_initcall(register_stack_map
);