1 // SPDX-License-Identifier: GPL-2.0
2 // Copyright (c) 2019 Facebook
8 #include <linux/ptrace.h>
9 #include <linux/sched.h>
10 #include <linux/types.h>
11 #include <bpf/bpf_helpers.h>
13 typedef uint32_t pid_t
;
14 struct task_struct
{};
16 #define TASK_COMM_LEN 16
17 #define PERF_MAX_STACK_DEPTH 127
19 #define STROBE_TYPE_INVALID 0
20 #define STROBE_TYPE_INT 1
21 #define STROBE_TYPE_STR 2
22 #define STROBE_TYPE_MAP 3
24 #define STACK_TABLE_EPOCH_SHIFT 20
25 #define STROBE_MAX_STR_LEN 1
26 #define STROBE_MAX_CFGS 32
27 #define STROBE_MAX_PAYLOAD \
28 (STROBE_MAX_STRS * STROBE_MAX_STR_LEN + \
29 STROBE_MAX_MAPS * (1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN)
31 struct strobe_value_header
{
33 * meaning depends on type:
34 * 1. int: 0, if value not set, 1 otherwise
35 * 2. str: 1 always, whether value is set or not is determined by ptr
36 * 3. map: 1 always, pointer points to additional struct with number
37 * of entries (up to STROBE_MAX_MAP_ENTRIES)
41 * _reserved might be used for some future fields/flags, but we always
42 * want to keep strobe_value_header to be 8 bytes, so BPF can read 16
43 * bytes in one go and get both header and value
49 * strobe_value_generic is used from BPF probe only, but needs to be a union
50 * of strobe_value_int/strobe_value_str/strobe_value_map
52 struct strobe_value_generic
{
53 struct strobe_value_header header
;
60 struct strobe_value_int
{
61 struct strobe_value_header header
;
65 struct strobe_value_str
{
66 struct strobe_value_header header
;
70 struct strobe_value_map
{
71 struct strobe_value_header header
;
72 const struct strobe_map_raw
* value
;
75 struct strobe_map_entry
{
81 * Map of C-string key/value pairs with fixed maximum capacity. Each map has
82 * corresponding int64 ID, which application can use (or ignore) in whatever
83 * way appropriate. Map is "write-only", there is no way to get data out of
84 * map. Map is intended to be used to provide metadata for profilers and is
85 * not to be used for internal in-app communication. All methods are
88 struct strobe_map_raw
{
90 * general purpose unique ID that's up to application to decide
91 * whether and how to use; for request metadata use case id is unique
92 * request ID that's used to match metadata with stack traces on
93 * Strobelight backend side
96 /* number of used entries in map */
99 * having volatile doesn't change anything on BPF side, but clang
100 * emits warnings for passing `volatile const char *` into
101 * bpf_probe_read_user_str that expects just `const char *`
105 * key/value entries, each consisting of 2 pointers to key and value
108 struct strobe_map_entry entries
[STROBE_MAX_MAP_ENTRIES
];
111 /* Following values define supported values of TLS mode */
112 #define TLS_NOT_SET -1
113 #define TLS_LOCAL_EXEC 0
114 #define TLS_IMM_EXEC 1
115 #define TLS_GENERAL_DYN 2
118 * structure that universally represents TLS location (both for static
119 * executables and shared libraries)
121 struct strobe_value_loc
{
123 * tls_mode defines what TLS mode was used for particular metavariable:
124 * - -1 (TLS_NOT_SET) - no metavariable;
125 * - 0 (TLS_LOCAL_EXEC) - Local Executable mode;
126 * - 1 (TLS_IMM_EXEC) - Immediate Executable mode;
127 * - 2 (TLS_GENERAL_DYN) - General Dynamic mode;
128 * Local Dynamic mode is not yet supported, because never seen in
129 * practice. Mode defines how offset field is interpreted. See
130 * calc_location() in below for details.
134 * TLS_LOCAL_EXEC: offset from thread pointer (fs:0 for x86-64,
135 * tpidr_el0 for aarch64).
136 * TLS_IMM_EXEC: absolute address of GOT entry containing offset
137 * from thread pointer;
138 * TLS_GENERAL_DYN: absolute addres of double GOT entry
139 * containing tls_index_t struct;
144 struct strobemeta_cfg
{
145 int64_t req_meta_idx
;
146 struct strobe_value_loc int_locs
[STROBE_MAX_INTS
];
147 struct strobe_value_loc str_locs
[STROBE_MAX_STRS
];
148 struct strobe_value_loc map_locs
[STROBE_MAX_MAPS
];
151 struct strobe_map_descr
{
155 * cnt <0 - map value isn't set;
156 * 0 - map has id set, but no key/value entries
160 * both key_lens[i] and val_lens[i] should be >0 for present key/value
163 uint16_t key_lens
[STROBE_MAX_MAP_ENTRIES
];
164 uint16_t val_lens
[STROBE_MAX_MAP_ENTRIES
];
167 struct strobemeta_payload
{
168 /* req_id has valid request ID, if req_meta_valid == 1 */
170 uint8_t req_meta_valid
;
172 * mask has Nth bit set to 1, if Nth metavar was present and
175 uint64_t int_vals_set_mask
;
176 int64_t int_vals
[STROBE_MAX_INTS
];
177 /* len is >0 for present values */
178 uint16_t str_lens
[STROBE_MAX_STRS
];
179 /* if map_descrs[i].cnt == -1, metavar is not present/set */
180 struct strobe_map_descr map_descrs
[STROBE_MAX_MAPS
];
182 * payload has compactly packed values of str and map variables in the
183 * form: strval1\0strval2\0map1key1\0map1val1\0map2key1\0map2val1\0
184 * (and so on); str_lens[i], key_lens[i] and val_lens[i] determines
187 char payload
[STROBE_MAX_PAYLOAD
];
190 struct strobelight_bpf_sample
{
192 char comm
[TASK_COMM_LEN
];
197 struct strobemeta_payload metadata
;
199 * makes it possible to pass (<real payload size> + 1) as data size to
200 * perf_submit() to avoid perf_submit's paranoia about passing zero as
201 * size, as it deduces that <real payload size> might be
202 * **theoretically** zero
204 char dummy_safeguard
;
208 __uint(type
, BPF_MAP_TYPE_PERF_EVENT_ARRAY
);
209 __uint(max_entries
, 32);
210 __uint(key_size
, sizeof(int));
211 __uint(value_size
, sizeof(int));
212 } samples
SEC(".maps");
215 __uint(type
, BPF_MAP_TYPE_STACK_TRACE
);
216 __uint(max_entries
, 16);
217 __uint(key_size
, sizeof(uint32_t));
218 __uint(value_size
, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH
);
219 } stacks_0
SEC(".maps");
222 __uint(type
, BPF_MAP_TYPE_STACK_TRACE
);
223 __uint(max_entries
, 16);
224 __uint(key_size
, sizeof(uint32_t));
225 __uint(value_size
, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH
);
226 } stacks_1
SEC(".maps");
229 __uint(type
, BPF_MAP_TYPE_PERCPU_ARRAY
);
230 __uint(max_entries
, 1);
231 __type(key
, uint32_t);
232 __type(value
, struct strobelight_bpf_sample
);
233 } sample_heap
SEC(".maps");
236 __uint(type
, BPF_MAP_TYPE_PERCPU_ARRAY
);
237 __uint(max_entries
, STROBE_MAX_CFGS
);
239 __type(value
, struct strobemeta_cfg
);
240 } strobemeta_cfgs
SEC(".maps");
242 /* Type for the dtv. */
243 /* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */
252 /* Partial definition for tcbhead_t */
253 /* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */
260 * TLS module/offset information for shared library case.
261 * For x86-64, this is mapped onto two entries in GOT.
262 * For aarch64, this is pointed to by second GOT entry.
269 static __always_inline
void *calc_location(struct strobe_value_loc
*loc
,
274 * - -1 (TLS_NOT_SET), if no metavar is present;
275 * - 0 (TLS_LOCAL_EXEC), if metavar uses Local Executable mode of TLS
276 * (offset from fs:0 for x86-64 or tpidr_el0 for aarch64);
277 * - 1 (TLS_IMM_EXEC), if metavar uses Immediate Executable mode of TLS;
278 * - 2 (TLS_GENERAL_DYN), if metavar uses General Dynamic mode of TLS;
279 * This schema allows to use something like:
280 * (tls_mode + 1) * (tls_base + offset)
281 * to get NULL for "no metavar" location, or correct pointer for local
282 * executable mode without doing extra ifs.
284 if (loc
->tls_mode
<= TLS_LOCAL_EXEC
) {
285 /* static executable is simple, we just have offset from
287 void *addr
= tls_base
+ loc
->offset
;
288 /* multiply by (tls_mode + 1) to get NULL, if we have no
289 * metavar in this slot */
290 return (void *)((loc
->tls_mode
+ 1) * (int64_t)addr
);
293 * Other modes are more complicated, we need to jump through few hoops.
295 * For immediate executable mode (currently supported only for aarch64):
296 * - loc->offset is pointing to a GOT entry containing fixed offset
297 * relative to tls_base;
299 * For general dynamic mode:
300 * - loc->offset is pointing to a beginning of double GOT entries;
301 * - (for aarch64 only) second entry points to tls_index_t struct;
302 * - (for x86-64 only) two GOT entries are already tls_index_t;
303 * - tls_index_t->module is used to find start of TLS section in
304 * which variable resides;
305 * - tls_index_t->offset provides offset within that TLS section,
306 * pointing to value of variable.
308 struct tls_index tls_index
;
312 bpf_probe_read_user(&tls_index
, sizeof(struct tls_index
),
313 (void *)loc
->offset
);
314 /* valid module index is always positive */
315 if (tls_index
.module
> 0) {
316 /* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */
317 bpf_probe_read_user(&dtv
, sizeof(dtv
),
318 &((struct tcbhead
*)tls_base
)->dtv
);
319 dtv
+= tls_index
.module
;
323 bpf_probe_read_user(&tls_ptr
, sizeof(void *), dtv
);
324 /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */
325 return tls_ptr
&& tls_ptr
!= (void *)-1
326 ? tls_ptr
+ tls_index
.offset
330 static __always_inline
void read_int_var(struct strobemeta_cfg
*cfg
,
331 size_t idx
, void *tls_base
,
332 struct strobe_value_generic
*value
,
333 struct strobemeta_payload
*data
)
335 void *location
= calc_location(&cfg
->int_locs
[idx
], tls_base
);
339 bpf_probe_read_user(value
, sizeof(struct strobe_value_generic
), location
);
340 data
->int_vals
[idx
] = value
->val
;
341 if (value
->header
.len
)
342 data
->int_vals_set_mask
|= (1 << idx
);
345 static __always_inline
uint64_t read_str_var(struct strobemeta_cfg
*cfg
,
346 size_t idx
, void *tls_base
,
347 struct strobe_value_generic
*value
,
348 struct strobemeta_payload
*data
,
354 data
->str_lens
[idx
] = 0;
355 location
= calc_location(&cfg
->str_locs
[idx
], tls_base
);
359 bpf_probe_read_user(value
, sizeof(struct strobe_value_generic
), location
);
360 len
= bpf_probe_read_user_str(payload
, STROBE_MAX_STR_LEN
, value
->ptr
);
362 * if bpf_probe_read_user_str returns error (<0), due to casting to
363 * unsinged int, it will become big number, so next check is
364 * sufficient to check for errors AND prove to BPF verifier, that
365 * bpf_probe_read_user_str won't return anything bigger than
368 if (len
> STROBE_MAX_STR_LEN
)
371 data
->str_lens
[idx
] = len
;
375 static __always_inline
void *read_map_var(struct strobemeta_cfg
*cfg
,
376 size_t idx
, void *tls_base
,
377 struct strobe_value_generic
*value
,
378 struct strobemeta_payload
*data
,
381 struct strobe_map_descr
* descr
= &data
->map_descrs
[idx
];
382 struct strobe_map_raw map
;
387 descr
->tag_len
= 0; /* presume no tag is set */
388 descr
->cnt
= -1; /* presume no value is set */
390 location
= calc_location(&cfg
->map_locs
[idx
], tls_base
);
394 bpf_probe_read_user(value
, sizeof(struct strobe_value_generic
), location
);
395 if (bpf_probe_read_user(&map
, sizeof(struct strobe_map_raw
), value
->ptr
))
399 descr
->cnt
= map
.cnt
;
400 if (cfg
->req_meta_idx
== idx
) {
401 data
->req_id
= map
.id
;
402 data
->req_meta_valid
= 1;
405 len
= bpf_probe_read_user_str(payload
, STROBE_MAX_STR_LEN
, map
.tag
);
406 if (len
<= STROBE_MAX_STR_LEN
) {
407 descr
->tag_len
= len
;
412 #pragma clang loop unroll(disable)
416 for (int i
= 0; i
< STROBE_MAX_MAP_ENTRIES
; ++i
) {
420 descr
->key_lens
[i
] = 0;
421 len
= bpf_probe_read_user_str(payload
, STROBE_MAX_STR_LEN
,
423 if (len
<= STROBE_MAX_STR_LEN
) {
424 descr
->key_lens
[i
] = len
;
427 descr
->val_lens
[i
] = 0;
428 len
= bpf_probe_read_user_str(payload
, STROBE_MAX_STR_LEN
,
430 if (len
<= STROBE_MAX_STR_LEN
) {
431 descr
->val_lens
[i
] = len
;
440 * read_strobe_meta returns NULL, if no metadata was read; otherwise returns
441 * pointer to *right after* payload ends
443 static __always_inline
void *read_strobe_meta(struct task_struct
*task
,
444 struct strobemeta_payload
*data
)
446 pid_t pid
= bpf_get_current_pid_tgid() >> 32;
447 struct strobe_value_generic value
= {0};
448 struct strobemeta_cfg
*cfg
;
449 void *tls_base
, *payload
;
451 cfg
= bpf_map_lookup_elem(&strobemeta_cfgs
, &pid
);
455 data
->int_vals_set_mask
= 0;
456 data
->req_meta_valid
= 0;
457 payload
= data
->payload
;
459 * we don't have struct task_struct definition, it should be:
460 * tls_base = (void *)task->thread.fsbase;
462 tls_base
= (void *)task
;
465 #pragma clang loop unroll(disable)
469 for (int i
= 0; i
< STROBE_MAX_INTS
; ++i
) {
470 read_int_var(cfg
, i
, tls_base
, &value
, data
);
473 #pragma clang loop unroll(disable)
477 for (int i
= 0; i
< STROBE_MAX_STRS
; ++i
) {
478 payload
+= read_str_var(cfg
, i
, tls_base
, &value
, data
, payload
);
481 #pragma clang loop unroll(disable)
485 for (int i
= 0; i
< STROBE_MAX_MAPS
; ++i
) {
486 payload
= read_map_var(cfg
, i
, tls_base
, &value
, data
, payload
);
489 * return pointer right after end of payload, so it's possible to
490 * calculate exact amount of useful data that needs to be sent
495 SEC("raw_tracepoint/kfree_skb")
496 int on_event(struct pt_regs
*ctx
) {
497 pid_t pid
= bpf_get_current_pid_tgid() >> 32;
498 struct strobelight_bpf_sample
* sample
;
499 struct task_struct
*task
;
504 sample
= bpf_map_lookup_elem(&sample_heap
, &zero
);
506 return 0; /* this will never happen */
509 bpf_get_current_comm(&sample
->comm
, TASK_COMM_LEN
);
510 ktime_ns
= bpf_ktime_get_ns();
511 sample
->ktime
= ktime_ns
;
513 task
= (struct task_struct
*)bpf_get_current_task();
514 sample_end
= read_strobe_meta(task
, &sample
->metadata
);
515 sample
->has_meta
= sample_end
!= NULL
;
516 sample_end
= sample_end
? : &sample
->metadata
;
518 if ((ktime_ns
>> STACK_TABLE_EPOCH_SHIFT
) & 1) {
519 sample
->kernel_stack_id
= bpf_get_stackid(ctx
, &stacks_1
, 0);
520 sample
->user_stack_id
= bpf_get_stackid(ctx
, &stacks_1
, BPF_F_USER_STACK
);
522 sample
->kernel_stack_id
= bpf_get_stackid(ctx
, &stacks_0
, 0);
523 sample
->user_stack_id
= bpf_get_stackid(ctx
, &stacks_0
, BPF_F_USER_STACK
);
526 uint64_t sample_size
= sample_end
- (void *)sample
;
527 /* should always be true */
528 if (sample_size
< sizeof(struct strobelight_bpf_sample
))
529 bpf_perf_event_output(ctx
, &samples
, 0, sample
, 1 + sample_size
);
533 char _license
[] SEC("license") = "GPL";