1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2022 Google
4 #include <bpf/bpf_helpers.h>
5 #include <bpf/bpf_tracing.h>
6 #include <bpf/bpf_core_read.h>
8 /* task->flags for off-cpu analysis */
9 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */
11 /* task->state for off-cpu analysis */
12 #define TASK_INTERRUPTIBLE 0x0001
13 #define TASK_UNINTERRUPTIBLE 0x0002
15 /* create a new thread */
16 #define CLONE_THREAD 0x10000
19 #define MAX_ENTRIES 102400
36 __uint(type
, BPF_MAP_TYPE_STACK_TRACE
);
37 __uint(key_size
, sizeof(__u32
));
38 __uint(value_size
, MAX_STACKS
* sizeof(__u64
));
39 __uint(max_entries
, MAX_ENTRIES
);
40 } stacks
SEC(".maps");
43 __uint(type
, BPF_MAP_TYPE_TASK_STORAGE
);
44 __uint(map_flags
, BPF_F_NO_PREALLOC
);
46 __type(value
, struct tstamp_data
);
47 } tstamp
SEC(".maps");
50 __uint(type
, BPF_MAP_TYPE_HASH
);
51 __uint(key_size
, sizeof(struct offcpu_key
));
52 __uint(value_size
, sizeof(__u64
));
53 __uint(max_entries
, MAX_ENTRIES
);
54 } off_cpu
SEC(".maps");
57 __uint(type
, BPF_MAP_TYPE_HASH
);
58 __uint(key_size
, sizeof(__u32
));
59 __uint(value_size
, sizeof(__u8
));
60 __uint(max_entries
, 1);
61 } cpu_filter
SEC(".maps");
64 __uint(type
, BPF_MAP_TYPE_HASH
);
65 __uint(key_size
, sizeof(__u32
));
66 __uint(value_size
, sizeof(__u8
));
67 __uint(max_entries
, 1);
68 } task_filter
SEC(".maps");
71 __uint(type
, BPF_MAP_TYPE_HASH
);
72 __uint(key_size
, sizeof(__u64
));
73 __uint(value_size
, sizeof(__u8
));
74 __uint(max_entries
, 1);
75 } cgroup_filter
SEC(".maps");
77 /* new kernel task_struct definition */
78 struct task_struct___new
{
80 } __attribute__((preserve_access_index
));
82 /* old kernel task_struct definition */
83 struct task_struct___old
{
85 } __attribute__((preserve_access_index
));
89 const volatile int has_cpu
= 0;
90 const volatile int has_task
= 0;
91 const volatile int has_cgroup
= 0;
92 const volatile int uses_tgid
= 0;
94 const volatile bool has_prev_state
= false;
95 const volatile bool needs_cgroup
= false;
96 const volatile bool uses_cgroup_v1
= false;
98 int perf_subsys_id
= -1;
101 * Old kernel used to call it task_struct->state and now it's '__state'.
102 * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
104 * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
106 static inline int get_task_state(struct task_struct
*t
)
108 /* recast pointer to capture new type for compiler */
109 struct task_struct___new
*t_new
= (void *)t
;
111 if (bpf_core_field_exists(t_new
->__state
)) {
112 return BPF_CORE_READ(t_new
, __state
);
114 /* recast pointer to capture old type for compiler */
115 struct task_struct___old
*t_old
= (void *)t
;
117 return BPF_CORE_READ(t_old
, state
);
121 static inline __u64
get_cgroup_id(struct task_struct
*t
)
126 return BPF_CORE_READ(t
, cgroups
, dfl_cgrp
, kn
, id
);
128 if (perf_subsys_id
== -1) {
129 #if __has_builtin(__builtin_preserve_enum_value)
130 perf_subsys_id
= bpf_core_enum_value(enum cgroup_subsys_id
,
133 perf_subsys_id
= perf_event_cgrp_id
;
137 cgrp
= BPF_CORE_READ(t
, cgroups
, subsys
[perf_subsys_id
], cgroup
);
138 return BPF_CORE_READ(cgrp
, kn
, id
);
141 static inline int can_record(struct task_struct
*t
, int state
)
143 /* kernel threads don't have user stack */
144 if (t
->flags
& PF_KTHREAD
)
147 if (state
!= TASK_INTERRUPTIBLE
&&
148 state
!= TASK_UNINTERRUPTIBLE
)
152 __u32 cpu
= bpf_get_smp_processor_id();
155 ok
= bpf_map_lookup_elem(&cpu_filter
, &cpu
);
169 ok
= bpf_map_lookup_elem(&task_filter
, &pid
);
176 __u64 cgrp_id
= get_cgroup_id(t
);
178 ok
= bpf_map_lookup_elem(&cgroup_filter
, &cgrp_id
);
186 static int off_cpu_stat(u64
*ctx
, struct task_struct
*prev
,
187 struct task_struct
*next
, int state
)
191 struct tstamp_data
*pelem
;
193 ts
= bpf_ktime_get_ns();
195 if (!can_record(prev
, state
))
198 stack_id
= bpf_get_stackid(ctx
, &stacks
,
199 BPF_F_FAST_STACK_CMP
| BPF_F_USER_STACK
);
201 pelem
= bpf_task_storage_get(&tstamp
, prev
, NULL
,
202 BPF_LOCAL_STORAGE_GET_F_CREATE
);
206 pelem
->timestamp
= ts
;
207 pelem
->state
= state
;
208 pelem
->stack_id
= stack_id
;
211 pelem
= bpf_task_storage_get(&tstamp
, next
, NULL
, 0);
213 if (pelem
&& pelem
->timestamp
) {
214 struct offcpu_key key
= {
217 .stack_id
= pelem
->stack_id
,
218 .state
= pelem
->state
,
219 .cgroup_id
= needs_cgroup
? get_cgroup_id(next
) : 0,
221 __u64 delta
= ts
- pelem
->timestamp
;
224 total
= bpf_map_lookup_elem(&off_cpu
, &key
);
228 bpf_map_update_elem(&off_cpu
, &key
, &delta
, BPF_ANY
);
230 /* prevent to reuse the timestamp later */
231 pelem
->timestamp
= 0;
237 SEC("tp_btf/task_newtask")
238 int on_newtask(u64
*ctx
)
240 struct task_struct
*task
;
248 task
= (struct task_struct
*)bpf_get_current_task();
250 pid
= BPF_CORE_READ(task
, tgid
);
251 if (!bpf_map_lookup_elem(&task_filter
, &pid
))
254 task
= (struct task_struct
*)ctx
[0];
255 clone_flags
= ctx
[1];
258 if (!(clone_flags
& CLONE_THREAD
))
259 bpf_map_update_elem(&task_filter
, &pid
, &val
, BPF_NOEXIST
);
264 SEC("tp_btf/sched_switch")
265 int on_switch(u64
*ctx
)
267 struct task_struct
*prev
, *next
;
273 prev
= (struct task_struct
*)ctx
[1];
274 next
= (struct task_struct
*)ctx
[2];
277 prev_state
= (int)ctx
[3];
279 prev_state
= get_task_state(prev
);
281 return off_cpu_stat(ctx
, prev
, next
, prev_state
& 0xff);
284 char LICENSE
[] SEC("license") = "Dual BSD/GPL";