1 // SPDX-License-Identifier: GPL-2.0
2 #include "util/cgroup.h"
3 #include "util/debug.h"
4 #include "util/evlist.h"
5 #include "util/machine.h"
7 #include "util/symbol.h"
8 #include "util/target.h"
9 #include "util/thread.h"
10 #include "util/thread_map.h"
11 #include "util/lock-contention.h"
12 #include <linux/zalloc.h>
13 #include <linux/string.h>
17 #include "bpf_skel/lock_contention.skel.h"
18 #include "bpf_skel/lock_data.h"
20 static struct lock_contention_bpf
*skel
;
22 int lock_contention_prepare(struct lock_contention
*con
)
25 int ncpus
= 1, ntasks
= 1, ntypes
= 1, naddrs
= 1, ncgrps
= 1;
26 struct evlist
*evlist
= con
->evlist
;
27 struct target
*target
= con
->target
;
29 skel
= lock_contention_bpf__open();
31 pr_err("Failed to open lock-contention BPF skeleton\n");
35 bpf_map__set_value_size(skel
->maps
.stacks
, con
->max_stack
* sizeof(u64
));
36 bpf_map__set_max_entries(skel
->maps
.lock_stat
, con
->map_nr_entries
);
37 bpf_map__set_max_entries(skel
->maps
.tstamp
, con
->map_nr_entries
);
39 if (con
->aggr_mode
== LOCK_AGGR_TASK
)
40 bpf_map__set_max_entries(skel
->maps
.task_data
, con
->map_nr_entries
);
42 bpf_map__set_max_entries(skel
->maps
.task_data
, 1);
44 if (con
->save_callstack
)
45 bpf_map__set_max_entries(skel
->maps
.stacks
, con
->map_nr_entries
);
47 bpf_map__set_max_entries(skel
->maps
.stacks
, 1);
49 if (target__has_cpu(target
)) {
50 skel
->rodata
->has_cpu
= 1;
51 ncpus
= perf_cpu_map__nr(evlist
->core
.user_requested_cpus
);
53 if (target__has_task(target
)) {
54 skel
->rodata
->has_task
= 1;
55 ntasks
= perf_thread_map__nr(evlist
->core
.threads
);
57 if (con
->filters
->nr_types
) {
58 skel
->rodata
->has_type
= 1;
59 ntypes
= con
->filters
->nr_types
;
61 if (con
->filters
->nr_cgrps
) {
62 skel
->rodata
->has_cgroup
= 1;
63 ncgrps
= con
->filters
->nr_cgrps
;
66 /* resolve lock name filters to addr */
67 if (con
->filters
->nr_syms
) {
72 for (i
= 0; i
< con
->filters
->nr_syms
; i
++) {
73 sym
= machine__find_kernel_symbol_by_name(con
->machine
,
74 con
->filters
->syms
[i
],
77 pr_warning("ignore unknown symbol: %s\n",
78 con
->filters
->syms
[i
]);
82 addrs
= realloc(con
->filters
->addrs
,
83 (con
->filters
->nr_addrs
+ 1) * sizeof(*addrs
));
85 pr_warning("memory allocation failure\n");
89 addrs
[con
->filters
->nr_addrs
++] = map__unmap_ip(kmap
, sym
->start
);
90 con
->filters
->addrs
= addrs
;
92 naddrs
= con
->filters
->nr_addrs
;
93 skel
->rodata
->has_addr
= 1;
96 bpf_map__set_max_entries(skel
->maps
.cpu_filter
, ncpus
);
97 bpf_map__set_max_entries(skel
->maps
.task_filter
, ntasks
);
98 bpf_map__set_max_entries(skel
->maps
.type_filter
, ntypes
);
99 bpf_map__set_max_entries(skel
->maps
.addr_filter
, naddrs
);
100 bpf_map__set_max_entries(skel
->maps
.cgroup_filter
, ncgrps
);
102 skel
->rodata
->stack_skip
= con
->stack_skip
;
103 skel
->rodata
->aggr_mode
= con
->aggr_mode
;
104 skel
->rodata
->needs_callstack
= con
->save_callstack
;
105 skel
->rodata
->lock_owner
= con
->owner
;
107 if (con
->aggr_mode
== LOCK_AGGR_CGROUP
|| con
->filters
->nr_cgrps
) {
108 if (cgroup_is_v2("perf_event"))
109 skel
->rodata
->use_cgroup_v2
= 1;
112 if (lock_contention_bpf__load(skel
) < 0) {
113 pr_err("Failed to load lock-contention BPF skeleton\n");
117 if (target__has_cpu(target
)) {
121 fd
= bpf_map__fd(skel
->maps
.cpu_filter
);
123 for (i
= 0; i
< ncpus
; i
++) {
124 cpu
= perf_cpu_map__cpu(evlist
->core
.user_requested_cpus
, i
).cpu
;
125 bpf_map_update_elem(fd
, &cpu
, &val
, BPF_ANY
);
129 if (target__has_task(target
)) {
133 fd
= bpf_map__fd(skel
->maps
.task_filter
);
135 for (i
= 0; i
< ntasks
; i
++) {
136 pid
= perf_thread_map__pid(evlist
->core
.threads
, i
);
137 bpf_map_update_elem(fd
, &pid
, &val
, BPF_ANY
);
141 if (target__none(target
) && evlist
->workload
.pid
> 0) {
142 u32 pid
= evlist
->workload
.pid
;
145 fd
= bpf_map__fd(skel
->maps
.task_filter
);
146 bpf_map_update_elem(fd
, &pid
, &val
, BPF_ANY
);
149 if (con
->filters
->nr_types
) {
152 fd
= bpf_map__fd(skel
->maps
.type_filter
);
154 for (i
= 0; i
< con
->filters
->nr_types
; i
++)
155 bpf_map_update_elem(fd
, &con
->filters
->types
[i
], &val
, BPF_ANY
);
158 if (con
->filters
->nr_addrs
) {
161 fd
= bpf_map__fd(skel
->maps
.addr_filter
);
163 for (i
= 0; i
< con
->filters
->nr_addrs
; i
++)
164 bpf_map_update_elem(fd
, &con
->filters
->addrs
[i
], &val
, BPF_ANY
);
167 if (con
->filters
->nr_cgrps
) {
170 fd
= bpf_map__fd(skel
->maps
.cgroup_filter
);
172 for (i
= 0; i
< con
->filters
->nr_cgrps
; i
++)
173 bpf_map_update_elem(fd
, &con
->filters
->cgrps
[i
], &val
, BPF_ANY
);
176 if (con
->aggr_mode
== LOCK_AGGR_CGROUP
)
177 read_all_cgroups(&con
->cgroups
);
179 bpf_program__set_autoload(skel
->progs
.collect_lock_syms
, false);
181 lock_contention_bpf__attach(skel
);
186 * Run the BPF program directly using BPF_PROG_TEST_RUN to update the end
187 * timestamp in ktime so that it can calculate delta easily.
189 static void mark_end_timestamp(void)
191 DECLARE_LIBBPF_OPTS(bpf_test_run_opts
, opts
,
192 .flags
= BPF_F_TEST_RUN_ON_CPU
,
194 int prog_fd
= bpf_program__fd(skel
->progs
.end_timestamp
);
196 bpf_prog_test_run_opts(prog_fd
, &opts
);
199 static void update_lock_stat(int map_fd
, int pid
, u64 end_ts
,
200 enum lock_aggr_mode aggr_mode
,
201 struct tstamp_data
*ts_data
)
204 struct contention_key stat_key
= {};
205 struct contention_data stat_data
;
207 if (ts_data
->timestamp
>= end_ts
)
210 delta
= end_ts
- ts_data
->timestamp
;
213 case LOCK_AGGR_CALLER
:
214 stat_key
.stack_id
= ts_data
->stack_id
;
220 stat_key
.lock_addr_or_cgroup
= ts_data
->lock
;
222 case LOCK_AGGR_CGROUP
:
229 if (bpf_map_lookup_elem(map_fd
, &stat_key
, &stat_data
) < 0)
232 stat_data
.total_time
+= delta
;
235 if (delta
> stat_data
.max_time
)
236 stat_data
.max_time
= delta
;
237 if (delta
< stat_data
.min_time
)
238 stat_data
.min_time
= delta
;
240 bpf_map_update_elem(map_fd
, &stat_key
, &stat_data
, BPF_EXIST
);
244 * Account entries in the tstamp map (which didn't see the corresponding
245 * lock:contention_end tracepoint) using end_ts.
247 static void account_end_timestamp(struct lock_contention
*con
)
251 u64 end_ts
= skel
->bss
->end_ts
;
253 enum lock_aggr_mode aggr_mode
= con
->aggr_mode
;
254 struct tstamp_data ts_data
, *cpu_data
;
256 /* Iterate per-task tstamp map (key = TID) */
257 ts_fd
= bpf_map__fd(skel
->maps
.tstamp
);
258 stat_fd
= bpf_map__fd(skel
->maps
.lock_stat
);
261 while (!bpf_map_get_next_key(ts_fd
, prev_key
, &key
)) {
262 if (bpf_map_lookup_elem(ts_fd
, &key
, &ts_data
) == 0) {
265 if (aggr_mode
== LOCK_AGGR_TASK
&& con
->owner
)
268 update_lock_stat(stat_fd
, pid
, end_ts
, aggr_mode
,
275 /* Now it'll check per-cpu tstamp map which doesn't have TID. */
276 if (aggr_mode
== LOCK_AGGR_TASK
|| aggr_mode
== LOCK_AGGR_CGROUP
)
279 total_cpus
= cpu__max_cpu().cpu
;
280 ts_fd
= bpf_map__fd(skel
->maps
.tstamp_cpu
);
282 cpu_data
= calloc(total_cpus
, sizeof(*cpu_data
));
283 if (cpu_data
== NULL
)
287 while (!bpf_map_get_next_key(ts_fd
, prev_key
, &key
)) {
288 if (bpf_map_lookup_elem(ts_fd
, &key
, cpu_data
) < 0)
291 for (int i
= 0; i
< total_cpus
; i
++) {
292 if (cpu_data
[i
].lock
== 0)
295 update_lock_stat(stat_fd
, -1, end_ts
, aggr_mode
,
305 int lock_contention_start(void)
307 skel
->bss
->enabled
= 1;
311 int lock_contention_stop(void)
313 skel
->bss
->enabled
= 0;
314 mark_end_timestamp();
318 static const char *lock_contention_get_name(struct lock_contention
*con
,
319 struct contention_key
*key
,
320 u64
*stack_trace
, u32 flags
)
324 const char *name
= "";
325 static char name_buf
[KSYM_NAME_LEN
];
328 struct machine
*machine
= con
->machine
;
330 if (con
->aggr_mode
== LOCK_AGGR_TASK
) {
331 struct contention_task_data task
;
333 int task_fd
= bpf_map__fd(skel
->maps
.task_data
);
335 /* do not update idle comm which contains CPU number */
337 struct thread
*t
= machine__findnew_thread(machine
, /*pid=*/-1, pid
);
341 if (!bpf_map_lookup_elem(task_fd
, &pid
, &task
) &&
342 thread__set_comm(t
, task
.comm
, /*timestamp=*/0))
348 if (con
->aggr_mode
== LOCK_AGGR_ADDR
) {
349 int lock_fd
= bpf_map__fd(skel
->maps
.lock_syms
);
351 /* per-process locks set upper bits of the flags */
352 if (flags
& LCD_F_MMAP_LOCK
)
354 if (flags
& LCD_F_SIGHAND_LOCK
)
357 /* global locks with symbols */
358 sym
= machine__find_kernel_symbol(machine
, key
->lock_addr_or_cgroup
, &kmap
);
362 /* try semi-global locks collected separately */
363 if (!bpf_map_lookup_elem(lock_fd
, &key
->lock_addr_or_cgroup
, &flags
)) {
364 if (flags
== LOCK_CLASS_RQLOCK
)
371 if (con
->aggr_mode
== LOCK_AGGR_CGROUP
) {
372 u64 cgrp_id
= key
->lock_addr_or_cgroup
;
373 struct cgroup
*cgrp
= __cgroup__find(&con
->cgroups
, cgrp_id
);
378 snprintf(name_buf
, sizeof(name_buf
), "cgroup:%" PRIu64
"", cgrp_id
);
382 /* LOCK_AGGR_CALLER: skip lock internal functions */
383 while (machine__is_lock_function(machine
, stack_trace
[idx
]) &&
384 idx
< con
->max_stack
- 1)
387 addr
= stack_trace
[idx
];
388 sym
= machine__find_kernel_symbol(machine
, addr
, &kmap
);
391 unsigned long offset
;
393 offset
= map__map_ip(kmap
, addr
) - sym
->start
;
398 snprintf(name_buf
, sizeof(name_buf
), "%s+%#lx", sym
->name
, offset
);
400 snprintf(name_buf
, sizeof(name_buf
), "%#lx", (unsigned long)addr
);
406 int lock_contention_read(struct lock_contention
*con
)
408 int fd
, stack
, err
= 0;
409 struct contention_key
*prev_key
, key
= {};
410 struct contention_data data
= {};
411 struct lock_stat
*st
= NULL
;
412 struct machine
*machine
= con
->machine
;
414 size_t stack_size
= con
->max_stack
* sizeof(*stack_trace
);
416 fd
= bpf_map__fd(skel
->maps
.lock_stat
);
417 stack
= bpf_map__fd(skel
->maps
.stacks
);
419 con
->fails
.task
= skel
->bss
->task_fail
;
420 con
->fails
.stack
= skel
->bss
->stack_fail
;
421 con
->fails
.time
= skel
->bss
->time_fail
;
422 con
->fails
.data
= skel
->bss
->data_fail
;
424 stack_trace
= zalloc(stack_size
);
425 if (stack_trace
== NULL
)
428 account_end_timestamp(con
);
430 if (con
->aggr_mode
== LOCK_AGGR_TASK
) {
431 struct thread
*idle
= machine__findnew_thread(machine
,
434 thread__set_comm(idle
, "swapper", /*timestamp=*/0);
437 if (con
->aggr_mode
== LOCK_AGGR_ADDR
) {
438 DECLARE_LIBBPF_OPTS(bpf_test_run_opts
, opts
,
439 .flags
= BPF_F_TEST_RUN_ON_CPU
,
441 int prog_fd
= bpf_program__fd(skel
->progs
.collect_lock_syms
);
443 bpf_prog_test_run_opts(prog_fd
, &opts
);
446 /* make sure it loads the kernel map */
447 maps__load_first(machine
->kmaps
);
450 while (!bpf_map_get_next_key(fd
, prev_key
, &key
)) {
454 /* to handle errors in the loop body */
457 bpf_map_lookup_elem(fd
, &key
, &data
);
458 if (con
->save_callstack
) {
459 bpf_map_lookup_elem(stack
, &key
.stack_id
, stack_trace
);
461 if (!match_callstack_filter(machine
, stack_trace
)) {
462 con
->nr_filtered
+= data
.count
;
467 switch (con
->aggr_mode
) {
468 case LOCK_AGGR_CALLER
:
469 ls_key
= key
.stack_id
;
475 case LOCK_AGGR_CGROUP
:
476 ls_key
= key
.lock_addr_or_cgroup
;
482 st
= lock_stat_find(ls_key
);
484 st
->wait_time_total
+= data
.total_time
;
485 if (st
->wait_time_max
< data
.max_time
)
486 st
->wait_time_max
= data
.max_time
;
487 if (st
->wait_time_min
> data
.min_time
)
488 st
->wait_time_min
= data
.min_time
;
490 st
->nr_contended
+= data
.count
;
491 if (st
->nr_contended
)
492 st
->avg_wait_time
= st
->wait_time_total
/ st
->nr_contended
;
496 name
= lock_contention_get_name(con
, &key
, stack_trace
, data
.flags
);
497 st
= lock_stat_findnew(ls_key
, name
, data
.flags
);
501 st
->nr_contended
= data
.count
;
502 st
->wait_time_total
= data
.total_time
;
503 st
->wait_time_max
= data
.max_time
;
504 st
->wait_time_min
= data
.min_time
;
507 st
->avg_wait_time
= data
.total_time
/ data
.count
;
509 if (con
->aggr_mode
== LOCK_AGGR_CALLER
&& verbose
> 0) {
510 st
->callstack
= memdup(stack_trace
, stack_size
);
511 if (st
->callstack
== NULL
)
518 /* we're fine now, reset the error */
527 int lock_contention_finish(struct lock_contention
*con
)
530 skel
->bss
->enabled
= 0;
531 lock_contention_bpf__destroy(skel
);
534 while (!RB_EMPTY_ROOT(&con
->cgroups
)) {
535 struct rb_node
*node
= rb_first(&con
->cgroups
);
536 struct cgroup
*cgrp
= rb_entry(node
, struct cgroup
, node
);
538 rb_erase(node
, &con
->cgroups
);