tools/perf/util/bpf_lock_contention.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include "util/cgroup.h"
   3 #include "util/debug.h"
   4 #include "util/evlist.h"
   5 #include "util/machine.h"
   6 #include "util/map.h"
   7 #include "util/symbol.h"
   8 #include "util/target.h"
   9 #include "util/thread.h"
  10 #include "util/thread_map.h"
  11 #include "util/lock-contention.h"
  12 #include <linux/zalloc.h>
  13 #include <linux/string.h>
  14 #include <bpf/bpf.h>
  15 #include <inttypes.h>
  16
  17 #include "bpf_skel/lock_contention.skel.h"
  18 #include "bpf_skel/lock_data.h"
  19
  20 static struct lock_contention_bpf *skel;
  21
  22 int lock_contention_prepare(struct lock_contention *con)
  23 {
  24         int i, fd;
  25         int ncpus = 1, ntasks = 1, ntypes = 1, naddrs = 1, ncgrps = 1;
  26         struct evlist *evlist = con->evlist;
  27         struct target *target = con->target;
  28
  29         skel = lock_contention_bpf__open();
  30         if (!skel) {
  31                 pr_err("Failed to open lock-contention BPF skeleton\n");
  32                 return -1;
  33         }
  34
  35         bpf_map__set_value_size(skel->maps.stacks, con->max_stack * sizeof(u64));
  36         bpf_map__set_max_entries(skel->maps.lock_stat, con->map_nr_entries);
  37         bpf_map__set_max_entries(skel->maps.tstamp, con->map_nr_entries);
  38
  39         if (con->aggr_mode == LOCK_AGGR_TASK)
  40                 bpf_map__set_max_entries(skel->maps.task_data, con->map_nr_entries);
  41         else
  42                 bpf_map__set_max_entries(skel->maps.task_data, 1);
  43
  44         if (con->save_callstack)
  45                 bpf_map__set_max_entries(skel->maps.stacks, con->map_nr_entries);
  46         else
  47                 bpf_map__set_max_entries(skel->maps.stacks, 1);
  48
  49         if (target__has_cpu(target)) {
  50                 skel->rodata->has_cpu = 1;
  51                 ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus);
  52         }
  53         if (target__has_task(target)) {
  54                 skel->rodata->has_task = 1;
  55                 ntasks = perf_thread_map__nr(evlist->core.threads);
  56         }
  57         if (con->filters->nr_types) {
  58                 skel->rodata->has_type = 1;
  59                 ntypes = con->filters->nr_types;
  60         }
  61         if (con->filters->nr_cgrps) {
  62                 skel->rodata->has_cgroup = 1;
  63                 ncgrps = con->filters->nr_cgrps;
  64         }
  65
  66         /* resolve lock name filters to addr */
  67         if (con->filters->nr_syms) {
  68                 struct symbol *sym;
  69                 struct map *kmap;
  70                 unsigned long *addrs;
  71
  72                 for (i = 0; i < con->filters->nr_syms; i++) {
  73                         sym = machine__find_kernel_symbol_by_name(con->machine,
  74                                                                   con->filters->syms[i],
  75                                                                   &kmap);
  76                         if (sym == NULL) {
  77                                 pr_warning("ignore unknown symbol: %s\n",
  78                                            con->filters->syms[i]);
  79                                 continue;
  80                         }
  81
  82                         addrs = realloc(con->filters->addrs,
  83                                         (con->filters->nr_addrs + 1) * sizeof(*addrs));
  84                         if (addrs == NULL) {
  85                                 pr_warning("memory allocation failure\n");
  86                                 continue;
  87                         }
  88
  89                         addrs[con->filters->nr_addrs++] = map__unmap_ip(kmap, sym->start);
  90                         con->filters->addrs = addrs;
  91                 }
  92                 naddrs = con->filters->nr_addrs;
  93                 skel->rodata->has_addr = 1;
  94         }
  95
  96         bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
  97         bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
  98         bpf_map__set_max_entries(skel->maps.type_filter, ntypes);
  99         bpf_map__set_max_entries(skel->maps.addr_filter, naddrs);
 100         bpf_map__set_max_entries(skel->maps.cgroup_filter, ncgrps);
 101
 102         skel->rodata->stack_skip = con->stack_skip;
 103         skel->rodata->aggr_mode = con->aggr_mode;
 104         skel->rodata->needs_callstack = con->save_callstack;
 105         skel->rodata->lock_owner = con->owner;
 106
 107         if (con->aggr_mode == LOCK_AGGR_CGROUP || con->filters->nr_cgrps) {
 108                 if (cgroup_is_v2("perf_event"))
 109                         skel->rodata->use_cgroup_v2 = 1;
 110         }
 111
 112         if (lock_contention_bpf__load(skel) < 0) {
 113                 pr_err("Failed to load lock-contention BPF skeleton\n");
 114                 return -1;
 115         }
 116
 117         if (target__has_cpu(target)) {
 118                 u32 cpu;
 119                 u8 val = 1;
 120
 121                 fd = bpf_map__fd(skel->maps.cpu_filter);
 122
 123                 for (i = 0; i < ncpus; i++) {
 124                         cpu = perf_cpu_map__cpu(evlist->core.user_requested_cpus, i).cpu;
 125                         bpf_map_update_elem(fd, &cpu, &val, BPF_ANY);
 126                 }
 127         }
 128
 129         if (target__has_task(target)) {
 130                 u32 pid;
 131                 u8 val = 1;
 132
 133                 fd = bpf_map__fd(skel->maps.task_filter);
 134
 135                 for (i = 0; i < ntasks; i++) {
 136                         pid = perf_thread_map__pid(evlist->core.threads, i);
 137                         bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
 138                 }
 139         }
 140
 141         if (target__none(target) && evlist->workload.pid > 0) {
 142                 u32 pid = evlist->workload.pid;
 143                 u8 val = 1;
 144
 145                 fd = bpf_map__fd(skel->maps.task_filter);
 146                 bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
 147         }
 148
 149         if (con->filters->nr_types) {
 150                 u8 val = 1;
 151
 152                 fd = bpf_map__fd(skel->maps.type_filter);
 153
 154                 for (i = 0; i < con->filters->nr_types; i++)
 155                         bpf_map_update_elem(fd, &con->filters->types[i], &val, BPF_ANY);
 156         }
 157
 158         if (con->filters->nr_addrs) {
 159                 u8 val = 1;
 160
 161                 fd = bpf_map__fd(skel->maps.addr_filter);
 162
 163                 for (i = 0; i < con->filters->nr_addrs; i++)
 164                         bpf_map_update_elem(fd, &con->filters->addrs[i], &val, BPF_ANY);
 165         }
 166
 167         if (con->filters->nr_cgrps) {
 168                 u8 val = 1;
 169
 170                 fd = bpf_map__fd(skel->maps.cgroup_filter);
 171
 172                 for (i = 0; i < con->filters->nr_cgrps; i++)
 173                         bpf_map_update_elem(fd, &con->filters->cgrps[i], &val, BPF_ANY);
 174         }
 175
 176         if (con->aggr_mode == LOCK_AGGR_CGROUP)
 177                 read_all_cgroups(&con->cgroups);
 178
 179         bpf_program__set_autoload(skel->progs.collect_lock_syms, false);
 180
 181         lock_contention_bpf__attach(skel);
 182         return 0;
 183 }
 184
 185 /*
 186  * Run the BPF program directly using BPF_PROG_TEST_RUN to update the end
 187  * timestamp in ktime so that it can calculate delta easily.
 188  */
 189 static void mark_end_timestamp(void)
 190 {
 191         DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
 192                 .flags = BPF_F_TEST_RUN_ON_CPU,
 193         );
 194         int prog_fd = bpf_program__fd(skel->progs.end_timestamp);
 195
 196         bpf_prog_test_run_opts(prog_fd, &opts);
 197 }
 198
 199 static void update_lock_stat(int map_fd, int pid, u64 end_ts,
 200                              enum lock_aggr_mode aggr_mode,
 201                              struct tstamp_data *ts_data)
 202 {
 203         u64 delta;
 204         struct contention_key stat_key = {};
 205         struct contention_data stat_data;
 206
 207         if (ts_data->timestamp >= end_ts)
 208                 return;
 209
 210         delta = end_ts - ts_data->timestamp;
 211
 212         switch (aggr_mode) {
 213         case LOCK_AGGR_CALLER:
 214                 stat_key.stack_id = ts_data->stack_id;
 215                 break;
 216         case LOCK_AGGR_TASK:
 217                 stat_key.pid = pid;
 218                 break;
 219         case LOCK_AGGR_ADDR:
 220                 stat_key.lock_addr_or_cgroup = ts_data->lock;
 221                 break;
 222         case LOCK_AGGR_CGROUP:
 223                 /* TODO */
 224                 return;
 225         default:
 226                 return;
 227         }
 228
 229         if (bpf_map_lookup_elem(map_fd, &stat_key, &stat_data) < 0)
 230                 return;
 231
 232         stat_data.total_time += delta;
 233         stat_data.count++;
 234
 235         if (delta > stat_data.max_time)
 236                 stat_data.max_time = delta;
 237         if (delta < stat_data.min_time)
 238                 stat_data.min_time = delta;
 239
 240         bpf_map_update_elem(map_fd, &stat_key, &stat_data, BPF_EXIST);
 241 }
 242
 243 /*
 244  * Account entries in the tstamp map (which didn't see the corresponding
 245  * lock:contention_end tracepoint) using end_ts.
 246  */
 247 static void account_end_timestamp(struct lock_contention *con)
 248 {
 249         int ts_fd, stat_fd;
 250         int *prev_key, key;
 251         u64 end_ts = skel->bss->end_ts;
 252         int total_cpus;
 253         enum lock_aggr_mode aggr_mode = con->aggr_mode;
 254         struct tstamp_data ts_data, *cpu_data;
 255
 256         /* Iterate per-task tstamp map (key = TID) */
 257         ts_fd = bpf_map__fd(skel->maps.tstamp);
 258         stat_fd = bpf_map__fd(skel->maps.lock_stat);
 259
 260         prev_key = NULL;
 261         while (!bpf_map_get_next_key(ts_fd, prev_key, &key)) {
 262                 if (bpf_map_lookup_elem(ts_fd, &key, &ts_data) == 0) {
 263                         int pid = key;
 264
 265                         if (aggr_mode == LOCK_AGGR_TASK && con->owner)
 266                                 pid = ts_data.flags;
 267
 268                         update_lock_stat(stat_fd, pid, end_ts, aggr_mode,
 269                                          &ts_data);
 270                 }
 271
 272                 prev_key = &key;
 273         }
 274
 275         /* Now it'll check per-cpu tstamp map which doesn't have TID. */
 276         if (aggr_mode == LOCK_AGGR_TASK || aggr_mode == LOCK_AGGR_CGROUP)
 277                 return;
 278
 279         total_cpus = cpu__max_cpu().cpu;
 280         ts_fd = bpf_map__fd(skel->maps.tstamp_cpu);
 281
 282         cpu_data = calloc(total_cpus, sizeof(*cpu_data));
 283         if (cpu_data == NULL)
 284                 return;
 285
 286         prev_key = NULL;
 287         while (!bpf_map_get_next_key(ts_fd, prev_key, &key)) {
 288                 if (bpf_map_lookup_elem(ts_fd, &key, cpu_data) < 0)
 289                         goto next;
 290
 291                 for (int i = 0; i < total_cpus; i++) {
 292                         if (cpu_data[i].lock == 0)
 293                                 continue;
 294
 295                         update_lock_stat(stat_fd, -1, end_ts, aggr_mode,
 296                                          &cpu_data[i]);
 297                 }
 298
 299 next:
 300                 prev_key = &key;
 301         }
 302         free(cpu_data);
 303 }
 304
 305 int lock_contention_start(void)
 306 {
 307         skel->bss->enabled = 1;
 308         return 0;
 309 }
 310
 311 int lock_contention_stop(void)
 312 {
 313         skel->bss->enabled = 0;
 314         mark_end_timestamp();
 315         return 0;
 316 }
 317
 318 static const char *lock_contention_get_name(struct lock_contention *con,
 319                                             struct contention_key *key,
 320                                             u64 *stack_trace, u32 flags)
 321 {
 322         int idx = 0;
 323         u64 addr;
 324         const char *name = "";
 325         static char name_buf[KSYM_NAME_LEN];
 326         struct symbol *sym;
 327         struct map *kmap;
 328         struct machine *machine = con->machine;
 329
 330         if (con->aggr_mode == LOCK_AGGR_TASK) {
 331                 struct contention_task_data task;
 332                 int pid = key->pid;
 333                 int task_fd = bpf_map__fd(skel->maps.task_data);
 334
 335                 /* do not update idle comm which contains CPU number */
 336                 if (pid) {
 337                         struct thread *t = machine__findnew_thread(machine, /*pid=*/-1, pid);
 338
 339                         if (t == NULL)
 340                                 return name;
 341                         if (!bpf_map_lookup_elem(task_fd, &pid, &task) &&
 342                             thread__set_comm(t, task.comm, /*timestamp=*/0))
 343                                 name = task.comm;
 344                 }
 345                 return name;
 346         }
 347
 348         if (con->aggr_mode == LOCK_AGGR_ADDR) {
 349                 int lock_fd = bpf_map__fd(skel->maps.lock_syms);
 350
 351                 /* per-process locks set upper bits of the flags */
 352                 if (flags & LCD_F_MMAP_LOCK)
 353                         return "mmap_lock";
 354                 if (flags & LCD_F_SIGHAND_LOCK)
 355                         return "siglock";
 356
 357                 /* global locks with symbols */
 358                 sym = machine__find_kernel_symbol(machine, key->lock_addr_or_cgroup, &kmap);
 359                 if (sym)
 360                         return sym->name;
 361
 362                 /* try semi-global locks collected separately */
 363                 if (!bpf_map_lookup_elem(lock_fd, &key->lock_addr_or_cgroup, &flags)) {
 364                         if (flags == LOCK_CLASS_RQLOCK)
 365                                 return "rq_lock";
 366                 }
 367
 368                 return "";
 369         }
 370
 371         if (con->aggr_mode == LOCK_AGGR_CGROUP) {
 372                 u64 cgrp_id = key->lock_addr_or_cgroup;
 373                 struct cgroup *cgrp = __cgroup__find(&con->cgroups, cgrp_id);
 374
 375                 if (cgrp)
 376                         return cgrp->name;
 377
 378                 snprintf(name_buf, sizeof(name_buf), "cgroup:%" PRIu64 "", cgrp_id);
 379                 return name_buf;
 380         }
 381
 382         /* LOCK_AGGR_CALLER: skip lock internal functions */
 383         while (machine__is_lock_function(machine, stack_trace[idx]) &&
 384                idx < con->max_stack - 1)
 385                 idx++;
 386
 387         addr = stack_trace[idx];
 388         sym = machine__find_kernel_symbol(machine, addr, &kmap);
 389
 390         if (sym) {
 391                 unsigned long offset;
 392
 393                 offset = map__map_ip(kmap, addr) - sym->start;
 394
 395                 if (offset == 0)
 396                         return sym->name;
 397
 398                 snprintf(name_buf, sizeof(name_buf), "%s+%#lx", sym->name, offset);
 399         } else {
 400                 snprintf(name_buf, sizeof(name_buf), "%#lx", (unsigned long)addr);
 401         }
 402
 403         return name_buf;
 404 }
 405
 406 int lock_contention_read(struct lock_contention *con)
 407 {
 408         int fd, stack, err = 0;
 409         struct contention_key *prev_key, key = {};
 410         struct contention_data data = {};
 411         struct lock_stat *st = NULL;
 412         struct machine *machine = con->machine;
 413         u64 *stack_trace;
 414         size_t stack_size = con->max_stack * sizeof(*stack_trace);
 415
 416         fd = bpf_map__fd(skel->maps.lock_stat);
 417         stack = bpf_map__fd(skel->maps.stacks);
 418
 419         con->fails.task = skel->bss->task_fail;
 420         con->fails.stack = skel->bss->stack_fail;
 421         con->fails.time = skel->bss->time_fail;
 422         con->fails.data = skel->bss->data_fail;
 423
 424         stack_trace = zalloc(stack_size);
 425         if (stack_trace == NULL)
 426                 return -1;
 427
 428         account_end_timestamp(con);
 429
 430         if (con->aggr_mode == LOCK_AGGR_TASK) {
 431                 struct thread *idle = machine__findnew_thread(machine,
 432                                                                 /*pid=*/0,
 433                                                                 /*tid=*/0);
 434                 thread__set_comm(idle, "swapper", /*timestamp=*/0);
 435         }
 436
 437         if (con->aggr_mode == LOCK_AGGR_ADDR) {
 438                 DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
 439                         .flags = BPF_F_TEST_RUN_ON_CPU,
 440                 );
 441                 int prog_fd = bpf_program__fd(skel->progs.collect_lock_syms);
 442
 443                 bpf_prog_test_run_opts(prog_fd, &opts);
 444         }
 445
 446         /* make sure it loads the kernel map */
 447         maps__load_first(machine->kmaps);
 448
 449         prev_key = NULL;
 450         while (!bpf_map_get_next_key(fd, prev_key, &key)) {
 451                 s64 ls_key;
 452                 const char *name;
 453
 454                 /* to handle errors in the loop body */
 455                 err = -1;
 456
 457                 bpf_map_lookup_elem(fd, &key, &data);
 458                 if (con->save_callstack) {
 459                         bpf_map_lookup_elem(stack, &key.stack_id, stack_trace);
 460
 461                         if (!match_callstack_filter(machine, stack_trace)) {
 462                                 con->nr_filtered += data.count;
 463                                 goto next;
 464                         }
 465                 }
 466
 467                 switch (con->aggr_mode) {
 468                 case LOCK_AGGR_CALLER:
 469                         ls_key = key.stack_id;
 470                         break;
 471                 case LOCK_AGGR_TASK:
 472                         ls_key = key.pid;
 473                         break;
 474                 case LOCK_AGGR_ADDR:
 475                 case LOCK_AGGR_CGROUP:
 476                         ls_key = key.lock_addr_or_cgroup;
 477                         break;
 478                 default:
 479                         goto next;
 480                 }
 481
 482                 st = lock_stat_find(ls_key);
 483                 if (st != NULL) {
 484                         st->wait_time_total += data.total_time;
 485                         if (st->wait_time_max < data.max_time)
 486                                 st->wait_time_max = data.max_time;
 487                         if (st->wait_time_min > data.min_time)
 488                                 st->wait_time_min = data.min_time;
 489
 490                         st->nr_contended += data.count;
 491                         if (st->nr_contended)
 492                                 st->avg_wait_time = st->wait_time_total / st->nr_contended;
 493                         goto next;
 494                 }
 495
 496                 name = lock_contention_get_name(con, &key, stack_trace, data.flags);
 497                 st = lock_stat_findnew(ls_key, name, data.flags);
 498                 if (st == NULL)
 499                         break;
 500
 501                 st->nr_contended = data.count;
 502                 st->wait_time_total = data.total_time;
 503                 st->wait_time_max = data.max_time;
 504                 st->wait_time_min = data.min_time;
 505
 506                 if (data.count)
 507                         st->avg_wait_time = data.total_time / data.count;
 508
 509                 if (con->aggr_mode == LOCK_AGGR_CALLER && verbose > 0) {
 510                         st->callstack = memdup(stack_trace, stack_size);
 511                         if (st->callstack == NULL)
 512                                 break;
 513                 }
 514
 515 next:
 516                 prev_key = &key;
 517
 518                 /* we're fine now, reset the error */
 519                 err = 0;
 520         }
 521
 522         free(stack_trace);
 523
 524         return err;
 525 }
 526
 527 int lock_contention_finish(struct lock_contention *con)
 528 {
 529         if (skel) {
 530                 skel->bss->enabled = 0;
 531                 lock_contention_bpf__destroy(skel);
 532         }
 533
 534         while (!RB_EMPTY_ROOT(&con->cgroups)) {
 535                 struct rb_node *node = rb_first(&con->cgroups);
 536                 struct cgroup *cgrp = rb_entry(node, struct cgroup, node);
 537
 538                 rb_erase(node, &con->cgroups);
 539                 cgroup__put(cgrp);
 540         }
 541
 542         return 0;
 543 }