Merge branch 'locking-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[cris-mirror.git] / tools / perf / builtin-trace.c
blobe7f1b182fc153f227f8541e485c47a27115a111e
1 /*
2 * builtin-trace.c
4 * Builtin 'trace' command:
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
16 * Released under the GPL v2. (and only v2, not any later version)
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/env.h"
25 #include "util/event.h"
26 #include "util/evlist.h"
27 #include <subcmd/exec-cmd.h>
28 #include "util/machine.h"
29 #include "util/path.h"
30 #include "util/session.h"
31 #include "util/thread.h"
32 #include <subcmd/parse-options.h>
33 #include "util/strlist.h"
34 #include "util/intlist.h"
35 #include "util/thread_map.h"
36 #include "util/stat.h"
37 #include "trace/beauty/beauty.h"
38 #include "trace-event.h"
39 #include "util/parse-events.h"
40 #include "util/bpf-loader.h"
41 #include "callchain.h"
42 #include "print_binary.h"
43 #include "string2.h"
44 #include "syscalltbl.h"
45 #include "rb_resort.h"
47 #include <errno.h>
48 #include <inttypes.h>
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/kernel.h>
56 #include <linux/random.h>
57 #include <linux/stringify.h>
58 #include <linux/time64.h>
59 #include <fcntl.h>
61 #include "sane_ctype.h"
63 #ifndef O_CLOEXEC
64 # define O_CLOEXEC 02000000
65 #endif
67 #ifndef F_LINUX_SPECIFIC_BASE
68 # define F_LINUX_SPECIFIC_BASE 1024
69 #endif
71 struct trace {
72 struct perf_tool tool;
73 struct syscalltbl *sctbl;
74 struct {
75 int max;
76 struct syscall *table;
77 struct {
78 struct perf_evsel *sys_enter,
79 *sys_exit;
80 } events;
81 } syscalls;
82 struct record_opts opts;
83 struct perf_evlist *evlist;
84 struct machine *host;
85 struct thread *current;
86 u64 base_time;
87 FILE *output;
88 unsigned long nr_events;
89 struct strlist *ev_qualifier;
90 struct {
91 size_t nr;
92 int *entries;
93 } ev_qualifier_ids;
94 struct {
95 size_t nr;
96 pid_t *entries;
97 } filter_pids;
98 double duration_filter;
99 double runtime_ms;
100 struct {
101 u64 vfs_getname,
102 proc_getname;
103 } stats;
104 unsigned int max_stack;
105 unsigned int min_stack;
106 bool not_ev_qualifier;
107 bool live;
108 bool full_time;
109 bool sched;
110 bool multiple_threads;
111 bool summary;
112 bool summary_only;
113 bool show_comm;
114 bool print_sample;
115 bool show_tool_stats;
116 bool trace_syscalls;
117 bool kernel_syscallchains;
118 bool force;
119 bool vfs_getname;
120 int trace_pgfaults;
121 int open_id;
124 struct tp_field {
125 int offset;
126 union {
127 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
128 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
132 #define TP_UINT_FIELD(bits) \
133 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
135 u##bits value; \
136 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
137 return value; \
140 TP_UINT_FIELD(8);
141 TP_UINT_FIELD(16);
142 TP_UINT_FIELD(32);
143 TP_UINT_FIELD(64);
145 #define TP_UINT_FIELD__SWAPPED(bits) \
146 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
148 u##bits value; \
149 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
150 return bswap_##bits(value);\
153 TP_UINT_FIELD__SWAPPED(16);
154 TP_UINT_FIELD__SWAPPED(32);
155 TP_UINT_FIELD__SWAPPED(64);
157 static int tp_field__init_uint(struct tp_field *field,
158 struct format_field *format_field,
159 bool needs_swap)
161 field->offset = format_field->offset;
163 switch (format_field->size) {
164 case 1:
165 field->integer = tp_field__u8;
166 break;
167 case 2:
168 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
169 break;
170 case 4:
171 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
172 break;
173 case 8:
174 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
175 break;
176 default:
177 return -1;
180 return 0;
183 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
185 return sample->raw_data + field->offset;
188 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
190 field->offset = format_field->offset;
191 field->pointer = tp_field__ptr;
192 return 0;
195 struct syscall_tp {
196 struct tp_field id;
197 union {
198 struct tp_field args, ret;
202 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
203 struct tp_field *field,
204 const char *name)
206 struct format_field *format_field = perf_evsel__field(evsel, name);
208 if (format_field == NULL)
209 return -1;
211 return tp_field__init_uint(field, format_field, evsel->needs_swap);
214 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
215 ({ struct syscall_tp *sc = evsel->priv;\
216 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
218 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
219 struct tp_field *field,
220 const char *name)
222 struct format_field *format_field = perf_evsel__field(evsel, name);
224 if (format_field == NULL)
225 return -1;
227 return tp_field__init_ptr(field, format_field);
230 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
231 ({ struct syscall_tp *sc = evsel->priv;\
232 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
234 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
236 zfree(&evsel->priv);
237 perf_evsel__delete(evsel);
240 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
242 evsel->priv = malloc(sizeof(struct syscall_tp));
243 if (evsel->priv != NULL) {
244 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
245 goto out_delete;
247 evsel->handler = handler;
248 return 0;
251 return -ENOMEM;
253 out_delete:
254 zfree(&evsel->priv);
255 return -ENOENT;
258 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
260 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
262 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
263 if (IS_ERR(evsel))
264 evsel = perf_evsel__newtp("syscalls", direction);
266 if (IS_ERR(evsel))
267 return NULL;
269 if (perf_evsel__init_syscall_tp(evsel, handler))
270 goto out_delete;
272 return evsel;
274 out_delete:
275 perf_evsel__delete_priv(evsel);
276 return NULL;
279 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
280 ({ struct syscall_tp *fields = evsel->priv; \
281 fields->name.integer(&fields->name, sample); })
283 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
284 ({ struct syscall_tp *fields = evsel->priv; \
285 fields->name.pointer(&fields->name, sample); })
287 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
289 int idx = val - sa->offset;
291 if (idx < 0 || idx >= sa->nr_entries)
292 return scnprintf(bf, size, intfmt, val);
294 return scnprintf(bf, size, "%s", sa->entries[idx]);
297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
298 const char *intfmt,
299 struct syscall_arg *arg)
301 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
304 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
305 struct syscall_arg *arg)
307 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
310 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
312 struct strarrays {
313 int nr_entries;
314 struct strarray **entries;
317 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
318 .nr_entries = ARRAY_SIZE(array), \
319 .entries = array, \
322 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
323 struct syscall_arg *arg)
325 struct strarrays *sas = arg->parm;
326 int i;
328 for (i = 0; i < sas->nr_entries; ++i) {
329 struct strarray *sa = sas->entries[i];
330 int idx = arg->val - sa->offset;
332 if (idx >= 0 && idx < sa->nr_entries) {
333 if (sa->entries[idx] == NULL)
334 break;
335 return scnprintf(bf, size, "%s", sa->entries[idx]);
339 return scnprintf(bf, size, "%d", arg->val);
342 #ifndef AT_FDCWD
343 #define AT_FDCWD -100
344 #endif
346 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
347 struct syscall_arg *arg)
349 int fd = arg->val;
351 if (fd == AT_FDCWD)
352 return scnprintf(bf, size, "CWD");
354 return syscall_arg__scnprintf_fd(bf, size, arg);
357 #define SCA_FDAT syscall_arg__scnprintf_fd_at
359 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
360 struct syscall_arg *arg);
362 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
364 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
366 return scnprintf(bf, size, "%#lx", arg->val);
369 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
371 return scnprintf(bf, size, "%d", arg->val);
374 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
376 return scnprintf(bf, size, "%ld", arg->val);
379 static const char *bpf_cmd[] = {
380 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
381 "MAP_GET_NEXT_KEY", "PROG_LOAD",
383 static DEFINE_STRARRAY(bpf_cmd);
385 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
386 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
388 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
389 static DEFINE_STRARRAY(itimers);
391 static const char *keyctl_options[] = {
392 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
393 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
394 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
395 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
396 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
398 static DEFINE_STRARRAY(keyctl_options);
400 static const char *whences[] = { "SET", "CUR", "END",
401 #ifdef SEEK_DATA
402 "DATA",
403 #endif
404 #ifdef SEEK_HOLE
405 "HOLE",
406 #endif
408 static DEFINE_STRARRAY(whences);
410 static const char *fcntl_cmds[] = {
411 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
412 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
413 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
414 "GETOWNER_UIDS",
416 static DEFINE_STRARRAY(fcntl_cmds);
418 static const char *fcntl_linux_specific_cmds[] = {
419 "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
420 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
421 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
424 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
426 static struct strarray *fcntl_cmds_arrays[] = {
427 &strarray__fcntl_cmds,
428 &strarray__fcntl_linux_specific_cmds,
431 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
433 static const char *rlimit_resources[] = {
434 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
435 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
436 "RTTIME",
438 static DEFINE_STRARRAY(rlimit_resources);
440 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
441 static DEFINE_STRARRAY(sighow);
443 static const char *clockid[] = {
444 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
445 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
446 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
448 static DEFINE_STRARRAY(clockid);
450 static const char *socket_families[] = {
451 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
452 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
453 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
454 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
455 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
456 "ALG", "NFC", "VSOCK",
458 static DEFINE_STRARRAY(socket_families);
460 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
461 struct syscall_arg *arg)
463 size_t printed = 0;
464 int mode = arg->val;
466 if (mode == F_OK) /* 0 */
467 return scnprintf(bf, size, "F");
468 #define P_MODE(n) \
469 if (mode & n##_OK) { \
470 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
471 mode &= ~n##_OK; \
474 P_MODE(R);
475 P_MODE(W);
476 P_MODE(X);
477 #undef P_MODE
479 if (mode)
480 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
482 return printed;
485 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
487 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
488 struct syscall_arg *arg);
490 #define SCA_FILENAME syscall_arg__scnprintf_filename
492 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
493 struct syscall_arg *arg)
495 int printed = 0, flags = arg->val;
497 #define P_FLAG(n) \
498 if (flags & O_##n) { \
499 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
500 flags &= ~O_##n; \
503 P_FLAG(CLOEXEC);
504 P_FLAG(NONBLOCK);
505 #undef P_FLAG
507 if (flags)
508 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
510 return printed;
513 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
515 #ifndef GRND_NONBLOCK
516 #define GRND_NONBLOCK 0x0001
517 #endif
518 #ifndef GRND_RANDOM
519 #define GRND_RANDOM 0x0002
520 #endif
522 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
523 struct syscall_arg *arg)
525 int printed = 0, flags = arg->val;
527 #define P_FLAG(n) \
528 if (flags & GRND_##n) { \
529 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
530 flags &= ~GRND_##n; \
533 P_FLAG(RANDOM);
534 P_FLAG(NONBLOCK);
535 #undef P_FLAG
537 if (flags)
538 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
540 return printed;
543 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
545 #define STRARRAY(name, array) \
546 { .scnprintf = SCA_STRARRAY, \
547 .parm = &strarray__##array, }
549 #include "trace/beauty/arch_errno_names.c"
550 #include "trace/beauty/eventfd.c"
551 #include "trace/beauty/futex_op.c"
552 #include "trace/beauty/futex_val3.c"
553 #include "trace/beauty/mmap.c"
554 #include "trace/beauty/mode_t.c"
555 #include "trace/beauty/msg_flags.c"
556 #include "trace/beauty/open_flags.c"
557 #include "trace/beauty/perf_event_open.c"
558 #include "trace/beauty/pid.c"
559 #include "trace/beauty/sched_policy.c"
560 #include "trace/beauty/seccomp.c"
561 #include "trace/beauty/signum.c"
562 #include "trace/beauty/socket_type.c"
563 #include "trace/beauty/waitid_options.c"
565 struct syscall_arg_fmt {
566 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
567 void *parm;
568 const char *name;
569 bool show_zero;
572 static struct syscall_fmt {
573 const char *name;
574 const char *alias;
575 struct syscall_arg_fmt arg[6];
576 u8 nr_args;
577 bool errpid;
578 bool timeout;
579 bool hexret;
580 } syscall_fmts[] = {
581 { .name = "access",
582 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
583 { .name = "bpf",
584 .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
585 { .name = "brk", .hexret = true,
586 .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
587 { .name = "clock_gettime",
588 .arg = { [0] = STRARRAY(clk_id, clockid), }, },
589 { .name = "clone", .errpid = true, .nr_args = 5,
590 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, },
591 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, },
592 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
593 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, },
594 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, },
595 { .name = "close",
596 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
597 { .name = "epoll_ctl",
598 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
599 { .name = "eventfd2",
600 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
601 { .name = "fchmodat",
602 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
603 { .name = "fchownat",
604 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
605 { .name = "fcntl",
606 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
607 .parm = &strarrays__fcntl_cmds_arrays,
608 .show_zero = true, },
609 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, },
610 { .name = "flock",
611 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
612 { .name = "fstat", .alias = "newfstat", },
613 { .name = "fstatat", .alias = "newfstatat", },
614 { .name = "futex",
615 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
616 [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
617 { .name = "futimesat",
618 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
619 { .name = "getitimer",
620 .arg = { [0] = STRARRAY(which, itimers), }, },
621 { .name = "getpid", .errpid = true, },
622 { .name = "getpgid", .errpid = true, },
623 { .name = "getppid", .errpid = true, },
624 { .name = "getrandom",
625 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
626 { .name = "getrlimit",
627 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
628 { .name = "gettid", .errpid = true, },
629 { .name = "ioctl",
630 .arg = {
631 #if defined(__i386__) || defined(__x86_64__)
633 * FIXME: Make this available to all arches.
635 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
636 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
637 #else
638 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
639 #endif
640 { .name = "kcmp", .nr_args = 5,
641 .arg = { [0] = { .name = "pid1", .scnprintf = SCA_PID, },
642 [1] = { .name = "pid2", .scnprintf = SCA_PID, },
643 [2] = { .name = "type", .scnprintf = SCA_KCMP_TYPE, },
644 [3] = { .name = "idx1", .scnprintf = SCA_KCMP_IDX, },
645 [4] = { .name = "idx2", .scnprintf = SCA_KCMP_IDX, }, }, },
646 { .name = "keyctl",
647 .arg = { [0] = STRARRAY(option, keyctl_options), }, },
648 { .name = "kill",
649 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
650 { .name = "linkat",
651 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
652 { .name = "lseek",
653 .arg = { [2] = STRARRAY(whence, whences), }, },
654 { .name = "lstat", .alias = "newlstat", },
655 { .name = "madvise",
656 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
657 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
658 { .name = "mkdirat",
659 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
660 { .name = "mknodat",
661 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
662 { .name = "mlock",
663 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
664 { .name = "mlockall",
665 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
666 { .name = "mmap", .hexret = true,
667 /* The standard mmap maps to old_mmap on s390x */
668 #if defined(__s390x__)
669 .alias = "old_mmap",
670 #endif
671 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
672 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
673 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
674 { .name = "mprotect",
675 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
676 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, },
677 { .name = "mq_unlink",
678 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
679 { .name = "mremap", .hexret = true,
680 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
681 [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
682 [4] = { .scnprintf = SCA_HEX, /* new_addr */ }, }, },
683 { .name = "munlock",
684 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
685 { .name = "munmap",
686 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
687 { .name = "name_to_handle_at",
688 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
689 { .name = "newfstatat",
690 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
691 { .name = "open",
692 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
693 { .name = "open_by_handle_at",
694 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
695 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
696 { .name = "openat",
697 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
698 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
699 { .name = "perf_event_open",
700 .arg = { [2] = { .scnprintf = SCA_INT, /* cpu */ },
701 [3] = { .scnprintf = SCA_FD, /* group_fd */ },
702 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
703 { .name = "pipe2",
704 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
705 { .name = "pkey_alloc",
706 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, },
707 { .name = "pkey_free",
708 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, },
709 { .name = "pkey_mprotect",
710 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
711 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
712 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, },
713 { .name = "poll", .timeout = true, },
714 { .name = "ppoll", .timeout = true, },
715 { .name = "prctl", .alias = "arch_prctl",
716 .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
717 [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
718 [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
719 { .name = "pread", .alias = "pread64", },
720 { .name = "preadv", .alias = "pread", },
721 { .name = "prlimit64",
722 .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
723 { .name = "pwrite", .alias = "pwrite64", },
724 { .name = "readlinkat",
725 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
726 { .name = "recvfrom",
727 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
728 { .name = "recvmmsg",
729 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
730 { .name = "recvmsg",
731 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
732 { .name = "renameat",
733 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
734 { .name = "rt_sigaction",
735 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
736 { .name = "rt_sigprocmask",
737 .arg = { [0] = STRARRAY(how, sighow), }, },
738 { .name = "rt_sigqueueinfo",
739 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
740 { .name = "rt_tgsigqueueinfo",
741 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
742 { .name = "sched_setscheduler",
743 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
744 { .name = "seccomp",
745 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ },
746 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
747 { .name = "select", .timeout = true, },
748 { .name = "sendmmsg",
749 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
750 { .name = "sendmsg",
751 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
752 { .name = "sendto",
753 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
754 { .name = "set_tid_address", .errpid = true, },
755 { .name = "setitimer",
756 .arg = { [0] = STRARRAY(which, itimers), }, },
757 { .name = "setrlimit",
758 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
759 { .name = "socket",
760 .arg = { [0] = STRARRAY(family, socket_families),
761 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
762 { .name = "socketpair",
763 .arg = { [0] = STRARRAY(family, socket_families),
764 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
765 { .name = "stat", .alias = "newstat", },
766 { .name = "statx",
767 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ },
768 [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
769 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, },
770 { .name = "swapoff",
771 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
772 { .name = "swapon",
773 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
774 { .name = "symlinkat",
775 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
776 { .name = "tgkill",
777 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
778 { .name = "tkill",
779 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
780 { .name = "uname", .alias = "newuname", },
781 { .name = "unlinkat",
782 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
783 { .name = "utimensat",
784 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
785 { .name = "wait4", .errpid = true,
786 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
787 { .name = "waitid", .errpid = true,
788 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
791 static int syscall_fmt__cmp(const void *name, const void *fmtp)
793 const struct syscall_fmt *fmt = fmtp;
794 return strcmp(name, fmt->name);
797 static struct syscall_fmt *syscall_fmt__find(const char *name)
799 const int nmemb = ARRAY_SIZE(syscall_fmts);
800 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
803 struct syscall {
804 struct event_format *tp_format;
805 int nr_args;
806 struct format_field *args;
807 const char *name;
808 bool is_exit;
809 struct syscall_fmt *fmt;
810 struct syscall_arg_fmt *arg_fmt;
814 * We need to have this 'calculated' boolean because in some cases we really
815 * don't know what is the duration of a syscall, for instance, when we start
816 * a session and some threads are waiting for a syscall to finish, say 'poll',
817 * in which case all we can do is to print "( ? ) for duration and for the
818 * start timestamp.
820 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
822 double duration = (double)t / NSEC_PER_MSEC;
823 size_t printed = fprintf(fp, "(");
825 if (!calculated)
826 printed += fprintf(fp, " ");
827 else if (duration >= 1.0)
828 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
829 else if (duration >= 0.01)
830 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
831 else
832 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
833 return printed + fprintf(fp, "): ");
837 * filename.ptr: The filename char pointer that will be vfs_getname'd
838 * filename.entry_str_pos: Where to insert the string translated from
839 * filename.ptr by the vfs_getname tracepoint/kprobe.
840 * ret_scnprintf: syscall args may set this to a different syscall return
841 * formatter, for instance, fcntl may return fds, file flags, etc.
843 struct thread_trace {
844 u64 entry_time;
845 bool entry_pending;
846 unsigned long nr_events;
847 unsigned long pfmaj, pfmin;
848 char *entry_str;
849 double runtime_ms;
850 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
851 struct {
852 unsigned long ptr;
853 short int entry_str_pos;
854 bool pending_open;
855 unsigned int namelen;
856 char *name;
857 } filename;
858 struct {
859 int max;
860 char **table;
861 } paths;
863 struct intlist *syscall_stats;
866 static struct thread_trace *thread_trace__new(void)
868 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
870 if (ttrace)
871 ttrace->paths.max = -1;
873 ttrace->syscall_stats = intlist__new(NULL);
875 return ttrace;
878 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
880 struct thread_trace *ttrace;
882 if (thread == NULL)
883 goto fail;
885 if (thread__priv(thread) == NULL)
886 thread__set_priv(thread, thread_trace__new());
888 if (thread__priv(thread) == NULL)
889 goto fail;
891 ttrace = thread__priv(thread);
892 ++ttrace->nr_events;
894 return ttrace;
895 fail:
896 color_fprintf(fp, PERF_COLOR_RED,
897 "WARNING: not enough memory, dropping samples!\n");
898 return NULL;
902 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
903 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
905 struct thread_trace *ttrace = thread__priv(arg->thread);
907 ttrace->ret_scnprintf = ret_scnprintf;
910 #define TRACE_PFMAJ (1 << 0)
911 #define TRACE_PFMIN (1 << 1)
913 static const size_t trace__entry_str_size = 2048;
915 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
917 struct thread_trace *ttrace = thread__priv(thread);
919 if (fd > ttrace->paths.max) {
920 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
922 if (npath == NULL)
923 return -1;
925 if (ttrace->paths.max != -1) {
926 memset(npath + ttrace->paths.max + 1, 0,
927 (fd - ttrace->paths.max) * sizeof(char *));
928 } else {
929 memset(npath, 0, (fd + 1) * sizeof(char *));
932 ttrace->paths.table = npath;
933 ttrace->paths.max = fd;
936 ttrace->paths.table[fd] = strdup(pathname);
938 return ttrace->paths.table[fd] != NULL ? 0 : -1;
941 static int thread__read_fd_path(struct thread *thread, int fd)
943 char linkname[PATH_MAX], pathname[PATH_MAX];
944 struct stat st;
945 int ret;
947 if (thread->pid_ == thread->tid) {
948 scnprintf(linkname, sizeof(linkname),
949 "/proc/%d/fd/%d", thread->pid_, fd);
950 } else {
951 scnprintf(linkname, sizeof(linkname),
952 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
955 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
956 return -1;
958 ret = readlink(linkname, pathname, sizeof(pathname));
960 if (ret < 0 || ret > st.st_size)
961 return -1;
963 pathname[ret] = '\0';
964 return trace__set_fd_pathname(thread, fd, pathname);
967 static const char *thread__fd_path(struct thread *thread, int fd,
968 struct trace *trace)
970 struct thread_trace *ttrace = thread__priv(thread);
972 if (ttrace == NULL)
973 return NULL;
975 if (fd < 0)
976 return NULL;
978 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
979 if (!trace->live)
980 return NULL;
981 ++trace->stats.proc_getname;
982 if (thread__read_fd_path(thread, fd))
983 return NULL;
986 return ttrace->paths.table[fd];
989 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
991 int fd = arg->val;
992 size_t printed = scnprintf(bf, size, "%d", fd);
993 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
995 if (path)
996 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
998 return printed;
1001 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1003 size_t printed = scnprintf(bf, size, "%d", fd);
1004 struct thread *thread = machine__find_thread(trace->host, pid, pid);
1006 if (thread) {
1007 const char *path = thread__fd_path(thread, fd, trace);
1009 if (path)
1010 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1012 thread__put(thread);
1015 return printed;
1018 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1019 struct syscall_arg *arg)
1021 int fd = arg->val;
1022 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1023 struct thread_trace *ttrace = thread__priv(arg->thread);
1025 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1026 zfree(&ttrace->paths.table[fd]);
1028 return printed;
1031 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1032 unsigned long ptr)
1034 struct thread_trace *ttrace = thread__priv(thread);
1036 ttrace->filename.ptr = ptr;
1037 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1040 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1041 struct syscall_arg *arg)
1043 unsigned long ptr = arg->val;
1045 if (!arg->trace->vfs_getname)
1046 return scnprintf(bf, size, "%#x", ptr);
1048 thread__set_filename_pos(arg->thread, bf, ptr);
1049 return 0;
1052 static bool trace__filter_duration(struct trace *trace, double t)
1054 return t < (trace->duration_filter * NSEC_PER_MSEC);
1057 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1059 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1061 return fprintf(fp, "%10.3f ", ts);
1065 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1066 * using ttrace->entry_time for a thread that receives a sys_exit without
1067 * first having received a sys_enter ("poll" issued before tracing session
1068 * starts, lost sys_enter exit due to ring buffer overflow).
1070 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1072 if (tstamp > 0)
1073 return __trace__fprintf_tstamp(trace, tstamp, fp);
1075 return fprintf(fp, " ? ");
1078 static bool done = false;
1079 static bool interrupted = false;
1081 static void sig_handler(int sig)
1083 done = true;
1084 interrupted = sig == SIGINT;
1087 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1088 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1090 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1091 printed += fprintf_duration(duration, duration_calculated, fp);
1093 if (trace->multiple_threads) {
1094 if (trace->show_comm)
1095 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1096 printed += fprintf(fp, "%d ", thread->tid);
1099 return printed;
1102 static int trace__process_event(struct trace *trace, struct machine *machine,
1103 union perf_event *event, struct perf_sample *sample)
1105 int ret = 0;
1107 switch (event->header.type) {
1108 case PERF_RECORD_LOST:
1109 color_fprintf(trace->output, PERF_COLOR_RED,
1110 "LOST %" PRIu64 " events!\n", event->lost.lost);
1111 ret = machine__process_lost_event(machine, event, sample);
1112 break;
1113 default:
1114 ret = machine__process_event(machine, event, sample);
1115 break;
1118 return ret;
1121 static int trace__tool_process(struct perf_tool *tool,
1122 union perf_event *event,
1123 struct perf_sample *sample,
1124 struct machine *machine)
1126 struct trace *trace = container_of(tool, struct trace, tool);
1127 return trace__process_event(trace, machine, event, sample);
1130 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1132 struct machine *machine = vmachine;
1134 if (machine->kptr_restrict_warned)
1135 return NULL;
1137 if (symbol_conf.kptr_restrict) {
1138 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1139 "Check /proc/sys/kernel/kptr_restrict.\n\n"
1140 "Kernel samples will not be resolved.\n");
1141 machine->kptr_restrict_warned = true;
1142 return NULL;
1145 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1148 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1150 int err = symbol__init(NULL);
1152 if (err)
1153 return err;
1155 trace->host = machine__new_host();
1156 if (trace->host == NULL)
1157 return -ENOMEM;
1159 err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1160 if (err < 0)
1161 goto out;
1163 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1164 evlist->threads, trace__tool_process, false,
1165 trace->opts.proc_map_timeout, 1);
1166 out:
1167 if (err)
1168 symbol__exit();
1170 return err;
1173 static void trace__symbols__exit(struct trace *trace)
1175 machine__exit(trace->host);
1176 trace->host = NULL;
1178 symbol__exit();
1181 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1183 int idx;
1185 if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1186 nr_args = sc->fmt->nr_args;
1188 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1189 if (sc->arg_fmt == NULL)
1190 return -1;
1192 for (idx = 0; idx < nr_args; ++idx) {
1193 if (sc->fmt)
1194 sc->arg_fmt[idx] = sc->fmt->arg[idx];
1197 sc->nr_args = nr_args;
1198 return 0;
1201 static int syscall__set_arg_fmts(struct syscall *sc)
1203 struct format_field *field;
1204 int idx = 0, len;
1206 for (field = sc->args; field; field = field->next, ++idx) {
1207 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1208 continue;
1210 if (strcmp(field->type, "const char *") == 0 &&
1211 (strcmp(field->name, "filename") == 0 ||
1212 strcmp(field->name, "path") == 0 ||
1213 strcmp(field->name, "pathname") == 0))
1214 sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1215 else if (field->flags & FIELD_IS_POINTER)
1216 sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1217 else if (strcmp(field->type, "pid_t") == 0)
1218 sc->arg_fmt[idx].scnprintf = SCA_PID;
1219 else if (strcmp(field->type, "umode_t") == 0)
1220 sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1221 else if ((strcmp(field->type, "int") == 0 ||
1222 strcmp(field->type, "unsigned int") == 0 ||
1223 strcmp(field->type, "long") == 0) &&
1224 (len = strlen(field->name)) >= 2 &&
1225 strcmp(field->name + len - 2, "fd") == 0) {
1227 * /sys/kernel/tracing/events/syscalls/sys_enter*
1228 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1229 * 65 int
1230 * 23 unsigned int
1231 * 7 unsigned long
1233 sc->arg_fmt[idx].scnprintf = SCA_FD;
1237 return 0;
1240 static int trace__read_syscall_info(struct trace *trace, int id)
1242 char tp_name[128];
1243 struct syscall *sc;
1244 const char *name = syscalltbl__name(trace->sctbl, id);
1246 if (name == NULL)
1247 return -1;
1249 if (id > trace->syscalls.max) {
1250 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1252 if (nsyscalls == NULL)
1253 return -1;
1255 if (trace->syscalls.max != -1) {
1256 memset(nsyscalls + trace->syscalls.max + 1, 0,
1257 (id - trace->syscalls.max) * sizeof(*sc));
1258 } else {
1259 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1262 trace->syscalls.table = nsyscalls;
1263 trace->syscalls.max = id;
1266 sc = trace->syscalls.table + id;
1267 sc->name = name;
1269 sc->fmt = syscall_fmt__find(sc->name);
1271 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1272 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1274 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1275 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1276 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1279 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1280 return -1;
1282 if (IS_ERR(sc->tp_format))
1283 return -1;
1285 sc->args = sc->tp_format->format.fields;
1287 * We need to check and discard the first variable '__syscall_nr'
1288 * or 'nr' that mean the syscall number. It is needless here.
1289 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1291 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1292 sc->args = sc->args->next;
1293 --sc->nr_args;
1296 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1298 return syscall__set_arg_fmts(sc);
1301 static int trace__validate_ev_qualifier(struct trace *trace)
1303 int err = 0, i;
1304 size_t nr_allocated;
1305 struct str_node *pos;
1307 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1308 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1309 sizeof(trace->ev_qualifier_ids.entries[0]));
1311 if (trace->ev_qualifier_ids.entries == NULL) {
1312 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1313 trace->output);
1314 err = -EINVAL;
1315 goto out;
1318 nr_allocated = trace->ev_qualifier_ids.nr;
1319 i = 0;
1321 strlist__for_each_entry(pos, trace->ev_qualifier) {
1322 const char *sc = pos->s;
1323 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1325 if (id < 0) {
1326 id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1327 if (id >= 0)
1328 goto matches;
1330 if (err == 0) {
1331 fputs("Error:\tInvalid syscall ", trace->output);
1332 err = -EINVAL;
1333 } else {
1334 fputs(", ", trace->output);
1337 fputs(sc, trace->output);
1339 matches:
1340 trace->ev_qualifier_ids.entries[i++] = id;
1341 if (match_next == -1)
1342 continue;
1344 while (1) {
1345 id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1346 if (id < 0)
1347 break;
1348 if (nr_allocated == trace->ev_qualifier_ids.nr) {
1349 void *entries;
1351 nr_allocated += 8;
1352 entries = realloc(trace->ev_qualifier_ids.entries,
1353 nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1354 if (entries == NULL) {
1355 err = -ENOMEM;
1356 fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1357 goto out_free;
1359 trace->ev_qualifier_ids.entries = entries;
1361 trace->ev_qualifier_ids.nr++;
1362 trace->ev_qualifier_ids.entries[i++] = id;
1366 if (err < 0) {
1367 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1368 "\nHint:\tand: 'man syscalls'\n", trace->output);
1369 out_free:
1370 zfree(&trace->ev_qualifier_ids.entries);
1371 trace->ev_qualifier_ids.nr = 0;
1373 out:
1374 return err;
1378 * args is to be interpreted as a series of longs but we need to handle
1379 * 8-byte unaligned accesses. args points to raw_data within the event
1380 * and raw_data is guaranteed to be 8-byte unaligned because it is
1381 * preceded by raw_size which is a u32. So we need to copy args to a temp
1382 * variable to read it. Most notably this avoids extended load instructions
1383 * on unaligned addresses
1385 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1387 unsigned long val;
1388 unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1390 memcpy(&val, p, sizeof(val));
1391 return val;
1394 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1395 struct syscall_arg *arg)
1397 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1398 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1400 return scnprintf(bf, size, "arg%d: ", arg->idx);
1403 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1404 struct syscall_arg *arg, unsigned long val)
1406 if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1407 arg->val = val;
1408 if (sc->arg_fmt[arg->idx].parm)
1409 arg->parm = sc->arg_fmt[arg->idx].parm;
1410 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1412 return scnprintf(bf, size, "%ld", val);
1415 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1416 unsigned char *args, struct trace *trace,
1417 struct thread *thread)
1419 size_t printed = 0;
1420 unsigned long val;
1421 u8 bit = 1;
1422 struct syscall_arg arg = {
1423 .args = args,
1424 .idx = 0,
1425 .mask = 0,
1426 .trace = trace,
1427 .thread = thread,
1429 struct thread_trace *ttrace = thread__priv(thread);
1432 * Things like fcntl will set this in its 'cmd' formatter to pick the
1433 * right formatter for the return value (an fd? file flags?), which is
1434 * not needed for syscalls that always return a given type, say an fd.
1436 ttrace->ret_scnprintf = NULL;
1438 if (sc->args != NULL) {
1439 struct format_field *field;
1441 for (field = sc->args; field;
1442 field = field->next, ++arg.idx, bit <<= 1) {
1443 if (arg.mask & bit)
1444 continue;
1446 val = syscall_arg__val(&arg, arg.idx);
1449 * Suppress this argument if its value is zero and
1450 * and we don't have a string associated in an
1451 * strarray for it.
1453 if (val == 0 &&
1454 !(sc->arg_fmt &&
1455 (sc->arg_fmt[arg.idx].show_zero ||
1456 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1457 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1458 sc->arg_fmt[arg.idx].parm))
1459 continue;
1461 printed += scnprintf(bf + printed, size - printed,
1462 "%s%s: ", printed ? ", " : "", field->name);
1463 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1465 } else if (IS_ERR(sc->tp_format)) {
1467 * If we managed to read the tracepoint /format file, then we
1468 * may end up not having any args, like with gettid(), so only
1469 * print the raw args when we didn't manage to read it.
1471 while (arg.idx < sc->nr_args) {
1472 if (arg.mask & bit)
1473 goto next_arg;
1474 val = syscall_arg__val(&arg, arg.idx);
1475 if (printed)
1476 printed += scnprintf(bf + printed, size - printed, ", ");
1477 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1478 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1479 next_arg:
1480 ++arg.idx;
1481 bit <<= 1;
1485 return printed;
1488 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1489 union perf_event *event,
1490 struct perf_sample *sample);
1492 static struct syscall *trace__syscall_info(struct trace *trace,
1493 struct perf_evsel *evsel, int id)
1496 if (id < 0) {
1499 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1500 * before that, leaving at a higher verbosity level till that is
1501 * explained. Reproduced with plain ftrace with:
1503 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1504 * grep "NR -1 " /t/trace_pipe
1506 * After generating some load on the machine.
1508 if (verbose > 1) {
1509 static u64 n;
1510 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1511 id, perf_evsel__name(evsel), ++n);
1513 return NULL;
1516 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1517 trace__read_syscall_info(trace, id))
1518 goto out_cant_read;
1520 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1521 goto out_cant_read;
1523 return &trace->syscalls.table[id];
1525 out_cant_read:
1526 if (verbose > 0) {
1527 fprintf(trace->output, "Problems reading syscall %d", id);
1528 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1529 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1530 fputs(" information\n", trace->output);
1532 return NULL;
1535 static void thread__update_stats(struct thread_trace *ttrace,
1536 int id, struct perf_sample *sample)
1538 struct int_node *inode;
1539 struct stats *stats;
1540 u64 duration = 0;
1542 inode = intlist__findnew(ttrace->syscall_stats, id);
1543 if (inode == NULL)
1544 return;
1546 stats = inode->priv;
1547 if (stats == NULL) {
1548 stats = malloc(sizeof(struct stats));
1549 if (stats == NULL)
1550 return;
1551 init_stats(stats);
1552 inode->priv = stats;
1555 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1556 duration = sample->time - ttrace->entry_time;
1558 update_stats(stats, duration);
1561 static int trace__printf_interrupted_entry(struct trace *trace)
1563 struct thread_trace *ttrace;
1564 size_t printed;
1566 if (trace->current == NULL)
1567 return 0;
1569 ttrace = thread__priv(trace->current);
1571 if (!ttrace->entry_pending)
1572 return 0;
1574 printed = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1575 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1576 ttrace->entry_pending = false;
1578 return printed;
1581 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1582 struct perf_sample *sample, struct thread *thread)
1584 int printed = 0;
1586 if (trace->print_sample) {
1587 double ts = (double)sample->time / NSEC_PER_MSEC;
1589 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1590 perf_evsel__name(evsel), ts,
1591 thread__comm_str(thread),
1592 sample->pid, sample->tid, sample->cpu);
1595 return printed;
1598 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1599 union perf_event *event __maybe_unused,
1600 struct perf_sample *sample)
1602 char *msg;
1603 void *args;
1604 size_t printed = 0;
1605 struct thread *thread;
1606 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1607 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1608 struct thread_trace *ttrace;
1610 if (sc == NULL)
1611 return -1;
1613 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1614 ttrace = thread__trace(thread, trace->output);
1615 if (ttrace == NULL)
1616 goto out_put;
1618 trace__fprintf_sample(trace, evsel, sample, thread);
1620 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1622 if (ttrace->entry_str == NULL) {
1623 ttrace->entry_str = malloc(trace__entry_str_size);
1624 if (!ttrace->entry_str)
1625 goto out_put;
1628 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1629 trace__printf_interrupted_entry(trace);
1631 ttrace->entry_time = sample->time;
1632 msg = ttrace->entry_str;
1633 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1635 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1636 args, trace, thread);
1638 if (sc->is_exit) {
1639 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1640 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1641 fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1643 } else {
1644 ttrace->entry_pending = true;
1645 /* See trace__vfs_getname & trace__sys_exit */
1646 ttrace->filename.pending_open = false;
1649 if (trace->current != thread) {
1650 thread__put(trace->current);
1651 trace->current = thread__get(thread);
1653 err = 0;
1654 out_put:
1655 thread__put(thread);
1656 return err;
1659 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1660 struct perf_sample *sample,
1661 struct callchain_cursor *cursor)
1663 struct addr_location al;
1664 int max_stack = evsel->attr.sample_max_stack ?
1665 evsel->attr.sample_max_stack :
1666 trace->max_stack;
1668 if (machine__resolve(trace->host, &al, sample) < 0 ||
1669 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1670 return -1;
1672 return 0;
1675 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1677 /* TODO: user-configurable print_opts */
1678 const unsigned int print_opts = EVSEL__PRINT_SYM |
1679 EVSEL__PRINT_DSO |
1680 EVSEL__PRINT_UNKNOWN_AS_ADDR;
1682 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1685 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1687 struct perf_env *env = perf_evsel__env(evsel);
1688 const char *arch_name = perf_env__arch(env);
1690 return arch_syscalls__strerrno(arch_name, err);
1693 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1694 union perf_event *event __maybe_unused,
1695 struct perf_sample *sample)
1697 long ret;
1698 u64 duration = 0;
1699 bool duration_calculated = false;
1700 struct thread *thread;
1701 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1702 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1703 struct thread_trace *ttrace;
1705 if (sc == NULL)
1706 return -1;
1708 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1709 ttrace = thread__trace(thread, trace->output);
1710 if (ttrace == NULL)
1711 goto out_put;
1713 trace__fprintf_sample(trace, evsel, sample, thread);
1715 if (trace->summary)
1716 thread__update_stats(ttrace, id, sample);
1718 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1720 if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1721 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1722 ttrace->filename.pending_open = false;
1723 ++trace->stats.vfs_getname;
1726 if (ttrace->entry_time) {
1727 duration = sample->time - ttrace->entry_time;
1728 if (trace__filter_duration(trace, duration))
1729 goto out;
1730 duration_calculated = true;
1731 } else if (trace->duration_filter)
1732 goto out;
1734 if (sample->callchain) {
1735 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1736 if (callchain_ret == 0) {
1737 if (callchain_cursor.nr < trace->min_stack)
1738 goto out;
1739 callchain_ret = 1;
1743 if (trace->summary_only)
1744 goto out;
1746 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1748 if (ttrace->entry_pending) {
1749 fprintf(trace->output, "%-70s", ttrace->entry_str);
1750 } else {
1751 fprintf(trace->output, " ... [");
1752 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1753 fprintf(trace->output, "]: %s()", sc->name);
1756 if (sc->fmt == NULL) {
1757 if (ret < 0)
1758 goto errno_print;
1759 signed_print:
1760 fprintf(trace->output, ") = %ld", ret);
1761 } else if (ret < 0) {
1762 errno_print: {
1763 char bf[STRERR_BUFSIZE];
1764 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1765 *e = errno_to_name(evsel, -ret);
1767 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1769 } else if (ret == 0 && sc->fmt->timeout)
1770 fprintf(trace->output, ") = 0 Timeout");
1771 else if (ttrace->ret_scnprintf) {
1772 char bf[1024];
1773 struct syscall_arg arg = {
1774 .val = ret,
1775 .thread = thread,
1776 .trace = trace,
1778 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1779 ttrace->ret_scnprintf = NULL;
1780 fprintf(trace->output, ") = %s", bf);
1781 } else if (sc->fmt->hexret)
1782 fprintf(trace->output, ") = %#lx", ret);
1783 else if (sc->fmt->errpid) {
1784 struct thread *child = machine__find_thread(trace->host, ret, ret);
1786 if (child != NULL) {
1787 fprintf(trace->output, ") = %ld", ret);
1788 if (child->comm_set)
1789 fprintf(trace->output, " (%s)", thread__comm_str(child));
1790 thread__put(child);
1792 } else
1793 goto signed_print;
1795 fputc('\n', trace->output);
1797 if (callchain_ret > 0)
1798 trace__fprintf_callchain(trace, sample);
1799 else if (callchain_ret < 0)
1800 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1801 out:
1802 ttrace->entry_pending = false;
1803 err = 0;
1804 out_put:
1805 thread__put(thread);
1806 return err;
1809 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1810 union perf_event *event __maybe_unused,
1811 struct perf_sample *sample)
1813 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1814 struct thread_trace *ttrace;
1815 size_t filename_len, entry_str_len, to_move;
1816 ssize_t remaining_space;
1817 char *pos;
1818 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1820 if (!thread)
1821 goto out;
1823 ttrace = thread__priv(thread);
1824 if (!ttrace)
1825 goto out_put;
1827 filename_len = strlen(filename);
1828 if (filename_len == 0)
1829 goto out_put;
1831 if (ttrace->filename.namelen < filename_len) {
1832 char *f = realloc(ttrace->filename.name, filename_len + 1);
1834 if (f == NULL)
1835 goto out_put;
1837 ttrace->filename.namelen = filename_len;
1838 ttrace->filename.name = f;
1841 strcpy(ttrace->filename.name, filename);
1842 ttrace->filename.pending_open = true;
1844 if (!ttrace->filename.ptr)
1845 goto out_put;
1847 entry_str_len = strlen(ttrace->entry_str);
1848 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1849 if (remaining_space <= 0)
1850 goto out_put;
1852 if (filename_len > (size_t)remaining_space) {
1853 filename += filename_len - remaining_space;
1854 filename_len = remaining_space;
1857 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1858 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1859 memmove(pos + filename_len, pos, to_move);
1860 memcpy(pos, filename, filename_len);
1862 ttrace->filename.ptr = 0;
1863 ttrace->filename.entry_str_pos = 0;
1864 out_put:
1865 thread__put(thread);
1866 out:
1867 return 0;
1870 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1871 union perf_event *event __maybe_unused,
1872 struct perf_sample *sample)
1874 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1875 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1876 struct thread *thread = machine__findnew_thread(trace->host,
1877 sample->pid,
1878 sample->tid);
1879 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1881 if (ttrace == NULL)
1882 goto out_dump;
1884 ttrace->runtime_ms += runtime_ms;
1885 trace->runtime_ms += runtime_ms;
1886 out_put:
1887 thread__put(thread);
1888 return 0;
1890 out_dump:
1891 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1892 evsel->name,
1893 perf_evsel__strval(evsel, sample, "comm"),
1894 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1895 runtime,
1896 perf_evsel__intval(evsel, sample, "vruntime"));
1897 goto out_put;
1900 static int bpf_output__printer(enum binary_printer_ops op,
1901 unsigned int val, void *extra __maybe_unused, FILE *fp)
1903 unsigned char ch = (unsigned char)val;
1905 switch (op) {
1906 case BINARY_PRINT_CHAR_DATA:
1907 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1908 case BINARY_PRINT_DATA_BEGIN:
1909 case BINARY_PRINT_LINE_BEGIN:
1910 case BINARY_PRINT_ADDR:
1911 case BINARY_PRINT_NUM_DATA:
1912 case BINARY_PRINT_NUM_PAD:
1913 case BINARY_PRINT_SEP:
1914 case BINARY_PRINT_CHAR_PAD:
1915 case BINARY_PRINT_LINE_END:
1916 case BINARY_PRINT_DATA_END:
1917 default:
1918 break;
1921 return 0;
1924 static void bpf_output__fprintf(struct trace *trace,
1925 struct perf_sample *sample)
1927 binary__fprintf(sample->raw_data, sample->raw_size, 8,
1928 bpf_output__printer, NULL, trace->output);
1931 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1932 union perf_event *event __maybe_unused,
1933 struct perf_sample *sample)
1935 int callchain_ret = 0;
1937 if (sample->callchain) {
1938 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1939 if (callchain_ret == 0) {
1940 if (callchain_cursor.nr < trace->min_stack)
1941 goto out;
1942 callchain_ret = 1;
1946 trace__printf_interrupted_entry(trace);
1947 trace__fprintf_tstamp(trace, sample->time, trace->output);
1949 if (trace->trace_syscalls)
1950 fprintf(trace->output, "( ): ");
1952 fprintf(trace->output, "%s:", evsel->name);
1954 if (perf_evsel__is_bpf_output(evsel)) {
1955 bpf_output__fprintf(trace, sample);
1956 } else if (evsel->tp_format) {
1957 event_format__fprintf(evsel->tp_format, sample->cpu,
1958 sample->raw_data, sample->raw_size,
1959 trace->output);
1962 fprintf(trace->output, ")\n");
1964 if (callchain_ret > 0)
1965 trace__fprintf_callchain(trace, sample);
1966 else if (callchain_ret < 0)
1967 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1968 out:
1969 return 0;
1972 static void print_location(FILE *f, struct perf_sample *sample,
1973 struct addr_location *al,
1974 bool print_dso, bool print_sym)
1977 if ((verbose > 0 || print_dso) && al->map)
1978 fprintf(f, "%s@", al->map->dso->long_name);
1980 if ((verbose > 0 || print_sym) && al->sym)
1981 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1982 al->addr - al->sym->start);
1983 else if (al->map)
1984 fprintf(f, "0x%" PRIx64, al->addr);
1985 else
1986 fprintf(f, "0x%" PRIx64, sample->addr);
1989 static int trace__pgfault(struct trace *trace,
1990 struct perf_evsel *evsel,
1991 union perf_event *event __maybe_unused,
1992 struct perf_sample *sample)
1994 struct thread *thread;
1995 struct addr_location al;
1996 char map_type = 'd';
1997 struct thread_trace *ttrace;
1998 int err = -1;
1999 int callchain_ret = 0;
2001 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2003 if (sample->callchain) {
2004 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2005 if (callchain_ret == 0) {
2006 if (callchain_cursor.nr < trace->min_stack)
2007 goto out_put;
2008 callchain_ret = 1;
2012 ttrace = thread__trace(thread, trace->output);
2013 if (ttrace == NULL)
2014 goto out_put;
2016 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2017 ttrace->pfmaj++;
2018 else
2019 ttrace->pfmin++;
2021 if (trace->summary_only)
2022 goto out;
2024 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
2025 sample->ip, &al);
2027 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2029 fprintf(trace->output, "%sfault [",
2030 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2031 "maj" : "min");
2033 print_location(trace->output, sample, &al, false, true);
2035 fprintf(trace->output, "] => ");
2037 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2038 sample->addr, &al);
2040 if (!al.map) {
2041 thread__find_addr_location(thread, sample->cpumode,
2042 MAP__FUNCTION, sample->addr, &al);
2044 if (al.map)
2045 map_type = 'x';
2046 else
2047 map_type = '?';
2050 print_location(trace->output, sample, &al, true, false);
2052 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2054 if (callchain_ret > 0)
2055 trace__fprintf_callchain(trace, sample);
2056 else if (callchain_ret < 0)
2057 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2058 out:
2059 err = 0;
2060 out_put:
2061 thread__put(thread);
2062 return err;
2065 static void trace__set_base_time(struct trace *trace,
2066 struct perf_evsel *evsel,
2067 struct perf_sample *sample)
2070 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2071 * and don't use sample->time unconditionally, we may end up having
2072 * some other event in the future without PERF_SAMPLE_TIME for good
2073 * reason, i.e. we may not be interested in its timestamps, just in
2074 * it taking place, picking some piece of information when it
2075 * appears in our event stream (vfs_getname comes to mind).
2077 if (trace->base_time == 0 && !trace->full_time &&
2078 (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2079 trace->base_time = sample->time;
2082 static int trace__process_sample(struct perf_tool *tool,
2083 union perf_event *event,
2084 struct perf_sample *sample,
2085 struct perf_evsel *evsel,
2086 struct machine *machine __maybe_unused)
2088 struct trace *trace = container_of(tool, struct trace, tool);
2089 struct thread *thread;
2090 int err = 0;
2092 tracepoint_handler handler = evsel->handler;
2094 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2095 if (thread && thread__is_filtered(thread))
2096 goto out;
2098 trace__set_base_time(trace, evsel, sample);
2100 if (handler) {
2101 ++trace->nr_events;
2102 handler(trace, evsel, event, sample);
2104 out:
2105 thread__put(thread);
2106 return err;
2109 static int trace__record(struct trace *trace, int argc, const char **argv)
2111 unsigned int rec_argc, i, j;
2112 const char **rec_argv;
2113 const char * const record_args[] = {
2114 "record",
2115 "-R",
2116 "-m", "1024",
2117 "-c", "1",
2120 const char * const sc_args[] = { "-e", };
2121 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2122 const char * const majpf_args[] = { "-e", "major-faults" };
2123 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2124 const char * const minpf_args[] = { "-e", "minor-faults" };
2125 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2127 /* +1 is for the event string below */
2128 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2129 majpf_args_nr + minpf_args_nr + argc;
2130 rec_argv = calloc(rec_argc + 1, sizeof(char *));
2132 if (rec_argv == NULL)
2133 return -ENOMEM;
2135 j = 0;
2136 for (i = 0; i < ARRAY_SIZE(record_args); i++)
2137 rec_argv[j++] = record_args[i];
2139 if (trace->trace_syscalls) {
2140 for (i = 0; i < sc_args_nr; i++)
2141 rec_argv[j++] = sc_args[i];
2143 /* event string may be different for older kernels - e.g., RHEL6 */
2144 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2145 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2146 else if (is_valid_tracepoint("syscalls:sys_enter"))
2147 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2148 else {
2149 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2150 free(rec_argv);
2151 return -1;
2155 if (trace->trace_pgfaults & TRACE_PFMAJ)
2156 for (i = 0; i < majpf_args_nr; i++)
2157 rec_argv[j++] = majpf_args[i];
2159 if (trace->trace_pgfaults & TRACE_PFMIN)
2160 for (i = 0; i < minpf_args_nr; i++)
2161 rec_argv[j++] = minpf_args[i];
2163 for (i = 0; i < (unsigned int)argc; i++)
2164 rec_argv[j++] = argv[i];
2166 return cmd_record(j, rec_argv);
2169 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2171 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2173 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2175 if (IS_ERR(evsel))
2176 return false;
2178 if (perf_evsel__field(evsel, "pathname") == NULL) {
2179 perf_evsel__delete(evsel);
2180 return false;
2183 evsel->handler = trace__vfs_getname;
2184 perf_evlist__add(evlist, evsel);
2185 return true;
2188 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2190 struct perf_evsel *evsel;
2191 struct perf_event_attr attr = {
2192 .type = PERF_TYPE_SOFTWARE,
2193 .mmap_data = 1,
2196 attr.config = config;
2197 attr.sample_period = 1;
2199 event_attr_init(&attr);
2201 evsel = perf_evsel__new(&attr);
2202 if (evsel)
2203 evsel->handler = trace__pgfault;
2205 return evsel;
2208 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2210 const u32 type = event->header.type;
2211 struct perf_evsel *evsel;
2213 if (type != PERF_RECORD_SAMPLE) {
2214 trace__process_event(trace, trace->host, event, sample);
2215 return;
2218 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2219 if (evsel == NULL) {
2220 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2221 return;
2224 trace__set_base_time(trace, evsel, sample);
2226 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2227 sample->raw_data == NULL) {
2228 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2229 perf_evsel__name(evsel), sample->tid,
2230 sample->cpu, sample->raw_size);
2231 } else {
2232 tracepoint_handler handler = evsel->handler;
2233 handler(trace, evsel, event, sample);
2237 static int trace__add_syscall_newtp(struct trace *trace)
2239 int ret = -1;
2240 struct perf_evlist *evlist = trace->evlist;
2241 struct perf_evsel *sys_enter, *sys_exit;
2243 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2244 if (sys_enter == NULL)
2245 goto out;
2247 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2248 goto out_delete_sys_enter;
2250 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2251 if (sys_exit == NULL)
2252 goto out_delete_sys_enter;
2254 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2255 goto out_delete_sys_exit;
2257 perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2258 perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2260 perf_evlist__add(evlist, sys_enter);
2261 perf_evlist__add(evlist, sys_exit);
2263 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2265 * We're interested only in the user space callchain
2266 * leading to the syscall, allow overriding that for
2267 * debugging reasons using --kernel_syscall_callchains
2269 sys_exit->attr.exclude_callchain_kernel = 1;
2272 trace->syscalls.events.sys_enter = sys_enter;
2273 trace->syscalls.events.sys_exit = sys_exit;
2275 ret = 0;
2276 out:
2277 return ret;
2279 out_delete_sys_exit:
2280 perf_evsel__delete_priv(sys_exit);
2281 out_delete_sys_enter:
2282 perf_evsel__delete_priv(sys_enter);
2283 goto out;
2286 static int trace__set_ev_qualifier_filter(struct trace *trace)
2288 int err = -1;
2289 struct perf_evsel *sys_exit;
2290 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2291 trace->ev_qualifier_ids.nr,
2292 trace->ev_qualifier_ids.entries);
2294 if (filter == NULL)
2295 goto out_enomem;
2297 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2298 filter)) {
2299 sys_exit = trace->syscalls.events.sys_exit;
2300 err = perf_evsel__append_tp_filter(sys_exit, filter);
2303 free(filter);
2304 out:
2305 return err;
2306 out_enomem:
2307 errno = ENOMEM;
2308 goto out;
2311 static int trace__set_filter_loop_pids(struct trace *trace)
2313 unsigned int nr = 1;
2314 pid_t pids[32] = {
2315 getpid(),
2317 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2319 while (thread && nr < ARRAY_SIZE(pids)) {
2320 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2322 if (parent == NULL)
2323 break;
2325 if (!strcmp(thread__comm_str(parent), "sshd")) {
2326 pids[nr++] = parent->tid;
2327 break;
2329 thread = parent;
2332 return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2335 static int trace__run(struct trace *trace, int argc, const char **argv)
2337 struct perf_evlist *evlist = trace->evlist;
2338 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2339 int err = -1, i;
2340 unsigned long before;
2341 const bool forks = argc > 0;
2342 bool draining = false;
2344 trace->live = true;
2346 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2347 goto out_error_raw_syscalls;
2349 if (trace->trace_syscalls)
2350 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2352 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2353 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2354 if (pgfault_maj == NULL)
2355 goto out_error_mem;
2356 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2357 perf_evlist__add(evlist, pgfault_maj);
2360 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2361 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2362 if (pgfault_min == NULL)
2363 goto out_error_mem;
2364 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2365 perf_evlist__add(evlist, pgfault_min);
2368 if (trace->sched &&
2369 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2370 trace__sched_stat_runtime))
2371 goto out_error_sched_stat_runtime;
2373 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2374 if (err < 0) {
2375 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2376 goto out_delete_evlist;
2379 err = trace__symbols_init(trace, evlist);
2380 if (err < 0) {
2381 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2382 goto out_delete_evlist;
2385 perf_evlist__config(evlist, &trace->opts, &callchain_param);
2387 signal(SIGCHLD, sig_handler);
2388 signal(SIGINT, sig_handler);
2390 if (forks) {
2391 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2392 argv, false, NULL);
2393 if (err < 0) {
2394 fprintf(trace->output, "Couldn't run the workload!\n");
2395 goto out_delete_evlist;
2399 err = perf_evlist__open(evlist);
2400 if (err < 0)
2401 goto out_error_open;
2403 err = bpf__apply_obj_config();
2404 if (err) {
2405 char errbuf[BUFSIZ];
2407 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2408 pr_err("ERROR: Apply config to BPF failed: %s\n",
2409 errbuf);
2410 goto out_error_open;
2414 * Better not use !target__has_task() here because we need to cover the
2415 * case where no threads were specified in the command line, but a
2416 * workload was, and in that case we will fill in the thread_map when
2417 * we fork the workload in perf_evlist__prepare_workload.
2419 if (trace->filter_pids.nr > 0)
2420 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2421 else if (thread_map__pid(evlist->threads, 0) == -1)
2422 err = trace__set_filter_loop_pids(trace);
2424 if (err < 0)
2425 goto out_error_mem;
2427 if (trace->ev_qualifier_ids.nr > 0) {
2428 err = trace__set_ev_qualifier_filter(trace);
2429 if (err < 0)
2430 goto out_errno;
2432 pr_debug("event qualifier tracepoint filter: %s\n",
2433 trace->syscalls.events.sys_exit->filter);
2436 err = perf_evlist__apply_filters(evlist, &evsel);
2437 if (err < 0)
2438 goto out_error_apply_filters;
2440 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2441 if (err < 0)
2442 goto out_error_mmap;
2444 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2445 perf_evlist__enable(evlist);
2447 if (forks)
2448 perf_evlist__start_workload(evlist);
2450 if (trace->opts.initial_delay) {
2451 usleep(trace->opts.initial_delay * 1000);
2452 perf_evlist__enable(evlist);
2455 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2456 evlist->threads->nr > 1 ||
2457 perf_evlist__first(evlist)->attr.inherit;
2460 * Now that we already used evsel->attr to ask the kernel to setup the
2461 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2462 * trace__resolve_callchain(), allowing per-event max-stack settings
2463 * to override an explicitely set --max-stack global setting.
2465 evlist__for_each_entry(evlist, evsel) {
2466 if ((evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
2467 evsel->attr.sample_max_stack == 0)
2468 evsel->attr.sample_max_stack = trace->max_stack;
2470 again:
2471 before = trace->nr_events;
2473 for (i = 0; i < evlist->nr_mmaps; i++) {
2474 union perf_event *event;
2476 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2477 struct perf_sample sample;
2479 ++trace->nr_events;
2481 err = perf_evlist__parse_sample(evlist, event, &sample);
2482 if (err) {
2483 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2484 goto next_event;
2487 trace__handle_event(trace, event, &sample);
2488 next_event:
2489 perf_evlist__mmap_consume(evlist, i);
2491 if (interrupted)
2492 goto out_disable;
2494 if (done && !draining) {
2495 perf_evlist__disable(evlist);
2496 draining = true;
2501 if (trace->nr_events == before) {
2502 int timeout = done ? 100 : -1;
2504 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2505 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2506 draining = true;
2508 goto again;
2510 } else {
2511 goto again;
2514 out_disable:
2515 thread__zput(trace->current);
2517 perf_evlist__disable(evlist);
2519 if (!err) {
2520 if (trace->summary)
2521 trace__fprintf_thread_summary(trace, trace->output);
2523 if (trace->show_tool_stats) {
2524 fprintf(trace->output, "Stats:\n "
2525 " vfs_getname : %" PRIu64 "\n"
2526 " proc_getname: %" PRIu64 "\n",
2527 trace->stats.vfs_getname,
2528 trace->stats.proc_getname);
2532 out_delete_evlist:
2533 trace__symbols__exit(trace);
2535 perf_evlist__delete(evlist);
2536 trace->evlist = NULL;
2537 trace->live = false;
2538 return err;
2540 char errbuf[BUFSIZ];
2542 out_error_sched_stat_runtime:
2543 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2544 goto out_error;
2546 out_error_raw_syscalls:
2547 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2548 goto out_error;
2550 out_error_mmap:
2551 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2552 goto out_error;
2554 out_error_open:
2555 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2557 out_error:
2558 fprintf(trace->output, "%s\n", errbuf);
2559 goto out_delete_evlist;
2561 out_error_apply_filters:
2562 fprintf(trace->output,
2563 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2564 evsel->filter, perf_evsel__name(evsel), errno,
2565 str_error_r(errno, errbuf, sizeof(errbuf)));
2566 goto out_delete_evlist;
2568 out_error_mem:
2569 fprintf(trace->output, "Not enough memory to run!\n");
2570 goto out_delete_evlist;
2572 out_errno:
2573 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2574 goto out_delete_evlist;
2577 static int trace__replay(struct trace *trace)
2579 const struct perf_evsel_str_handler handlers[] = {
2580 { "probe:vfs_getname", trace__vfs_getname, },
2582 struct perf_data data = {
2583 .file = {
2584 .path = input_name,
2586 .mode = PERF_DATA_MODE_READ,
2587 .force = trace->force,
2589 struct perf_session *session;
2590 struct perf_evsel *evsel;
2591 int err = -1;
2593 trace->tool.sample = trace__process_sample;
2594 trace->tool.mmap = perf_event__process_mmap;
2595 trace->tool.mmap2 = perf_event__process_mmap2;
2596 trace->tool.comm = perf_event__process_comm;
2597 trace->tool.exit = perf_event__process_exit;
2598 trace->tool.fork = perf_event__process_fork;
2599 trace->tool.attr = perf_event__process_attr;
2600 trace->tool.tracing_data = perf_event__process_tracing_data;
2601 trace->tool.build_id = perf_event__process_build_id;
2602 trace->tool.namespaces = perf_event__process_namespaces;
2604 trace->tool.ordered_events = true;
2605 trace->tool.ordering_requires_timestamps = true;
2607 /* add tid to output */
2608 trace->multiple_threads = true;
2610 session = perf_session__new(&data, false, &trace->tool);
2611 if (session == NULL)
2612 return -1;
2614 if (trace->opts.target.pid)
2615 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2617 if (trace->opts.target.tid)
2618 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2620 if (symbol__init(&session->header.env) < 0)
2621 goto out;
2623 trace->host = &session->machines.host;
2625 err = perf_session__set_tracepoints_handlers(session, handlers);
2626 if (err)
2627 goto out;
2629 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2630 "raw_syscalls:sys_enter");
2631 /* older kernels have syscalls tp versus raw_syscalls */
2632 if (evsel == NULL)
2633 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2634 "syscalls:sys_enter");
2636 if (evsel &&
2637 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2638 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2639 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2640 goto out;
2643 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2644 "raw_syscalls:sys_exit");
2645 if (evsel == NULL)
2646 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2647 "syscalls:sys_exit");
2648 if (evsel &&
2649 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2650 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2651 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2652 goto out;
2655 evlist__for_each_entry(session->evlist, evsel) {
2656 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2657 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2658 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2659 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2660 evsel->handler = trace__pgfault;
2663 setup_pager();
2665 err = perf_session__process_events(session);
2666 if (err)
2667 pr_err("Failed to process events, error %d", err);
2669 else if (trace->summary)
2670 trace__fprintf_thread_summary(trace, trace->output);
2672 out:
2673 perf_session__delete(session);
2675 return err;
2678 static size_t trace__fprintf_threads_header(FILE *fp)
2680 size_t printed;
2682 printed = fprintf(fp, "\n Summary of events:\n\n");
2684 return printed;
2687 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2688 struct stats *stats;
2689 double msecs;
2690 int syscall;
2693 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2694 struct stats *stats = source->priv;
2696 entry->syscall = source->i;
2697 entry->stats = stats;
2698 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2701 static size_t thread__dump_stats(struct thread_trace *ttrace,
2702 struct trace *trace, FILE *fp)
2704 size_t printed = 0;
2705 struct syscall *sc;
2706 struct rb_node *nd;
2707 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2709 if (syscall_stats == NULL)
2710 return 0;
2712 printed += fprintf(fp, "\n");
2714 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2715 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2716 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2718 resort_rb__for_each_entry(nd, syscall_stats) {
2719 struct stats *stats = syscall_stats_entry->stats;
2720 if (stats) {
2721 double min = (double)(stats->min) / NSEC_PER_MSEC;
2722 double max = (double)(stats->max) / NSEC_PER_MSEC;
2723 double avg = avg_stats(stats);
2724 double pct;
2725 u64 n = (u64) stats->n;
2727 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2728 avg /= NSEC_PER_MSEC;
2730 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2731 printed += fprintf(fp, " %-15s", sc->name);
2732 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2733 n, syscall_stats_entry->msecs, min, avg);
2734 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2738 resort_rb__delete(syscall_stats);
2739 printed += fprintf(fp, "\n\n");
2741 return printed;
2744 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2746 size_t printed = 0;
2747 struct thread_trace *ttrace = thread__priv(thread);
2748 double ratio;
2750 if (ttrace == NULL)
2751 return 0;
2753 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2755 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2756 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2757 printed += fprintf(fp, "%.1f%%", ratio);
2758 if (ttrace->pfmaj)
2759 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2760 if (ttrace->pfmin)
2761 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2762 if (trace->sched)
2763 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2764 else if (fputc('\n', fp) != EOF)
2765 ++printed;
2767 printed += thread__dump_stats(ttrace, trace, fp);
2769 return printed;
2772 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2774 return ttrace ? ttrace->nr_events : 0;
2777 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2778 struct thread *thread;
2781 entry->thread = rb_entry(nd, struct thread, rb_node);
2784 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2786 size_t printed = trace__fprintf_threads_header(fp);
2787 struct rb_node *nd;
2788 int i;
2790 for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2791 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2793 if (threads == NULL) {
2794 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2795 return 0;
2798 resort_rb__for_each_entry(nd, threads)
2799 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2801 resort_rb__delete(threads);
2803 return printed;
2806 static int trace__set_duration(const struct option *opt, const char *str,
2807 int unset __maybe_unused)
2809 struct trace *trace = opt->value;
2811 trace->duration_filter = atof(str);
2812 return 0;
2815 static int trace__set_filter_pids(const struct option *opt, const char *str,
2816 int unset __maybe_unused)
2818 int ret = -1;
2819 size_t i;
2820 struct trace *trace = opt->value;
2822 * FIXME: introduce a intarray class, plain parse csv and create a
2823 * { int nr, int entries[] } struct...
2825 struct intlist *list = intlist__new(str);
2827 if (list == NULL)
2828 return -1;
2830 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2831 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2833 if (trace->filter_pids.entries == NULL)
2834 goto out;
2836 trace->filter_pids.entries[0] = getpid();
2838 for (i = 1; i < trace->filter_pids.nr; ++i)
2839 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2841 intlist__delete(list);
2842 ret = 0;
2843 out:
2844 return ret;
2847 static int trace__open_output(struct trace *trace, const char *filename)
2849 struct stat st;
2851 if (!stat(filename, &st) && st.st_size) {
2852 char oldname[PATH_MAX];
2854 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2855 unlink(oldname);
2856 rename(filename, oldname);
2859 trace->output = fopen(filename, "w");
2861 return trace->output == NULL ? -errno : 0;
2864 static int parse_pagefaults(const struct option *opt, const char *str,
2865 int unset __maybe_unused)
2867 int *trace_pgfaults = opt->value;
2869 if (strcmp(str, "all") == 0)
2870 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2871 else if (strcmp(str, "maj") == 0)
2872 *trace_pgfaults |= TRACE_PFMAJ;
2873 else if (strcmp(str, "min") == 0)
2874 *trace_pgfaults |= TRACE_PFMIN;
2875 else
2876 return -1;
2878 return 0;
2881 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2883 struct perf_evsel *evsel;
2885 evlist__for_each_entry(evlist, evsel)
2886 evsel->handler = handler;
2890 * XXX: Hackish, just splitting the combined -e+--event (syscalls
2891 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2892 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2894 * It'd be better to introduce a parse_options() variant that would return a
2895 * list with the terms it didn't match to an event...
2897 static int trace__parse_events_option(const struct option *opt, const char *str,
2898 int unset __maybe_unused)
2900 struct trace *trace = (struct trace *)opt->value;
2901 const char *s = str;
2902 char *sep = NULL, *lists[2] = { NULL, NULL, };
2903 int len = strlen(str) + 1, err = -1, list, idx;
2904 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2905 char group_name[PATH_MAX];
2907 if (strace_groups_dir == NULL)
2908 return -1;
2910 if (*s == '!') {
2911 ++s;
2912 trace->not_ev_qualifier = true;
2915 while (1) {
2916 if ((sep = strchr(s, ',')) != NULL)
2917 *sep = '\0';
2919 list = 0;
2920 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2921 syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2922 list = 1;
2923 } else {
2924 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2925 if (access(group_name, R_OK) == 0)
2926 list = 1;
2929 if (lists[list]) {
2930 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2931 } else {
2932 lists[list] = malloc(len);
2933 if (lists[list] == NULL)
2934 goto out;
2935 strcpy(lists[list], s);
2938 if (!sep)
2939 break;
2941 *sep = ',';
2942 s = sep + 1;
2945 if (lists[1] != NULL) {
2946 struct strlist_config slist_config = {
2947 .dirname = strace_groups_dir,
2950 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2951 if (trace->ev_qualifier == NULL) {
2952 fputs("Not enough memory to parse event qualifier", trace->output);
2953 goto out;
2956 if (trace__validate_ev_qualifier(trace))
2957 goto out;
2960 err = 0;
2962 if (lists[0]) {
2963 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2964 "event selector. use 'perf list' to list available events",
2965 parse_events_option);
2966 err = parse_events_option(&o, lists[0], 0);
2968 out:
2969 if (sep)
2970 *sep = ',';
2972 return err;
2975 int cmd_trace(int argc, const char **argv)
2977 const char *trace_usage[] = {
2978 "perf trace [<options>] [<command>]",
2979 "perf trace [<options>] -- <command> [<options>]",
2980 "perf trace record [<options>] [<command>]",
2981 "perf trace record [<options>] -- <command> [<options>]",
2982 NULL
2984 struct trace trace = {
2985 .syscalls = {
2986 . max = -1,
2988 .opts = {
2989 .target = {
2990 .uid = UINT_MAX,
2991 .uses_mmap = true,
2993 .user_freq = UINT_MAX,
2994 .user_interval = ULLONG_MAX,
2995 .no_buffering = true,
2996 .mmap_pages = UINT_MAX,
2997 .proc_map_timeout = 500,
2999 .output = stderr,
3000 .show_comm = true,
3001 .trace_syscalls = true,
3002 .kernel_syscallchains = false,
3003 .max_stack = UINT_MAX,
3005 const char *output_name = NULL;
3006 const struct option trace_options[] = {
3007 OPT_CALLBACK('e', "event", &trace, "event",
3008 "event/syscall selector. use 'perf list' to list available events",
3009 trace__parse_events_option),
3010 OPT_BOOLEAN(0, "comm", &trace.show_comm,
3011 "show the thread COMM next to its id"),
3012 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3013 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3014 trace__parse_events_option),
3015 OPT_STRING('o', "output", &output_name, "file", "output file name"),
3016 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3017 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3018 "trace events on existing process id"),
3019 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3020 "trace events on existing thread id"),
3021 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3022 "pids to filter (by the kernel)", trace__set_filter_pids),
3023 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3024 "system-wide collection from all CPUs"),
3025 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3026 "list of cpus to monitor"),
3027 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3028 "child tasks do not inherit counters"),
3029 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3030 "number of mmap data pages",
3031 perf_evlist__parse_mmap_pages),
3032 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3033 "user to profile"),
3034 OPT_CALLBACK(0, "duration", &trace, "float",
3035 "show only events with duration > N.M ms",
3036 trace__set_duration),
3037 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3038 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3039 OPT_BOOLEAN('T', "time", &trace.full_time,
3040 "Show full timestamp, not time relative to first start"),
3041 OPT_BOOLEAN('s', "summary", &trace.summary_only,
3042 "Show only syscall summary with statistics"),
3043 OPT_BOOLEAN('S', "with-summary", &trace.summary,
3044 "Show all syscalls and summary with statistics"),
3045 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3046 "Trace pagefaults", parse_pagefaults, "maj"),
3047 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3048 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3049 OPT_CALLBACK(0, "call-graph", &trace.opts,
3050 "record_mode[,record_size]", record_callchain_help,
3051 &record_parse_callchain_opt),
3052 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3053 "Show the kernel callchains on the syscall exit path"),
3054 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3055 "Set the minimum stack depth when parsing the callchain, "
3056 "anything below the specified depth will be ignored."),
3057 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3058 "Set the maximum stack depth when parsing the callchain, "
3059 "anything beyond the specified depth will be ignored. "
3060 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3061 OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3062 "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3063 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3064 "per thread proc mmap processing timeout in ms"),
3065 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3066 "ms to wait before starting measurement after program "
3067 "start"),
3068 OPT_END()
3070 bool __maybe_unused max_stack_user_set = true;
3071 bool mmap_pages_user_set = true;
3072 const char * const trace_subcommands[] = { "record", NULL };
3073 int err;
3074 char bf[BUFSIZ];
3076 signal(SIGSEGV, sighandler_dump_stack);
3077 signal(SIGFPE, sighandler_dump_stack);
3079 trace.evlist = perf_evlist__new();
3080 trace.sctbl = syscalltbl__new();
3082 if (trace.evlist == NULL || trace.sctbl == NULL) {
3083 pr_err("Not enough memory to run!\n");
3084 err = -ENOMEM;
3085 goto out;
3088 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3089 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3091 err = bpf__setup_stdout(trace.evlist);
3092 if (err) {
3093 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3094 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3095 goto out;
3098 err = -1;
3100 if (trace.trace_pgfaults) {
3101 trace.opts.sample_address = true;
3102 trace.opts.sample_time = true;
3105 if (trace.opts.mmap_pages == UINT_MAX)
3106 mmap_pages_user_set = false;
3108 if (trace.max_stack == UINT_MAX) {
3109 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3110 max_stack_user_set = false;
3113 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3114 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3115 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3117 #endif
3119 if (callchain_param.enabled) {
3120 if (!mmap_pages_user_set && geteuid() == 0)
3121 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3123 symbol_conf.use_callchain = true;
3126 if (trace.evlist->nr_entries > 0)
3127 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3129 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3130 return trace__record(&trace, argc-1, &argv[1]);
3132 /* summary_only implies summary option, but don't overwrite summary if set */
3133 if (trace.summary_only)
3134 trace.summary = trace.summary_only;
3136 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3137 trace.evlist->nr_entries == 0 /* Was --events used? */) {
3138 pr_err("Please specify something to trace.\n");
3139 return -1;
3142 if (!trace.trace_syscalls && trace.ev_qualifier) {
3143 pr_err("The -e option can't be used with --no-syscalls.\n");
3144 goto out;
3147 if (output_name != NULL) {
3148 err = trace__open_output(&trace, output_name);
3149 if (err < 0) {
3150 perror("failed to create output file");
3151 goto out;
3155 trace.open_id = syscalltbl__id(trace.sctbl, "open");
3157 err = target__validate(&trace.opts.target);
3158 if (err) {
3159 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3160 fprintf(trace.output, "%s", bf);
3161 goto out_close;
3164 err = target__parse_uid(&trace.opts.target);
3165 if (err) {
3166 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3167 fprintf(trace.output, "%s", bf);
3168 goto out_close;
3171 if (!argc && target__none(&trace.opts.target))
3172 trace.opts.target.system_wide = true;
3174 if (input_name)
3175 err = trace__replay(&trace);
3176 else
3177 err = trace__run(&trace, argc, argv);
3179 out_close:
3180 if (output_name != NULL)
3181 fclose(trace.output);
3182 out:
3183 return err;