xfs: remove bli from AIL before release on transaction abort
[linux/fpc-iii.git] / tools / perf / builtin-trace.c
blob21f8a81797a04e32b9fc496851e9332f58e5e6d9
1 /*
2 * builtin-trace.c
4 * Builtin 'trace' command:
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
16 * Released under the GPL v2. (and only v2, not any later version)
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37 #include "callchain.h"
38 #include "syscalltbl.h"
39 #include "rb_resort.h"
41 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
42 #include <stdlib.h>
43 #include <linux/err.h>
44 #include <linux/filter.h>
45 #include <linux/audit.h>
46 #include <linux/random.h>
47 #include <linux/stringify.h>
48 #include <linux/time64.h>
50 #ifndef O_CLOEXEC
51 # define O_CLOEXEC 02000000
52 #endif
54 struct trace {
55 struct perf_tool tool;
56 struct syscalltbl *sctbl;
57 struct {
58 int max;
59 struct syscall *table;
60 struct {
61 struct perf_evsel *sys_enter,
62 *sys_exit;
63 } events;
64 } syscalls;
65 struct record_opts opts;
66 struct perf_evlist *evlist;
67 struct machine *host;
68 struct thread *current;
69 u64 base_time;
70 FILE *output;
71 unsigned long nr_events;
72 struct strlist *ev_qualifier;
73 struct {
74 size_t nr;
75 int *entries;
76 } ev_qualifier_ids;
77 struct intlist *tid_list;
78 struct intlist *pid_list;
79 struct {
80 size_t nr;
81 pid_t *entries;
82 } filter_pids;
83 double duration_filter;
84 double runtime_ms;
85 struct {
86 u64 vfs_getname,
87 proc_getname;
88 } stats;
89 unsigned int max_stack;
90 unsigned int min_stack;
91 bool not_ev_qualifier;
92 bool live;
93 bool full_time;
94 bool sched;
95 bool multiple_threads;
96 bool summary;
97 bool summary_only;
98 bool show_comm;
99 bool show_tool_stats;
100 bool trace_syscalls;
101 bool kernel_syscallchains;
102 bool force;
103 bool vfs_getname;
104 int trace_pgfaults;
105 int open_id;
108 struct tp_field {
109 int offset;
110 union {
111 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
112 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
116 #define TP_UINT_FIELD(bits) \
117 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
119 u##bits value; \
120 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
121 return value; \
124 TP_UINT_FIELD(8);
125 TP_UINT_FIELD(16);
126 TP_UINT_FIELD(32);
127 TP_UINT_FIELD(64);
129 #define TP_UINT_FIELD__SWAPPED(bits) \
130 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
132 u##bits value; \
133 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
134 return bswap_##bits(value);\
137 TP_UINT_FIELD__SWAPPED(16);
138 TP_UINT_FIELD__SWAPPED(32);
139 TP_UINT_FIELD__SWAPPED(64);
141 static int tp_field__init_uint(struct tp_field *field,
142 struct format_field *format_field,
143 bool needs_swap)
145 field->offset = format_field->offset;
147 switch (format_field->size) {
148 case 1:
149 field->integer = tp_field__u8;
150 break;
151 case 2:
152 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
153 break;
154 case 4:
155 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
156 break;
157 case 8:
158 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
159 break;
160 default:
161 return -1;
164 return 0;
167 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
169 return sample->raw_data + field->offset;
172 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
174 field->offset = format_field->offset;
175 field->pointer = tp_field__ptr;
176 return 0;
179 struct syscall_tp {
180 struct tp_field id;
181 union {
182 struct tp_field args, ret;
186 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
187 struct tp_field *field,
188 const char *name)
190 struct format_field *format_field = perf_evsel__field(evsel, name);
192 if (format_field == NULL)
193 return -1;
195 return tp_field__init_uint(field, format_field, evsel->needs_swap);
198 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
199 ({ struct syscall_tp *sc = evsel->priv;\
200 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
202 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
203 struct tp_field *field,
204 const char *name)
206 struct format_field *format_field = perf_evsel__field(evsel, name);
208 if (format_field == NULL)
209 return -1;
211 return tp_field__init_ptr(field, format_field);
214 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
215 ({ struct syscall_tp *sc = evsel->priv;\
216 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
218 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
220 zfree(&evsel->priv);
221 perf_evsel__delete(evsel);
224 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
226 evsel->priv = malloc(sizeof(struct syscall_tp));
227 if (evsel->priv != NULL) {
228 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
229 goto out_delete;
231 evsel->handler = handler;
232 return 0;
235 return -ENOMEM;
237 out_delete:
238 zfree(&evsel->priv);
239 return -ENOENT;
242 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
244 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
246 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
247 if (IS_ERR(evsel))
248 evsel = perf_evsel__newtp("syscalls", direction);
250 if (IS_ERR(evsel))
251 return NULL;
253 if (perf_evsel__init_syscall_tp(evsel, handler))
254 goto out_delete;
256 return evsel;
258 out_delete:
259 perf_evsel__delete_priv(evsel);
260 return NULL;
263 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
264 ({ struct syscall_tp *fields = evsel->priv; \
265 fields->name.integer(&fields->name, sample); })
267 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
268 ({ struct syscall_tp *fields = evsel->priv; \
269 fields->name.pointer(&fields->name, sample); })
271 struct syscall_arg {
272 unsigned long val;
273 struct thread *thread;
274 struct trace *trace;
275 void *parm;
276 u8 idx;
277 u8 mask;
280 struct strarray {
281 int offset;
282 int nr_entries;
283 const char **entries;
286 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
287 .nr_entries = ARRAY_SIZE(array), \
288 .entries = array, \
291 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
292 .offset = off, \
293 .nr_entries = ARRAY_SIZE(array), \
294 .entries = array, \
297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
298 const char *intfmt,
299 struct syscall_arg *arg)
301 struct strarray *sa = arg->parm;
302 int idx = arg->val - sa->offset;
304 if (idx < 0 || idx >= sa->nr_entries)
305 return scnprintf(bf, size, intfmt, arg->val);
307 return scnprintf(bf, size, "%s", sa->entries[idx]);
310 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
311 struct syscall_arg *arg)
313 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
316 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
318 #if defined(__i386__) || defined(__x86_64__)
320 * FIXME: Make this available to all arches as soon as the ioctl beautifier
321 * gets rewritten to support all arches.
323 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
324 struct syscall_arg *arg)
326 return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
329 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
330 #endif /* defined(__i386__) || defined(__x86_64__) */
332 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
333 struct syscall_arg *arg);
335 #define SCA_FD syscall_arg__scnprintf_fd
337 #ifndef AT_FDCWD
338 #define AT_FDCWD -100
339 #endif
341 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
342 struct syscall_arg *arg)
344 int fd = arg->val;
346 if (fd == AT_FDCWD)
347 return scnprintf(bf, size, "CWD");
349 return syscall_arg__scnprintf_fd(bf, size, arg);
352 #define SCA_FDAT syscall_arg__scnprintf_fd_at
354 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
355 struct syscall_arg *arg);
357 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
359 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
360 struct syscall_arg *arg)
362 return scnprintf(bf, size, "%#lx", arg->val);
365 #define SCA_HEX syscall_arg__scnprintf_hex
367 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
368 struct syscall_arg *arg)
370 return scnprintf(bf, size, "%d", arg->val);
373 #define SCA_INT syscall_arg__scnprintf_int
375 static const char *bpf_cmd[] = {
376 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
377 "MAP_GET_NEXT_KEY", "PROG_LOAD",
379 static DEFINE_STRARRAY(bpf_cmd);
381 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
382 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
384 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
385 static DEFINE_STRARRAY(itimers);
387 static const char *keyctl_options[] = {
388 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
389 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
390 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
391 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
392 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
394 static DEFINE_STRARRAY(keyctl_options);
396 static const char *whences[] = { "SET", "CUR", "END",
397 #ifdef SEEK_DATA
398 "DATA",
399 #endif
400 #ifdef SEEK_HOLE
401 "HOLE",
402 #endif
404 static DEFINE_STRARRAY(whences);
406 static const char *fcntl_cmds[] = {
407 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
408 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
409 "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
410 "F_GETOWNER_UIDS",
412 static DEFINE_STRARRAY(fcntl_cmds);
414 static const char *rlimit_resources[] = {
415 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
416 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
417 "RTTIME",
419 static DEFINE_STRARRAY(rlimit_resources);
421 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
422 static DEFINE_STRARRAY(sighow);
424 static const char *clockid[] = {
425 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
426 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
427 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
429 static DEFINE_STRARRAY(clockid);
431 static const char *socket_families[] = {
432 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
433 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
434 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
435 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
436 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
437 "ALG", "NFC", "VSOCK",
439 static DEFINE_STRARRAY(socket_families);
441 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
442 struct syscall_arg *arg)
444 size_t printed = 0;
445 int mode = arg->val;
447 if (mode == F_OK) /* 0 */
448 return scnprintf(bf, size, "F");
449 #define P_MODE(n) \
450 if (mode & n##_OK) { \
451 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
452 mode &= ~n##_OK; \
455 P_MODE(R);
456 P_MODE(W);
457 P_MODE(X);
458 #undef P_MODE
460 if (mode)
461 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
463 return printed;
466 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
468 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
469 struct syscall_arg *arg);
471 #define SCA_FILENAME syscall_arg__scnprintf_filename
473 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
474 struct syscall_arg *arg)
476 int printed = 0, flags = arg->val;
478 #define P_FLAG(n) \
479 if (flags & O_##n) { \
480 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
481 flags &= ~O_##n; \
484 P_FLAG(CLOEXEC);
485 P_FLAG(NONBLOCK);
486 #undef P_FLAG
488 if (flags)
489 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
491 return printed;
494 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
496 #if defined(__i386__) || defined(__x86_64__)
498 * FIXME: Make this available to all arches.
500 #define TCGETS 0x5401
502 static const char *tioctls[] = {
503 "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
504 "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
505 "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
506 "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
507 "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
508 "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
509 "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
510 "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
511 "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
512 "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
513 "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
514 [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
515 "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
516 "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
517 "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
520 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
521 #endif /* defined(__i386__) || defined(__x86_64__) */
523 #ifndef GRND_NONBLOCK
524 #define GRND_NONBLOCK 0x0001
525 #endif
526 #ifndef GRND_RANDOM
527 #define GRND_RANDOM 0x0002
528 #endif
530 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
531 struct syscall_arg *arg)
533 int printed = 0, flags = arg->val;
535 #define P_FLAG(n) \
536 if (flags & GRND_##n) { \
537 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
538 flags &= ~GRND_##n; \
541 P_FLAG(RANDOM);
542 P_FLAG(NONBLOCK);
543 #undef P_FLAG
545 if (flags)
546 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
548 return printed;
551 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
553 #define STRARRAY(arg, name, array) \
554 .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
555 .arg_parm = { [arg] = &strarray__##array, }
557 #include "trace/beauty/eventfd.c"
558 #include "trace/beauty/flock.c"
559 #include "trace/beauty/futex_op.c"
560 #include "trace/beauty/mmap.c"
561 #include "trace/beauty/mode_t.c"
562 #include "trace/beauty/msg_flags.c"
563 #include "trace/beauty/open_flags.c"
564 #include "trace/beauty/perf_event_open.c"
565 #include "trace/beauty/pid.c"
566 #include "trace/beauty/sched_policy.c"
567 #include "trace/beauty/seccomp.c"
568 #include "trace/beauty/signum.c"
569 #include "trace/beauty/socket_type.c"
570 #include "trace/beauty/waitid_options.c"
572 static struct syscall_fmt {
573 const char *name;
574 const char *alias;
575 size_t (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
576 void *arg_parm[6];
577 bool errmsg;
578 bool errpid;
579 bool timeout;
580 bool hexret;
581 } syscall_fmts[] = {
582 { .name = "access", .errmsg = true,
583 .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
584 { .name = "arch_prctl", .errmsg = true, .alias = "prctl", },
585 { .name = "bpf", .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
586 { .name = "brk", .hexret = true,
587 .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
588 { .name = "chdir", .errmsg = true, },
589 { .name = "chmod", .errmsg = true, },
590 { .name = "chroot", .errmsg = true, },
591 { .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), },
592 { .name = "clone", .errpid = true, },
593 { .name = "close", .errmsg = true,
594 .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
595 { .name = "connect", .errmsg = true, },
596 { .name = "creat", .errmsg = true, },
597 { .name = "dup", .errmsg = true, },
598 { .name = "dup2", .errmsg = true, },
599 { .name = "dup3", .errmsg = true, },
600 { .name = "epoll_ctl", .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
601 { .name = "eventfd2", .errmsg = true,
602 .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
603 { .name = "faccessat", .errmsg = true, },
604 { .name = "fadvise64", .errmsg = true, },
605 { .name = "fallocate", .errmsg = true, },
606 { .name = "fchdir", .errmsg = true, },
607 { .name = "fchmod", .errmsg = true, },
608 { .name = "fchmodat", .errmsg = true,
609 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
610 { .name = "fchown", .errmsg = true, },
611 { .name = "fchownat", .errmsg = true,
612 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
613 { .name = "fcntl", .errmsg = true,
614 .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
615 .arg_parm = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
616 { .name = "fdatasync", .errmsg = true, },
617 { .name = "flock", .errmsg = true,
618 .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
619 { .name = "fsetxattr", .errmsg = true, },
620 { .name = "fstat", .errmsg = true, .alias = "newfstat", },
621 { .name = "fstatat", .errmsg = true, .alias = "newfstatat", },
622 { .name = "fstatfs", .errmsg = true, },
623 { .name = "fsync", .errmsg = true, },
624 { .name = "ftruncate", .errmsg = true, },
625 { .name = "futex", .errmsg = true,
626 .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
627 { .name = "futimesat", .errmsg = true,
628 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
629 { .name = "getdents", .errmsg = true, },
630 { .name = "getdents64", .errmsg = true, },
631 { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), },
632 { .name = "getpid", .errpid = true, },
633 { .name = "getpgid", .errpid = true, },
634 { .name = "getppid", .errpid = true, },
635 { .name = "getrandom", .errmsg = true,
636 .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
637 { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
638 { .name = "getxattr", .errmsg = true, },
639 { .name = "inotify_add_watch", .errmsg = true, },
640 { .name = "ioctl", .errmsg = true,
641 .arg_scnprintf = {
642 #if defined(__i386__) || defined(__x86_64__)
644 * FIXME: Make this available to all arches.
646 [1] = SCA_STRHEXARRAY, /* cmd */
647 [2] = SCA_HEX, /* arg */ },
648 .arg_parm = { [1] = &strarray__tioctls, /* cmd */ }, },
649 #else
650 [2] = SCA_HEX, /* arg */ }, },
651 #endif
652 { .name = "keyctl", .errmsg = true, STRARRAY(0, option, keyctl_options), },
653 { .name = "kill", .errmsg = true,
654 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
655 { .name = "lchown", .errmsg = true, },
656 { .name = "lgetxattr", .errmsg = true, },
657 { .name = "linkat", .errmsg = true,
658 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
659 { .name = "listxattr", .errmsg = true, },
660 { .name = "llistxattr", .errmsg = true, },
661 { .name = "lremovexattr", .errmsg = true, },
662 { .name = "lseek", .errmsg = true,
663 .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
664 .arg_parm = { [2] = &strarray__whences, /* whence */ }, },
665 { .name = "lsetxattr", .errmsg = true, },
666 { .name = "lstat", .errmsg = true, .alias = "newlstat", },
667 { .name = "lsxattr", .errmsg = true, },
668 { .name = "madvise", .errmsg = true,
669 .arg_scnprintf = { [0] = SCA_HEX, /* start */
670 [2] = SCA_MADV_BHV, /* behavior */ }, },
671 { .name = "mkdir", .errmsg = true, },
672 { .name = "mkdirat", .errmsg = true,
673 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
674 { .name = "mknod", .errmsg = true, },
675 { .name = "mknodat", .errmsg = true,
676 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
677 { .name = "mlock", .errmsg = true,
678 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
679 { .name = "mlockall", .errmsg = true,
680 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
681 { .name = "mmap", .hexret = true,
682 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
683 [2] = SCA_MMAP_PROT, /* prot */
684 [3] = SCA_MMAP_FLAGS, /* flags */ }, },
685 { .name = "mprotect", .errmsg = true,
686 .arg_scnprintf = { [0] = SCA_HEX, /* start */
687 [2] = SCA_MMAP_PROT, /* prot */ }, },
688 { .name = "mq_unlink", .errmsg = true,
689 .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
690 { .name = "mremap", .hexret = true,
691 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
692 [3] = SCA_MREMAP_FLAGS, /* flags */
693 [4] = SCA_HEX, /* new_addr */ }, },
694 { .name = "munlock", .errmsg = true,
695 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
696 { .name = "munmap", .errmsg = true,
697 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
698 { .name = "name_to_handle_at", .errmsg = true,
699 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
700 { .name = "newfstatat", .errmsg = true,
701 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
702 { .name = "open", .errmsg = true,
703 .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
704 { .name = "open_by_handle_at", .errmsg = true,
705 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
706 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
707 { .name = "openat", .errmsg = true,
708 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
709 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
710 { .name = "perf_event_open", .errmsg = true,
711 .arg_scnprintf = { [2] = SCA_INT, /* cpu */
712 [3] = SCA_FD, /* group_fd */
713 [4] = SCA_PERF_FLAGS, /* flags */ }, },
714 { .name = "pipe2", .errmsg = true,
715 .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
716 { .name = "poll", .errmsg = true, .timeout = true, },
717 { .name = "ppoll", .errmsg = true, .timeout = true, },
718 { .name = "pread", .errmsg = true, .alias = "pread64", },
719 { .name = "preadv", .errmsg = true, .alias = "pread", },
720 { .name = "prlimit64", .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
721 { .name = "pwrite", .errmsg = true, .alias = "pwrite64", },
722 { .name = "pwritev", .errmsg = true, },
723 { .name = "read", .errmsg = true, },
724 { .name = "readlink", .errmsg = true, },
725 { .name = "readlinkat", .errmsg = true,
726 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
727 { .name = "readv", .errmsg = true, },
728 { .name = "recvfrom", .errmsg = true,
729 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
730 { .name = "recvmmsg", .errmsg = true,
731 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
732 { .name = "recvmsg", .errmsg = true,
733 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
734 { .name = "removexattr", .errmsg = true, },
735 { .name = "renameat", .errmsg = true,
736 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
737 { .name = "rmdir", .errmsg = true, },
738 { .name = "rt_sigaction", .errmsg = true,
739 .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
740 { .name = "rt_sigprocmask", .errmsg = true, STRARRAY(0, how, sighow), },
741 { .name = "rt_sigqueueinfo", .errmsg = true,
742 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
743 { .name = "rt_tgsigqueueinfo", .errmsg = true,
744 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
745 { .name = "sched_getattr", .errmsg = true, },
746 { .name = "sched_setattr", .errmsg = true, },
747 { .name = "sched_setscheduler", .errmsg = true,
748 .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
749 { .name = "seccomp", .errmsg = true,
750 .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
751 [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
752 { .name = "select", .errmsg = true, .timeout = true, },
753 { .name = "sendmmsg", .errmsg = true,
754 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
755 { .name = "sendmsg", .errmsg = true,
756 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
757 { .name = "sendto", .errmsg = true,
758 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
759 { .name = "set_tid_address", .errpid = true, },
760 { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), },
761 { .name = "setpgid", .errmsg = true, },
762 { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
763 { .name = "setxattr", .errmsg = true, },
764 { .name = "shutdown", .errmsg = true, },
765 { .name = "socket", .errmsg = true,
766 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
767 [1] = SCA_SK_TYPE, /* type */ },
768 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
769 { .name = "socketpair", .errmsg = true,
770 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
771 [1] = SCA_SK_TYPE, /* type */ },
772 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
773 { .name = "stat", .errmsg = true, .alias = "newstat", },
774 { .name = "statfs", .errmsg = true, },
775 { .name = "swapoff", .errmsg = true,
776 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
777 { .name = "swapon", .errmsg = true,
778 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
779 { .name = "symlinkat", .errmsg = true,
780 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
781 { .name = "tgkill", .errmsg = true,
782 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
783 { .name = "tkill", .errmsg = true,
784 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
785 { .name = "truncate", .errmsg = true, },
786 { .name = "uname", .errmsg = true, .alias = "newuname", },
787 { .name = "unlinkat", .errmsg = true,
788 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
789 { .name = "utime", .errmsg = true, },
790 { .name = "utimensat", .errmsg = true,
791 .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
792 { .name = "utimes", .errmsg = true, },
793 { .name = "vmsplice", .errmsg = true, },
794 { .name = "wait4", .errpid = true,
795 .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
796 { .name = "waitid", .errpid = true,
797 .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
798 { .name = "write", .errmsg = true, },
799 { .name = "writev", .errmsg = true, },
802 static int syscall_fmt__cmp(const void *name, const void *fmtp)
804 const struct syscall_fmt *fmt = fmtp;
805 return strcmp(name, fmt->name);
808 static struct syscall_fmt *syscall_fmt__find(const char *name)
810 const int nmemb = ARRAY_SIZE(syscall_fmts);
811 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
814 struct syscall {
815 struct event_format *tp_format;
816 int nr_args;
817 struct format_field *args;
818 const char *name;
819 bool is_exit;
820 struct syscall_fmt *fmt;
821 size_t (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
822 void **arg_parm;
825 static size_t fprintf_duration(unsigned long t, FILE *fp)
827 double duration = (double)t / NSEC_PER_MSEC;
828 size_t printed = fprintf(fp, "(");
830 if (duration >= 1.0)
831 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
832 else if (duration >= 0.01)
833 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
834 else
835 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
836 return printed + fprintf(fp, "): ");
840 * filename.ptr: The filename char pointer that will be vfs_getname'd
841 * filename.entry_str_pos: Where to insert the string translated from
842 * filename.ptr by the vfs_getname tracepoint/kprobe.
844 struct thread_trace {
845 u64 entry_time;
846 u64 exit_time;
847 bool entry_pending;
848 unsigned long nr_events;
849 unsigned long pfmaj, pfmin;
850 char *entry_str;
851 double runtime_ms;
852 struct {
853 unsigned long ptr;
854 short int entry_str_pos;
855 bool pending_open;
856 unsigned int namelen;
857 char *name;
858 } filename;
859 struct {
860 int max;
861 char **table;
862 } paths;
864 struct intlist *syscall_stats;
867 static struct thread_trace *thread_trace__new(void)
869 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
871 if (ttrace)
872 ttrace->paths.max = -1;
874 ttrace->syscall_stats = intlist__new(NULL);
876 return ttrace;
879 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
881 struct thread_trace *ttrace;
883 if (thread == NULL)
884 goto fail;
886 if (thread__priv(thread) == NULL)
887 thread__set_priv(thread, thread_trace__new());
889 if (thread__priv(thread) == NULL)
890 goto fail;
892 ttrace = thread__priv(thread);
893 ++ttrace->nr_events;
895 return ttrace;
896 fail:
897 color_fprintf(fp, PERF_COLOR_RED,
898 "WARNING: not enough memory, dropping samples!\n");
899 return NULL;
902 #define TRACE_PFMAJ (1 << 0)
903 #define TRACE_PFMIN (1 << 1)
905 static const size_t trace__entry_str_size = 2048;
907 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
909 struct thread_trace *ttrace = thread__priv(thread);
911 if (fd > ttrace->paths.max) {
912 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
914 if (npath == NULL)
915 return -1;
917 if (ttrace->paths.max != -1) {
918 memset(npath + ttrace->paths.max + 1, 0,
919 (fd - ttrace->paths.max) * sizeof(char *));
920 } else {
921 memset(npath, 0, (fd + 1) * sizeof(char *));
924 ttrace->paths.table = npath;
925 ttrace->paths.max = fd;
928 ttrace->paths.table[fd] = strdup(pathname);
930 return ttrace->paths.table[fd] != NULL ? 0 : -1;
933 static int thread__read_fd_path(struct thread *thread, int fd)
935 char linkname[PATH_MAX], pathname[PATH_MAX];
936 struct stat st;
937 int ret;
939 if (thread->pid_ == thread->tid) {
940 scnprintf(linkname, sizeof(linkname),
941 "/proc/%d/fd/%d", thread->pid_, fd);
942 } else {
943 scnprintf(linkname, sizeof(linkname),
944 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
947 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
948 return -1;
950 ret = readlink(linkname, pathname, sizeof(pathname));
952 if (ret < 0 || ret > st.st_size)
953 return -1;
955 pathname[ret] = '\0';
956 return trace__set_fd_pathname(thread, fd, pathname);
959 static const char *thread__fd_path(struct thread *thread, int fd,
960 struct trace *trace)
962 struct thread_trace *ttrace = thread__priv(thread);
964 if (ttrace == NULL)
965 return NULL;
967 if (fd < 0)
968 return NULL;
970 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
971 if (!trace->live)
972 return NULL;
973 ++trace->stats.proc_getname;
974 if (thread__read_fd_path(thread, fd))
975 return NULL;
978 return ttrace->paths.table[fd];
981 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
982 struct syscall_arg *arg)
984 int fd = arg->val;
985 size_t printed = scnprintf(bf, size, "%d", fd);
986 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
988 if (path)
989 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
991 return printed;
994 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
995 struct syscall_arg *arg)
997 int fd = arg->val;
998 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
999 struct thread_trace *ttrace = thread__priv(arg->thread);
1001 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1002 zfree(&ttrace->paths.table[fd]);
1004 return printed;
1007 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1008 unsigned long ptr)
1010 struct thread_trace *ttrace = thread__priv(thread);
1012 ttrace->filename.ptr = ptr;
1013 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1016 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1017 struct syscall_arg *arg)
1019 unsigned long ptr = arg->val;
1021 if (!arg->trace->vfs_getname)
1022 return scnprintf(bf, size, "%#x", ptr);
1024 thread__set_filename_pos(arg->thread, bf, ptr);
1025 return 0;
1028 static bool trace__filter_duration(struct trace *trace, double t)
1030 return t < (trace->duration_filter * NSEC_PER_MSEC);
1033 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1035 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1037 return fprintf(fp, "%10.3f ", ts);
1040 static bool done = false;
1041 static bool interrupted = false;
1043 static void sig_handler(int sig)
1045 done = true;
1046 interrupted = sig == SIGINT;
1049 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1050 u64 duration, u64 tstamp, FILE *fp)
1052 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1053 printed += fprintf_duration(duration, fp);
1055 if (trace->multiple_threads) {
1056 if (trace->show_comm)
1057 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1058 printed += fprintf(fp, "%d ", thread->tid);
1061 return printed;
1064 static int trace__process_event(struct trace *trace, struct machine *machine,
1065 union perf_event *event, struct perf_sample *sample)
1067 int ret = 0;
1069 switch (event->header.type) {
1070 case PERF_RECORD_LOST:
1071 color_fprintf(trace->output, PERF_COLOR_RED,
1072 "LOST %" PRIu64 " events!\n", event->lost.lost);
1073 ret = machine__process_lost_event(machine, event, sample);
1074 break;
1075 default:
1076 ret = machine__process_event(machine, event, sample);
1077 break;
1080 return ret;
1083 static int trace__tool_process(struct perf_tool *tool,
1084 union perf_event *event,
1085 struct perf_sample *sample,
1086 struct machine *machine)
1088 struct trace *trace = container_of(tool, struct trace, tool);
1089 return trace__process_event(trace, machine, event, sample);
1092 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1094 struct machine *machine = vmachine;
1096 if (machine->kptr_restrict_warned)
1097 return NULL;
1099 if (symbol_conf.kptr_restrict) {
1100 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1101 "Check /proc/sys/kernel/kptr_restrict.\n\n"
1102 "Kernel samples will not be resolved.\n");
1103 machine->kptr_restrict_warned = true;
1104 return NULL;
1107 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1110 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1112 int err = symbol__init(NULL);
1114 if (err)
1115 return err;
1117 trace->host = machine__new_host();
1118 if (trace->host == NULL)
1119 return -ENOMEM;
1121 if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1122 return -errno;
1124 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1125 evlist->threads, trace__tool_process, false,
1126 trace->opts.proc_map_timeout);
1127 if (err)
1128 symbol__exit();
1130 return err;
1133 static int syscall__set_arg_fmts(struct syscall *sc)
1135 struct format_field *field;
1136 int idx = 0, len;
1138 sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1139 if (sc->arg_scnprintf == NULL)
1140 return -1;
1142 if (sc->fmt)
1143 sc->arg_parm = sc->fmt->arg_parm;
1145 for (field = sc->args; field; field = field->next) {
1146 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1147 sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1148 else if (strcmp(field->type, "const char *") == 0 &&
1149 (strcmp(field->name, "filename") == 0 ||
1150 strcmp(field->name, "path") == 0 ||
1151 strcmp(field->name, "pathname") == 0))
1152 sc->arg_scnprintf[idx] = SCA_FILENAME;
1153 else if (field->flags & FIELD_IS_POINTER)
1154 sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1155 else if (strcmp(field->type, "pid_t") == 0)
1156 sc->arg_scnprintf[idx] = SCA_PID;
1157 else if (strcmp(field->type, "umode_t") == 0)
1158 sc->arg_scnprintf[idx] = SCA_MODE_T;
1159 else if ((strcmp(field->type, "int") == 0 ||
1160 strcmp(field->type, "unsigned int") == 0 ||
1161 strcmp(field->type, "long") == 0) &&
1162 (len = strlen(field->name)) >= 2 &&
1163 strcmp(field->name + len - 2, "fd") == 0) {
1165 * /sys/kernel/tracing/events/syscalls/sys_enter*
1166 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1167 * 65 int
1168 * 23 unsigned int
1169 * 7 unsigned long
1171 sc->arg_scnprintf[idx] = SCA_FD;
1173 ++idx;
1176 return 0;
1179 static int trace__read_syscall_info(struct trace *trace, int id)
1181 char tp_name[128];
1182 struct syscall *sc;
1183 const char *name = syscalltbl__name(trace->sctbl, id);
1185 if (name == NULL)
1186 return -1;
1188 if (id > trace->syscalls.max) {
1189 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1191 if (nsyscalls == NULL)
1192 return -1;
1194 if (trace->syscalls.max != -1) {
1195 memset(nsyscalls + trace->syscalls.max + 1, 0,
1196 (id - trace->syscalls.max) * sizeof(*sc));
1197 } else {
1198 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1201 trace->syscalls.table = nsyscalls;
1202 trace->syscalls.max = id;
1205 sc = trace->syscalls.table + id;
1206 sc->name = name;
1208 sc->fmt = syscall_fmt__find(sc->name);
1210 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1211 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1213 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1214 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1215 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1218 if (IS_ERR(sc->tp_format))
1219 return -1;
1221 sc->args = sc->tp_format->format.fields;
1222 sc->nr_args = sc->tp_format->format.nr_fields;
1224 * We need to check and discard the first variable '__syscall_nr'
1225 * or 'nr' that mean the syscall number. It is needless here.
1226 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1228 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1229 sc->args = sc->args->next;
1230 --sc->nr_args;
1233 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1235 return syscall__set_arg_fmts(sc);
1238 static int trace__validate_ev_qualifier(struct trace *trace)
1240 int err = 0, i;
1241 struct str_node *pos;
1243 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1244 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1245 sizeof(trace->ev_qualifier_ids.entries[0]));
1247 if (trace->ev_qualifier_ids.entries == NULL) {
1248 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1249 trace->output);
1250 err = -EINVAL;
1251 goto out;
1254 i = 0;
1256 strlist__for_each_entry(pos, trace->ev_qualifier) {
1257 const char *sc = pos->s;
1258 int id = syscalltbl__id(trace->sctbl, sc);
1260 if (id < 0) {
1261 if (err == 0) {
1262 fputs("Error:\tInvalid syscall ", trace->output);
1263 err = -EINVAL;
1264 } else {
1265 fputs(", ", trace->output);
1268 fputs(sc, trace->output);
1271 trace->ev_qualifier_ids.entries[i++] = id;
1274 if (err < 0) {
1275 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1276 "\nHint:\tand: 'man syscalls'\n", trace->output);
1277 zfree(&trace->ev_qualifier_ids.entries);
1278 trace->ev_qualifier_ids.nr = 0;
1280 out:
1281 return err;
1285 * args is to be interpreted as a series of longs but we need to handle
1286 * 8-byte unaligned accesses. args points to raw_data within the event
1287 * and raw_data is guaranteed to be 8-byte unaligned because it is
1288 * preceded by raw_size which is a u32. So we need to copy args to a temp
1289 * variable to read it. Most notably this avoids extended load instructions
1290 * on unaligned addresses
1293 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1294 unsigned char *args, struct trace *trace,
1295 struct thread *thread)
1297 size_t printed = 0;
1298 unsigned char *p;
1299 unsigned long val;
1301 if (sc->args != NULL) {
1302 struct format_field *field;
1303 u8 bit = 1;
1304 struct syscall_arg arg = {
1305 .idx = 0,
1306 .mask = 0,
1307 .trace = trace,
1308 .thread = thread,
1311 for (field = sc->args; field;
1312 field = field->next, ++arg.idx, bit <<= 1) {
1313 if (arg.mask & bit)
1314 continue;
1316 /* special care for unaligned accesses */
1317 p = args + sizeof(unsigned long) * arg.idx;
1318 memcpy(&val, p, sizeof(val));
1321 * Suppress this argument if its value is zero and
1322 * and we don't have a string associated in an
1323 * strarray for it.
1325 if (val == 0 &&
1326 !(sc->arg_scnprintf &&
1327 sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1328 sc->arg_parm[arg.idx]))
1329 continue;
1331 printed += scnprintf(bf + printed, size - printed,
1332 "%s%s: ", printed ? ", " : "", field->name);
1333 if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1334 arg.val = val;
1335 if (sc->arg_parm)
1336 arg.parm = sc->arg_parm[arg.idx];
1337 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1338 size - printed, &arg);
1339 } else {
1340 printed += scnprintf(bf + printed, size - printed,
1341 "%ld", val);
1344 } else if (IS_ERR(sc->tp_format)) {
1346 * If we managed to read the tracepoint /format file, then we
1347 * may end up not having any args, like with gettid(), so only
1348 * print the raw args when we didn't manage to read it.
1350 int i = 0;
1352 while (i < 6) {
1353 /* special care for unaligned accesses */
1354 p = args + sizeof(unsigned long) * i;
1355 memcpy(&val, p, sizeof(val));
1356 printed += scnprintf(bf + printed, size - printed,
1357 "%sarg%d: %ld",
1358 printed ? ", " : "", i, val);
1359 ++i;
1363 return printed;
1366 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1367 union perf_event *event,
1368 struct perf_sample *sample);
1370 static struct syscall *trace__syscall_info(struct trace *trace,
1371 struct perf_evsel *evsel, int id)
1374 if (id < 0) {
1377 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1378 * before that, leaving at a higher verbosity level till that is
1379 * explained. Reproduced with plain ftrace with:
1381 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1382 * grep "NR -1 " /t/trace_pipe
1384 * After generating some load on the machine.
1386 if (verbose > 1) {
1387 static u64 n;
1388 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1389 id, perf_evsel__name(evsel), ++n);
1391 return NULL;
1394 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1395 trace__read_syscall_info(trace, id))
1396 goto out_cant_read;
1398 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1399 goto out_cant_read;
1401 return &trace->syscalls.table[id];
1403 out_cant_read:
1404 if (verbose) {
1405 fprintf(trace->output, "Problems reading syscall %d", id);
1406 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1407 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1408 fputs(" information\n", trace->output);
1410 return NULL;
1413 static void thread__update_stats(struct thread_trace *ttrace,
1414 int id, struct perf_sample *sample)
1416 struct int_node *inode;
1417 struct stats *stats;
1418 u64 duration = 0;
1420 inode = intlist__findnew(ttrace->syscall_stats, id);
1421 if (inode == NULL)
1422 return;
1424 stats = inode->priv;
1425 if (stats == NULL) {
1426 stats = malloc(sizeof(struct stats));
1427 if (stats == NULL)
1428 return;
1429 init_stats(stats);
1430 inode->priv = stats;
1433 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1434 duration = sample->time - ttrace->entry_time;
1436 update_stats(stats, duration);
1439 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1441 struct thread_trace *ttrace;
1442 u64 duration;
1443 size_t printed;
1445 if (trace->current == NULL)
1446 return 0;
1448 ttrace = thread__priv(trace->current);
1450 if (!ttrace->entry_pending)
1451 return 0;
1453 duration = sample->time - ttrace->entry_time;
1455 printed = trace__fprintf_entry_head(trace, trace->current, duration, ttrace->entry_time, trace->output);
1456 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1457 ttrace->entry_pending = false;
1459 return printed;
1462 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1463 union perf_event *event __maybe_unused,
1464 struct perf_sample *sample)
1466 char *msg;
1467 void *args;
1468 size_t printed = 0;
1469 struct thread *thread;
1470 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1471 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1472 struct thread_trace *ttrace;
1474 if (sc == NULL)
1475 return -1;
1477 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1478 ttrace = thread__trace(thread, trace->output);
1479 if (ttrace == NULL)
1480 goto out_put;
1482 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1484 if (ttrace->entry_str == NULL) {
1485 ttrace->entry_str = malloc(trace__entry_str_size);
1486 if (!ttrace->entry_str)
1487 goto out_put;
1490 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1491 trace__printf_interrupted_entry(trace, sample);
1493 ttrace->entry_time = sample->time;
1494 msg = ttrace->entry_str;
1495 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1497 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1498 args, trace, thread);
1500 if (sc->is_exit) {
1501 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1502 trace__fprintf_entry_head(trace, thread, 1, ttrace->entry_time, trace->output);
1503 fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1505 } else {
1506 ttrace->entry_pending = true;
1507 /* See trace__vfs_getname & trace__sys_exit */
1508 ttrace->filename.pending_open = false;
1511 if (trace->current != thread) {
1512 thread__put(trace->current);
1513 trace->current = thread__get(thread);
1515 err = 0;
1516 out_put:
1517 thread__put(thread);
1518 return err;
1521 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1522 struct perf_sample *sample,
1523 struct callchain_cursor *cursor)
1525 struct addr_location al;
1527 if (machine__resolve(trace->host, &al, sample) < 0 ||
1528 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1529 return -1;
1531 return 0;
1534 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1536 /* TODO: user-configurable print_opts */
1537 const unsigned int print_opts = EVSEL__PRINT_SYM |
1538 EVSEL__PRINT_DSO |
1539 EVSEL__PRINT_UNKNOWN_AS_ADDR;
1541 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1544 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1545 union perf_event *event __maybe_unused,
1546 struct perf_sample *sample)
1548 long ret;
1549 u64 duration = 0;
1550 struct thread *thread;
1551 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1552 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1553 struct thread_trace *ttrace;
1555 if (sc == NULL)
1556 return -1;
1558 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1559 ttrace = thread__trace(thread, trace->output);
1560 if (ttrace == NULL)
1561 goto out_put;
1563 if (trace->summary)
1564 thread__update_stats(ttrace, id, sample);
1566 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1568 if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1569 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1570 ttrace->filename.pending_open = false;
1571 ++trace->stats.vfs_getname;
1574 ttrace->exit_time = sample->time;
1576 if (ttrace->entry_time) {
1577 duration = sample->time - ttrace->entry_time;
1578 if (trace__filter_duration(trace, duration))
1579 goto out;
1580 } else if (trace->duration_filter)
1581 goto out;
1583 if (sample->callchain) {
1584 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1585 if (callchain_ret == 0) {
1586 if (callchain_cursor.nr < trace->min_stack)
1587 goto out;
1588 callchain_ret = 1;
1592 if (trace->summary_only)
1593 goto out;
1595 trace__fprintf_entry_head(trace, thread, duration, ttrace->entry_time, trace->output);
1597 if (ttrace->entry_pending) {
1598 fprintf(trace->output, "%-70s", ttrace->entry_str);
1599 } else {
1600 fprintf(trace->output, " ... [");
1601 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1602 fprintf(trace->output, "]: %s()", sc->name);
1605 if (sc->fmt == NULL) {
1606 signed_print:
1607 fprintf(trace->output, ") = %ld", ret);
1608 } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1609 char bf[STRERR_BUFSIZE];
1610 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1611 *e = audit_errno_to_name(-ret);
1613 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1614 } else if (ret == 0 && sc->fmt->timeout)
1615 fprintf(trace->output, ") = 0 Timeout");
1616 else if (sc->fmt->hexret)
1617 fprintf(trace->output, ") = %#lx", ret);
1618 else if (sc->fmt->errpid) {
1619 struct thread *child = machine__find_thread(trace->host, ret, ret);
1621 if (child != NULL) {
1622 fprintf(trace->output, ") = %ld", ret);
1623 if (child->comm_set)
1624 fprintf(trace->output, " (%s)", thread__comm_str(child));
1625 thread__put(child);
1627 } else
1628 goto signed_print;
1630 fputc('\n', trace->output);
1632 if (callchain_ret > 0)
1633 trace__fprintf_callchain(trace, sample);
1634 else if (callchain_ret < 0)
1635 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1636 out:
1637 ttrace->entry_pending = false;
1638 err = 0;
1639 out_put:
1640 thread__put(thread);
1641 return err;
1644 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1645 union perf_event *event __maybe_unused,
1646 struct perf_sample *sample)
1648 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1649 struct thread_trace *ttrace;
1650 size_t filename_len, entry_str_len, to_move;
1651 ssize_t remaining_space;
1652 char *pos;
1653 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1655 if (!thread)
1656 goto out;
1658 ttrace = thread__priv(thread);
1659 if (!ttrace)
1660 goto out;
1662 filename_len = strlen(filename);
1664 if (ttrace->filename.namelen < filename_len) {
1665 char *f = realloc(ttrace->filename.name, filename_len + 1);
1667 if (f == NULL)
1668 goto out;
1670 ttrace->filename.namelen = filename_len;
1671 ttrace->filename.name = f;
1674 strcpy(ttrace->filename.name, filename);
1675 ttrace->filename.pending_open = true;
1677 if (!ttrace->filename.ptr)
1678 goto out;
1680 entry_str_len = strlen(ttrace->entry_str);
1681 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1682 if (remaining_space <= 0)
1683 goto out;
1685 if (filename_len > (size_t)remaining_space) {
1686 filename += filename_len - remaining_space;
1687 filename_len = remaining_space;
1690 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1691 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1692 memmove(pos + filename_len, pos, to_move);
1693 memcpy(pos, filename, filename_len);
1695 ttrace->filename.ptr = 0;
1696 ttrace->filename.entry_str_pos = 0;
1697 out:
1698 return 0;
1701 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1702 union perf_event *event __maybe_unused,
1703 struct perf_sample *sample)
1705 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1706 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1707 struct thread *thread = machine__findnew_thread(trace->host,
1708 sample->pid,
1709 sample->tid);
1710 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1712 if (ttrace == NULL)
1713 goto out_dump;
1715 ttrace->runtime_ms += runtime_ms;
1716 trace->runtime_ms += runtime_ms;
1717 thread__put(thread);
1718 return 0;
1720 out_dump:
1721 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1722 evsel->name,
1723 perf_evsel__strval(evsel, sample, "comm"),
1724 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1725 runtime,
1726 perf_evsel__intval(evsel, sample, "vruntime"));
1727 thread__put(thread);
1728 return 0;
1731 static void bpf_output__printer(enum binary_printer_ops op,
1732 unsigned int val, void *extra)
1734 FILE *output = extra;
1735 unsigned char ch = (unsigned char)val;
1737 switch (op) {
1738 case BINARY_PRINT_CHAR_DATA:
1739 fprintf(output, "%c", isprint(ch) ? ch : '.');
1740 break;
1741 case BINARY_PRINT_DATA_BEGIN:
1742 case BINARY_PRINT_LINE_BEGIN:
1743 case BINARY_PRINT_ADDR:
1744 case BINARY_PRINT_NUM_DATA:
1745 case BINARY_PRINT_NUM_PAD:
1746 case BINARY_PRINT_SEP:
1747 case BINARY_PRINT_CHAR_PAD:
1748 case BINARY_PRINT_LINE_END:
1749 case BINARY_PRINT_DATA_END:
1750 default:
1751 break;
1755 static void bpf_output__fprintf(struct trace *trace,
1756 struct perf_sample *sample)
1758 print_binary(sample->raw_data, sample->raw_size, 8,
1759 bpf_output__printer, trace->output);
1762 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1763 union perf_event *event __maybe_unused,
1764 struct perf_sample *sample)
1766 int callchain_ret = 0;
1768 if (sample->callchain) {
1769 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1770 if (callchain_ret == 0) {
1771 if (callchain_cursor.nr < trace->min_stack)
1772 goto out;
1773 callchain_ret = 1;
1777 trace__printf_interrupted_entry(trace, sample);
1778 trace__fprintf_tstamp(trace, sample->time, trace->output);
1780 if (trace->trace_syscalls)
1781 fprintf(trace->output, "( ): ");
1783 fprintf(trace->output, "%s:", evsel->name);
1785 if (perf_evsel__is_bpf_output(evsel)) {
1786 bpf_output__fprintf(trace, sample);
1787 } else if (evsel->tp_format) {
1788 event_format__fprintf(evsel->tp_format, sample->cpu,
1789 sample->raw_data, sample->raw_size,
1790 trace->output);
1793 fprintf(trace->output, ")\n");
1795 if (callchain_ret > 0)
1796 trace__fprintf_callchain(trace, sample);
1797 else if (callchain_ret < 0)
1798 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1799 out:
1800 return 0;
1803 static void print_location(FILE *f, struct perf_sample *sample,
1804 struct addr_location *al,
1805 bool print_dso, bool print_sym)
1808 if ((verbose || print_dso) && al->map)
1809 fprintf(f, "%s@", al->map->dso->long_name);
1811 if ((verbose || print_sym) && al->sym)
1812 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1813 al->addr - al->sym->start);
1814 else if (al->map)
1815 fprintf(f, "0x%" PRIx64, al->addr);
1816 else
1817 fprintf(f, "0x%" PRIx64, sample->addr);
1820 static int trace__pgfault(struct trace *trace,
1821 struct perf_evsel *evsel,
1822 union perf_event *event __maybe_unused,
1823 struct perf_sample *sample)
1825 struct thread *thread;
1826 struct addr_location al;
1827 char map_type = 'd';
1828 struct thread_trace *ttrace;
1829 int err = -1;
1830 int callchain_ret = 0;
1832 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1834 if (sample->callchain) {
1835 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1836 if (callchain_ret == 0) {
1837 if (callchain_cursor.nr < trace->min_stack)
1838 goto out_put;
1839 callchain_ret = 1;
1843 ttrace = thread__trace(thread, trace->output);
1844 if (ttrace == NULL)
1845 goto out_put;
1847 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1848 ttrace->pfmaj++;
1849 else
1850 ttrace->pfmin++;
1852 if (trace->summary_only)
1853 goto out;
1855 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1856 sample->ip, &al);
1858 trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1860 fprintf(trace->output, "%sfault [",
1861 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1862 "maj" : "min");
1864 print_location(trace->output, sample, &al, false, true);
1866 fprintf(trace->output, "] => ");
1868 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1869 sample->addr, &al);
1871 if (!al.map) {
1872 thread__find_addr_location(thread, sample->cpumode,
1873 MAP__FUNCTION, sample->addr, &al);
1875 if (al.map)
1876 map_type = 'x';
1877 else
1878 map_type = '?';
1881 print_location(trace->output, sample, &al, true, false);
1883 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1885 if (callchain_ret > 0)
1886 trace__fprintf_callchain(trace, sample);
1887 else if (callchain_ret < 0)
1888 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1889 out:
1890 err = 0;
1891 out_put:
1892 thread__put(thread);
1893 return err;
1896 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1898 if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1899 (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1900 return false;
1902 if (trace->pid_list || trace->tid_list)
1903 return true;
1905 return false;
1908 static void trace__set_base_time(struct trace *trace,
1909 struct perf_evsel *evsel,
1910 struct perf_sample *sample)
1913 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1914 * and don't use sample->time unconditionally, we may end up having
1915 * some other event in the future without PERF_SAMPLE_TIME for good
1916 * reason, i.e. we may not be interested in its timestamps, just in
1917 * it taking place, picking some piece of information when it
1918 * appears in our event stream (vfs_getname comes to mind).
1920 if (trace->base_time == 0 && !trace->full_time &&
1921 (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1922 trace->base_time = sample->time;
1925 static int trace__process_sample(struct perf_tool *tool,
1926 union perf_event *event,
1927 struct perf_sample *sample,
1928 struct perf_evsel *evsel,
1929 struct machine *machine __maybe_unused)
1931 struct trace *trace = container_of(tool, struct trace, tool);
1932 int err = 0;
1934 tracepoint_handler handler = evsel->handler;
1936 if (skip_sample(trace, sample))
1937 return 0;
1939 trace__set_base_time(trace, evsel, sample);
1941 if (handler) {
1942 ++trace->nr_events;
1943 handler(trace, evsel, event, sample);
1946 return err;
1949 static int parse_target_str(struct trace *trace)
1951 if (trace->opts.target.pid) {
1952 trace->pid_list = intlist__new(trace->opts.target.pid);
1953 if (trace->pid_list == NULL) {
1954 pr_err("Error parsing process id string\n");
1955 return -EINVAL;
1959 if (trace->opts.target.tid) {
1960 trace->tid_list = intlist__new(trace->opts.target.tid);
1961 if (trace->tid_list == NULL) {
1962 pr_err("Error parsing thread id string\n");
1963 return -EINVAL;
1967 return 0;
1970 static int trace__record(struct trace *trace, int argc, const char **argv)
1972 unsigned int rec_argc, i, j;
1973 const char **rec_argv;
1974 const char * const record_args[] = {
1975 "record",
1976 "-R",
1977 "-m", "1024",
1978 "-c", "1",
1981 const char * const sc_args[] = { "-e", };
1982 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1983 const char * const majpf_args[] = { "-e", "major-faults" };
1984 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1985 const char * const minpf_args[] = { "-e", "minor-faults" };
1986 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1988 /* +1 is for the event string below */
1989 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1990 majpf_args_nr + minpf_args_nr + argc;
1991 rec_argv = calloc(rec_argc + 1, sizeof(char *));
1993 if (rec_argv == NULL)
1994 return -ENOMEM;
1996 j = 0;
1997 for (i = 0; i < ARRAY_SIZE(record_args); i++)
1998 rec_argv[j++] = record_args[i];
2000 if (trace->trace_syscalls) {
2001 for (i = 0; i < sc_args_nr; i++)
2002 rec_argv[j++] = sc_args[i];
2004 /* event string may be different for older kernels - e.g., RHEL6 */
2005 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2006 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2007 else if (is_valid_tracepoint("syscalls:sys_enter"))
2008 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2009 else {
2010 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2011 return -1;
2015 if (trace->trace_pgfaults & TRACE_PFMAJ)
2016 for (i = 0; i < majpf_args_nr; i++)
2017 rec_argv[j++] = majpf_args[i];
2019 if (trace->trace_pgfaults & TRACE_PFMIN)
2020 for (i = 0; i < minpf_args_nr; i++)
2021 rec_argv[j++] = minpf_args[i];
2023 for (i = 0; i < (unsigned int)argc; i++)
2024 rec_argv[j++] = argv[i];
2026 return cmd_record(j, rec_argv, NULL);
2029 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2031 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2033 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2035 if (IS_ERR(evsel))
2036 return false;
2038 if (perf_evsel__field(evsel, "pathname") == NULL) {
2039 perf_evsel__delete(evsel);
2040 return false;
2043 evsel->handler = trace__vfs_getname;
2044 perf_evlist__add(evlist, evsel);
2045 return true;
2048 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2050 struct perf_evsel *evsel;
2051 struct perf_event_attr attr = {
2052 .type = PERF_TYPE_SOFTWARE,
2053 .mmap_data = 1,
2056 attr.config = config;
2057 attr.sample_period = 1;
2059 event_attr_init(&attr);
2061 evsel = perf_evsel__new(&attr);
2062 if (evsel)
2063 evsel->handler = trace__pgfault;
2065 return evsel;
2068 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2070 const u32 type = event->header.type;
2071 struct perf_evsel *evsel;
2073 if (type != PERF_RECORD_SAMPLE) {
2074 trace__process_event(trace, trace->host, event, sample);
2075 return;
2078 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2079 if (evsel == NULL) {
2080 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2081 return;
2084 trace__set_base_time(trace, evsel, sample);
2086 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2087 sample->raw_data == NULL) {
2088 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2089 perf_evsel__name(evsel), sample->tid,
2090 sample->cpu, sample->raw_size);
2091 } else {
2092 tracepoint_handler handler = evsel->handler;
2093 handler(trace, evsel, event, sample);
2097 static int trace__add_syscall_newtp(struct trace *trace)
2099 int ret = -1;
2100 struct perf_evlist *evlist = trace->evlist;
2101 struct perf_evsel *sys_enter, *sys_exit;
2103 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2104 if (sys_enter == NULL)
2105 goto out;
2107 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2108 goto out_delete_sys_enter;
2110 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2111 if (sys_exit == NULL)
2112 goto out_delete_sys_enter;
2114 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2115 goto out_delete_sys_exit;
2117 perf_evlist__add(evlist, sys_enter);
2118 perf_evlist__add(evlist, sys_exit);
2120 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2122 * We're interested only in the user space callchain
2123 * leading to the syscall, allow overriding that for
2124 * debugging reasons using --kernel_syscall_callchains
2126 sys_exit->attr.exclude_callchain_kernel = 1;
2129 trace->syscalls.events.sys_enter = sys_enter;
2130 trace->syscalls.events.sys_exit = sys_exit;
2132 ret = 0;
2133 out:
2134 return ret;
2136 out_delete_sys_exit:
2137 perf_evsel__delete_priv(sys_exit);
2138 out_delete_sys_enter:
2139 perf_evsel__delete_priv(sys_enter);
2140 goto out;
2143 static int trace__set_ev_qualifier_filter(struct trace *trace)
2145 int err = -1;
2146 struct perf_evsel *sys_exit;
2147 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2148 trace->ev_qualifier_ids.nr,
2149 trace->ev_qualifier_ids.entries);
2151 if (filter == NULL)
2152 goto out_enomem;
2154 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2155 filter)) {
2156 sys_exit = trace->syscalls.events.sys_exit;
2157 err = perf_evsel__append_tp_filter(sys_exit, filter);
2160 free(filter);
2161 out:
2162 return err;
2163 out_enomem:
2164 errno = ENOMEM;
2165 goto out;
2168 static int trace__run(struct trace *trace, int argc, const char **argv)
2170 struct perf_evlist *evlist = trace->evlist;
2171 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2172 int err = -1, i;
2173 unsigned long before;
2174 const bool forks = argc > 0;
2175 bool draining = false;
2177 trace->live = true;
2179 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2180 goto out_error_raw_syscalls;
2182 if (trace->trace_syscalls)
2183 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2185 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2186 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2187 if (pgfault_maj == NULL)
2188 goto out_error_mem;
2189 perf_evlist__add(evlist, pgfault_maj);
2192 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2193 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2194 if (pgfault_min == NULL)
2195 goto out_error_mem;
2196 perf_evlist__add(evlist, pgfault_min);
2199 if (trace->sched &&
2200 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2201 trace__sched_stat_runtime))
2202 goto out_error_sched_stat_runtime;
2204 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2205 if (err < 0) {
2206 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2207 goto out_delete_evlist;
2210 err = trace__symbols_init(trace, evlist);
2211 if (err < 0) {
2212 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2213 goto out_delete_evlist;
2216 perf_evlist__config(evlist, &trace->opts, NULL);
2218 if (callchain_param.enabled) {
2219 bool use_identifier = false;
2221 if (trace->syscalls.events.sys_exit) {
2222 perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2223 &trace->opts, &callchain_param);
2224 use_identifier = true;
2227 if (pgfault_maj) {
2228 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2229 use_identifier = true;
2232 if (pgfault_min) {
2233 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2234 use_identifier = true;
2237 if (use_identifier) {
2239 * Now we have evsels with different sample_ids, use
2240 * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2241 * from a fixed position in each ring buffer record.
2243 * As of this the changeset introducing this comment, this
2244 * isn't strictly needed, as the fields that can come before
2245 * PERF_SAMPLE_ID are all used, but we'll probably disable
2246 * some of those for things like copying the payload of
2247 * pointer syscall arguments, and for vfs_getname we don't
2248 * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2249 * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2251 perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2252 perf_evlist__reset_sample_bit(evlist, ID);
2256 signal(SIGCHLD, sig_handler);
2257 signal(SIGINT, sig_handler);
2259 if (forks) {
2260 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2261 argv, false, NULL);
2262 if (err < 0) {
2263 fprintf(trace->output, "Couldn't run the workload!\n");
2264 goto out_delete_evlist;
2268 err = perf_evlist__open(evlist);
2269 if (err < 0)
2270 goto out_error_open;
2272 err = bpf__apply_obj_config();
2273 if (err) {
2274 char errbuf[BUFSIZ];
2276 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2277 pr_err("ERROR: Apply config to BPF failed: %s\n",
2278 errbuf);
2279 goto out_error_open;
2283 * Better not use !target__has_task() here because we need to cover the
2284 * case where no threads were specified in the command line, but a
2285 * workload was, and in that case we will fill in the thread_map when
2286 * we fork the workload in perf_evlist__prepare_workload.
2288 if (trace->filter_pids.nr > 0)
2289 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2290 else if (thread_map__pid(evlist->threads, 0) == -1)
2291 err = perf_evlist__set_filter_pid(evlist, getpid());
2293 if (err < 0)
2294 goto out_error_mem;
2296 if (trace->ev_qualifier_ids.nr > 0) {
2297 err = trace__set_ev_qualifier_filter(trace);
2298 if (err < 0)
2299 goto out_errno;
2301 pr_debug("event qualifier tracepoint filter: %s\n",
2302 trace->syscalls.events.sys_exit->filter);
2305 err = perf_evlist__apply_filters(evlist, &evsel);
2306 if (err < 0)
2307 goto out_error_apply_filters;
2309 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2310 if (err < 0)
2311 goto out_error_mmap;
2313 if (!target__none(&trace->opts.target))
2314 perf_evlist__enable(evlist);
2316 if (forks)
2317 perf_evlist__start_workload(evlist);
2319 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2320 evlist->threads->nr > 1 ||
2321 perf_evlist__first(evlist)->attr.inherit;
2322 again:
2323 before = trace->nr_events;
2325 for (i = 0; i < evlist->nr_mmaps; i++) {
2326 union perf_event *event;
2328 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2329 struct perf_sample sample;
2331 ++trace->nr_events;
2333 err = perf_evlist__parse_sample(evlist, event, &sample);
2334 if (err) {
2335 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2336 goto next_event;
2339 trace__handle_event(trace, event, &sample);
2340 next_event:
2341 perf_evlist__mmap_consume(evlist, i);
2343 if (interrupted)
2344 goto out_disable;
2346 if (done && !draining) {
2347 perf_evlist__disable(evlist);
2348 draining = true;
2353 if (trace->nr_events == before) {
2354 int timeout = done ? 100 : -1;
2356 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2357 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2358 draining = true;
2360 goto again;
2362 } else {
2363 goto again;
2366 out_disable:
2367 thread__zput(trace->current);
2369 perf_evlist__disable(evlist);
2371 if (!err) {
2372 if (trace->summary)
2373 trace__fprintf_thread_summary(trace, trace->output);
2375 if (trace->show_tool_stats) {
2376 fprintf(trace->output, "Stats:\n "
2377 " vfs_getname : %" PRIu64 "\n"
2378 " proc_getname: %" PRIu64 "\n",
2379 trace->stats.vfs_getname,
2380 trace->stats.proc_getname);
2384 out_delete_evlist:
2385 perf_evlist__delete(evlist);
2386 trace->evlist = NULL;
2387 trace->live = false;
2388 return err;
2390 char errbuf[BUFSIZ];
2392 out_error_sched_stat_runtime:
2393 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2394 goto out_error;
2396 out_error_raw_syscalls:
2397 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2398 goto out_error;
2400 out_error_mmap:
2401 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2402 goto out_error;
2404 out_error_open:
2405 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2407 out_error:
2408 fprintf(trace->output, "%s\n", errbuf);
2409 goto out_delete_evlist;
2411 out_error_apply_filters:
2412 fprintf(trace->output,
2413 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2414 evsel->filter, perf_evsel__name(evsel), errno,
2415 str_error_r(errno, errbuf, sizeof(errbuf)));
2416 goto out_delete_evlist;
2418 out_error_mem:
2419 fprintf(trace->output, "Not enough memory to run!\n");
2420 goto out_delete_evlist;
2422 out_errno:
2423 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2424 goto out_delete_evlist;
2427 static int trace__replay(struct trace *trace)
2429 const struct perf_evsel_str_handler handlers[] = {
2430 { "probe:vfs_getname", trace__vfs_getname, },
2432 struct perf_data_file file = {
2433 .path = input_name,
2434 .mode = PERF_DATA_MODE_READ,
2435 .force = trace->force,
2437 struct perf_session *session;
2438 struct perf_evsel *evsel;
2439 int err = -1;
2441 trace->tool.sample = trace__process_sample;
2442 trace->tool.mmap = perf_event__process_mmap;
2443 trace->tool.mmap2 = perf_event__process_mmap2;
2444 trace->tool.comm = perf_event__process_comm;
2445 trace->tool.exit = perf_event__process_exit;
2446 trace->tool.fork = perf_event__process_fork;
2447 trace->tool.attr = perf_event__process_attr;
2448 trace->tool.tracing_data = perf_event__process_tracing_data;
2449 trace->tool.build_id = perf_event__process_build_id;
2451 trace->tool.ordered_events = true;
2452 trace->tool.ordering_requires_timestamps = true;
2454 /* add tid to output */
2455 trace->multiple_threads = true;
2457 session = perf_session__new(&file, false, &trace->tool);
2458 if (session == NULL)
2459 return -1;
2461 if (symbol__init(&session->header.env) < 0)
2462 goto out;
2464 trace->host = &session->machines.host;
2466 err = perf_session__set_tracepoints_handlers(session, handlers);
2467 if (err)
2468 goto out;
2470 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2471 "raw_syscalls:sys_enter");
2472 /* older kernels have syscalls tp versus raw_syscalls */
2473 if (evsel == NULL)
2474 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2475 "syscalls:sys_enter");
2477 if (evsel &&
2478 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2479 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2480 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2481 goto out;
2484 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2485 "raw_syscalls:sys_exit");
2486 if (evsel == NULL)
2487 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2488 "syscalls:sys_exit");
2489 if (evsel &&
2490 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2491 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2492 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2493 goto out;
2496 evlist__for_each_entry(session->evlist, evsel) {
2497 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2498 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2499 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2500 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2501 evsel->handler = trace__pgfault;
2504 err = parse_target_str(trace);
2505 if (err != 0)
2506 goto out;
2508 setup_pager();
2510 err = perf_session__process_events(session);
2511 if (err)
2512 pr_err("Failed to process events, error %d", err);
2514 else if (trace->summary)
2515 trace__fprintf_thread_summary(trace, trace->output);
2517 out:
2518 perf_session__delete(session);
2520 return err;
2523 static size_t trace__fprintf_threads_header(FILE *fp)
2525 size_t printed;
2527 printed = fprintf(fp, "\n Summary of events:\n\n");
2529 return printed;
2532 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2533 struct stats *stats;
2534 double msecs;
2535 int syscall;
2538 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2539 struct stats *stats = source->priv;
2541 entry->syscall = source->i;
2542 entry->stats = stats;
2543 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2546 static size_t thread__dump_stats(struct thread_trace *ttrace,
2547 struct trace *trace, FILE *fp)
2549 size_t printed = 0;
2550 struct syscall *sc;
2551 struct rb_node *nd;
2552 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2554 if (syscall_stats == NULL)
2555 return 0;
2557 printed += fprintf(fp, "\n");
2559 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2560 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2561 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2563 resort_rb__for_each_entry(nd, syscall_stats) {
2564 struct stats *stats = syscall_stats_entry->stats;
2565 if (stats) {
2566 double min = (double)(stats->min) / NSEC_PER_MSEC;
2567 double max = (double)(stats->max) / NSEC_PER_MSEC;
2568 double avg = avg_stats(stats);
2569 double pct;
2570 u64 n = (u64) stats->n;
2572 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2573 avg /= NSEC_PER_MSEC;
2575 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2576 printed += fprintf(fp, " %-15s", sc->name);
2577 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2578 n, syscall_stats_entry->msecs, min, avg);
2579 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2583 resort_rb__delete(syscall_stats);
2584 printed += fprintf(fp, "\n\n");
2586 return printed;
2589 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2591 size_t printed = 0;
2592 struct thread_trace *ttrace = thread__priv(thread);
2593 double ratio;
2595 if (ttrace == NULL)
2596 return 0;
2598 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2600 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2601 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2602 printed += fprintf(fp, "%.1f%%", ratio);
2603 if (ttrace->pfmaj)
2604 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2605 if (ttrace->pfmin)
2606 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2607 if (trace->sched)
2608 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2609 else if (fputc('\n', fp) != EOF)
2610 ++printed;
2612 printed += thread__dump_stats(ttrace, trace, fp);
2614 return printed;
2617 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2619 return ttrace ? ttrace->nr_events : 0;
2622 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2623 struct thread *thread;
2626 entry->thread = rb_entry(nd, struct thread, rb_node);
2629 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2631 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2632 size_t printed = trace__fprintf_threads_header(fp);
2633 struct rb_node *nd;
2635 if (threads == NULL) {
2636 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2637 return 0;
2640 resort_rb__for_each_entry(nd, threads)
2641 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2643 resort_rb__delete(threads);
2645 return printed;
2648 static int trace__set_duration(const struct option *opt, const char *str,
2649 int unset __maybe_unused)
2651 struct trace *trace = opt->value;
2653 trace->duration_filter = atof(str);
2654 return 0;
2657 static int trace__set_filter_pids(const struct option *opt, const char *str,
2658 int unset __maybe_unused)
2660 int ret = -1;
2661 size_t i;
2662 struct trace *trace = opt->value;
2664 * FIXME: introduce a intarray class, plain parse csv and create a
2665 * { int nr, int entries[] } struct...
2667 struct intlist *list = intlist__new(str);
2669 if (list == NULL)
2670 return -1;
2672 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2673 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2675 if (trace->filter_pids.entries == NULL)
2676 goto out;
2678 trace->filter_pids.entries[0] = getpid();
2680 for (i = 1; i < trace->filter_pids.nr; ++i)
2681 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2683 intlist__delete(list);
2684 ret = 0;
2685 out:
2686 return ret;
2689 static int trace__open_output(struct trace *trace, const char *filename)
2691 struct stat st;
2693 if (!stat(filename, &st) && st.st_size) {
2694 char oldname[PATH_MAX];
2696 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2697 unlink(oldname);
2698 rename(filename, oldname);
2701 trace->output = fopen(filename, "w");
2703 return trace->output == NULL ? -errno : 0;
2706 static int parse_pagefaults(const struct option *opt, const char *str,
2707 int unset __maybe_unused)
2709 int *trace_pgfaults = opt->value;
2711 if (strcmp(str, "all") == 0)
2712 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2713 else if (strcmp(str, "maj") == 0)
2714 *trace_pgfaults |= TRACE_PFMAJ;
2715 else if (strcmp(str, "min") == 0)
2716 *trace_pgfaults |= TRACE_PFMIN;
2717 else
2718 return -1;
2720 return 0;
2723 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2725 struct perf_evsel *evsel;
2727 evlist__for_each_entry(evlist, evsel)
2728 evsel->handler = handler;
2731 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2733 const char *trace_usage[] = {
2734 "perf trace [<options>] [<command>]",
2735 "perf trace [<options>] -- <command> [<options>]",
2736 "perf trace record [<options>] [<command>]",
2737 "perf trace record [<options>] -- <command> [<options>]",
2738 NULL
2740 struct trace trace = {
2741 .syscalls = {
2742 . max = -1,
2744 .opts = {
2745 .target = {
2746 .uid = UINT_MAX,
2747 .uses_mmap = true,
2749 .user_freq = UINT_MAX,
2750 .user_interval = ULLONG_MAX,
2751 .no_buffering = true,
2752 .mmap_pages = UINT_MAX,
2753 .proc_map_timeout = 500,
2755 .output = stderr,
2756 .show_comm = true,
2757 .trace_syscalls = true,
2758 .kernel_syscallchains = false,
2759 .max_stack = UINT_MAX,
2761 const char *output_name = NULL;
2762 const char *ev_qualifier_str = NULL;
2763 const struct option trace_options[] = {
2764 OPT_CALLBACK(0, "event", &trace.evlist, "event",
2765 "event selector. use 'perf list' to list available events",
2766 parse_events_option),
2767 OPT_BOOLEAN(0, "comm", &trace.show_comm,
2768 "show the thread COMM next to its id"),
2769 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2770 OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2771 OPT_STRING('o', "output", &output_name, "file", "output file name"),
2772 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2773 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2774 "trace events on existing process id"),
2775 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2776 "trace events on existing thread id"),
2777 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2778 "pids to filter (by the kernel)", trace__set_filter_pids),
2779 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2780 "system-wide collection from all CPUs"),
2781 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2782 "list of cpus to monitor"),
2783 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2784 "child tasks do not inherit counters"),
2785 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2786 "number of mmap data pages",
2787 perf_evlist__parse_mmap_pages),
2788 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2789 "user to profile"),
2790 OPT_CALLBACK(0, "duration", &trace, "float",
2791 "show only events with duration > N.M ms",
2792 trace__set_duration),
2793 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2794 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2795 OPT_BOOLEAN('T', "time", &trace.full_time,
2796 "Show full timestamp, not time relative to first start"),
2797 OPT_BOOLEAN('s', "summary", &trace.summary_only,
2798 "Show only syscall summary with statistics"),
2799 OPT_BOOLEAN('S', "with-summary", &trace.summary,
2800 "Show all syscalls and summary with statistics"),
2801 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2802 "Trace pagefaults", parse_pagefaults, "maj"),
2803 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2804 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2805 OPT_CALLBACK(0, "call-graph", &trace.opts,
2806 "record_mode[,record_size]", record_callchain_help,
2807 &record_parse_callchain_opt),
2808 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2809 "Show the kernel callchains on the syscall exit path"),
2810 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2811 "Set the minimum stack depth when parsing the callchain, "
2812 "anything below the specified depth will be ignored."),
2813 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2814 "Set the maximum stack depth when parsing the callchain, "
2815 "anything beyond the specified depth will be ignored. "
2816 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2817 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2818 "per thread proc mmap processing timeout in ms"),
2819 OPT_END()
2821 bool __maybe_unused max_stack_user_set = true;
2822 bool mmap_pages_user_set = true;
2823 const char * const trace_subcommands[] = { "record", NULL };
2824 int err;
2825 char bf[BUFSIZ];
2827 signal(SIGSEGV, sighandler_dump_stack);
2828 signal(SIGFPE, sighandler_dump_stack);
2830 trace.evlist = perf_evlist__new();
2831 trace.sctbl = syscalltbl__new();
2833 if (trace.evlist == NULL || trace.sctbl == NULL) {
2834 pr_err("Not enough memory to run!\n");
2835 err = -ENOMEM;
2836 goto out;
2839 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2840 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2842 err = bpf__setup_stdout(trace.evlist);
2843 if (err) {
2844 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2845 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2846 goto out;
2849 err = -1;
2851 if (trace.trace_pgfaults) {
2852 trace.opts.sample_address = true;
2853 trace.opts.sample_time = true;
2856 if (trace.opts.mmap_pages == UINT_MAX)
2857 mmap_pages_user_set = false;
2859 if (trace.max_stack == UINT_MAX) {
2860 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2861 max_stack_user_set = false;
2864 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2865 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2866 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2867 #endif
2869 if (callchain_param.enabled) {
2870 if (!mmap_pages_user_set && geteuid() == 0)
2871 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2873 symbol_conf.use_callchain = true;
2876 if (trace.evlist->nr_entries > 0)
2877 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2879 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2880 return trace__record(&trace, argc-1, &argv[1]);
2882 /* summary_only implies summary option, but don't overwrite summary if set */
2883 if (trace.summary_only)
2884 trace.summary = trace.summary_only;
2886 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2887 trace.evlist->nr_entries == 0 /* Was --events used? */) {
2888 pr_err("Please specify something to trace.\n");
2889 return -1;
2892 if (!trace.trace_syscalls && ev_qualifier_str) {
2893 pr_err("The -e option can't be used with --no-syscalls.\n");
2894 goto out;
2897 if (output_name != NULL) {
2898 err = trace__open_output(&trace, output_name);
2899 if (err < 0) {
2900 perror("failed to create output file");
2901 goto out;
2905 trace.open_id = syscalltbl__id(trace.sctbl, "open");
2907 if (ev_qualifier_str != NULL) {
2908 const char *s = ev_qualifier_str;
2909 struct strlist_config slist_config = {
2910 .dirname = system_path(STRACE_GROUPS_DIR),
2913 trace.not_ev_qualifier = *s == '!';
2914 if (trace.not_ev_qualifier)
2915 ++s;
2916 trace.ev_qualifier = strlist__new(s, &slist_config);
2917 if (trace.ev_qualifier == NULL) {
2918 fputs("Not enough memory to parse event qualifier",
2919 trace.output);
2920 err = -ENOMEM;
2921 goto out_close;
2924 err = trace__validate_ev_qualifier(&trace);
2925 if (err)
2926 goto out_close;
2929 err = target__validate(&trace.opts.target);
2930 if (err) {
2931 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2932 fprintf(trace.output, "%s", bf);
2933 goto out_close;
2936 err = target__parse_uid(&trace.opts.target);
2937 if (err) {
2938 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2939 fprintf(trace.output, "%s", bf);
2940 goto out_close;
2943 if (!argc && target__none(&trace.opts.target))
2944 trace.opts.target.system_wide = true;
2946 if (input_name)
2947 err = trace__replay(&trace);
2948 else
2949 err = trace__run(&trace, argc, argv);
2951 out_close:
2952 if (output_name != NULL)
2953 fclose(trace.output);
2954 out:
2955 return err;