4 * Builtin record command: Record the profile of a workload
5 * (or a CPU, or a PID) into the perf.data output file - for
6 * later analysis via perf report.
8 #define _FILE_OFFSET_BITS 64
14 #include "util/build-id.h"
15 #include "util/util.h"
16 #include "util/parse-options.h"
17 #include "util/parse-events.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/symbol.h"
26 #include "util/cpumap.h"
27 #include "util/thread_map.h"
38 static u64 user_interval
= ULLONG_MAX
;
39 static u64 default_interval
= 0;
41 static unsigned int page_size
;
42 static unsigned int mmap_pages
= UINT_MAX
;
43 static unsigned int user_freq
= UINT_MAX
;
44 static int freq
= 1000;
46 static int pipe_output
= 0;
47 static const char *output_name
= NULL
;
48 static bool group
= false;
49 static int realtime_prio
= 0;
50 static bool nodelay
= false;
51 static bool raw_samples
= false;
52 static bool sample_id_all_avail
= true;
53 static bool system_wide
= false;
54 static pid_t target_pid
= -1;
55 static pid_t target_tid
= -1;
56 static pid_t child_pid
= -1;
57 static bool no_inherit
= false;
58 static enum write_mode_t write_mode
= WRITE_FORCE
;
59 static bool call_graph
= false;
60 static bool inherit_stat
= false;
61 static bool no_samples
= false;
62 static bool sample_address
= false;
63 static bool sample_time
= false;
64 static bool no_buildid
= false;
65 static bool no_buildid_cache
= false;
66 static struct perf_evlist
*evsel_list
;
68 static long samples
= 0;
69 static u64 bytes_written
= 0;
71 static int file_new
= 1;
72 static off_t post_processing_offset
;
74 static struct perf_session
*session
;
75 static const char *cpu_list
;
77 static void advance_output(size_t size
)
79 bytes_written
+= size
;
82 static void write_output(void *buf
, size_t size
)
85 int ret
= write(output
, buf
, size
);
88 die("failed to write");
97 static int process_synthesized_event(union perf_event
*event
,
98 struct perf_sample
*sample __used
,
99 struct perf_session
*self __used
)
101 write_output(event
, event
->header
.size
);
105 static void mmap_read(struct perf_mmap
*md
)
107 unsigned int head
= perf_mmap__read_head(md
);
108 unsigned int old
= md
->prev
;
109 unsigned char *data
= md
->base
+ page_size
;
120 if ((old
& md
->mask
) + size
!= (head
& md
->mask
)) {
121 buf
= &data
[old
& md
->mask
];
122 size
= md
->mask
+ 1 - (old
& md
->mask
);
125 write_output(buf
, size
);
128 buf
= &data
[old
& md
->mask
];
132 write_output(buf
, size
);
135 perf_mmap__write_tail(md
, old
);
138 static volatile int done
= 0;
139 static volatile int signr
= -1;
141 static void sig_handler(int sig
)
147 static void sig_atexit(void)
150 kill(child_pid
, SIGTERM
);
152 if (signr
== -1 || signr
== SIGUSR1
)
155 signal(signr
, SIG_DFL
);
156 kill(getpid(), signr
);
159 static void config_attr(struct perf_evsel
*evsel
, struct perf_evlist
*evlist
)
161 struct perf_event_attr
*attr
= &evsel
->attr
;
162 int track
= !evsel
->idx
; /* only the first counter needs these */
165 attr
->inherit
= !no_inherit
;
166 attr
->read_format
= PERF_FORMAT_TOTAL_TIME_ENABLED
|
167 PERF_FORMAT_TOTAL_TIME_RUNNING
|
170 attr
->sample_type
|= PERF_SAMPLE_IP
| PERF_SAMPLE_TID
;
172 if (evlist
->nr_entries
> 1)
173 attr
->sample_type
|= PERF_SAMPLE_ID
;
176 * We default some events to a 1 default interval. But keep
177 * it a weak assumption overridable by the user.
179 if (!attr
->sample_period
|| (user_freq
!= UINT_MAX
&&
180 user_interval
!= ULLONG_MAX
)) {
182 attr
->sample_type
|= PERF_SAMPLE_PERIOD
;
184 attr
->sample_freq
= freq
;
186 attr
->sample_period
= default_interval
;
191 attr
->sample_freq
= 0;
194 attr
->inherit_stat
= 1;
196 if (sample_address
) {
197 attr
->sample_type
|= PERF_SAMPLE_ADDR
;
198 attr
->mmap_data
= track
;
202 attr
->sample_type
|= PERF_SAMPLE_CALLCHAIN
;
205 attr
->sample_type
|= PERF_SAMPLE_CPU
;
207 if (sample_id_all_avail
&&
208 (sample_time
|| system_wide
|| !no_inherit
|| cpu_list
))
209 attr
->sample_type
|= PERF_SAMPLE_TIME
;
212 attr
->sample_type
|= PERF_SAMPLE_TIME
;
213 attr
->sample_type
|= PERF_SAMPLE_RAW
;
214 attr
->sample_type
|= PERF_SAMPLE_CPU
;
219 attr
->wakeup_events
= 1;
225 if (target_pid
== -1 && target_tid
== -1 && !system_wide
) {
227 attr
->enable_on_exec
= 1;
231 static bool perf_evlist__equal(struct perf_evlist
*evlist
,
232 struct perf_evlist
*other
)
234 struct perf_evsel
*pos
, *pair
;
236 if (evlist
->nr_entries
!= other
->nr_entries
)
239 pair
= list_entry(other
->entries
.next
, struct perf_evsel
, node
);
241 list_for_each_entry(pos
, &evlist
->entries
, node
) {
242 if (memcmp(&pos
->attr
, &pair
->attr
, sizeof(pos
->attr
) != 0))
244 pair
= list_entry(pair
->node
.next
, struct perf_evsel
, node
);
250 static void open_counters(struct perf_evlist
*evlist
)
252 struct perf_evsel
*pos
;
254 if (evlist
->cpus
->map
[0] < 0)
257 list_for_each_entry(pos
, &evlist
->entries
, node
) {
258 struct perf_event_attr
*attr
= &pos
->attr
;
260 * Check if parse_single_tracepoint_event has already asked for
263 * XXX this is kludgy but short term fix for problems introduced by
264 * eac23d1c that broke 'perf script' by having different sample_types
265 * when using multiple tracepoint events when we use a perf binary
266 * that tries to use sample_id_all on an older kernel.
268 * We need to move counter creation to perf_session, support
269 * different sample_types, etc.
271 bool time_needed
= attr
->sample_type
& PERF_SAMPLE_TIME
;
273 config_attr(pos
, evlist
);
275 attr
->sample_id_all
= sample_id_all_avail
? 1 : 0;
277 if (perf_evsel__open(pos
, evlist
->cpus
, evlist
->threads
, group
) < 0) {
280 if (err
== EPERM
|| err
== EACCES
) {
281 ui__warning_paranoid();
283 } else if (err
== ENODEV
&& cpu_list
) {
284 die("No such device - did you specify"
285 " an out-of-range profile CPU?\n");
286 } else if (err
== EINVAL
&& sample_id_all_avail
) {
288 * Old kernel, no attr->sample_id_type_all field
290 sample_id_all_avail
= false;
291 if (!sample_time
&& !raw_samples
&& !time_needed
)
292 attr
->sample_type
&= ~PERF_SAMPLE_TIME
;
294 goto retry_sample_id
;
298 * If it's cycles then fall back to hrtimer
299 * based cpu-clock-tick sw counter, which
300 * is always available even if no PMU support:
302 if (attr
->type
== PERF_TYPE_HARDWARE
303 && attr
->config
== PERF_COUNT_HW_CPU_CYCLES
) {
306 ui__warning("The cycles event is not supported, "
307 "trying to fall back to cpu-clock-ticks\n");
308 attr
->type
= PERF_TYPE_SOFTWARE
;
309 attr
->config
= PERF_COUNT_SW_CPU_CLOCK
;
314 ui__warning("The %s event is not supported.\n",
320 error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n",
323 #if defined(__i386__) || defined(__x86_64__)
324 if (attr
->type
== PERF_TYPE_HARDWARE
&& err
== EOPNOTSUPP
)
325 die("No hardware sampling interrupt available."
326 " No APIC? If so then you can boot the kernel"
327 " with the \"lapic\" boot parameter to"
328 " force-enable it.\n");
331 die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
335 if (perf_evlist__set_filters(evlist
)) {
336 error("failed to set filter with %d (%s)\n", errno
,
341 if (perf_evlist__mmap(evlist
, mmap_pages
, false) < 0)
342 die("failed to mmap with %d (%s)\n", errno
, strerror(errno
));
345 session
->evlist
= evlist
;
347 if (!perf_evlist__equal(session
->evlist
, evlist
)) {
348 fprintf(stderr
, "incompatible append\n");
353 perf_session__update_sample_type(session
);
356 static int process_buildids(void)
358 u64 size
= lseek(output
, 0, SEEK_CUR
);
363 session
->fd
= output
;
364 return __perf_session__process_events(session
, post_processing_offset
,
365 size
- post_processing_offset
,
366 size
, &build_id__mark_dso_hit_ops
);
369 static void atexit_header(void)
372 session
->header
.data_size
+= bytes_written
;
376 perf_session__write_header(session
, evsel_list
, output
, true);
377 perf_session__delete(session
);
378 perf_evlist__delete(evsel_list
);
383 static void perf_event__synthesize_guest_os(struct machine
*machine
, void *data
)
386 struct perf_session
*psession
= data
;
388 if (machine__is_host(machine
))
392 *As for guest kernel when processing subcommand record&report,
393 *we arrange module mmap prior to guest kernel mmap and trigger
394 *a preload dso because default guest module symbols are loaded
395 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
396 *method is used to avoid symbol missing when the first addr is
397 *in module instead of in guest kernel.
399 err
= perf_event__synthesize_modules(process_synthesized_event
,
402 pr_err("Couldn't record guest kernel [%d]'s reference"
403 " relocation symbol.\n", machine
->pid
);
406 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
407 * have no _text sometimes.
409 err
= perf_event__synthesize_kernel_mmap(process_synthesized_event
,
410 psession
, machine
, "_text");
412 err
= perf_event__synthesize_kernel_mmap(process_synthesized_event
,
416 pr_err("Couldn't record guest kernel [%d]'s reference"
417 " relocation symbol.\n", machine
->pid
);
420 static struct perf_event_header finished_round_event
= {
421 .size
= sizeof(struct perf_event_header
),
422 .type
= PERF_RECORD_FINISHED_ROUND
,
425 static void mmap_read_all(void)
429 for (i
= 0; i
< evsel_list
->nr_mmaps
; i
++) {
430 if (evsel_list
->mmap
[i
].base
)
431 mmap_read(&evsel_list
->mmap
[i
]);
434 if (perf_header__has_feat(&session
->header
, HEADER_TRACE_INFO
))
435 write_output(&finished_round_event
, sizeof(finished_round_event
));
438 static int __cmd_record(int argc
, const char **argv
)
443 unsigned long waking
= 0;
444 int child_ready_pipe
[2], go_pipe
[2];
445 const bool forks
= argc
> 0;
447 struct machine
*machine
;
449 page_size
= sysconf(_SC_PAGE_SIZE
);
452 signal(SIGCHLD
, sig_handler
);
453 signal(SIGINT
, sig_handler
);
454 signal(SIGUSR1
, sig_handler
);
456 if (forks
&& (pipe(child_ready_pipe
) < 0 || pipe(go_pipe
) < 0)) {
457 perror("failed to create pipes");
462 if (!fstat(STDOUT_FILENO
, &st
) && S_ISFIFO(st
.st_mode
))
465 output_name
= "perf.data";
468 if (!strcmp(output_name
, "-"))
470 else if (!stat(output_name
, &st
) && st
.st_size
) {
471 if (write_mode
== WRITE_FORCE
) {
472 char oldname
[PATH_MAX
];
473 snprintf(oldname
, sizeof(oldname
), "%s.old",
476 rename(output_name
, oldname
);
478 } else if (write_mode
== WRITE_APPEND
) {
479 write_mode
= WRITE_FORCE
;
483 flags
= O_CREAT
|O_RDWR
;
484 if (write_mode
== WRITE_APPEND
)
490 output
= STDOUT_FILENO
;
492 output
= open(output_name
, flags
, S_IRUSR
| S_IWUSR
);
494 perror("failed to create output file");
498 session
= perf_session__new(output_name
, O_WRONLY
,
499 write_mode
== WRITE_FORCE
, false, NULL
);
500 if (session
== NULL
) {
501 pr_err("Not enough memory for reading perf file header\n");
506 perf_header__set_feat(&session
->header
, HEADER_BUILD_ID
);
509 err
= perf_session__read_header(session
, output
);
511 goto out_delete_session
;
514 if (have_tracepoints(&evsel_list
->entries
))
515 perf_header__set_feat(&session
->header
, HEADER_TRACE_INFO
);
517 /* 512 kiB: default amount of unprivileged mlocked memory */
518 if (mmap_pages
== UINT_MAX
)
519 mmap_pages
= (512 * 1024) / page_size
;
524 perror("failed to fork");
531 close(child_ready_pipe
[0]);
533 fcntl(go_pipe
[0], F_SETFD
, FD_CLOEXEC
);
536 * Do a dummy execvp to get the PLT entry resolved,
537 * so we avoid the resolver overhead on the real
540 execvp("", (char **)argv
);
543 * Tell the parent we're ready to go
545 close(child_ready_pipe
[1]);
548 * Wait until the parent tells us to go.
550 if (read(go_pipe
[0], &buf
, 1) == -1)
551 perror("unable to read pipe");
553 execvp(argv
[0], (char **)argv
);
556 kill(getppid(), SIGUSR1
);
560 if (!system_wide
&& target_tid
== -1 && target_pid
== -1)
561 evsel_list
->threads
->map
[0] = child_pid
;
563 close(child_ready_pipe
[1]);
566 * wait for child to settle
568 if (read(child_ready_pipe
[0], &buf
, 1) == -1) {
569 perror("unable to read pipe");
572 close(child_ready_pipe
[0]);
575 open_counters(evsel_list
);
578 * perf_session__delete(session) will be called at atexit_header()
580 atexit(atexit_header
);
583 err
= perf_header__write_pipe(output
);
586 } else if (file_new
) {
587 err
= perf_session__write_header(session
, evsel_list
,
593 post_processing_offset
= lseek(output
, 0, SEEK_CUR
);
596 err
= perf_session__synthesize_attrs(session
,
597 process_synthesized_event
);
599 pr_err("Couldn't synthesize attrs.\n");
603 err
= perf_event__synthesize_event_types(process_synthesized_event
,
606 pr_err("Couldn't synthesize event_types.\n");
610 if (have_tracepoints(&evsel_list
->entries
)) {
612 * FIXME err <= 0 here actually means that
613 * there were no tracepoints so its not really
614 * an error, just that we don't need to
615 * synthesize anything. We really have to
616 * return this more properly and also
617 * propagate errors that now are calling die()
619 err
= perf_event__synthesize_tracing_data(output
, evsel_list
,
620 process_synthesized_event
,
623 pr_err("Couldn't record tracing data.\n");
630 machine
= perf_session__find_host_machine(session
);
632 pr_err("Couldn't find native kernel information.\n");
636 err
= perf_event__synthesize_kernel_mmap(process_synthesized_event
,
637 session
, machine
, "_text");
639 err
= perf_event__synthesize_kernel_mmap(process_synthesized_event
,
640 session
, machine
, "_stext");
642 pr_err("Couldn't record kernel reference relocation symbol\n"
643 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
644 "Check /proc/kallsyms permission or run as root.\n");
646 err
= perf_event__synthesize_modules(process_synthesized_event
,
649 pr_err("Couldn't record kernel module information.\n"
650 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
651 "Check /proc/modules permission or run as root.\n");
654 perf_session__process_machines(session
,
655 perf_event__synthesize_guest_os
);
658 perf_event__synthesize_thread_map(evsel_list
->threads
,
659 process_synthesized_event
,
662 perf_event__synthesize_threads(process_synthesized_event
,
666 struct sched_param param
;
668 param
.sched_priority
= realtime_prio
;
669 if (sched_setscheduler(0, SCHED_FIFO
, ¶m
)) {
670 pr_err("Could not set realtime priority.\n");
675 perf_evlist__enable(evsel_list
);
688 if (hits
== samples
) {
691 err
= poll(evsel_list
->pollfd
, evsel_list
->nr_fds
, -1);
696 perf_evlist__disable(evsel_list
);
699 if (quiet
|| signr
== SIGUSR1
)
702 fprintf(stderr
, "[ perf record: Woken up %ld times to write data ]\n", waking
);
705 * Approximate RIP event size: 24 bytes.
708 "[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64
" samples) ]\n",
709 (double)bytes_written
/ 1024.0 / 1024.0,
716 perf_session__delete(session
);
720 static const char * const record_usage
[] = {
721 "perf record [<options>] [<command>]",
722 "perf record [<options>] -- <command> [<options>]",
726 static bool force
, append_file
;
728 const struct option record_options
[] = {
729 OPT_CALLBACK('e', "event", &evsel_list
, "event",
730 "event selector. use 'perf list' to list available events",
731 parse_events_option
),
732 OPT_CALLBACK(0, "filter", &evsel_list
, "filter",
733 "event filter", parse_filter
),
734 OPT_INTEGER('p', "pid", &target_pid
,
735 "record events on existing process id"),
736 OPT_INTEGER('t', "tid", &target_tid
,
737 "record events on existing thread id"),
738 OPT_INTEGER('r', "realtime", &realtime_prio
,
739 "collect data with this RT SCHED_FIFO priority"),
740 OPT_BOOLEAN('D', "no-delay", &nodelay
,
741 "collect data without buffering"),
742 OPT_BOOLEAN('R', "raw-samples", &raw_samples
,
743 "collect raw sample records from all opened counters"),
744 OPT_BOOLEAN('a', "all-cpus", &system_wide
,
745 "system-wide collection from all CPUs"),
746 OPT_BOOLEAN('A', "append", &append_file
,
747 "append to the output file to do incremental profiling"),
748 OPT_STRING('C', "cpu", &cpu_list
, "cpu",
749 "list of cpus to monitor"),
750 OPT_BOOLEAN('f', "force", &force
,
751 "overwrite existing data file (deprecated)"),
752 OPT_U64('c', "count", &user_interval
, "event period to sample"),
753 OPT_STRING('o', "output", &output_name
, "file",
755 OPT_BOOLEAN('i', "no-inherit", &no_inherit
,
756 "child tasks do not inherit counters"),
757 OPT_UINTEGER('F', "freq", &user_freq
, "profile at this frequency"),
758 OPT_UINTEGER('m', "mmap-pages", &mmap_pages
, "number of mmap data pages"),
759 OPT_BOOLEAN(0, "group", &group
,
760 "put the counters into a counter group"),
761 OPT_BOOLEAN('g', "call-graph", &call_graph
,
762 "do call-graph (stack chain/backtrace) recording"),
763 OPT_INCR('v', "verbose", &verbose
,
764 "be more verbose (show counter open errors, etc)"),
765 OPT_BOOLEAN('q', "quiet", &quiet
, "don't print any message"),
766 OPT_BOOLEAN('s', "stat", &inherit_stat
,
767 "per thread counts"),
768 OPT_BOOLEAN('d', "data", &sample_address
,
770 OPT_BOOLEAN('T', "timestamp", &sample_time
, "Sample timestamps"),
771 OPT_BOOLEAN('n', "no-samples", &no_samples
,
773 OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid_cache
,
774 "do not update the buildid cache"),
775 OPT_BOOLEAN('B', "no-buildid", &no_buildid
,
776 "do not collect buildids in perf.data"),
777 OPT_CALLBACK('G', "cgroup", &evsel_list
, "name",
778 "monitor event in cgroup name only",
783 int cmd_record(int argc
, const char **argv
, const char *prefix __used
)
786 struct perf_evsel
*pos
;
788 evsel_list
= perf_evlist__new(NULL
, NULL
);
789 if (evsel_list
== NULL
)
792 argc
= parse_options(argc
, argv
, record_options
, record_usage
,
793 PARSE_OPT_STOP_AT_NON_OPTION
);
794 if (!argc
&& target_pid
== -1 && target_tid
== -1 &&
795 !system_wide
&& !cpu_list
)
796 usage_with_options(record_usage
, record_options
);
798 if (force
&& append_file
) {
799 fprintf(stderr
, "Can't overwrite and append at the same time."
800 " You need to choose between -f and -A");
801 usage_with_options(record_usage
, record_options
);
802 } else if (append_file
) {
803 write_mode
= WRITE_APPEND
;
805 write_mode
= WRITE_FORCE
;
808 if (nr_cgroups
&& !system_wide
) {
809 fprintf(stderr
, "cgroup monitoring only available in"
810 " system-wide mode\n");
811 usage_with_options(record_usage
, record_options
);
816 if (symbol_conf
.kptr_restrict
)
818 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
819 "check /proc/sys/kernel/kptr_restrict.\n\n"
820 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
821 "file is not found in the buildid cache or in the vmlinux path.\n\n"
822 "Samples in kernel modules won't be resolved at all.\n\n"
823 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
824 "even with a suitable vmlinux or kallsyms file.\n\n");
826 if (no_buildid_cache
|| no_buildid
)
827 disable_buildid_cache();
829 if (evsel_list
->nr_entries
== 0 &&
830 perf_evlist__add_default(evsel_list
) < 0) {
831 pr_err("Not enough memory for event selector list\n");
832 goto out_symbol_exit
;
835 if (target_pid
!= -1)
836 target_tid
= target_pid
;
838 if (perf_evlist__create_maps(evsel_list
, target_pid
,
839 target_tid
, cpu_list
) < 0)
840 usage_with_options(record_usage
, record_options
);
842 list_for_each_entry(pos
, &evsel_list
->entries
, node
) {
843 if (perf_evsel__alloc_fd(pos
, evsel_list
->cpus
->nr
,
844 evsel_list
->threads
->nr
) < 0)
846 if (perf_header__push_event(pos
->attr
.config
, event_name(pos
)))
850 if (perf_evlist__alloc_pollfd(evsel_list
) < 0)
853 if (user_interval
!= ULLONG_MAX
)
854 default_interval
= user_interval
;
855 if (user_freq
!= UINT_MAX
)
859 * User specified count overrides default frequency.
861 if (default_interval
)
864 default_interval
= freq
;
866 fprintf(stderr
, "frequency and count are zero, aborting\n");
871 err
= __cmd_record(argc
, argv
);
873 perf_evlist__delete_maps(evsel_list
);