4 * Builtin record command: Record the profile of a workload
5 * (or a CPU, or a PID) into the perf.data output file - for
6 * later analysis via perf report.
8 #define _FILE_OFFSET_BITS 64
14 #include "util/build-id.h"
15 #include "util/util.h"
16 #include "util/parse-options.h"
17 #include "util/parse-events.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/symbol.h"
26 #include "util/cpumap.h"
27 #include "util/thread_map.h"
38 static u64 user_interval
= ULLONG_MAX
;
39 static u64 default_interval
= 0;
41 static unsigned int page_size
;
42 static unsigned int mmap_pages
= UINT_MAX
;
43 static unsigned int user_freq
= UINT_MAX
;
44 static int freq
= 1000;
46 static int pipe_output
= 0;
47 static const char *output_name
= NULL
;
48 static bool group
= false;
49 static int realtime_prio
= 0;
50 static bool nodelay
= false;
51 static bool raw_samples
= false;
52 static bool sample_id_all_avail
= true;
53 static bool system_wide
= false;
54 static pid_t target_pid
= -1;
55 static pid_t target_tid
= -1;
56 static pid_t child_pid
= -1;
57 static bool no_inherit
= false;
58 static enum write_mode_t write_mode
= WRITE_FORCE
;
59 static bool call_graph
= false;
60 static bool inherit_stat
= false;
61 static bool no_samples
= false;
62 static bool sample_address
= false;
63 static bool sample_time
= false;
64 static bool no_buildid
= false;
65 static bool no_buildid_cache
= false;
66 static struct perf_evlist
*evsel_list
;
68 static long samples
= 0;
69 static u64 bytes_written
= 0;
71 static int file_new
= 1;
72 static off_t post_processing_offset
;
74 static struct perf_session
*session
;
75 static const char *cpu_list
;
77 static void advance_output(size_t size
)
79 bytes_written
+= size
;
82 static void write_output(void *buf
, size_t size
)
85 int ret
= write(output
, buf
, size
);
88 die("failed to write");
97 static int process_synthesized_event(union perf_event
*event
,
98 struct perf_sample
*sample __used
,
99 struct perf_session
*self __used
)
101 write_output(event
, event
->header
.size
);
105 static void mmap_read(struct perf_mmap
*md
)
107 unsigned int head
= perf_mmap__read_head(md
);
108 unsigned int old
= md
->prev
;
109 unsigned char *data
= md
->base
+ page_size
;
120 if ((old
& md
->mask
) + size
!= (head
& md
->mask
)) {
121 buf
= &data
[old
& md
->mask
];
122 size
= md
->mask
+ 1 - (old
& md
->mask
);
125 write_output(buf
, size
);
128 buf
= &data
[old
& md
->mask
];
132 write_output(buf
, size
);
135 perf_mmap__write_tail(md
, old
);
138 static volatile int done
= 0;
139 static volatile int signr
= -1;
141 static void sig_handler(int sig
)
147 static void sig_atexit(void)
150 kill(child_pid
, SIGTERM
);
152 if (signr
== -1 || signr
== SIGUSR1
)
155 signal(signr
, SIG_DFL
);
156 kill(getpid(), signr
);
159 static void config_attr(struct perf_evsel
*evsel
, struct perf_evlist
*evlist
)
161 struct perf_event_attr
*attr
= &evsel
->attr
;
162 int track
= !evsel
->idx
; /* only the first counter needs these */
164 attr
->inherit
= !no_inherit
;
165 attr
->read_format
= PERF_FORMAT_TOTAL_TIME_ENABLED
|
166 PERF_FORMAT_TOTAL_TIME_RUNNING
|
169 attr
->sample_type
|= PERF_SAMPLE_IP
| PERF_SAMPLE_TID
;
171 if (evlist
->nr_entries
> 1)
172 attr
->sample_type
|= PERF_SAMPLE_ID
;
175 * We default some events to a 1 default interval. But keep
176 * it a weak assumption overridable by the user.
178 if (!attr
->sample_period
|| (user_freq
!= UINT_MAX
&&
179 user_interval
!= ULLONG_MAX
)) {
181 attr
->sample_type
|= PERF_SAMPLE_PERIOD
;
183 attr
->sample_freq
= freq
;
185 attr
->sample_period
= default_interval
;
190 attr
->sample_freq
= 0;
193 attr
->inherit_stat
= 1;
195 if (sample_address
) {
196 attr
->sample_type
|= PERF_SAMPLE_ADDR
;
197 attr
->mmap_data
= track
;
201 attr
->sample_type
|= PERF_SAMPLE_CALLCHAIN
;
204 attr
->sample_type
|= PERF_SAMPLE_CPU
;
206 if (sample_id_all_avail
&&
207 (sample_time
|| system_wide
|| !no_inherit
|| cpu_list
))
208 attr
->sample_type
|= PERF_SAMPLE_TIME
;
211 attr
->sample_type
|= PERF_SAMPLE_TIME
;
212 attr
->sample_type
|= PERF_SAMPLE_RAW
;
213 attr
->sample_type
|= PERF_SAMPLE_CPU
;
218 attr
->wakeup_events
= 1;
224 if (target_pid
== -1 && target_tid
== -1 && !system_wide
) {
226 attr
->enable_on_exec
= 1;
230 static bool perf_evlist__equal(struct perf_evlist
*evlist
,
231 struct perf_evlist
*other
)
233 struct perf_evsel
*pos
, *pair
;
235 if (evlist
->nr_entries
!= other
->nr_entries
)
238 pair
= list_entry(other
->entries
.next
, struct perf_evsel
, node
);
240 list_for_each_entry(pos
, &evlist
->entries
, node
) {
241 if (memcmp(&pos
->attr
, &pair
->attr
, sizeof(pos
->attr
) != 0))
243 pair
= list_entry(pair
->node
.next
, struct perf_evsel
, node
);
249 static void open_counters(struct perf_evlist
*evlist
)
251 struct perf_evsel
*pos
;
253 if (evlist
->cpus
->map
[0] < 0)
256 list_for_each_entry(pos
, &evlist
->entries
, node
) {
257 struct perf_event_attr
*attr
= &pos
->attr
;
259 * Check if parse_single_tracepoint_event has already asked for
262 * XXX this is kludgy but short term fix for problems introduced by
263 * eac23d1c that broke 'perf script' by having different sample_types
264 * when using multiple tracepoint events when we use a perf binary
265 * that tries to use sample_id_all on an older kernel.
267 * We need to move counter creation to perf_session, support
268 * different sample_types, etc.
270 bool time_needed
= attr
->sample_type
& PERF_SAMPLE_TIME
;
272 config_attr(pos
, evlist
);
274 attr
->sample_id_all
= sample_id_all_avail
? 1 : 0;
276 if (perf_evsel__open(pos
, evlist
->cpus
, evlist
->threads
, group
) < 0) {
279 if (err
== EPERM
|| err
== EACCES
) {
280 ui__warning_paranoid();
282 } else if (err
== ENODEV
&& cpu_list
) {
283 die("No such device - did you specify"
284 " an out-of-range profile CPU?\n");
285 } else if (err
== EINVAL
&& sample_id_all_avail
) {
287 * Old kernel, no attr->sample_id_type_all field
289 sample_id_all_avail
= false;
290 if (!sample_time
&& !raw_samples
&& !time_needed
)
291 attr
->sample_type
&= ~PERF_SAMPLE_TIME
;
293 goto retry_sample_id
;
297 * If it's cycles then fall back to hrtimer
298 * based cpu-clock-tick sw counter, which
299 * is always available even if no PMU support:
301 if (attr
->type
== PERF_TYPE_HARDWARE
302 && attr
->config
== PERF_COUNT_HW_CPU_CYCLES
) {
305 ui__warning("The cycles event is not supported, "
306 "trying to fall back to cpu-clock-ticks\n");
307 attr
->type
= PERF_TYPE_SOFTWARE
;
308 attr
->config
= PERF_COUNT_SW_CPU_CLOCK
;
313 ui__warning("The %s event is not supported.\n",
319 error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n",
322 #if defined(__i386__) || defined(__x86_64__)
323 if (attr
->type
== PERF_TYPE_HARDWARE
&& err
== EOPNOTSUPP
)
324 die("No hardware sampling interrupt available."
325 " No APIC? If so then you can boot the kernel"
326 " with the \"lapic\" boot parameter to"
327 " force-enable it.\n");
330 die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
334 if (perf_evlist__set_filters(evlist
)) {
335 error("failed to set filter with %d (%s)\n", errno
,
340 if (perf_evlist__mmap(evlist
, mmap_pages
, false) < 0)
341 die("failed to mmap with %d (%s)\n", errno
, strerror(errno
));
344 session
->evlist
= evlist
;
346 if (!perf_evlist__equal(session
->evlist
, evlist
)) {
347 fprintf(stderr
, "incompatible append\n");
352 perf_session__update_sample_type(session
);
355 static int process_buildids(void)
357 u64 size
= lseek(output
, 0, SEEK_CUR
);
362 session
->fd
= output
;
363 return __perf_session__process_events(session
, post_processing_offset
,
364 size
- post_processing_offset
,
365 size
, &build_id__mark_dso_hit_ops
);
368 static void atexit_header(void)
371 session
->header
.data_size
+= bytes_written
;
375 perf_session__write_header(session
, evsel_list
, output
, true);
376 perf_session__delete(session
);
377 perf_evlist__delete(evsel_list
);
382 static void perf_event__synthesize_guest_os(struct machine
*machine
, void *data
)
385 struct perf_session
*psession
= data
;
387 if (machine__is_host(machine
))
391 *As for guest kernel when processing subcommand record&report,
392 *we arrange module mmap prior to guest kernel mmap and trigger
393 *a preload dso because default guest module symbols are loaded
394 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
395 *method is used to avoid symbol missing when the first addr is
396 *in module instead of in guest kernel.
398 err
= perf_event__synthesize_modules(process_synthesized_event
,
401 pr_err("Couldn't record guest kernel [%d]'s reference"
402 " relocation symbol.\n", machine
->pid
);
405 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
406 * have no _text sometimes.
408 err
= perf_event__synthesize_kernel_mmap(process_synthesized_event
,
409 psession
, machine
, "_text");
411 err
= perf_event__synthesize_kernel_mmap(process_synthesized_event
,
415 pr_err("Couldn't record guest kernel [%d]'s reference"
416 " relocation symbol.\n", machine
->pid
);
419 static struct perf_event_header finished_round_event
= {
420 .size
= sizeof(struct perf_event_header
),
421 .type
= PERF_RECORD_FINISHED_ROUND
,
424 static void mmap_read_all(void)
428 for (i
= 0; i
< evsel_list
->nr_mmaps
; i
++) {
429 if (evsel_list
->mmap
[i
].base
)
430 mmap_read(&evsel_list
->mmap
[i
]);
433 if (perf_header__has_feat(&session
->header
, HEADER_TRACE_INFO
))
434 write_output(&finished_round_event
, sizeof(finished_round_event
));
437 static int __cmd_record(int argc
, const char **argv
)
442 unsigned long waking
= 0;
443 int child_ready_pipe
[2], go_pipe
[2];
444 const bool forks
= argc
> 0;
446 struct machine
*machine
;
448 page_size
= sysconf(_SC_PAGE_SIZE
);
451 signal(SIGCHLD
, sig_handler
);
452 signal(SIGINT
, sig_handler
);
453 signal(SIGUSR1
, sig_handler
);
455 if (forks
&& (pipe(child_ready_pipe
) < 0 || pipe(go_pipe
) < 0)) {
456 perror("failed to create pipes");
461 if (!fstat(STDOUT_FILENO
, &st
) && S_ISFIFO(st
.st_mode
))
464 output_name
= "perf.data";
467 if (!strcmp(output_name
, "-"))
469 else if (!stat(output_name
, &st
) && st
.st_size
) {
470 if (write_mode
== WRITE_FORCE
) {
471 char oldname
[PATH_MAX
];
472 snprintf(oldname
, sizeof(oldname
), "%s.old",
475 rename(output_name
, oldname
);
477 } else if (write_mode
== WRITE_APPEND
) {
478 write_mode
= WRITE_FORCE
;
482 flags
= O_CREAT
|O_RDWR
;
483 if (write_mode
== WRITE_APPEND
)
489 output
= STDOUT_FILENO
;
491 output
= open(output_name
, flags
, S_IRUSR
| S_IWUSR
);
493 perror("failed to create output file");
497 session
= perf_session__new(output_name
, O_WRONLY
,
498 write_mode
== WRITE_FORCE
, false, NULL
);
499 if (session
== NULL
) {
500 pr_err("Not enough memory for reading perf file header\n");
505 perf_header__set_feat(&session
->header
, HEADER_BUILD_ID
);
508 err
= perf_session__read_header(session
, output
);
510 goto out_delete_session
;
513 if (have_tracepoints(&evsel_list
->entries
))
514 perf_header__set_feat(&session
->header
, HEADER_TRACE_INFO
);
516 /* 512 kiB: default amount of unprivileged mlocked memory */
517 if (mmap_pages
== UINT_MAX
)
518 mmap_pages
= (512 * 1024) / page_size
;
523 perror("failed to fork");
530 close(child_ready_pipe
[0]);
532 fcntl(go_pipe
[0], F_SETFD
, FD_CLOEXEC
);
535 * Do a dummy execvp to get the PLT entry resolved,
536 * so we avoid the resolver overhead on the real
539 execvp("", (char **)argv
);
542 * Tell the parent we're ready to go
544 close(child_ready_pipe
[1]);
547 * Wait until the parent tells us to go.
549 if (read(go_pipe
[0], &buf
, 1) == -1)
550 perror("unable to read pipe");
552 execvp(argv
[0], (char **)argv
);
555 kill(getppid(), SIGUSR1
);
559 if (!system_wide
&& target_tid
== -1 && target_pid
== -1)
560 evsel_list
->threads
->map
[0] = child_pid
;
562 close(child_ready_pipe
[1]);
565 * wait for child to settle
567 if (read(child_ready_pipe
[0], &buf
, 1) == -1) {
568 perror("unable to read pipe");
571 close(child_ready_pipe
[0]);
574 open_counters(evsel_list
);
577 * perf_session__delete(session) will be called at atexit_header()
579 atexit(atexit_header
);
582 err
= perf_header__write_pipe(output
);
585 } else if (file_new
) {
586 err
= perf_session__write_header(session
, evsel_list
,
592 post_processing_offset
= lseek(output
, 0, SEEK_CUR
);
595 err
= perf_session__synthesize_attrs(session
,
596 process_synthesized_event
);
598 pr_err("Couldn't synthesize attrs.\n");
602 err
= perf_event__synthesize_event_types(process_synthesized_event
,
605 pr_err("Couldn't synthesize event_types.\n");
609 if (have_tracepoints(&evsel_list
->entries
)) {
611 * FIXME err <= 0 here actually means that
612 * there were no tracepoints so its not really
613 * an error, just that we don't need to
614 * synthesize anything. We really have to
615 * return this more properly and also
616 * propagate errors that now are calling die()
618 err
= perf_event__synthesize_tracing_data(output
, evsel_list
,
619 process_synthesized_event
,
622 pr_err("Couldn't record tracing data.\n");
629 machine
= perf_session__find_host_machine(session
);
631 pr_err("Couldn't find native kernel information.\n");
635 err
= perf_event__synthesize_kernel_mmap(process_synthesized_event
,
636 session
, machine
, "_text");
638 err
= perf_event__synthesize_kernel_mmap(process_synthesized_event
,
639 session
, machine
, "_stext");
641 pr_err("Couldn't record kernel reference relocation symbol\n"
642 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
643 "Check /proc/kallsyms permission or run as root.\n");
645 err
= perf_event__synthesize_modules(process_synthesized_event
,
648 pr_err("Couldn't record kernel module information.\n"
649 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
650 "Check /proc/modules permission or run as root.\n");
653 perf_session__process_machines(session
,
654 perf_event__synthesize_guest_os
);
657 perf_event__synthesize_thread_map(evsel_list
->threads
,
658 process_synthesized_event
,
661 perf_event__synthesize_threads(process_synthesized_event
,
665 struct sched_param param
;
667 param
.sched_priority
= realtime_prio
;
668 if (sched_setscheduler(0, SCHED_FIFO
, ¶m
)) {
669 pr_err("Could not set realtime priority.\n");
685 if (hits
== samples
) {
688 err
= poll(evsel_list
->pollfd
, evsel_list
->nr_fds
, -1);
693 perf_evlist__disable(evsel_list
);
696 if (quiet
|| signr
== SIGUSR1
)
699 fprintf(stderr
, "[ perf record: Woken up %ld times to write data ]\n", waking
);
702 * Approximate RIP event size: 24 bytes.
705 "[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64
" samples) ]\n",
706 (double)bytes_written
/ 1024.0 / 1024.0,
713 perf_session__delete(session
);
717 static const char * const record_usage
[] = {
718 "perf record [<options>] [<command>]",
719 "perf record [<options>] -- <command> [<options>]",
723 static bool force
, append_file
;
725 const struct option record_options
[] = {
726 OPT_CALLBACK('e', "event", &evsel_list
, "event",
727 "event selector. use 'perf list' to list available events",
728 parse_events_option
),
729 OPT_CALLBACK(0, "filter", &evsel_list
, "filter",
730 "event filter", parse_filter
),
731 OPT_INTEGER('p', "pid", &target_pid
,
732 "record events on existing process id"),
733 OPT_INTEGER('t', "tid", &target_tid
,
734 "record events on existing thread id"),
735 OPT_INTEGER('r', "realtime", &realtime_prio
,
736 "collect data with this RT SCHED_FIFO priority"),
737 OPT_BOOLEAN('D', "no-delay", &nodelay
,
738 "collect data without buffering"),
739 OPT_BOOLEAN('R', "raw-samples", &raw_samples
,
740 "collect raw sample records from all opened counters"),
741 OPT_BOOLEAN('a', "all-cpus", &system_wide
,
742 "system-wide collection from all CPUs"),
743 OPT_BOOLEAN('A', "append", &append_file
,
744 "append to the output file to do incremental profiling"),
745 OPT_STRING('C', "cpu", &cpu_list
, "cpu",
746 "list of cpus to monitor"),
747 OPT_BOOLEAN('f', "force", &force
,
748 "overwrite existing data file (deprecated)"),
749 OPT_U64('c', "count", &user_interval
, "event period to sample"),
750 OPT_STRING('o', "output", &output_name
, "file",
752 OPT_BOOLEAN('i', "no-inherit", &no_inherit
,
753 "child tasks do not inherit counters"),
754 OPT_UINTEGER('F', "freq", &user_freq
, "profile at this frequency"),
755 OPT_UINTEGER('m', "mmap-pages", &mmap_pages
, "number of mmap data pages"),
756 OPT_BOOLEAN(0, "group", &group
,
757 "put the counters into a counter group"),
758 OPT_BOOLEAN('g', "call-graph", &call_graph
,
759 "do call-graph (stack chain/backtrace) recording"),
760 OPT_INCR('v', "verbose", &verbose
,
761 "be more verbose (show counter open errors, etc)"),
762 OPT_BOOLEAN('q', "quiet", &quiet
, "don't print any message"),
763 OPT_BOOLEAN('s', "stat", &inherit_stat
,
764 "per thread counts"),
765 OPT_BOOLEAN('d', "data", &sample_address
,
767 OPT_BOOLEAN('T', "timestamp", &sample_time
, "Sample timestamps"),
768 OPT_BOOLEAN('n', "no-samples", &no_samples
,
770 OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid_cache
,
771 "do not update the buildid cache"),
772 OPT_BOOLEAN('B', "no-buildid", &no_buildid
,
773 "do not collect buildids in perf.data"),
774 OPT_CALLBACK('G', "cgroup", &evsel_list
, "name",
775 "monitor event in cgroup name only",
780 int cmd_record(int argc
, const char **argv
, const char *prefix __used
)
783 struct perf_evsel
*pos
;
785 evsel_list
= perf_evlist__new(NULL
, NULL
);
786 if (evsel_list
== NULL
)
789 argc
= parse_options(argc
, argv
, record_options
, record_usage
,
790 PARSE_OPT_STOP_AT_NON_OPTION
);
791 if (!argc
&& target_pid
== -1 && target_tid
== -1 &&
792 !system_wide
&& !cpu_list
)
793 usage_with_options(record_usage
, record_options
);
795 if (force
&& append_file
) {
796 fprintf(stderr
, "Can't overwrite and append at the same time."
797 " You need to choose between -f and -A");
798 usage_with_options(record_usage
, record_options
);
799 } else if (append_file
) {
800 write_mode
= WRITE_APPEND
;
802 write_mode
= WRITE_FORCE
;
805 if (nr_cgroups
&& !system_wide
) {
806 fprintf(stderr
, "cgroup monitoring only available in"
807 " system-wide mode\n");
808 usage_with_options(record_usage
, record_options
);
813 if (symbol_conf
.kptr_restrict
)
815 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
816 "check /proc/sys/kernel/kptr_restrict.\n\n"
817 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
818 "file is not found in the buildid cache or in the vmlinux path.\n\n"
819 "Samples in kernel modules won't be resolved at all.\n\n"
820 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
821 "even with a suitable vmlinux or kallsyms file.\n\n");
823 if (no_buildid_cache
|| no_buildid
)
824 disable_buildid_cache();
826 if (evsel_list
->nr_entries
== 0 &&
827 perf_evlist__add_default(evsel_list
) < 0) {
828 pr_err("Not enough memory for event selector list\n");
829 goto out_symbol_exit
;
832 if (target_pid
!= -1)
833 target_tid
= target_pid
;
835 if (perf_evlist__create_maps(evsel_list
, target_pid
,
836 target_tid
, cpu_list
) < 0)
837 usage_with_options(record_usage
, record_options
);
839 list_for_each_entry(pos
, &evsel_list
->entries
, node
) {
840 if (perf_evsel__alloc_fd(pos
, evsel_list
->cpus
->nr
,
841 evsel_list
->threads
->nr
) < 0)
843 if (perf_header__push_event(pos
->attr
.config
, event_name(pos
)))
847 if (perf_evlist__alloc_pollfd(evsel_list
) < 0)
850 if (user_interval
!= ULLONG_MAX
)
851 default_interval
= user_interval
;
852 if (user_freq
!= UINT_MAX
)
856 * User specified count overrides default frequency.
858 if (default_interval
)
861 default_interval
= freq
;
863 fprintf(stderr
, "frequency and count are zero, aborting\n");
868 err
= __cmd_record(argc
, argv
);
870 perf_evlist__delete_maps(evsel_list
);