blob: 9c03f67398cb29145f3f1d09b30e60f466f7491e [file] [log] [blame]
b.liue9582032025-04-17 19:18:16 +08001// SPDX-License-Identifier: GPL-2.0
2/*
3 * builtin-record.c
4 *
5 * Builtin record command: Record the profile of a workload
6 * (or a CPU, or a PID) into the perf.data output file - for
7 * later analysis via perf report.
8 */
9#include "builtin.h"
10
11#include "util/build-id.h"
12#include <subcmd/parse-options.h>
13#include "util/parse-events.h"
14#include "util/config.h"
15
16#include "util/callchain.h"
17#include "util/cgroup.h"
18#include "util/header.h"
19#include "util/event.h"
20#include "util/evlist.h"
21#include "util/evsel.h"
22#include "util/debug.h"
23#include "util/mmap.h"
24#include "util/target.h"
25#include "util/session.h"
26#include "util/tool.h"
27#include "util/symbol.h"
28#include "util/record.h"
29#include "util/cpumap.h"
30#include "util/thread_map.h"
31#include "util/data.h"
32#include "util/perf_regs.h"
33#include "util/auxtrace.h"
34#include "util/tsc.h"
35#include "util/parse-branch-options.h"
36#include "util/parse-regs-options.h"
37#include "util/llvm-utils.h"
38#include "util/bpf-loader.h"
39#include "util/trigger.h"
40#include "util/perf-hooks.h"
41#include "util/cpu-set-sched.h"
42#include "util/synthetic-events.h"
43#include "util/time-utils.h"
44#include "util/units.h"
45#include "util/bpf-event.h"
46#include "asm/bug.h"
47#include "perf.h"
48
49#include <errno.h>
50#include <inttypes.h>
51#include <locale.h>
52#include <poll.h>
53#include <unistd.h>
54#include <sched.h>
55#include <signal.h>
56#include <sys/mman.h>
57#include <sys/wait.h>
58#include <linux/err.h>
59#include <linux/string.h>
60#include <linux/time64.h>
61#include <linux/zalloc.h>
62
63struct switch_output {
64 bool enabled;
65 bool signal;
66 unsigned long size;
67 unsigned long time;
68 const char *str;
69 bool set;
70 char **filenames;
71 int num_files;
72 int cur_file;
73};
74
75struct record {
76 struct perf_tool tool;
77 struct record_opts opts;
78 u64 bytes_written;
79 struct perf_data data;
80 struct auxtrace_record *itr;
81 struct evlist *evlist;
82 struct perf_session *session;
83 struct evlist *sb_evlist;
84 int realtime_prio;
85 bool no_buildid;
86 bool no_buildid_set;
87 bool no_buildid_cache;
88 bool no_buildid_cache_set;
89 bool buildid_all;
90 bool timestamp_filename;
91 bool timestamp_boundary;
92 struct switch_output switch_output;
93 unsigned long long samples;
94 cpu_set_t affinity_mask;
95};
96
97static volatile int auxtrace_record__snapshot_started;
98static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
99static DEFINE_TRIGGER(switch_output_trigger);
100
101static const char *affinity_tags[PERF_AFFINITY_MAX] = {
102 "SYS", "NODE", "CPU"
103};
104
105static bool switch_output_signal(struct record *rec)
106{
107 return rec->switch_output.signal &&
108 trigger_is_ready(&switch_output_trigger);
109}
110
111static bool switch_output_size(struct record *rec)
112{
113 return rec->switch_output.size &&
114 trigger_is_ready(&switch_output_trigger) &&
115 (rec->bytes_written >= rec->switch_output.size);
116}
117
118static bool switch_output_time(struct record *rec)
119{
120 return rec->switch_output.time &&
121 trigger_is_ready(&switch_output_trigger);
122}
123
124static int record__write(struct record *rec, struct mmap *map __maybe_unused,
125 void *bf, size_t size)
126{
127 struct perf_data_file *file = &rec->session->data->file;
128
129 if (perf_data_file__write(file, bf, size) < 0) {
130 pr_err("failed to write perf data, error: %m\n");
131 return -1;
132 }
133
134 rec->bytes_written += size;
135
136 if (switch_output_size(rec))
137 trigger_hit(&switch_output_trigger);
138
139 return 0;
140}
141
142static int record__aio_enabled(struct record *rec);
143static int record__comp_enabled(struct record *rec);
144static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
145 void *src, size_t src_size);
146
147#ifdef HAVE_AIO_SUPPORT
148static int record__aio_write(struct aiocb *cblock, int trace_fd,
149 void *buf, size_t size, off_t off)
150{
151 int rc;
152
153 cblock->aio_fildes = trace_fd;
154 cblock->aio_buf = buf;
155 cblock->aio_nbytes = size;
156 cblock->aio_offset = off;
157 cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
158
159 do {
160 rc = aio_write(cblock);
161 if (rc == 0) {
162 break;
163 } else if (errno != EAGAIN) {
164 cblock->aio_fildes = -1;
165 pr_err("failed to queue perf data, error: %m\n");
166 break;
167 }
168 } while (1);
169
170 return rc;
171}
172
173static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
174{
175 void *rem_buf;
176 off_t rem_off;
177 size_t rem_size;
178 int rc, aio_errno;
179 ssize_t aio_ret, written;
180
181 aio_errno = aio_error(cblock);
182 if (aio_errno == EINPROGRESS)
183 return 0;
184
185 written = aio_ret = aio_return(cblock);
186 if (aio_ret < 0) {
187 if (aio_errno != EINTR)
188 pr_err("failed to write perf data, error: %m\n");
189 written = 0;
190 }
191
192 rem_size = cblock->aio_nbytes - written;
193
194 if (rem_size == 0) {
195 cblock->aio_fildes = -1;
196 /*
197 * md->refcount is incremented in record__aio_pushfn() for
198 * every aio write request started in record__aio_push() so
199 * decrement it because the request is now complete.
200 */
201 perf_mmap__put(md);
202 rc = 1;
203 } else {
204 /*
205 * aio write request may require restart with the
206 * reminder if the kernel didn't write whole
207 * chunk at once.
208 */
209 rem_off = cblock->aio_offset + written;
210 rem_buf = (void *)(cblock->aio_buf + written);
211 record__aio_write(cblock, cblock->aio_fildes,
212 rem_buf, rem_size, rem_off);
213 rc = 0;
214 }
215
216 return rc;
217}
218
219static int record__aio_sync(struct mmap *md, bool sync_all)
220{
221 struct aiocb **aiocb = md->aio.aiocb;
222 struct aiocb *cblocks = md->aio.cblocks;
223 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */
224 int i, do_suspend;
225
226 do {
227 do_suspend = 0;
228 for (i = 0; i < md->aio.nr_cblocks; ++i) {
229 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
230 if (sync_all)
231 aiocb[i] = NULL;
232 else
233 return i;
234 } else {
235 /*
236 * Started aio write is not complete yet
237 * so it has to be waited before the
238 * next allocation.
239 */
240 aiocb[i] = &cblocks[i];
241 do_suspend = 1;
242 }
243 }
244 if (!do_suspend)
245 return -1;
246
247 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
248 if (!(errno == EAGAIN || errno == EINTR))
249 pr_err("failed to sync perf data, error: %m\n");
250 }
251 } while (1);
252}
253
254struct record_aio {
255 struct record *rec;
256 void *data;
257 size_t size;
258};
259
260static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
261{
262 struct record_aio *aio = to;
263
264 /*
265 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
266 * to release space in the kernel buffer as fast as possible, calling
267 * perf_mmap__consume() from perf_mmap__push() function.
268 *
269 * That lets the kernel to proceed with storing more profiling data into
270 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
271 *
272 * Coping can be done in two steps in case the chunk of profiling data
273 * crosses the upper bound of the kernel buffer. In this case we first move
274 * part of data from map->start till the upper bound and then the reminder
275 * from the beginning of the kernel buffer till the end of the data chunk.
276 */
277
278 if (record__comp_enabled(aio->rec)) {
279 size = zstd_compress(aio->rec->session, aio->data + aio->size,
280 perf_mmap__mmap_len(map) - aio->size,
281 buf, size);
282 } else {
283 memcpy(aio->data + aio->size, buf, size);
284 }
285
286 if (!aio->size) {
287 /*
288 * Increment map->refcount to guard map->aio.data[] buffer
289 * from premature deallocation because map object can be
290 * released earlier than aio write request started on
291 * map->aio.data[] buffer is complete.
292 *
293 * perf_mmap__put() is done at record__aio_complete()
294 * after started aio request completion or at record__aio_push()
295 * if the request failed to start.
296 */
297 perf_mmap__get(map);
298 }
299
300 aio->size += size;
301
302 return size;
303}
304
305static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
306{
307 int ret, idx;
308 int trace_fd = rec->session->data->file.fd;
309 struct record_aio aio = { .rec = rec, .size = 0 };
310
311 /*
312 * Call record__aio_sync() to wait till map->aio.data[] buffer
313 * becomes available after previous aio write operation.
314 */
315
316 idx = record__aio_sync(map, false);
317 aio.data = map->aio.data[idx];
318 ret = perf_mmap__push(map, &aio, record__aio_pushfn);
319 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
320 return ret;
321
322 rec->samples++;
323 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
324 if (!ret) {
325 *off += aio.size;
326 rec->bytes_written += aio.size;
327 if (switch_output_size(rec))
328 trigger_hit(&switch_output_trigger);
329 } else {
330 /*
331 * Decrement map->refcount incremented in record__aio_pushfn()
332 * back if record__aio_write() operation failed to start, otherwise
333 * map->refcount is decremented in record__aio_complete() after
334 * aio write operation finishes successfully.
335 */
336 perf_mmap__put(map);
337 }
338
339 return ret;
340}
341
342static off_t record__aio_get_pos(int trace_fd)
343{
344 return lseek(trace_fd, 0, SEEK_CUR);
345}
346
347static void record__aio_set_pos(int trace_fd, off_t pos)
348{
349 lseek(trace_fd, pos, SEEK_SET);
350}
351
352static void record__aio_mmap_read_sync(struct record *rec)
353{
354 int i;
355 struct evlist *evlist = rec->evlist;
356 struct mmap *maps = evlist->mmap;
357
358 if (!record__aio_enabled(rec))
359 return;
360
361 for (i = 0; i < evlist->core.nr_mmaps; i++) {
362 struct mmap *map = &maps[i];
363
364 if (map->core.base)
365 record__aio_sync(map, true);
366 }
367}
368
369static int nr_cblocks_default = 1;
370static int nr_cblocks_max = 4;
371
372static int record__aio_parse(const struct option *opt,
373 const char *str,
374 int unset)
375{
376 struct record_opts *opts = (struct record_opts *)opt->value;
377
378 if (unset) {
379 opts->nr_cblocks = 0;
380 } else {
381 if (str)
382 opts->nr_cblocks = strtol(str, NULL, 0);
383 if (!opts->nr_cblocks)
384 opts->nr_cblocks = nr_cblocks_default;
385 }
386
387 return 0;
388}
389#else /* HAVE_AIO_SUPPORT */
390static int nr_cblocks_max = 0;
391
392static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
393 off_t *off __maybe_unused)
394{
395 return -1;
396}
397
398static off_t record__aio_get_pos(int trace_fd __maybe_unused)
399{
400 return -1;
401}
402
403static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
404{
405}
406
407static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
408{
409}
410#endif
411
412static int record__aio_enabled(struct record *rec)
413{
414 return rec->opts.nr_cblocks > 0;
415}
416
417#define MMAP_FLUSH_DEFAULT 1
418static int record__mmap_flush_parse(const struct option *opt,
419 const char *str,
420 int unset)
421{
422 int flush_max;
423 struct record_opts *opts = (struct record_opts *)opt->value;
424 static struct parse_tag tags[] = {
425 { .tag = 'B', .mult = 1 },
426 { .tag = 'K', .mult = 1 << 10 },
427 { .tag = 'M', .mult = 1 << 20 },
428 { .tag = 'G', .mult = 1 << 30 },
429 { .tag = 0 },
430 };
431
432 if (unset)
433 return 0;
434
435 if (str) {
436 opts->mmap_flush = parse_tag_value(str, tags);
437 if (opts->mmap_flush == (int)-1)
438 opts->mmap_flush = strtol(str, NULL, 0);
439 }
440
441 if (!opts->mmap_flush)
442 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
443
444 flush_max = evlist__mmap_size(opts->mmap_pages);
445 flush_max /= 4;
446 if (opts->mmap_flush > flush_max)
447 opts->mmap_flush = flush_max;
448
449 return 0;
450}
451
452#ifdef HAVE_ZSTD_SUPPORT
453static unsigned int comp_level_default = 1;
454
455static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
456{
457 struct record_opts *opts = opt->value;
458
459 if (unset) {
460 opts->comp_level = 0;
461 } else {
462 if (str)
463 opts->comp_level = strtol(str, NULL, 0);
464 if (!opts->comp_level)
465 opts->comp_level = comp_level_default;
466 }
467
468 return 0;
469}
470#endif
471static unsigned int comp_level_max = 22;
472
473static int record__comp_enabled(struct record *rec)
474{
475 return rec->opts.comp_level > 0;
476}
477
478static int process_synthesized_event(struct perf_tool *tool,
479 union perf_event *event,
480 struct perf_sample *sample __maybe_unused,
481 struct machine *machine __maybe_unused)
482{
483 struct record *rec = container_of(tool, struct record, tool);
484 return record__write(rec, NULL, event, event->header.size);
485}
486
487static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
488{
489 struct record *rec = to;
490
491 if (record__comp_enabled(rec)) {
492 size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size);
493 bf = map->data;
494 }
495
496 rec->samples++;
497 return record__write(rec, map, bf, size);
498}
499
500static volatile int done;
501static volatile int signr = -1;
502static volatile int child_finished;
503
504static void sig_handler(int sig)
505{
506 if (sig == SIGCHLD)
507 child_finished = 1;
508 else
509 signr = sig;
510
511 done = 1;
512}
513
514static void sigsegv_handler(int sig)
515{
516 perf_hooks__recover();
517 sighandler_dump_stack(sig);
518}
519
520static void record__sig_exit(void)
521{
522 if (signr == -1)
523 return;
524
525 signal(signr, SIG_DFL);
526 raise(signr);
527}
528
529#ifdef HAVE_AUXTRACE_SUPPORT
530
531static int record__process_auxtrace(struct perf_tool *tool,
532 struct mmap *map,
533 union perf_event *event, void *data1,
534 size_t len1, void *data2, size_t len2)
535{
536 struct record *rec = container_of(tool, struct record, tool);
537 struct perf_data *data = &rec->data;
538 size_t padding;
539 u8 pad[8] = {0};
540
541 if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
542 off_t file_offset;
543 int fd = perf_data__fd(data);
544 int err;
545
546 file_offset = lseek(fd, 0, SEEK_CUR);
547 if (file_offset == -1)
548 return -1;
549 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
550 event, file_offset);
551 if (err)
552 return err;
553 }
554
555 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
556 padding = (len1 + len2) & 7;
557 if (padding)
558 padding = 8 - padding;
559
560 record__write(rec, map, event, event->header.size);
561 record__write(rec, map, data1, len1);
562 if (len2)
563 record__write(rec, map, data2, len2);
564 record__write(rec, map, &pad, padding);
565
566 return 0;
567}
568
569static int record__auxtrace_mmap_read(struct record *rec,
570 struct mmap *map)
571{
572 int ret;
573
574 ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
575 record__process_auxtrace);
576 if (ret < 0)
577 return ret;
578
579 if (ret)
580 rec->samples++;
581
582 return 0;
583}
584
585static int record__auxtrace_mmap_read_snapshot(struct record *rec,
586 struct mmap *map)
587{
588 int ret;
589
590 ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
591 record__process_auxtrace,
592 rec->opts.auxtrace_snapshot_size);
593 if (ret < 0)
594 return ret;
595
596 if (ret)
597 rec->samples++;
598
599 return 0;
600}
601
602static int record__auxtrace_read_snapshot_all(struct record *rec)
603{
604 int i;
605 int rc = 0;
606
607 for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
608 struct mmap *map = &rec->evlist->mmap[i];
609
610 if (!map->auxtrace_mmap.base)
611 continue;
612
613 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
614 rc = -1;
615 goto out;
616 }
617 }
618out:
619 return rc;
620}
621
622static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
623{
624 pr_debug("Recording AUX area tracing snapshot\n");
625 if (record__auxtrace_read_snapshot_all(rec) < 0) {
626 trigger_error(&auxtrace_snapshot_trigger);
627 } else {
628 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
629 trigger_error(&auxtrace_snapshot_trigger);
630 else
631 trigger_ready(&auxtrace_snapshot_trigger);
632 }
633}
634
635static int record__auxtrace_snapshot_exit(struct record *rec)
636{
637 if (trigger_is_error(&auxtrace_snapshot_trigger))
638 return 0;
639
640 if (!auxtrace_record__snapshot_started &&
641 auxtrace_record__snapshot_start(rec->itr))
642 return -1;
643
644 record__read_auxtrace_snapshot(rec, true);
645 if (trigger_is_error(&auxtrace_snapshot_trigger))
646 return -1;
647
648 return 0;
649}
650
651static int record__auxtrace_init(struct record *rec)
652{
653 int err;
654
655 if (!rec->itr) {
656 rec->itr = auxtrace_record__init(rec->evlist, &err);
657 if (err)
658 return err;
659 }
660
661 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
662 rec->opts.auxtrace_snapshot_opts);
663 if (err)
664 return err;
665
666 return auxtrace_parse_filters(rec->evlist);
667}
668
669#else
670
671static inline
672int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
673 struct mmap *map __maybe_unused)
674{
675 return 0;
676}
677
678static inline
679void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
680 bool on_exit __maybe_unused)
681{
682}
683
684static inline
685int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
686{
687 return 0;
688}
689
690static inline
691int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
692{
693 return 0;
694}
695
696static int record__auxtrace_init(struct record *rec __maybe_unused)
697{
698 return 0;
699}
700
701#endif
702
703static int record__mmap_evlist(struct record *rec,
704 struct evlist *evlist)
705{
706 struct record_opts *opts = &rec->opts;
707 char msg[512];
708
709 if (opts->affinity != PERF_AFFINITY_SYS)
710 cpu__setup_cpunode_map();
711
712 if (evlist__mmap_ex(evlist, opts->mmap_pages,
713 opts->auxtrace_mmap_pages,
714 opts->auxtrace_snapshot_mode,
715 opts->nr_cblocks, opts->affinity,
716 opts->mmap_flush, opts->comp_level) < 0) {
717 if (errno == EPERM) {
718 pr_err("Permission error mapping pages.\n"
719 "Consider increasing "
720 "/proc/sys/kernel/perf_event_mlock_kb,\n"
721 "or try again with a smaller value of -m/--mmap_pages.\n"
722 "(current value: %u,%u)\n",
723 opts->mmap_pages, opts->auxtrace_mmap_pages);
724 return -errno;
725 } else {
726 pr_err("failed to mmap with %d (%s)\n", errno,
727 str_error_r(errno, msg, sizeof(msg)));
728 if (errno)
729 return -errno;
730 else
731 return -EINVAL;
732 }
733 }
734 return 0;
735}
736
737static int record__mmap(struct record *rec)
738{
739 return record__mmap_evlist(rec, rec->evlist);
740}
741
742static int record__open(struct record *rec)
743{
744 char msg[BUFSIZ];
745 struct evsel *pos;
746 struct evlist *evlist = rec->evlist;
747 struct perf_session *session = rec->session;
748 struct record_opts *opts = &rec->opts;
749 int rc = 0;
750
751 /*
752 * For initial_delay we need to add a dummy event so that we can track
753 * PERF_RECORD_MMAP while we wait for the initial delay to enable the
754 * real events, the ones asked by the user.
755 */
756 if (opts->initial_delay) {
757 if (perf_evlist__add_dummy(evlist))
758 return -ENOMEM;
759
760 pos = evlist__first(evlist);
761 pos->tracking = 0;
762 pos = evlist__last(evlist);
763 pos->tracking = 1;
764 pos->core.attr.enable_on_exec = 1;
765 }
766
767 perf_evlist__config(evlist, opts, &callchain_param);
768
769 evlist__for_each_entry(evlist, pos) {
770try_again:
771 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
772 if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
773 if (verbose > 0)
774 ui__warning("%s\n", msg);
775 goto try_again;
776 }
777 if ((errno == EINVAL || errno == EBADF) &&
778 pos->leader != pos &&
779 pos->weak_group) {
780 pos = perf_evlist__reset_weak_group(evlist, pos);
781 goto try_again;
782 }
783 rc = -errno;
784 perf_evsel__open_strerror(pos, &opts->target,
785 errno, msg, sizeof(msg));
786 ui__error("%s\n", msg);
787 goto out;
788 }
789
790 pos->supported = true;
791 }
792
793 if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) {
794 pr_warning(
795"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
796"check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
797"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
798"file is not found in the buildid cache or in the vmlinux path.\n\n"
799"Samples in kernel modules won't be resolved at all.\n\n"
800"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
801"even with a suitable vmlinux or kallsyms file.\n\n");
802 }
803
804 if (perf_evlist__apply_filters(evlist, &pos)) {
805 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
806 pos->filter, perf_evsel__name(pos), errno,
807 str_error_r(errno, msg, sizeof(msg)));
808 rc = -1;
809 goto out;
810 }
811
812 rc = record__mmap(rec);
813 if (rc)
814 goto out;
815
816 session->evlist = evlist;
817 perf_session__set_id_hdr_size(session);
818out:
819 return rc;
820}
821
822static int process_sample_event(struct perf_tool *tool,
823 union perf_event *event,
824 struct perf_sample *sample,
825 struct evsel *evsel,
826 struct machine *machine)
827{
828 struct record *rec = container_of(tool, struct record, tool);
829
830 if (rec->evlist->first_sample_time == 0)
831 rec->evlist->first_sample_time = sample->time;
832
833 rec->evlist->last_sample_time = sample->time;
834
835 if (rec->buildid_all)
836 return 0;
837
838 rec->samples++;
839 return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
840}
841
842static int process_buildids(struct record *rec)
843{
844 struct perf_session *session = rec->session;
845
846 if (perf_data__size(&rec->data) == 0)
847 return 0;
848
849 /*
850 * During this process, it'll load kernel map and replace the
851 * dso->long_name to a real pathname it found. In this case
852 * we prefer the vmlinux path like
853 * /lib/modules/3.16.4/build/vmlinux
854 *
855 * rather than build-id path (in debug directory).
856 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
857 */
858 symbol_conf.ignore_vmlinux_buildid = true;
859
860 /*
861 * If --buildid-all is given, it marks all DSO regardless of hits,
862 * so no need to process samples. But if timestamp_boundary is enabled,
863 * it still needs to walk on all samples to get the timestamps of
864 * first/last samples.
865 */
866 if (rec->buildid_all && !rec->timestamp_boundary)
867 rec->tool.sample = NULL;
868
869 return perf_session__process_events(session);
870}
871
872static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
873{
874 int err;
875 struct perf_tool *tool = data;
876 /*
877 *As for guest kernel when processing subcommand record&report,
878 *we arrange module mmap prior to guest kernel mmap and trigger
879 *a preload dso because default guest module symbols are loaded
880 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
881 *method is used to avoid symbol missing when the first addr is
882 *in module instead of in guest kernel.
883 */
884 err = perf_event__synthesize_modules(tool, process_synthesized_event,
885 machine);
886 if (err < 0)
887 pr_err("Couldn't record guest kernel [%d]'s reference"
888 " relocation symbol.\n", machine->pid);
889
890 /*
891 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
892 * have no _text sometimes.
893 */
894 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
895 machine);
896 if (err < 0)
897 pr_err("Couldn't record guest kernel [%d]'s reference"
898 " relocation symbol.\n", machine->pid);
899}
900
901static struct perf_event_header finished_round_event = {
902 .size = sizeof(struct perf_event_header),
903 .type = PERF_RECORD_FINISHED_ROUND,
904};
905
906static void record__adjust_affinity(struct record *rec, struct mmap *map)
907{
908 if (rec->opts.affinity != PERF_AFFINITY_SYS &&
909 !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
910 CPU_ZERO(&rec->affinity_mask);
911 CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
912 sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
913 }
914}
915
916static size_t process_comp_header(void *record, size_t increment)
917{
918 struct perf_record_compressed *event = record;
919 size_t size = sizeof(*event);
920
921 if (increment) {
922 event->header.size += increment;
923 return increment;
924 }
925
926 event->header.type = PERF_RECORD_COMPRESSED;
927 event->header.size = size;
928
929 return size;
930}
931
932static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
933 void *src, size_t src_size)
934{
935 size_t compressed;
936 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
937
938 compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
939 max_record_size, process_comp_header);
940
941 session->bytes_transferred += src_size;
942 session->bytes_compressed += compressed;
943
944 return compressed;
945}
946
947static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
948 bool overwrite, bool synch)
949{
950 u64 bytes_written = rec->bytes_written;
951 int i;
952 int rc = 0;
953 struct mmap *maps;
954 int trace_fd = rec->data.file.fd;
955 off_t off = 0;
956
957 if (!evlist)
958 return 0;
959
960 maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
961 if (!maps)
962 return 0;
963
964 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
965 return 0;
966
967 if (record__aio_enabled(rec))
968 off = record__aio_get_pos(trace_fd);
969
970 for (i = 0; i < evlist->core.nr_mmaps; i++) {
971 u64 flush = 0;
972 struct mmap *map = &maps[i];
973
974 if (map->core.base) {
975 record__adjust_affinity(rec, map);
976 if (synch) {
977 flush = map->core.flush;
978 map->core.flush = 1;
979 }
980 if (!record__aio_enabled(rec)) {
981 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
982 if (synch)
983 map->core.flush = flush;
984 rc = -1;
985 goto out;
986 }
987 } else {
988 if (record__aio_push(rec, map, &off) < 0) {
989 record__aio_set_pos(trace_fd, off);
990 if (synch)
991 map->core.flush = flush;
992 rc = -1;
993 goto out;
994 }
995 }
996 if (synch)
997 map->core.flush = flush;
998 }
999
1000 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1001 record__auxtrace_mmap_read(rec, map) != 0) {
1002 rc = -1;
1003 goto out;
1004 }
1005 }
1006
1007 if (record__aio_enabled(rec))
1008 record__aio_set_pos(trace_fd, off);
1009
1010 /*
1011 * Mark the round finished in case we wrote
1012 * at least one event.
1013 */
1014 if (bytes_written != rec->bytes_written)
1015 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1016
1017 if (overwrite)
1018 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1019out:
1020 return rc;
1021}
1022
1023static int record__mmap_read_all(struct record *rec, bool synch)
1024{
1025 int err;
1026
1027 err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1028 if (err)
1029 return err;
1030
1031 return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1032}
1033
1034static void record__init_features(struct record *rec)
1035{
1036 struct perf_session *session = rec->session;
1037 int feat;
1038
1039 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1040 perf_header__set_feat(&session->header, feat);
1041
1042 if (rec->no_buildid)
1043 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1044
1045 if (!have_tracepoints(&rec->evlist->core.entries))
1046 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1047
1048 if (!rec->opts.branch_stack)
1049 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1050
1051 if (!rec->opts.full_auxtrace)
1052 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1053
1054 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1055 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1056
1057 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1058 if (!record__comp_enabled(rec))
1059 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1060
1061 perf_header__clear_feat(&session->header, HEADER_STAT);
1062}
1063
1064static void
1065record__finish_output(struct record *rec)
1066{
1067 struct perf_data *data = &rec->data;
1068 int fd = perf_data__fd(data);
1069
1070 if (data->is_pipe)
1071 return;
1072
1073 rec->session->header.data_size += rec->bytes_written;
1074 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1075
1076 if (!rec->no_buildid) {
1077 process_buildids(rec);
1078
1079 if (rec->buildid_all)
1080 dsos__hit_all(rec->session);
1081 }
1082 perf_session__write_header(rec->session, rec->evlist, fd, true);
1083
1084 return;
1085}
1086
1087static int record__synthesize_workload(struct record *rec, bool tail)
1088{
1089 int err;
1090 struct perf_thread_map *thread_map;
1091
1092 if (rec->opts.tail_synthesize != tail)
1093 return 0;
1094
1095 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1096 if (thread_map == NULL)
1097 return -1;
1098
1099 err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1100 process_synthesized_event,
1101 &rec->session->machines.host,
1102 rec->opts.sample_address);
1103 perf_thread_map__put(thread_map);
1104 return err;
1105}
1106
1107static int record__synthesize(struct record *rec, bool tail);
1108
1109static int
1110record__switch_output(struct record *rec, bool at_exit)
1111{
1112 struct perf_data *data = &rec->data;
1113 char *new_filename = NULL;
1114 int fd, err;
1115
1116 /* Same Size: "2015122520103046"*/
1117 char timestamp[] = "InvalidTimestamp";
1118
1119 record__aio_mmap_read_sync(rec);
1120
1121 record__synthesize(rec, true);
1122 if (target__none(&rec->opts.target))
1123 record__synthesize_workload(rec, true);
1124
1125 rec->samples = 0;
1126 record__finish_output(rec);
1127 err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1128 if (err) {
1129 pr_err("Failed to get current timestamp\n");
1130 return -EINVAL;
1131 }
1132
1133 fd = perf_data__switch(data, timestamp,
1134 rec->session->header.data_offset,
1135 at_exit, &new_filename);
1136 if (fd >= 0 && !at_exit) {
1137 rec->bytes_written = 0;
1138 rec->session->header.data_size = 0;
1139 }
1140
1141 if (!quiet)
1142 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1143 data->path, timestamp);
1144
1145 if (rec->switch_output.num_files) {
1146 int n = rec->switch_output.cur_file + 1;
1147
1148 if (n >= rec->switch_output.num_files)
1149 n = 0;
1150 rec->switch_output.cur_file = n;
1151 if (rec->switch_output.filenames[n]) {
1152 remove(rec->switch_output.filenames[n]);
1153 zfree(&rec->switch_output.filenames[n]);
1154 }
1155 rec->switch_output.filenames[n] = new_filename;
1156 } else {
1157 free(new_filename);
1158 }
1159
1160 /* Output tracking events */
1161 if (!at_exit) {
1162 record__synthesize(rec, false);
1163
1164 /*
1165 * In 'perf record --switch-output' without -a,
1166 * record__synthesize() in record__switch_output() won't
1167 * generate tracking events because there's no thread_map
1168 * in evlist. Which causes newly created perf.data doesn't
1169 * contain map and comm information.
1170 * Create a fake thread_map and directly call
1171 * perf_event__synthesize_thread_map() for those events.
1172 */
1173 if (target__none(&rec->opts.target))
1174 record__synthesize_workload(rec, false);
1175 }
1176 return fd;
1177}
1178
1179static volatile int workload_exec_errno;
1180
1181/*
1182 * perf_evlist__prepare_workload will send a SIGUSR1
1183 * if the fork fails, since we asked by setting its
1184 * want_signal to true.
1185 */
1186static void workload_exec_failed_signal(int signo __maybe_unused,
1187 siginfo_t *info,
1188 void *ucontext __maybe_unused)
1189{
1190 workload_exec_errno = info->si_value.sival_int;
1191 done = 1;
1192 child_finished = 1;
1193}
1194
1195static void snapshot_sig_handler(int sig);
1196static void alarm_sig_handler(int sig);
1197
1198static const struct perf_event_mmap_page *
1199perf_evlist__pick_pc(struct evlist *evlist)
1200{
1201 if (evlist) {
1202 if (evlist->mmap && evlist->mmap[0].core.base)
1203 return evlist->mmap[0].core.base;
1204 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1205 return evlist->overwrite_mmap[0].core.base;
1206 }
1207 return NULL;
1208}
1209
1210static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1211{
1212 const struct perf_event_mmap_page *pc;
1213
1214 pc = perf_evlist__pick_pc(rec->evlist);
1215 if (pc)
1216 return pc;
1217 return NULL;
1218}
1219
1220static int record__synthesize(struct record *rec, bool tail)
1221{
1222 struct perf_session *session = rec->session;
1223 struct machine *machine = &session->machines.host;
1224 struct perf_data *data = &rec->data;
1225 struct record_opts *opts = &rec->opts;
1226 struct perf_tool *tool = &rec->tool;
1227 int fd = perf_data__fd(data);
1228 int err = 0;
1229
1230 if (rec->opts.tail_synthesize != tail)
1231 return 0;
1232
1233 if (data->is_pipe) {
1234 /*
1235 * We need to synthesize events first, because some
1236 * features works on top of them (on report side).
1237 */
1238 err = perf_event__synthesize_attrs(tool, rec->evlist,
1239 process_synthesized_event);
1240 if (err < 0) {
1241 pr_err("Couldn't synthesize attrs.\n");
1242 goto out;
1243 }
1244
1245 err = perf_event__synthesize_features(tool, session, rec->evlist,
1246 process_synthesized_event);
1247 if (err < 0) {
1248 pr_err("Couldn't synthesize features.\n");
1249 return err;
1250 }
1251
1252 if (have_tracepoints(&rec->evlist->core.entries)) {
1253 /*
1254 * FIXME err <= 0 here actually means that
1255 * there were no tracepoints so its not really
1256 * an error, just that we don't need to
1257 * synthesize anything. We really have to
1258 * return this more properly and also
1259 * propagate errors that now are calling die()
1260 */
1261 err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
1262 process_synthesized_event);
1263 if (err <= 0) {
1264 pr_err("Couldn't record tracing data.\n");
1265 goto out;
1266 }
1267 rec->bytes_written += err;
1268 }
1269 }
1270
1271 err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1272 process_synthesized_event, machine);
1273 if (err)
1274 goto out;
1275
1276 if (rec->opts.full_auxtrace) {
1277 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1278 session, process_synthesized_event);
1279 if (err)
1280 goto out;
1281 }
1282
1283 if (!perf_evlist__exclude_kernel(rec->evlist)) {
1284 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1285 machine);
1286 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1287 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1288 "Check /proc/kallsyms permission or run as root.\n");
1289
1290 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1291 machine);
1292 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1293 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1294 "Check /proc/modules permission or run as root.\n");
1295 }
1296
1297 if (perf_guest) {
1298 machines__process_guests(&session->machines,
1299 perf_event__synthesize_guest_os, tool);
1300 }
1301
1302 err = perf_event__synthesize_extra_attr(&rec->tool,
1303 rec->evlist,
1304 process_synthesized_event,
1305 data->is_pipe);
1306 if (err)
1307 goto out;
1308
1309 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1310 process_synthesized_event,
1311 NULL);
1312 if (err < 0) {
1313 pr_err("Couldn't synthesize thread map.\n");
1314 return err;
1315 }
1316
1317 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1318 process_synthesized_event, NULL);
1319 if (err < 0) {
1320 pr_err("Couldn't synthesize cpu map.\n");
1321 return err;
1322 }
1323
1324 err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1325 machine, opts);
1326 if (err < 0)
1327 pr_warning("Couldn't synthesize bpf events.\n");
1328
1329 err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1330 process_synthesized_event, opts->sample_address,
1331 1);
1332out:
1333 return err;
1334}
1335
1336static int __cmd_record(struct record *rec, int argc, const char **argv)
1337{
1338 int err;
1339 int status = 0;
1340 unsigned long waking = 0;
1341 const bool forks = argc > 0;
1342 struct perf_tool *tool = &rec->tool;
1343 struct record_opts *opts = &rec->opts;
1344 struct perf_data *data = &rec->data;
1345 struct perf_session *session;
1346 bool disabled = false, draining = false;
1347 int fd;
1348 float ratio = 0;
1349
1350 atexit(record__sig_exit);
1351 signal(SIGCHLD, sig_handler);
1352 signal(SIGINT, sig_handler);
1353 signal(SIGTERM, sig_handler);
1354 signal(SIGSEGV, sigsegv_handler);
1355
1356 if (rec->opts.record_namespaces)
1357 tool->namespace_events = true;
1358
1359 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1360 signal(SIGUSR2, snapshot_sig_handler);
1361 if (rec->opts.auxtrace_snapshot_mode)
1362 trigger_on(&auxtrace_snapshot_trigger);
1363 if (rec->switch_output.enabled)
1364 trigger_on(&switch_output_trigger);
1365 } else {
1366 signal(SIGUSR2, SIG_IGN);
1367 }
1368
1369 session = perf_session__new(data, false, tool);
1370 if (IS_ERR(session)) {
1371 pr_err("Perf session creation failed.\n");
1372 return PTR_ERR(session);
1373 }
1374
1375 fd = perf_data__fd(data);
1376 rec->session = session;
1377
1378 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1379 pr_err("Compression initialization failed.\n");
1380 return -1;
1381 }
1382
1383 session->header.env.comp_type = PERF_COMP_ZSTD;
1384 session->header.env.comp_level = rec->opts.comp_level;
1385
1386 record__init_features(rec);
1387
1388 if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1389 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1390
1391 if (forks) {
1392 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1393 argv, data->is_pipe,
1394 workload_exec_failed_signal);
1395 if (err < 0) {
1396 pr_err("Couldn't run the workload!\n");
1397 status = err;
1398 goto out_delete_session;
1399 }
1400 }
1401
1402 /*
1403 * If we have just single event and are sending data
1404 * through pipe, we need to force the ids allocation,
1405 * because we synthesize event name through the pipe
1406 * and need the id for that.
1407 */
1408 if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1409 rec->opts.sample_id = true;
1410
1411 if (record__open(rec) != 0) {
1412 err = -1;
1413 goto out_child;
1414 }
1415 session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1416
1417 err = bpf__apply_obj_config();
1418 if (err) {
1419 char errbuf[BUFSIZ];
1420
1421 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1422 pr_err("ERROR: Apply config to BPF failed: %s\n",
1423 errbuf);
1424 goto out_child;
1425 }
1426
1427 /*
1428 * Normally perf_session__new would do this, but it doesn't have the
1429 * evlist.
1430 */
1431 if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1432 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1433 rec->tool.ordered_events = false;
1434 }
1435
1436 if (!rec->evlist->nr_groups)
1437 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1438
1439 if (data->is_pipe) {
1440 err = perf_header__write_pipe(fd);
1441 if (err < 0)
1442 goto out_child;
1443 } else {
1444 err = perf_session__write_header(session, rec->evlist, fd, false);
1445 if (err < 0)
1446 goto out_child;
1447 }
1448
1449 err = -1;
1450 if (!rec->no_buildid
1451 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1452 pr_err("Couldn't generate buildids. "
1453 "Use --no-buildid to profile anyway.\n");
1454 goto out_child;
1455 }
1456
1457 if (!opts->no_bpf_event) {
1458 rec->sb_evlist = evlist__new();
1459
1460 if (rec->sb_evlist == NULL) {
1461 pr_err("Couldn't create side band evlist.\n.");
1462 goto out_child;
1463 }
1464
1465 if (evlist__add_bpf_sb_event(rec->sb_evlist, &session->header.env)) {
1466 pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1467 goto out_child;
1468 }
1469 }
1470
1471 if (perf_evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1472 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1473 opts->no_bpf_event = true;
1474 }
1475
1476 err = record__synthesize(rec, false);
1477 if (err < 0)
1478 goto out_child;
1479
1480 if (rec->realtime_prio) {
1481 struct sched_param param;
1482
1483 param.sched_priority = rec->realtime_prio;
1484 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1485 pr_err("Could not set realtime priority.\n");
1486 err = -1;
1487 goto out_child;
1488 }
1489 }
1490
1491 /*
1492 * When perf is starting the traced process, all the events
1493 * (apart from group members) have enable_on_exec=1 set,
1494 * so don't spoil it by prematurely enabling them.
1495 */
1496 if (!target__none(&opts->target) && !opts->initial_delay)
1497 evlist__enable(rec->evlist);
1498
1499 /*
1500 * Let the child rip
1501 */
1502 if (forks) {
1503 struct machine *machine = &session->machines.host;
1504 union perf_event *event;
1505 pid_t tgid;
1506
1507 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1508 if (event == NULL) {
1509 err = -ENOMEM;
1510 goto out_child;
1511 }
1512
1513 /*
1514 * Some H/W events are generated before COMM event
1515 * which is emitted during exec(), so perf script
1516 * cannot see a correct process name for those events.
1517 * Synthesize COMM event to prevent it.
1518 */
1519 tgid = perf_event__synthesize_comm(tool, event,
1520 rec->evlist->workload.pid,
1521 process_synthesized_event,
1522 machine);
1523 free(event);
1524
1525 if (tgid == -1)
1526 goto out_child;
1527
1528 event = malloc(sizeof(event->namespaces) +
1529 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1530 machine->id_hdr_size);
1531 if (event == NULL) {
1532 err = -ENOMEM;
1533 goto out_child;
1534 }
1535
1536 /*
1537 * Synthesize NAMESPACES event for the command specified.
1538 */
1539 perf_event__synthesize_namespaces(tool, event,
1540 rec->evlist->workload.pid,
1541 tgid, process_synthesized_event,
1542 machine);
1543 free(event);
1544
1545 perf_evlist__start_workload(rec->evlist);
1546 }
1547
1548 if (opts->initial_delay) {
1549 usleep(opts->initial_delay * USEC_PER_MSEC);
1550 evlist__enable(rec->evlist);
1551 }
1552
1553 trigger_ready(&auxtrace_snapshot_trigger);
1554 trigger_ready(&switch_output_trigger);
1555 perf_hooks__invoke_record_start();
1556 for (;;) {
1557 unsigned long long hits = rec->samples;
1558
1559 /*
1560 * rec->evlist->bkw_mmap_state is possible to be
1561 * BKW_MMAP_EMPTY here: when done == true and
1562 * hits != rec->samples in previous round.
1563 *
1564 * perf_evlist__toggle_bkw_mmap ensure we never
1565 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1566 */
1567 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1568 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1569
1570 if (record__mmap_read_all(rec, false) < 0) {
1571 trigger_error(&auxtrace_snapshot_trigger);
1572 trigger_error(&switch_output_trigger);
1573 err = -1;
1574 goto out_child;
1575 }
1576
1577 if (auxtrace_record__snapshot_started) {
1578 auxtrace_record__snapshot_started = 0;
1579 if (!trigger_is_error(&auxtrace_snapshot_trigger))
1580 record__read_auxtrace_snapshot(rec, false);
1581 if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1582 pr_err("AUX area tracing snapshot failed\n");
1583 err = -1;
1584 goto out_child;
1585 }
1586 }
1587
1588 if (trigger_is_hit(&switch_output_trigger)) {
1589 /*
1590 * If switch_output_trigger is hit, the data in
1591 * overwritable ring buffer should have been collected,
1592 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1593 *
1594 * If SIGUSR2 raise after or during record__mmap_read_all(),
1595 * record__mmap_read_all() didn't collect data from
1596 * overwritable ring buffer. Read again.
1597 */
1598 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1599 continue;
1600 trigger_ready(&switch_output_trigger);
1601
1602 /*
1603 * Reenable events in overwrite ring buffer after
1604 * record__mmap_read_all(): we should have collected
1605 * data from it.
1606 */
1607 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1608
1609 if (!quiet)
1610 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1611 waking);
1612 waking = 0;
1613 fd = record__switch_output(rec, false);
1614 if (fd < 0) {
1615 pr_err("Failed to switch to new file\n");
1616 trigger_error(&switch_output_trigger);
1617 err = fd;
1618 goto out_child;
1619 }
1620
1621 /* re-arm the alarm */
1622 if (rec->switch_output.time)
1623 alarm(rec->switch_output.time);
1624 }
1625
1626 if (hits == rec->samples) {
1627 if (done || draining)
1628 break;
1629 err = evlist__poll(rec->evlist, -1);
1630 /*
1631 * Propagate error, only if there's any. Ignore positive
1632 * number of returned events and interrupt error.
1633 */
1634 if (err > 0 || (err < 0 && errno == EINTR))
1635 err = 0;
1636 waking++;
1637
1638 if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1639 draining = true;
1640 }
1641
1642 /*
1643 * When perf is starting the traced process, at the end events
1644 * die with the process and we wait for that. Thus no need to
1645 * disable events in this case.
1646 */
1647 if (done && !disabled && !target__none(&opts->target)) {
1648 trigger_off(&auxtrace_snapshot_trigger);
1649 evlist__disable(rec->evlist);
1650 disabled = true;
1651 }
1652 }
1653
1654 trigger_off(&auxtrace_snapshot_trigger);
1655 trigger_off(&switch_output_trigger);
1656
1657 if (opts->auxtrace_snapshot_on_exit)
1658 record__auxtrace_snapshot_exit(rec);
1659
1660 if (forks && workload_exec_errno) {
1661 char msg[STRERR_BUFSIZE];
1662 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1663 pr_err("Workload failed: %s\n", emsg);
1664 err = -1;
1665 goto out_child;
1666 }
1667
1668 if (!quiet)
1669 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1670
1671 if (target__none(&rec->opts.target))
1672 record__synthesize_workload(rec, true);
1673
1674out_child:
1675 record__mmap_read_all(rec, true);
1676 record__aio_mmap_read_sync(rec);
1677
1678 if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1679 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1680 session->header.env.comp_ratio = ratio + 0.5;
1681 }
1682
1683 if (forks) {
1684 int exit_status;
1685
1686 if (!child_finished)
1687 kill(rec->evlist->workload.pid, SIGTERM);
1688
1689 wait(&exit_status);
1690
1691 if (err < 0)
1692 status = err;
1693 else if (WIFEXITED(exit_status))
1694 status = WEXITSTATUS(exit_status);
1695 else if (WIFSIGNALED(exit_status))
1696 signr = WTERMSIG(exit_status);
1697 } else
1698 status = err;
1699
1700 record__synthesize(rec, true);
1701 /* this will be recalculated during process_buildids() */
1702 rec->samples = 0;
1703
1704 if (!err) {
1705 if (!rec->timestamp_filename) {
1706 record__finish_output(rec);
1707 } else {
1708 fd = record__switch_output(rec, true);
1709 if (fd < 0) {
1710 status = fd;
1711 goto out_delete_session;
1712 }
1713 }
1714 }
1715
1716 perf_hooks__invoke_record_end();
1717
1718 if (!err && !quiet) {
1719 char samples[128];
1720 const char *postfix = rec->timestamp_filename ?
1721 ".<timestamp>" : "";
1722
1723 if (rec->samples && !rec->opts.full_auxtrace)
1724 scnprintf(samples, sizeof(samples),
1725 " (%" PRIu64 " samples)", rec->samples);
1726 else
1727 samples[0] = '\0';
1728
1729 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
1730 perf_data__size(data) / 1024.0 / 1024.0,
1731 data->path, postfix, samples);
1732 if (ratio) {
1733 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
1734 rec->session->bytes_transferred / 1024.0 / 1024.0,
1735 ratio);
1736 }
1737 fprintf(stderr, " ]\n");
1738 }
1739
1740out_delete_session:
1741 zstd_fini(&session->zstd_data);
1742 perf_session__delete(session);
1743
1744 if (!opts->no_bpf_event)
1745 perf_evlist__stop_sb_thread(rec->sb_evlist);
1746 return status;
1747}
1748
1749static void callchain_debug(struct callchain_param *callchain)
1750{
1751 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1752
1753 pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1754
1755 if (callchain->record_mode == CALLCHAIN_DWARF)
1756 pr_debug("callchain: stack dump size %d\n",
1757 callchain->dump_size);
1758}
1759
1760int record_opts__parse_callchain(struct record_opts *record,
1761 struct callchain_param *callchain,
1762 const char *arg, bool unset)
1763{
1764 int ret;
1765 callchain->enabled = !unset;
1766
1767 /* --no-call-graph */
1768 if (unset) {
1769 callchain->record_mode = CALLCHAIN_NONE;
1770 pr_debug("callchain: disabled\n");
1771 return 0;
1772 }
1773
1774 ret = parse_callchain_record_opt(arg, callchain);
1775 if (!ret) {
1776 /* Enable data address sampling for DWARF unwind. */
1777 if (callchain->record_mode == CALLCHAIN_DWARF)
1778 record->sample_address = true;
1779 callchain_debug(callchain);
1780 }
1781
1782 return ret;
1783}
1784
1785int record_parse_callchain_opt(const struct option *opt,
1786 const char *arg,
1787 int unset)
1788{
1789 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1790}
1791
1792int record_callchain_opt(const struct option *opt,
1793 const char *arg __maybe_unused,
1794 int unset __maybe_unused)
1795{
1796 struct callchain_param *callchain = opt->value;
1797
1798 callchain->enabled = true;
1799
1800 if (callchain->record_mode == CALLCHAIN_NONE)
1801 callchain->record_mode = CALLCHAIN_FP;
1802
1803 callchain_debug(callchain);
1804 return 0;
1805}
1806
1807static int perf_record_config(const char *var, const char *value, void *cb)
1808{
1809 struct record *rec = cb;
1810
1811 if (!strcmp(var, "record.build-id")) {
1812 if (!strcmp(value, "cache"))
1813 rec->no_buildid_cache = false;
1814 else if (!strcmp(value, "no-cache"))
1815 rec->no_buildid_cache = true;
1816 else if (!strcmp(value, "skip"))
1817 rec->no_buildid = true;
1818 else
1819 return -1;
1820 return 0;
1821 }
1822 if (!strcmp(var, "record.call-graph")) {
1823 var = "call-graph.record-mode";
1824 return perf_default_config(var, value, cb);
1825 }
1826#ifdef HAVE_AIO_SUPPORT
1827 if (!strcmp(var, "record.aio")) {
1828 rec->opts.nr_cblocks = strtol(value, NULL, 0);
1829 if (!rec->opts.nr_cblocks)
1830 rec->opts.nr_cblocks = nr_cblocks_default;
1831 }
1832#endif
1833
1834 return 0;
1835}
1836
1837struct clockid_map {
1838 const char *name;
1839 int clockid;
1840};
1841
1842#define CLOCKID_MAP(n, c) \
1843 { .name = n, .clockid = (c), }
1844
1845#define CLOCKID_END { .name = NULL, }
1846
1847
1848/*
1849 * Add the missing ones, we need to build on many distros...
1850 */
1851#ifndef CLOCK_MONOTONIC_RAW
1852#define CLOCK_MONOTONIC_RAW 4
1853#endif
1854#ifndef CLOCK_BOOTTIME
1855#define CLOCK_BOOTTIME 7
1856#endif
1857#ifndef CLOCK_TAI
1858#define CLOCK_TAI 11
1859#endif
1860
1861static const struct clockid_map clockids[] = {
1862 /* available for all events, NMI safe */
1863 CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1864 CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1865
1866 /* available for some events */
1867 CLOCKID_MAP("realtime", CLOCK_REALTIME),
1868 CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1869 CLOCKID_MAP("tai", CLOCK_TAI),
1870
1871 /* available for the lazy */
1872 CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1873 CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1874 CLOCKID_MAP("real", CLOCK_REALTIME),
1875 CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1876
1877 CLOCKID_END,
1878};
1879
1880static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1881{
1882 struct timespec res;
1883
1884 *res_ns = 0;
1885 if (!clock_getres(clk_id, &res))
1886 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1887 else
1888 pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1889
1890 return 0;
1891}
1892
1893static int parse_clockid(const struct option *opt, const char *str, int unset)
1894{
1895 struct record_opts *opts = (struct record_opts *)opt->value;
1896 const struct clockid_map *cm;
1897 const char *ostr = str;
1898
1899 if (unset) {
1900 opts->use_clockid = 0;
1901 return 0;
1902 }
1903
1904 /* no arg passed */
1905 if (!str)
1906 return 0;
1907
1908 /* no setting it twice */
1909 if (opts->use_clockid)
1910 return -1;
1911
1912 opts->use_clockid = true;
1913
1914 /* if its a number, we're done */
1915 if (sscanf(str, "%d", &opts->clockid) == 1)
1916 return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1917
1918 /* allow a "CLOCK_" prefix to the name */
1919 if (!strncasecmp(str, "CLOCK_", 6))
1920 str += 6;
1921
1922 for (cm = clockids; cm->name; cm++) {
1923 if (!strcasecmp(str, cm->name)) {
1924 opts->clockid = cm->clockid;
1925 return get_clockid_res(opts->clockid,
1926 &opts->clockid_res_ns);
1927 }
1928 }
1929
1930 opts->use_clockid = false;
1931 ui__warning("unknown clockid %s, check man page\n", ostr);
1932 return -1;
1933}
1934
1935static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1936{
1937 struct record_opts *opts = (struct record_opts *)opt->value;
1938
1939 if (unset || !str)
1940 return 0;
1941
1942 if (!strcasecmp(str, "node"))
1943 opts->affinity = PERF_AFFINITY_NODE;
1944 else if (!strcasecmp(str, "cpu"))
1945 opts->affinity = PERF_AFFINITY_CPU;
1946
1947 return 0;
1948}
1949
1950static int record__parse_mmap_pages(const struct option *opt,
1951 const char *str,
1952 int unset __maybe_unused)
1953{
1954 struct record_opts *opts = opt->value;
1955 char *s, *p;
1956 unsigned int mmap_pages;
1957 int ret;
1958
1959 if (!str)
1960 return -EINVAL;
1961
1962 s = strdup(str);
1963 if (!s)
1964 return -ENOMEM;
1965
1966 p = strchr(s, ',');
1967 if (p)
1968 *p = '\0';
1969
1970 if (*s) {
1971 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1972 if (ret)
1973 goto out_free;
1974 opts->mmap_pages = mmap_pages;
1975 }
1976
1977 if (!p) {
1978 ret = 0;
1979 goto out_free;
1980 }
1981
1982 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1983 if (ret)
1984 goto out_free;
1985
1986 opts->auxtrace_mmap_pages = mmap_pages;
1987
1988out_free:
1989 free(s);
1990 return ret;
1991}
1992
1993static void switch_output_size_warn(struct record *rec)
1994{
1995 u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
1996 struct switch_output *s = &rec->switch_output;
1997
1998 wakeup_size /= 2;
1999
2000 if (s->size < wakeup_size) {
2001 char buf[100];
2002
2003 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2004 pr_warning("WARNING: switch-output data size lower than "
2005 "wakeup kernel buffer size (%s) "
2006 "expect bigger perf.data sizes\n", buf);
2007 }
2008}
2009
2010static int switch_output_setup(struct record *rec)
2011{
2012 struct switch_output *s = &rec->switch_output;
2013 static struct parse_tag tags_size[] = {
2014 { .tag = 'B', .mult = 1 },
2015 { .tag = 'K', .mult = 1 << 10 },
2016 { .tag = 'M', .mult = 1 << 20 },
2017 { .tag = 'G', .mult = 1 << 30 },
2018 { .tag = 0 },
2019 };
2020 static struct parse_tag tags_time[] = {
2021 { .tag = 's', .mult = 1 },
2022 { .tag = 'm', .mult = 60 },
2023 { .tag = 'h', .mult = 60*60 },
2024 { .tag = 'd', .mult = 60*60*24 },
2025 { .tag = 0 },
2026 };
2027 unsigned long val;
2028
2029 if (!s->set)
2030 return 0;
2031
2032 if (!strcmp(s->str, "signal")) {
2033 s->signal = true;
2034 pr_debug("switch-output with SIGUSR2 signal\n");
2035 goto enabled;
2036 }
2037
2038 val = parse_tag_value(s->str, tags_size);
2039 if (val != (unsigned long) -1) {
2040 s->size = val;
2041 pr_debug("switch-output with %s size threshold\n", s->str);
2042 goto enabled;
2043 }
2044
2045 val = parse_tag_value(s->str, tags_time);
2046 if (val != (unsigned long) -1) {
2047 s->time = val;
2048 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2049 s->str, s->time);
2050 goto enabled;
2051 }
2052
2053 return -1;
2054
2055enabled:
2056 rec->timestamp_filename = true;
2057 s->enabled = true;
2058
2059 if (s->size && !rec->opts.no_buffering)
2060 switch_output_size_warn(rec);
2061
2062 return 0;
2063}
2064
2065static const char * const __record_usage[] = {
2066 "perf record [<options>] [<command>]",
2067 "perf record [<options>] -- <command> [<options>]",
2068 NULL
2069};
2070const char * const *record_usage = __record_usage;
2071
2072/*
2073 * XXX Ideally would be local to cmd_record() and passed to a record__new
2074 * because we need to have access to it in record__exit, that is called
2075 * after cmd_record() exits, but since record_options need to be accessible to
2076 * builtin-script, leave it here.
2077 *
2078 * At least we don't ouch it in all the other functions here directly.
2079 *
2080 * Just say no to tons of global variables, sigh.
2081 */
2082static struct record record = {
2083 .opts = {
2084 .sample_time = true,
2085 .mmap_pages = UINT_MAX,
2086 .user_freq = UINT_MAX,
2087 .user_interval = ULLONG_MAX,
2088 .freq = 4000,
2089 .target = {
2090 .uses_mmap = true,
2091 .default_per_cpu = true,
2092 },
2093 .mmap_flush = MMAP_FLUSH_DEFAULT,
2094 },
2095 .tool = {
2096 .sample = process_sample_event,
2097 .fork = perf_event__process_fork,
2098 .exit = perf_event__process_exit,
2099 .comm = perf_event__process_comm,
2100 .namespaces = perf_event__process_namespaces,
2101 .mmap = perf_event__process_mmap,
2102 .mmap2 = perf_event__process_mmap2,
2103 .ordered_events = true,
2104 },
2105};
2106
2107const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2108 "\n\t\t\t\tDefault: fp";
2109
2110static bool dry_run;
2111
2112/*
2113 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2114 * with it and switch to use the library functions in perf_evlist that came
2115 * from builtin-record.c, i.e. use record_opts,
2116 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2117 * using pipes, etc.
2118 */
2119static struct option __record_options[] = {
2120 OPT_CALLBACK('e', "event", &record.evlist, "event",
2121 "event selector. use 'perf list' to list available events",
2122 parse_events_option),
2123 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2124 "event filter", parse_filter),
2125 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2126 NULL, "don't record events from perf itself",
2127 exclude_perf),
2128 OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2129 "record events on existing process id"),
2130 OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2131 "record events on existing thread id"),
2132 OPT_INTEGER('r', "realtime", &record.realtime_prio,
2133 "collect data with this RT SCHED_FIFO priority"),
2134 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2135 "collect data without buffering"),
2136 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2137 "collect raw sample records from all opened counters"),
2138 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2139 "system-wide collection from all CPUs"),
2140 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2141 "list of cpus to monitor"),
2142 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2143 OPT_STRING('o', "output", &record.data.path, "file",
2144 "output file name"),
2145 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2146 &record.opts.no_inherit_set,
2147 "child tasks do not inherit counters"),
2148 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2149 "synthesize non-sample events at the end of output"),
2150 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2151 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
2152 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2153 "Fail if the specified frequency can't be used"),
2154 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2155 "profile at this frequency",
2156 record__parse_freq),
2157 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2158 "number of mmap data pages and AUX area tracing mmap pages",
2159 record__parse_mmap_pages),
2160 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2161 "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2162 record__mmap_flush_parse),
2163 OPT_BOOLEAN(0, "group", &record.opts.group,
2164 "put the counters into a counter group"),
2165 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2166 NULL, "enables call-graph recording" ,
2167 &record_callchain_opt),
2168 OPT_CALLBACK(0, "call-graph", &record.opts,
2169 "record_mode[,record_size]", record_callchain_help,
2170 &record_parse_callchain_opt),
2171 OPT_INCR('v', "verbose", &verbose,
2172 "be more verbose (show counter open errors, etc)"),
2173 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2174 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2175 "per thread counts"),
2176 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2177 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2178 "Record the sample physical addresses"),
2179 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2180 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2181 &record.opts.sample_time_set,
2182 "Record the sample timestamps"),
2183 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2184 "Record the sample period"),
2185 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2186 "don't sample"),
2187 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2188 &record.no_buildid_cache_set,
2189 "do not update the buildid cache"),
2190 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2191 &record.no_buildid_set,
2192 "do not collect buildids in perf.data"),
2193 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2194 "monitor event in cgroup name only",
2195 parse_cgroups),
2196 OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2197 "ms to wait before starting measurement after program start"),
2198 OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2199 "user to profile"),
2200
2201 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2202 "branch any", "sample any taken branches",
2203 parse_branch_stack),
2204
2205 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2206 "branch filter mask", "branch stack filter modes",
2207 parse_branch_stack),
2208 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2209 "sample by weight (on special events only)"),
2210 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2211 "sample transaction flags (special events only)"),
2212 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2213 "use per-thread mmaps"),
2214 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2215 "sample selected machine registers on interrupt,"
2216 " use '-I?' to list register names", parse_intr_regs),
2217 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2218 "sample selected machine registers on interrupt,"
2219 " use '--user-regs=?' to list register names", parse_user_regs),
2220 OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2221 "Record running/enabled time of read (:S) events"),
2222 OPT_CALLBACK('k', "clockid", &record.opts,
2223 "clockid", "clockid to use for events, see clock_gettime()",
2224 parse_clockid),
2225 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2226 "opts", "AUX area tracing Snapshot Mode", ""),
2227 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2228 "per thread proc mmap processing timeout in ms"),
2229 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2230 "Record namespaces events"),
2231 OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2232 "Record context switch events"),
2233 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2234 "Configure all used events to run in kernel space.",
2235 PARSE_OPT_EXCLUSIVE),
2236 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2237 "Configure all used events to run in user space.",
2238 PARSE_OPT_EXCLUSIVE),
2239 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2240 "collect kernel callchains"),
2241 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2242 "collect user callchains"),
2243 OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2244 "clang binary to use for compiling BPF scriptlets"),
2245 OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2246 "options passed to clang when compiling BPF scriptlets"),
2247 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2248 "file", "vmlinux pathname"),
2249 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2250 "Record build-id of all DSOs regardless of hits"),
2251 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2252 "append timestamp to output filename"),
2253 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2254 "Record timestamp boundary (time of first/last samples)"),
2255 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2256 &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2257 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2258 "signal"),
2259 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2260 "Limit number of switch output generated files"),
2261 OPT_BOOLEAN(0, "dry-run", &dry_run,
2262 "Parse options then exit"),
2263#ifdef HAVE_AIO_SUPPORT
2264 OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2265 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2266 record__aio_parse),
2267#endif
2268 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2269 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2270 record__parse_affinity),
2271#ifdef HAVE_ZSTD_SUPPORT
2272 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2273 "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2274 record__parse_comp_level),
2275#endif
2276 OPT_END()
2277};
2278
2279struct option *record_options = __record_options;
2280
2281int cmd_record(int argc, const char **argv)
2282{
2283 int err;
2284 struct record *rec = &record;
2285 char errbuf[BUFSIZ];
2286
2287 setlocale(LC_ALL, "");
2288
2289#ifndef HAVE_LIBBPF_SUPPORT
2290# define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2291 set_nobuild('\0', "clang-path", true);
2292 set_nobuild('\0', "clang-opt", true);
2293# undef set_nobuild
2294#endif
2295
2296#ifndef HAVE_BPF_PROLOGUE
2297# if !defined (HAVE_DWARF_SUPPORT)
2298# define REASON "NO_DWARF=1"
2299# elif !defined (HAVE_LIBBPF_SUPPORT)
2300# define REASON "NO_LIBBPF=1"
2301# else
2302# define REASON "this architecture doesn't support BPF prologue"
2303# endif
2304# define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2305 set_nobuild('\0', "vmlinux", true);
2306# undef set_nobuild
2307# undef REASON
2308#endif
2309
2310 CPU_ZERO(&rec->affinity_mask);
2311 rec->opts.affinity = PERF_AFFINITY_SYS;
2312
2313 rec->evlist = evlist__new();
2314 if (rec->evlist == NULL)
2315 return -ENOMEM;
2316
2317 err = perf_config(perf_record_config, rec);
2318 if (err)
2319 return err;
2320
2321 argc = parse_options(argc, argv, record_options, record_usage,
2322 PARSE_OPT_STOP_AT_NON_OPTION);
2323 if (quiet)
2324 perf_quiet_option();
2325
2326 /* Make system wide (-a) the default target. */
2327 if (!argc && target__none(&rec->opts.target))
2328 rec->opts.target.system_wide = true;
2329
2330 if (nr_cgroups && !rec->opts.target.system_wide) {
2331 usage_with_options_msg(record_usage, record_options,
2332 "cgroup monitoring only available in system-wide mode");
2333
2334 }
2335
2336 if (rec->opts.comp_level != 0) {
2337 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2338 rec->no_buildid = true;
2339 }
2340
2341 if (rec->opts.record_switch_events &&
2342 !perf_can_record_switch_events()) {
2343 ui__error("kernel does not support recording context switch events\n");
2344 parse_options_usage(record_usage, record_options, "switch-events", 0);
2345 return -EINVAL;
2346 }
2347
2348 if (switch_output_setup(rec)) {
2349 parse_options_usage(record_usage, record_options, "switch-output", 0);
2350 return -EINVAL;
2351 }
2352
2353 if (rec->switch_output.time) {
2354 signal(SIGALRM, alarm_sig_handler);
2355 alarm(rec->switch_output.time);
2356 }
2357
2358 if (rec->switch_output.num_files) {
2359 rec->switch_output.filenames = calloc(sizeof(char *),
2360 rec->switch_output.num_files);
2361 if (!rec->switch_output.filenames)
2362 return -EINVAL;
2363 }
2364
2365 /*
2366 * Allow aliases to facilitate the lookup of symbols for address
2367 * filters. Refer to auxtrace_parse_filters().
2368 */
2369 symbol_conf.allow_aliases = true;
2370
2371 symbol__init(NULL);
2372
2373 err = record__auxtrace_init(rec);
2374 if (err)
2375 goto out;
2376
2377 if (dry_run)
2378 goto out;
2379
2380 err = bpf__setup_stdout(rec->evlist);
2381 if (err) {
2382 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2383 pr_err("ERROR: Setup BPF stdout failed: %s\n",
2384 errbuf);
2385 goto out;
2386 }
2387
2388 err = -ENOMEM;
2389
2390 if (rec->no_buildid_cache || rec->no_buildid) {
2391 disable_buildid_cache();
2392 } else if (rec->switch_output.enabled) {
2393 /*
2394 * In 'perf record --switch-output', disable buildid
2395 * generation by default to reduce data file switching
2396 * overhead. Still generate buildid if they are required
2397 * explicitly using
2398 *
2399 * perf record --switch-output --no-no-buildid \
2400 * --no-no-buildid-cache
2401 *
2402 * Following code equals to:
2403 *
2404 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2405 * (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2406 * disable_buildid_cache();
2407 */
2408 bool disable = true;
2409
2410 if (rec->no_buildid_set && !rec->no_buildid)
2411 disable = false;
2412 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2413 disable = false;
2414 if (disable) {
2415 rec->no_buildid = true;
2416 rec->no_buildid_cache = true;
2417 disable_buildid_cache();
2418 }
2419 }
2420
2421 if (record.opts.overwrite)
2422 record.opts.tail_synthesize = true;
2423
2424 if (rec->evlist->core.nr_entries == 0 &&
2425 __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2426 pr_err("Not enough memory for event selector list\n");
2427 goto out;
2428 }
2429
2430 if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2431 rec->opts.no_inherit = true;
2432
2433 err = target__validate(&rec->opts.target);
2434 if (err) {
2435 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2436 ui__warning("%s\n", errbuf);
2437 }
2438
2439 err = target__parse_uid(&rec->opts.target);
2440 if (err) {
2441 int saved_errno = errno;
2442
2443 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2444 ui__error("%s", errbuf);
2445
2446 err = -saved_errno;
2447 goto out;
2448 }
2449
2450 /* Enable ignoring missing threads when -u/-p option is defined. */
2451 rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2452
2453 err = -ENOMEM;
2454 if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2455 usage_with_options(record_usage, record_options);
2456
2457 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2458 if (err)
2459 goto out;
2460
2461 /*
2462 * We take all buildids when the file contains
2463 * AUX area tracing data because we do not decode the
2464 * trace because it would take too long.
2465 */
2466 if (rec->opts.full_auxtrace)
2467 rec->buildid_all = true;
2468
2469 if (record_opts__config(&rec->opts)) {
2470 err = -EINVAL;
2471 goto out;
2472 }
2473
2474 if (rec->opts.nr_cblocks > nr_cblocks_max)
2475 rec->opts.nr_cblocks = nr_cblocks_max;
2476 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2477
2478 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2479 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2480
2481 if (rec->opts.comp_level > comp_level_max)
2482 rec->opts.comp_level = comp_level_max;
2483 pr_debug("comp level: %d\n", rec->opts.comp_level);
2484
2485 err = __cmd_record(&record, argc, argv);
2486out:
2487 evlist__delete(rec->evlist);
2488 symbol__exit();
2489 auxtrace_record__free(rec->itr);
2490 return err;
2491}
2492
2493static void snapshot_sig_handler(int sig __maybe_unused)
2494{
2495 struct record *rec = &record;
2496
2497 if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2498 trigger_hit(&auxtrace_snapshot_trigger);
2499 auxtrace_record__snapshot_started = 1;
2500 if (auxtrace_record__snapshot_start(record.itr))
2501 trigger_error(&auxtrace_snapshot_trigger);
2502 }
2503
2504 if (switch_output_signal(rec))
2505 trigger_hit(&switch_output_trigger);
2506}
2507
2508static void alarm_sig_handler(int sig __maybe_unused)
2509{
2510 struct record *rec = &record;
2511
2512 if (switch_output_time(rec))
2513 trigger_hit(&switch_output_trigger);
2514}