blob: 3f43aedb384d4d78e8c2416131dd33644baed871 [file] [log] [blame]
xjb04a4022021-11-25 15:01:52 +08001/*
2 * builtin-trace.c
3 *
4 * Builtin 'trace' command:
5 *
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
9 *
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11 *
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
13 *
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15 *
16 * Released under the GPL v2. (and only v2, not any later version)
17 */
18
19#include <traceevent/event-parse.h>
20#include <api/fs/tracing_path.h>
21#include "builtin.h"
22#include "util/cgroup.h"
23#include "util/color.h"
24#include "util/debug.h"
25#include "util/env.h"
26#include "util/event.h"
27#include "util/evlist.h"
28#include <subcmd/exec-cmd.h>
29#include "util/machine.h"
30#include "util/path.h"
31#include "util/session.h"
32#include "util/thread.h"
33#include <subcmd/parse-options.h>
34#include "util/strlist.h"
35#include "util/intlist.h"
36#include "util/thread_map.h"
37#include "util/stat.h"
38#include "trace/beauty/beauty.h"
39#include "trace-event.h"
40#include "util/parse-events.h"
41#include "util/bpf-loader.h"
42#include "callchain.h"
43#include "print_binary.h"
44#include "string2.h"
45#include "syscalltbl.h"
46#include "rb_resort.h"
47
48#include <errno.h>
49#include <inttypes.h>
50#include <poll.h>
51#include <signal.h>
52#include <stdlib.h>
53#include <string.h>
54#include <linux/err.h>
55#include <linux/filter.h>
56#include <linux/kernel.h>
57#include <linux/random.h>
58#include <linux/stringify.h>
59#include <linux/time64.h>
60#include <fcntl.h>
61
62#include "sane_ctype.h"
63
64#ifndef O_CLOEXEC
65# define O_CLOEXEC 02000000
66#endif
67
68#ifndef F_LINUX_SPECIFIC_BASE
69# define F_LINUX_SPECIFIC_BASE 1024
70#endif
71
72struct trace {
73 struct perf_tool tool;
74 struct syscalltbl *sctbl;
75 struct {
76 int max;
77 struct syscall *table;
78 struct {
79 struct perf_evsel *sys_enter,
80 *sys_exit,
81 *augmented;
82 } events;
83 } syscalls;
84 struct record_opts opts;
85 struct perf_evlist *evlist;
86 struct machine *host;
87 struct thread *current;
88 struct cgroup *cgroup;
89 u64 base_time;
90 FILE *output;
91 unsigned long nr_events;
92 struct strlist *ev_qualifier;
93 struct {
94 size_t nr;
95 int *entries;
96 } ev_qualifier_ids;
97 struct {
98 size_t nr;
99 pid_t *entries;
100 } filter_pids;
101 double duration_filter;
102 double runtime_ms;
103 struct {
104 u64 vfs_getname,
105 proc_getname;
106 } stats;
107 unsigned int max_stack;
108 unsigned int min_stack;
109 bool not_ev_qualifier;
110 bool live;
111 bool full_time;
112 bool sched;
113 bool multiple_threads;
114 bool summary;
115 bool summary_only;
116 bool failure_only;
117 bool show_comm;
118 bool print_sample;
119 bool show_tool_stats;
120 bool trace_syscalls;
121 bool kernel_syscallchains;
122 bool force;
123 bool vfs_getname;
124 int trace_pgfaults;
125};
126
127struct tp_field {
128 int offset;
129 union {
130 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
131 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
132 };
133};
134
135#define TP_UINT_FIELD(bits) \
136static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
137{ \
138 u##bits value; \
139 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
140 return value; \
141}
142
143TP_UINT_FIELD(8);
144TP_UINT_FIELD(16);
145TP_UINT_FIELD(32);
146TP_UINT_FIELD(64);
147
148#define TP_UINT_FIELD__SWAPPED(bits) \
149static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
150{ \
151 u##bits value; \
152 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
153 return bswap_##bits(value);\
154}
155
156TP_UINT_FIELD__SWAPPED(16);
157TP_UINT_FIELD__SWAPPED(32);
158TP_UINT_FIELD__SWAPPED(64);
159
160static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
161{
162 field->offset = offset;
163
164 switch (size) {
165 case 1:
166 field->integer = tp_field__u8;
167 break;
168 case 2:
169 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
170 break;
171 case 4:
172 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
173 break;
174 case 8:
175 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
176 break;
177 default:
178 return -1;
179 }
180
181 return 0;
182}
183
184static int tp_field__init_uint(struct tp_field *field, struct format_field *format_field, bool needs_swap)
185{
186 return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
187}
188
189static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
190{
191 return sample->raw_data + field->offset;
192}
193
194static int __tp_field__init_ptr(struct tp_field *field, int offset)
195{
196 field->offset = offset;
197 field->pointer = tp_field__ptr;
198 return 0;
199}
200
201static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
202{
203 return __tp_field__init_ptr(field, format_field->offset);
204}
205
206struct syscall_tp {
207 struct tp_field id;
208 union {
209 struct tp_field args, ret;
210 };
211};
212
213static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
214 struct tp_field *field,
215 const char *name)
216{
217 struct format_field *format_field = perf_evsel__field(evsel, name);
218
219 if (format_field == NULL)
220 return -1;
221
222 return tp_field__init_uint(field, format_field, evsel->needs_swap);
223}
224
225#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
226 ({ struct syscall_tp *sc = evsel->priv;\
227 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
228
229static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
230 struct tp_field *field,
231 const char *name)
232{
233 struct format_field *format_field = perf_evsel__field(evsel, name);
234
235 if (format_field == NULL)
236 return -1;
237
238 return tp_field__init_ptr(field, format_field);
239}
240
241#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
242 ({ struct syscall_tp *sc = evsel->priv;\
243 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
244
245static void perf_evsel__delete_priv(struct perf_evsel *evsel)
246{
247 zfree(&evsel->priv);
248 perf_evsel__delete(evsel);
249}
250
251static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel)
252{
253 struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
254
255 if (evsel->priv != NULL) {
256 if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr"))
257 goto out_delete;
258 return 0;
259 }
260
261 return -ENOMEM;
262out_delete:
263 zfree(&evsel->priv);
264 return -ENOENT;
265}
266
267static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel)
268{
269 struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
270
271 if (evsel->priv != NULL) { /* field, sizeof_field, offsetof_field */
272 if (__tp_field__init_uint(&sc->id, sizeof(long), sizeof(long long), evsel->needs_swap))
273 goto out_delete;
274
275 return 0;
276 }
277
278 return -ENOMEM;
279out_delete:
280 zfree(&evsel->priv);
281 return -EINVAL;
282}
283
284static int perf_evsel__init_augmented_syscall_tp_args(struct perf_evsel *evsel)
285{
286 struct syscall_tp *sc = evsel->priv;
287
288 return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
289}
290
291static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler)
292{
293 evsel->priv = malloc(sizeof(struct syscall_tp));
294 if (evsel->priv != NULL) {
295 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
296 goto out_delete;
297
298 evsel->handler = handler;
299 return 0;
300 }
301
302 return -ENOMEM;
303
304out_delete:
305 zfree(&evsel->priv);
306 return -ENOENT;
307}
308
309static struct perf_evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
310{
311 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
312
313 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
314 if (IS_ERR(evsel))
315 evsel = perf_evsel__newtp("syscalls", direction);
316
317 if (IS_ERR(evsel))
318 return NULL;
319
320 if (perf_evsel__init_raw_syscall_tp(evsel, handler))
321 goto out_delete;
322
323 return evsel;
324
325out_delete:
326 perf_evsel__delete_priv(evsel);
327 return NULL;
328}
329
330#define perf_evsel__sc_tp_uint(evsel, name, sample) \
331 ({ struct syscall_tp *fields = evsel->priv; \
332 fields->name.integer(&fields->name, sample); })
333
334#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
335 ({ struct syscall_tp *fields = evsel->priv; \
336 fields->name.pointer(&fields->name, sample); })
337
338size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
339{
340 int idx = val - sa->offset;
341
342 if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL)
343 return scnprintf(bf, size, intfmt, val);
344
345 return scnprintf(bf, size, "%s", sa->entries[idx]);
346}
347
348static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
349 const char *intfmt,
350 struct syscall_arg *arg)
351{
352 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
353}
354
355static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
356 struct syscall_arg *arg)
357{
358 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
359}
360
361#define SCA_STRARRAY syscall_arg__scnprintf_strarray
362
363struct strarrays {
364 int nr_entries;
365 struct strarray **entries;
366};
367
368#define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
369 .nr_entries = ARRAY_SIZE(array), \
370 .entries = array, \
371}
372
373size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
374 struct syscall_arg *arg)
375{
376 struct strarrays *sas = arg->parm;
377 int i;
378
379 for (i = 0; i < sas->nr_entries; ++i) {
380 struct strarray *sa = sas->entries[i];
381 int idx = arg->val - sa->offset;
382
383 if (idx >= 0 && idx < sa->nr_entries) {
384 if (sa->entries[idx] == NULL)
385 break;
386 return scnprintf(bf, size, "%s", sa->entries[idx]);
387 }
388 }
389
390 return scnprintf(bf, size, "%d", arg->val);
391}
392
393#ifndef AT_FDCWD
394#define AT_FDCWD -100
395#endif
396
397static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
398 struct syscall_arg *arg)
399{
400 int fd = arg->val;
401
402 if (fd == AT_FDCWD)
403 return scnprintf(bf, size, "CWD");
404
405 return syscall_arg__scnprintf_fd(bf, size, arg);
406}
407
408#define SCA_FDAT syscall_arg__scnprintf_fd_at
409
410static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
411 struct syscall_arg *arg);
412
413#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
414
415size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
416{
417 return scnprintf(bf, size, "%#lx", arg->val);
418}
419
420size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
421{
422 return scnprintf(bf, size, "%d", arg->val);
423}
424
425size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
426{
427 return scnprintf(bf, size, "%ld", arg->val);
428}
429
430static const char *bpf_cmd[] = {
431 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
432 "MAP_GET_NEXT_KEY", "PROG_LOAD",
433};
434static DEFINE_STRARRAY(bpf_cmd);
435
436static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
437static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
438
439static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
440static DEFINE_STRARRAY(itimers);
441
442static const char *keyctl_options[] = {
443 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
444 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
445 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
446 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
447 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
448};
449static DEFINE_STRARRAY(keyctl_options);
450
451static const char *whences[] = { "SET", "CUR", "END",
452#ifdef SEEK_DATA
453"DATA",
454#endif
455#ifdef SEEK_HOLE
456"HOLE",
457#endif
458};
459static DEFINE_STRARRAY(whences);
460
461static const char *fcntl_cmds[] = {
462 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
463 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
464 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
465 "GETOWNER_UIDS",
466};
467static DEFINE_STRARRAY(fcntl_cmds);
468
469static const char *fcntl_linux_specific_cmds[] = {
470 "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
471 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
472 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
473};
474
475static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
476
477static struct strarray *fcntl_cmds_arrays[] = {
478 &strarray__fcntl_cmds,
479 &strarray__fcntl_linux_specific_cmds,
480};
481
482static DEFINE_STRARRAYS(fcntl_cmds_arrays);
483
484static const char *rlimit_resources[] = {
485 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
486 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
487 "RTTIME",
488};
489static DEFINE_STRARRAY(rlimit_resources);
490
491static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
492static DEFINE_STRARRAY(sighow);
493
494static const char *clockid[] = {
495 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
496 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
497 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
498};
499static DEFINE_STRARRAY(clockid);
500
501static const char *socket_families[] = {
502 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
503 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
504 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
505 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
506 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
507 "ALG", "NFC", "VSOCK",
508};
509static DEFINE_STRARRAY(socket_families);
510
511static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
512 struct syscall_arg *arg)
513{
514 size_t printed = 0;
515 int mode = arg->val;
516
517 if (mode == F_OK) /* 0 */
518 return scnprintf(bf, size, "F");
519#define P_MODE(n) \
520 if (mode & n##_OK) { \
521 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
522 mode &= ~n##_OK; \
523 }
524
525 P_MODE(R);
526 P_MODE(W);
527 P_MODE(X);
528#undef P_MODE
529
530 if (mode)
531 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
532
533 return printed;
534}
535
536#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
537
538static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
539 struct syscall_arg *arg);
540
541#define SCA_FILENAME syscall_arg__scnprintf_filename
542
543static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
544 struct syscall_arg *arg)
545{
546 int printed = 0, flags = arg->val;
547
548#define P_FLAG(n) \
549 if (flags & O_##n) { \
550 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
551 flags &= ~O_##n; \
552 }
553
554 P_FLAG(CLOEXEC);
555 P_FLAG(NONBLOCK);
556#undef P_FLAG
557
558 if (flags)
559 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
560
561 return printed;
562}
563
564#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
565
566#ifndef GRND_NONBLOCK
567#define GRND_NONBLOCK 0x0001
568#endif
569#ifndef GRND_RANDOM
570#define GRND_RANDOM 0x0002
571#endif
572
573static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
574 struct syscall_arg *arg)
575{
576 int printed = 0, flags = arg->val;
577
578#define P_FLAG(n) \
579 if (flags & GRND_##n) { \
580 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
581 flags &= ~GRND_##n; \
582 }
583
584 P_FLAG(RANDOM);
585 P_FLAG(NONBLOCK);
586#undef P_FLAG
587
588 if (flags)
589 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
590
591 return printed;
592}
593
594#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
595
596#define STRARRAY(name, array) \
597 { .scnprintf = SCA_STRARRAY, \
598 .parm = &strarray__##array, }
599
600#include "trace/beauty/arch_errno_names.c"
601#include "trace/beauty/eventfd.c"
602#include "trace/beauty/futex_op.c"
603#include "trace/beauty/futex_val3.c"
604#include "trace/beauty/mmap.c"
605#include "trace/beauty/mode_t.c"
606#include "trace/beauty/msg_flags.c"
607#include "trace/beauty/open_flags.c"
608#include "trace/beauty/perf_event_open.c"
609#include "trace/beauty/pid.c"
610#include "trace/beauty/sched_policy.c"
611#include "trace/beauty/seccomp.c"
612#include "trace/beauty/signum.c"
613#include "trace/beauty/socket_type.c"
614#include "trace/beauty/waitid_options.c"
615
616struct syscall_arg_fmt {
617 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
618 void *parm;
619 const char *name;
620 bool show_zero;
621};
622
623static struct syscall_fmt {
624 const char *name;
625 const char *alias;
626 struct syscall_arg_fmt arg[6];
627 u8 nr_args;
628 bool errpid;
629 bool timeout;
630 bool hexret;
631} syscall_fmts[] = {
632 { .name = "access",
633 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
634 { .name = "bpf",
635 .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
636 { .name = "brk", .hexret = true,
637 .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
638 { .name = "clock_gettime",
639 .arg = { [0] = STRARRAY(clk_id, clockid), }, },
640 { .name = "clone", .errpid = true, .nr_args = 5,
641 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, },
642 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, },
643 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
644 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, },
645 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, },
646 { .name = "close",
647 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
648 { .name = "epoll_ctl",
649 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
650 { .name = "eventfd2",
651 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
652 { .name = "fchmodat",
653 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
654 { .name = "fchownat",
655 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
656 { .name = "fcntl",
657 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
658 .parm = &strarrays__fcntl_cmds_arrays,
659 .show_zero = true, },
660 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, },
661 { .name = "flock",
662 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
663 { .name = "fstat", .alias = "newfstat", },
664 { .name = "fstatat", .alias = "newfstatat", },
665 { .name = "futex",
666 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
667 [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
668 { .name = "futimesat",
669 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
670 { .name = "getitimer",
671 .arg = { [0] = STRARRAY(which, itimers), }, },
672 { .name = "getpid", .errpid = true, },
673 { .name = "getpgid", .errpid = true, },
674 { .name = "getppid", .errpid = true, },
675 { .name = "getrandom",
676 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
677 { .name = "getrlimit",
678 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
679 { .name = "gettid", .errpid = true, },
680 { .name = "ioctl",
681 .arg = {
682#if defined(__i386__) || defined(__x86_64__)
683/*
684 * FIXME: Make this available to all arches.
685 */
686 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
687 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
688#else
689 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
690#endif
691 { .name = "kcmp", .nr_args = 5,
692 .arg = { [0] = { .name = "pid1", .scnprintf = SCA_PID, },
693 [1] = { .name = "pid2", .scnprintf = SCA_PID, },
694 [2] = { .name = "type", .scnprintf = SCA_KCMP_TYPE, },
695 [3] = { .name = "idx1", .scnprintf = SCA_KCMP_IDX, },
696 [4] = { .name = "idx2", .scnprintf = SCA_KCMP_IDX, }, }, },
697 { .name = "keyctl",
698 .arg = { [0] = STRARRAY(option, keyctl_options), }, },
699 { .name = "kill",
700 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
701 { .name = "linkat",
702 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
703 { .name = "lseek",
704 .arg = { [2] = STRARRAY(whence, whences), }, },
705 { .name = "lstat", .alias = "newlstat", },
706 { .name = "madvise",
707 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
708 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
709 { .name = "mkdirat",
710 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
711 { .name = "mknodat",
712 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
713 { .name = "mlock",
714 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
715 { .name = "mlockall",
716 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
717 { .name = "mmap", .hexret = true,
718/* The standard mmap maps to old_mmap on s390x */
719#if defined(__s390x__)
720 .alias = "old_mmap",
721#endif
722 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
723 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
724 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
725 { .name = "mprotect",
726 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
727 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, },
728 { .name = "mq_unlink",
729 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
730 { .name = "mremap", .hexret = true,
731 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
732 [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
733 [4] = { .scnprintf = SCA_HEX, /* new_addr */ }, }, },
734 { .name = "munlock",
735 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
736 { .name = "munmap",
737 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
738 { .name = "name_to_handle_at",
739 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
740 { .name = "newfstatat",
741 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
742 { .name = "open",
743 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
744 { .name = "open_by_handle_at",
745 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
746 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
747 { .name = "openat",
748 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
749 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
750 { .name = "perf_event_open",
751 .arg = { [2] = { .scnprintf = SCA_INT, /* cpu */ },
752 [3] = { .scnprintf = SCA_FD, /* group_fd */ },
753 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
754 { .name = "pipe2",
755 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
756 { .name = "pkey_alloc",
757 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, },
758 { .name = "pkey_free",
759 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, },
760 { .name = "pkey_mprotect",
761 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
762 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
763 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, },
764 { .name = "poll", .timeout = true, },
765 { .name = "ppoll", .timeout = true, },
766 { .name = "prctl", .alias = "arch_prctl",
767 .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
768 [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
769 [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
770 { .name = "pread", .alias = "pread64", },
771 { .name = "preadv", .alias = "pread", },
772 { .name = "prlimit64",
773 .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
774 { .name = "pwrite", .alias = "pwrite64", },
775 { .name = "readlinkat",
776 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
777 { .name = "recvfrom",
778 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
779 { .name = "recvmmsg",
780 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
781 { .name = "recvmsg",
782 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
783 { .name = "renameat",
784 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
785 { .name = "rt_sigaction",
786 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
787 { .name = "rt_sigprocmask",
788 .arg = { [0] = STRARRAY(how, sighow), }, },
789 { .name = "rt_sigqueueinfo",
790 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
791 { .name = "rt_tgsigqueueinfo",
792 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
793 { .name = "sched_setscheduler",
794 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
795 { .name = "seccomp",
796 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ },
797 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
798 { .name = "select", .timeout = true, },
799 { .name = "sendmmsg",
800 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
801 { .name = "sendmsg",
802 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
803 { .name = "sendto",
804 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
805 { .name = "set_tid_address", .errpid = true, },
806 { .name = "setitimer",
807 .arg = { [0] = STRARRAY(which, itimers), }, },
808 { .name = "setrlimit",
809 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
810 { .name = "socket",
811 .arg = { [0] = STRARRAY(family, socket_families),
812 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
813 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
814 { .name = "socketpair",
815 .arg = { [0] = STRARRAY(family, socket_families),
816 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
817 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
818 { .name = "stat", .alias = "newstat", },
819 { .name = "statx",
820 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ },
821 [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
822 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, },
823 { .name = "swapoff",
824 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
825 { .name = "swapon",
826 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
827 { .name = "symlinkat",
828 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
829 { .name = "tgkill",
830 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
831 { .name = "tkill",
832 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
833 { .name = "uname", .alias = "newuname", },
834 { .name = "unlinkat",
835 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
836 { .name = "utimensat",
837 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
838 { .name = "wait4", .errpid = true,
839 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
840 { .name = "waitid", .errpid = true,
841 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
842};
843
844static int syscall_fmt__cmp(const void *name, const void *fmtp)
845{
846 const struct syscall_fmt *fmt = fmtp;
847 return strcmp(name, fmt->name);
848}
849
850static struct syscall_fmt *syscall_fmt__find(const char *name)
851{
852 const int nmemb = ARRAY_SIZE(syscall_fmts);
853 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
854}
855
856/*
857 * is_exit: is this "exit" or "exit_group"?
858 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
859 */
860struct syscall {
861 struct event_format *tp_format;
862 int nr_args;
863 bool is_exit;
864 bool is_open;
865 struct format_field *args;
866 const char *name;
867 struct syscall_fmt *fmt;
868 struct syscall_arg_fmt *arg_fmt;
869};
870
871/*
872 * We need to have this 'calculated' boolean because in some cases we really
873 * don't know what is the duration of a syscall, for instance, when we start
874 * a session and some threads are waiting for a syscall to finish, say 'poll',
875 * in which case all we can do is to print "( ? ) for duration and for the
876 * start timestamp.
877 */
878static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
879{
880 double duration = (double)t / NSEC_PER_MSEC;
881 size_t printed = fprintf(fp, "(");
882
883 if (!calculated)
884 printed += fprintf(fp, " ");
885 else if (duration >= 1.0)
886 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
887 else if (duration >= 0.01)
888 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
889 else
890 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
891 return printed + fprintf(fp, "): ");
892}
893
894/**
895 * filename.ptr: The filename char pointer that will be vfs_getname'd
896 * filename.entry_str_pos: Where to insert the string translated from
897 * filename.ptr by the vfs_getname tracepoint/kprobe.
898 * ret_scnprintf: syscall args may set this to a different syscall return
899 * formatter, for instance, fcntl may return fds, file flags, etc.
900 */
901struct thread_trace {
902 u64 entry_time;
903 bool entry_pending;
904 unsigned long nr_events;
905 unsigned long pfmaj, pfmin;
906 char *entry_str;
907 double runtime_ms;
908 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
909 struct {
910 unsigned long ptr;
911 short int entry_str_pos;
912 bool pending_open;
913 unsigned int namelen;
914 char *name;
915 } filename;
916 struct {
917 int max;
918 char **table;
919 } paths;
920
921 struct intlist *syscall_stats;
922};
923
924static struct thread_trace *thread_trace__new(void)
925{
926 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
927
928 if (ttrace)
929 ttrace->paths.max = -1;
930
931 ttrace->syscall_stats = intlist__new(NULL);
932
933 return ttrace;
934}
935
936static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
937{
938 struct thread_trace *ttrace;
939
940 if (thread == NULL)
941 goto fail;
942
943 if (thread__priv(thread) == NULL)
944 thread__set_priv(thread, thread_trace__new());
945
946 if (thread__priv(thread) == NULL)
947 goto fail;
948
949 ttrace = thread__priv(thread);
950 ++ttrace->nr_events;
951
952 return ttrace;
953fail:
954 color_fprintf(fp, PERF_COLOR_RED,
955 "WARNING: not enough memory, dropping samples!\n");
956 return NULL;
957}
958
959
960void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
961 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
962{
963 struct thread_trace *ttrace = thread__priv(arg->thread);
964
965 ttrace->ret_scnprintf = ret_scnprintf;
966}
967
968#define TRACE_PFMAJ (1 << 0)
969#define TRACE_PFMIN (1 << 1)
970
971static const size_t trace__entry_str_size = 2048;
972
973static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
974{
975 struct thread_trace *ttrace = thread__priv(thread);
976
977 if (fd > ttrace->paths.max) {
978 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
979
980 if (npath == NULL)
981 return -1;
982
983 if (ttrace->paths.max != -1) {
984 memset(npath + ttrace->paths.max + 1, 0,
985 (fd - ttrace->paths.max) * sizeof(char *));
986 } else {
987 memset(npath, 0, (fd + 1) * sizeof(char *));
988 }
989
990 ttrace->paths.table = npath;
991 ttrace->paths.max = fd;
992 }
993
994 ttrace->paths.table[fd] = strdup(pathname);
995
996 return ttrace->paths.table[fd] != NULL ? 0 : -1;
997}
998
999static int thread__read_fd_path(struct thread *thread, int fd)
1000{
1001 char linkname[PATH_MAX], pathname[PATH_MAX];
1002 struct stat st;
1003 int ret;
1004
1005 if (thread->pid_ == thread->tid) {
1006 scnprintf(linkname, sizeof(linkname),
1007 "/proc/%d/fd/%d", thread->pid_, fd);
1008 } else {
1009 scnprintf(linkname, sizeof(linkname),
1010 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1011 }
1012
1013 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1014 return -1;
1015
1016 ret = readlink(linkname, pathname, sizeof(pathname));
1017
1018 if (ret < 0 || ret > st.st_size)
1019 return -1;
1020
1021 pathname[ret] = '\0';
1022 return trace__set_fd_pathname(thread, fd, pathname);
1023}
1024
1025static const char *thread__fd_path(struct thread *thread, int fd,
1026 struct trace *trace)
1027{
1028 struct thread_trace *ttrace = thread__priv(thread);
1029
1030 if (ttrace == NULL)
1031 return NULL;
1032
1033 if (fd < 0)
1034 return NULL;
1035
1036 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1037 if (!trace->live)
1038 return NULL;
1039 ++trace->stats.proc_getname;
1040 if (thread__read_fd_path(thread, fd))
1041 return NULL;
1042 }
1043
1044 return ttrace->paths.table[fd];
1045}
1046
1047size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1048{
1049 int fd = arg->val;
1050 size_t printed = scnprintf(bf, size, "%d", fd);
1051 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1052
1053 if (path)
1054 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1055
1056 return printed;
1057}
1058
1059size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1060{
1061 size_t printed = scnprintf(bf, size, "%d", fd);
1062 struct thread *thread = machine__find_thread(trace->host, pid, pid);
1063
1064 if (thread) {
1065 const char *path = thread__fd_path(thread, fd, trace);
1066
1067 if (path)
1068 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1069
1070 thread__put(thread);
1071 }
1072
1073 return printed;
1074}
1075
1076static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1077 struct syscall_arg *arg)
1078{
1079 int fd = arg->val;
1080 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1081 struct thread_trace *ttrace = thread__priv(arg->thread);
1082
1083 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1084 zfree(&ttrace->paths.table[fd]);
1085
1086 return printed;
1087}
1088
1089static void thread__set_filename_pos(struct thread *thread, const char *bf,
1090 unsigned long ptr)
1091{
1092 struct thread_trace *ttrace = thread__priv(thread);
1093
1094 ttrace->filename.ptr = ptr;
1095 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1096}
1097
1098static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1099 struct syscall_arg *arg)
1100{
1101 unsigned long ptr = arg->val;
1102
1103 if (!arg->trace->vfs_getname)
1104 return scnprintf(bf, size, "%#x", ptr);
1105
1106 thread__set_filename_pos(arg->thread, bf, ptr);
1107 return 0;
1108}
1109
1110static bool trace__filter_duration(struct trace *trace, double t)
1111{
1112 return t < (trace->duration_filter * NSEC_PER_MSEC);
1113}
1114
1115static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1116{
1117 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1118
1119 return fprintf(fp, "%10.3f ", ts);
1120}
1121
1122/*
1123 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1124 * using ttrace->entry_time for a thread that receives a sys_exit without
1125 * first having received a sys_enter ("poll" issued before tracing session
1126 * starts, lost sys_enter exit due to ring buffer overflow).
1127 */
1128static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1129{
1130 if (tstamp > 0)
1131 return __trace__fprintf_tstamp(trace, tstamp, fp);
1132
1133 return fprintf(fp, " ? ");
1134}
1135
1136static bool done = false;
1137static bool interrupted = false;
1138
1139static void sig_handler(int sig)
1140{
1141 done = true;
1142 interrupted = sig == SIGINT;
1143}
1144
1145static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1146 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1147{
1148 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1149 printed += fprintf_duration(duration, duration_calculated, fp);
1150
1151 if (trace->multiple_threads) {
1152 if (trace->show_comm)
1153 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1154 printed += fprintf(fp, "%d ", thread->tid);
1155 }
1156
1157 return printed;
1158}
1159
1160static int trace__process_event(struct trace *trace, struct machine *machine,
1161 union perf_event *event, struct perf_sample *sample)
1162{
1163 int ret = 0;
1164
1165 switch (event->header.type) {
1166 case PERF_RECORD_LOST:
1167 color_fprintf(trace->output, PERF_COLOR_RED,
1168 "LOST %" PRIu64 " events!\n", event->lost.lost);
1169 ret = machine__process_lost_event(machine, event, sample);
1170 break;
1171 default:
1172 ret = machine__process_event(machine, event, sample);
1173 break;
1174 }
1175
1176 return ret;
1177}
1178
1179static int trace__tool_process(struct perf_tool *tool,
1180 union perf_event *event,
1181 struct perf_sample *sample,
1182 struct machine *machine)
1183{
1184 struct trace *trace = container_of(tool, struct trace, tool);
1185 return trace__process_event(trace, machine, event, sample);
1186}
1187
1188static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1189{
1190 struct machine *machine = vmachine;
1191
1192 if (machine->kptr_restrict_warned)
1193 return NULL;
1194
1195 if (symbol_conf.kptr_restrict) {
1196 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1197 "Check /proc/sys/kernel/kptr_restrict.\n\n"
1198 "Kernel samples will not be resolved.\n");
1199 machine->kptr_restrict_warned = true;
1200 return NULL;
1201 }
1202
1203 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1204}
1205
1206static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1207{
1208 int err = symbol__init(NULL);
1209
1210 if (err)
1211 return err;
1212
1213 trace->host = machine__new_host();
1214 if (trace->host == NULL)
1215 return -ENOMEM;
1216
1217 err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1218 if (err < 0)
1219 goto out;
1220
1221 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1222 evlist->threads, trace__tool_process, false,
1223 trace->opts.proc_map_timeout, 1);
1224out:
1225 if (err)
1226 symbol__exit();
1227
1228 return err;
1229}
1230
1231static void trace__symbols__exit(struct trace *trace)
1232{
1233 machine__exit(trace->host);
1234 trace->host = NULL;
1235
1236 symbol__exit();
1237}
1238
1239static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1240{
1241 int idx;
1242
1243 if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1244 nr_args = sc->fmt->nr_args;
1245
1246 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1247 if (sc->arg_fmt == NULL)
1248 return -1;
1249
1250 for (idx = 0; idx < nr_args; ++idx) {
1251 if (sc->fmt)
1252 sc->arg_fmt[idx] = sc->fmt->arg[idx];
1253 }
1254
1255 sc->nr_args = nr_args;
1256 return 0;
1257}
1258
1259static int syscall__set_arg_fmts(struct syscall *sc)
1260{
1261 struct format_field *field;
1262 int idx = 0, len;
1263
1264 for (field = sc->args; field; field = field->next, ++idx) {
1265 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1266 continue;
1267
1268 if (strcmp(field->type, "const char *") == 0 &&
1269 (strcmp(field->name, "filename") == 0 ||
1270 strcmp(field->name, "path") == 0 ||
1271 strcmp(field->name, "pathname") == 0))
1272 sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1273 else if (field->flags & FIELD_IS_POINTER)
1274 sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1275 else if (strcmp(field->type, "pid_t") == 0)
1276 sc->arg_fmt[idx].scnprintf = SCA_PID;
1277 else if (strcmp(field->type, "umode_t") == 0)
1278 sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1279 else if ((strcmp(field->type, "int") == 0 ||
1280 strcmp(field->type, "unsigned int") == 0 ||
1281 strcmp(field->type, "long") == 0) &&
1282 (len = strlen(field->name)) >= 2 &&
1283 strcmp(field->name + len - 2, "fd") == 0) {
1284 /*
1285 * /sys/kernel/tracing/events/syscalls/sys_enter*
1286 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1287 * 65 int
1288 * 23 unsigned int
1289 * 7 unsigned long
1290 */
1291 sc->arg_fmt[idx].scnprintf = SCA_FD;
1292 }
1293 }
1294
1295 return 0;
1296}
1297
1298static int trace__read_syscall_info(struct trace *trace, int id)
1299{
1300 char tp_name[128];
1301 struct syscall *sc;
1302 const char *name = syscalltbl__name(trace->sctbl, id);
1303
1304 if (name == NULL)
1305 return -1;
1306
1307 if (id > trace->syscalls.max) {
1308 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1309
1310 if (nsyscalls == NULL)
1311 return -1;
1312
1313 if (trace->syscalls.max != -1) {
1314 memset(nsyscalls + trace->syscalls.max + 1, 0,
1315 (id - trace->syscalls.max) * sizeof(*sc));
1316 } else {
1317 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1318 }
1319
1320 trace->syscalls.table = nsyscalls;
1321 trace->syscalls.max = id;
1322 }
1323
1324 sc = trace->syscalls.table + id;
1325 sc->name = name;
1326
1327 sc->fmt = syscall_fmt__find(sc->name);
1328
1329 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1330 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1331
1332 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1333 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1334 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1335 }
1336
1337 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1338 return -1;
1339
1340 if (IS_ERR(sc->tp_format))
1341 return -1;
1342
1343 sc->args = sc->tp_format->format.fields;
1344 /*
1345 * We need to check and discard the first variable '__syscall_nr'
1346 * or 'nr' that mean the syscall number. It is needless here.
1347 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1348 */
1349 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1350 sc->args = sc->args->next;
1351 --sc->nr_args;
1352 }
1353
1354 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1355 sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1356
1357 return syscall__set_arg_fmts(sc);
1358}
1359
1360static int trace__validate_ev_qualifier(struct trace *trace)
1361{
1362 int err = 0, i;
1363 size_t nr_allocated;
1364 struct str_node *pos;
1365
1366 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1367 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1368 sizeof(trace->ev_qualifier_ids.entries[0]));
1369
1370 if (trace->ev_qualifier_ids.entries == NULL) {
1371 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1372 trace->output);
1373 err = -EINVAL;
1374 goto out;
1375 }
1376
1377 nr_allocated = trace->ev_qualifier_ids.nr;
1378 i = 0;
1379
1380 strlist__for_each_entry(pos, trace->ev_qualifier) {
1381 const char *sc = pos->s;
1382 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1383
1384 if (id < 0) {
1385 id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1386 if (id >= 0)
1387 goto matches;
1388
1389 if (err == 0) {
1390 fputs("Error:\tInvalid syscall ", trace->output);
1391 err = -EINVAL;
1392 } else {
1393 fputs(", ", trace->output);
1394 }
1395
1396 fputs(sc, trace->output);
1397 }
1398matches:
1399 trace->ev_qualifier_ids.entries[i++] = id;
1400 if (match_next == -1)
1401 continue;
1402
1403 while (1) {
1404 id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1405 if (id < 0)
1406 break;
1407 if (nr_allocated == trace->ev_qualifier_ids.nr) {
1408 void *entries;
1409
1410 nr_allocated += 8;
1411 entries = realloc(trace->ev_qualifier_ids.entries,
1412 nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1413 if (entries == NULL) {
1414 err = -ENOMEM;
1415 fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1416 goto out_free;
1417 }
1418 trace->ev_qualifier_ids.entries = entries;
1419 }
1420 trace->ev_qualifier_ids.nr++;
1421 trace->ev_qualifier_ids.entries[i++] = id;
1422 }
1423 }
1424
1425 if (err < 0) {
1426 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1427 "\nHint:\tand: 'man syscalls'\n", trace->output);
1428out_free:
1429 zfree(&trace->ev_qualifier_ids.entries);
1430 trace->ev_qualifier_ids.nr = 0;
1431 }
1432out:
1433 return err;
1434}
1435
1436/*
1437 * args is to be interpreted as a series of longs but we need to handle
1438 * 8-byte unaligned accesses. args points to raw_data within the event
1439 * and raw_data is guaranteed to be 8-byte unaligned because it is
1440 * preceded by raw_size which is a u32. So we need to copy args to a temp
1441 * variable to read it. Most notably this avoids extended load instructions
1442 * on unaligned addresses
1443 */
1444unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1445{
1446 unsigned long val;
1447 unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1448
1449 memcpy(&val, p, sizeof(val));
1450 return val;
1451}
1452
1453static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1454 struct syscall_arg *arg)
1455{
1456 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1457 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1458
1459 return scnprintf(bf, size, "arg%d: ", arg->idx);
1460}
1461
1462static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1463 struct syscall_arg *arg, unsigned long val)
1464{
1465 if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1466 arg->val = val;
1467 if (sc->arg_fmt[arg->idx].parm)
1468 arg->parm = sc->arg_fmt[arg->idx].parm;
1469 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1470 }
1471 return scnprintf(bf, size, "%ld", val);
1472}
1473
1474static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1475 unsigned char *args, struct trace *trace,
1476 struct thread *thread)
1477{
1478 size_t printed = 0;
1479 unsigned long val;
1480 u8 bit = 1;
1481 struct syscall_arg arg = {
1482 .args = args,
1483 .idx = 0,
1484 .mask = 0,
1485 .trace = trace,
1486 .thread = thread,
1487 };
1488 struct thread_trace *ttrace = thread__priv(thread);
1489
1490 /*
1491 * Things like fcntl will set this in its 'cmd' formatter to pick the
1492 * right formatter for the return value (an fd? file flags?), which is
1493 * not needed for syscalls that always return a given type, say an fd.
1494 */
1495 ttrace->ret_scnprintf = NULL;
1496
1497 if (sc->args != NULL) {
1498 struct format_field *field;
1499
1500 for (field = sc->args; field;
1501 field = field->next, ++arg.idx, bit <<= 1) {
1502 if (arg.mask & bit)
1503 continue;
1504
1505 val = syscall_arg__val(&arg, arg.idx);
1506
1507 /*
1508 * Suppress this argument if its value is zero and
1509 * and we don't have a string associated in an
1510 * strarray for it.
1511 */
1512 if (val == 0 &&
1513 !(sc->arg_fmt &&
1514 (sc->arg_fmt[arg.idx].show_zero ||
1515 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1516 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1517 sc->arg_fmt[arg.idx].parm))
1518 continue;
1519
1520 printed += scnprintf(bf + printed, size - printed,
1521 "%s%s: ", printed ? ", " : "", field->name);
1522 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1523 }
1524 } else if (IS_ERR(sc->tp_format)) {
1525 /*
1526 * If we managed to read the tracepoint /format file, then we
1527 * may end up not having any args, like with gettid(), so only
1528 * print the raw args when we didn't manage to read it.
1529 */
1530 while (arg.idx < sc->nr_args) {
1531 if (arg.mask & bit)
1532 goto next_arg;
1533 val = syscall_arg__val(&arg, arg.idx);
1534 if (printed)
1535 printed += scnprintf(bf + printed, size - printed, ", ");
1536 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1537 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1538next_arg:
1539 ++arg.idx;
1540 bit <<= 1;
1541 }
1542 }
1543
1544 return printed;
1545}
1546
1547typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1548 union perf_event *event,
1549 struct perf_sample *sample);
1550
1551static struct syscall *trace__syscall_info(struct trace *trace,
1552 struct perf_evsel *evsel, int id)
1553{
1554
1555 if (id < 0) {
1556
1557 /*
1558 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1559 * before that, leaving at a higher verbosity level till that is
1560 * explained. Reproduced with plain ftrace with:
1561 *
1562 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1563 * grep "NR -1 " /t/trace_pipe
1564 *
1565 * After generating some load on the machine.
1566 */
1567 if (verbose > 1) {
1568 static u64 n;
1569 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1570 id, perf_evsel__name(evsel), ++n);
1571 }
1572 return NULL;
1573 }
1574
1575 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1576 trace__read_syscall_info(trace, id))
1577 goto out_cant_read;
1578
1579 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1580 goto out_cant_read;
1581
1582 return &trace->syscalls.table[id];
1583
1584out_cant_read:
1585 if (verbose > 0) {
1586 fprintf(trace->output, "Problems reading syscall %d", id);
1587 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1588 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1589 fputs(" information\n", trace->output);
1590 }
1591 return NULL;
1592}
1593
1594static void thread__update_stats(struct thread_trace *ttrace,
1595 int id, struct perf_sample *sample)
1596{
1597 struct int_node *inode;
1598 struct stats *stats;
1599 u64 duration = 0;
1600
1601 inode = intlist__findnew(ttrace->syscall_stats, id);
1602 if (inode == NULL)
1603 return;
1604
1605 stats = inode->priv;
1606 if (stats == NULL) {
1607 stats = malloc(sizeof(struct stats));
1608 if (stats == NULL)
1609 return;
1610 init_stats(stats);
1611 inode->priv = stats;
1612 }
1613
1614 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1615 duration = sample->time - ttrace->entry_time;
1616
1617 update_stats(stats, duration);
1618}
1619
1620static int trace__printf_interrupted_entry(struct trace *trace)
1621{
1622 struct thread_trace *ttrace;
1623 size_t printed;
1624
1625 if (trace->failure_only || trace->current == NULL)
1626 return 0;
1627
1628 ttrace = thread__priv(trace->current);
1629
1630 if (!ttrace->entry_pending)
1631 return 0;
1632
1633 printed = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1634 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1635 ttrace->entry_pending = false;
1636
1637 return printed;
1638}
1639
1640static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1641 struct perf_sample *sample, struct thread *thread)
1642{
1643 int printed = 0;
1644
1645 if (trace->print_sample) {
1646 double ts = (double)sample->time / NSEC_PER_MSEC;
1647
1648 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1649 perf_evsel__name(evsel), ts,
1650 thread__comm_str(thread),
1651 sample->pid, sample->tid, sample->cpu);
1652 }
1653
1654 return printed;
1655}
1656
1657static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1658 union perf_event *event __maybe_unused,
1659 struct perf_sample *sample)
1660{
1661 char *msg;
1662 void *args;
1663 size_t printed = 0;
1664 struct thread *thread;
1665 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1666 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1667 struct thread_trace *ttrace;
1668
1669 if (sc == NULL)
1670 return -1;
1671
1672 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1673 ttrace = thread__trace(thread, trace->output);
1674 if (ttrace == NULL)
1675 goto out_put;
1676
1677 trace__fprintf_sample(trace, evsel, sample, thread);
1678
1679 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1680
1681 if (ttrace->entry_str == NULL) {
1682 ttrace->entry_str = malloc(trace__entry_str_size);
1683 if (!ttrace->entry_str)
1684 goto out_put;
1685 }
1686
1687 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1688 trace__printf_interrupted_entry(trace);
1689
1690 ttrace->entry_time = sample->time;
1691 msg = ttrace->entry_str;
1692 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1693
1694 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1695 args, trace, thread);
1696
1697 if (sc->is_exit) {
1698 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1699 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1700 fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1701 }
1702 } else {
1703 ttrace->entry_pending = true;
1704 /* See trace__vfs_getname & trace__sys_exit */
1705 ttrace->filename.pending_open = false;
1706 }
1707
1708 if (trace->current != thread) {
1709 thread__put(trace->current);
1710 trace->current = thread__get(thread);
1711 }
1712 err = 0;
1713out_put:
1714 thread__put(thread);
1715 return err;
1716}
1717
1718static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel,
1719 struct perf_sample *sample)
1720{
1721 struct thread_trace *ttrace;
1722 struct thread *thread;
1723 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1724 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1725 char msg[1024];
1726 void *args;
1727
1728 if (sc == NULL)
1729 return -1;
1730
1731 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1732 ttrace = thread__trace(thread, trace->output);
1733 /*
1734 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
1735 * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
1736 */
1737 if (ttrace == NULL)
1738 goto out_put;
1739
1740 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1741 syscall__scnprintf_args(sc, msg, sizeof(msg), args, trace, thread);
1742 fprintf(trace->output, "%s", msg);
1743 err = 0;
1744out_put:
1745 thread__put(thread);
1746 return err;
1747}
1748
1749static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1750 struct perf_sample *sample,
1751 struct callchain_cursor *cursor)
1752{
1753 struct addr_location al;
1754 int max_stack = evsel->attr.sample_max_stack ?
1755 evsel->attr.sample_max_stack :
1756 trace->max_stack;
1757
1758 if (machine__resolve(trace->host, &al, sample) < 0 ||
1759 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1760 return -1;
1761
1762 return 0;
1763}
1764
1765static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1766{
1767 /* TODO: user-configurable print_opts */
1768 const unsigned int print_opts = EVSEL__PRINT_SYM |
1769 EVSEL__PRINT_DSO |
1770 EVSEL__PRINT_UNKNOWN_AS_ADDR;
1771
1772 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1773}
1774
1775static const char *errno_to_name(struct perf_evsel *evsel, int err)
1776{
1777 struct perf_env *env = perf_evsel__env(evsel);
1778 const char *arch_name = perf_env__arch(env);
1779
1780 return arch_syscalls__strerrno(arch_name, err);
1781}
1782
1783static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1784 union perf_event *event __maybe_unused,
1785 struct perf_sample *sample)
1786{
1787 long ret;
1788 u64 duration = 0;
1789 bool duration_calculated = false;
1790 struct thread *thread;
1791 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1792 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1793 struct thread_trace *ttrace;
1794
1795 if (sc == NULL)
1796 return -1;
1797
1798 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1799 ttrace = thread__trace(thread, trace->output);
1800 if (ttrace == NULL)
1801 goto out_put;
1802
1803 trace__fprintf_sample(trace, evsel, sample, thread);
1804
1805 if (trace->summary)
1806 thread__update_stats(ttrace, id, sample);
1807
1808 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1809
1810 if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
1811 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1812 ttrace->filename.pending_open = false;
1813 ++trace->stats.vfs_getname;
1814 }
1815
1816 if (ttrace->entry_time) {
1817 duration = sample->time - ttrace->entry_time;
1818 if (trace__filter_duration(trace, duration))
1819 goto out;
1820 duration_calculated = true;
1821 } else if (trace->duration_filter)
1822 goto out;
1823
1824 if (sample->callchain) {
1825 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1826 if (callchain_ret == 0) {
1827 if (callchain_cursor.nr < trace->min_stack)
1828 goto out;
1829 callchain_ret = 1;
1830 }
1831 }
1832
1833 if (trace->summary_only || (ret >= 0 && trace->failure_only))
1834 goto out;
1835
1836 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1837
1838 if (ttrace->entry_pending) {
1839 fprintf(trace->output, "%-70s", ttrace->entry_str);
1840 } else {
1841 fprintf(trace->output, " ... [");
1842 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1843 fprintf(trace->output, "]: %s()", sc->name);
1844 }
1845
1846 if (sc->fmt == NULL) {
1847 if (ret < 0)
1848 goto errno_print;
1849signed_print:
1850 fprintf(trace->output, ") = %ld", ret);
1851 } else if (ret < 0) {
1852errno_print: {
1853 char bf[STRERR_BUFSIZE];
1854 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1855 *e = errno_to_name(evsel, -ret);
1856
1857 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1858 }
1859 } else if (ret == 0 && sc->fmt->timeout)
1860 fprintf(trace->output, ") = 0 Timeout");
1861 else if (ttrace->ret_scnprintf) {
1862 char bf[1024];
1863 struct syscall_arg arg = {
1864 .val = ret,
1865 .thread = thread,
1866 .trace = trace,
1867 };
1868 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1869 ttrace->ret_scnprintf = NULL;
1870 fprintf(trace->output, ") = %s", bf);
1871 } else if (sc->fmt->hexret)
1872 fprintf(trace->output, ") = %#lx", ret);
1873 else if (sc->fmt->errpid) {
1874 struct thread *child = machine__find_thread(trace->host, ret, ret);
1875
1876 if (child != NULL) {
1877 fprintf(trace->output, ") = %ld", ret);
1878 if (child->comm_set)
1879 fprintf(trace->output, " (%s)", thread__comm_str(child));
1880 thread__put(child);
1881 }
1882 } else
1883 goto signed_print;
1884
1885 fputc('\n', trace->output);
1886
1887 if (callchain_ret > 0)
1888 trace__fprintf_callchain(trace, sample);
1889 else if (callchain_ret < 0)
1890 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1891out:
1892 ttrace->entry_pending = false;
1893 err = 0;
1894out_put:
1895 thread__put(thread);
1896 return err;
1897}
1898
1899static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1900 union perf_event *event __maybe_unused,
1901 struct perf_sample *sample)
1902{
1903 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1904 struct thread_trace *ttrace;
1905 size_t filename_len, entry_str_len, to_move;
1906 ssize_t remaining_space;
1907 char *pos;
1908 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1909
1910 if (!thread)
1911 goto out;
1912
1913 ttrace = thread__priv(thread);
1914 if (!ttrace)
1915 goto out_put;
1916
1917 filename_len = strlen(filename);
1918 if (filename_len == 0)
1919 goto out_put;
1920
1921 if (ttrace->filename.namelen < filename_len) {
1922 char *f = realloc(ttrace->filename.name, filename_len + 1);
1923
1924 if (f == NULL)
1925 goto out_put;
1926
1927 ttrace->filename.namelen = filename_len;
1928 ttrace->filename.name = f;
1929 }
1930
1931 strcpy(ttrace->filename.name, filename);
1932 ttrace->filename.pending_open = true;
1933
1934 if (!ttrace->filename.ptr)
1935 goto out_put;
1936
1937 entry_str_len = strlen(ttrace->entry_str);
1938 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1939 if (remaining_space <= 0)
1940 goto out_put;
1941
1942 if (filename_len > (size_t)remaining_space) {
1943 filename += filename_len - remaining_space;
1944 filename_len = remaining_space;
1945 }
1946
1947 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1948 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1949 memmove(pos + filename_len, pos, to_move);
1950 memcpy(pos, filename, filename_len);
1951
1952 ttrace->filename.ptr = 0;
1953 ttrace->filename.entry_str_pos = 0;
1954out_put:
1955 thread__put(thread);
1956out:
1957 return 0;
1958}
1959
1960static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1961 union perf_event *event __maybe_unused,
1962 struct perf_sample *sample)
1963{
1964 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1965 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1966 struct thread *thread = machine__findnew_thread(trace->host,
1967 sample->pid,
1968 sample->tid);
1969 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1970
1971 if (ttrace == NULL)
1972 goto out_dump;
1973
1974 ttrace->runtime_ms += runtime_ms;
1975 trace->runtime_ms += runtime_ms;
1976out_put:
1977 thread__put(thread);
1978 return 0;
1979
1980out_dump:
1981 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1982 evsel->name,
1983 perf_evsel__strval(evsel, sample, "comm"),
1984 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1985 runtime,
1986 perf_evsel__intval(evsel, sample, "vruntime"));
1987 goto out_put;
1988}
1989
1990static int bpf_output__printer(enum binary_printer_ops op,
1991 unsigned int val, void *extra __maybe_unused, FILE *fp)
1992{
1993 unsigned char ch = (unsigned char)val;
1994
1995 switch (op) {
1996 case BINARY_PRINT_CHAR_DATA:
1997 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1998 case BINARY_PRINT_DATA_BEGIN:
1999 case BINARY_PRINT_LINE_BEGIN:
2000 case BINARY_PRINT_ADDR:
2001 case BINARY_PRINT_NUM_DATA:
2002 case BINARY_PRINT_NUM_PAD:
2003 case BINARY_PRINT_SEP:
2004 case BINARY_PRINT_CHAR_PAD:
2005 case BINARY_PRINT_LINE_END:
2006 case BINARY_PRINT_DATA_END:
2007 default:
2008 break;
2009 }
2010
2011 return 0;
2012}
2013
2014static void bpf_output__fprintf(struct trace *trace,
2015 struct perf_sample *sample)
2016{
2017 binary__fprintf(sample->raw_data, sample->raw_size, 8,
2018 bpf_output__printer, NULL, trace->output);
2019}
2020
2021static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2022 union perf_event *event __maybe_unused,
2023 struct perf_sample *sample)
2024{
2025 int callchain_ret = 0;
2026
2027 if (sample->callchain) {
2028 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2029 if (callchain_ret == 0) {
2030 if (callchain_cursor.nr < trace->min_stack)
2031 goto out;
2032 callchain_ret = 1;
2033 }
2034 }
2035
2036 trace__printf_interrupted_entry(trace);
2037 trace__fprintf_tstamp(trace, sample->time, trace->output);
2038
2039 if (trace->trace_syscalls)
2040 fprintf(trace->output, "( ): ");
2041
2042 fprintf(trace->output, "%s:", evsel->name);
2043
2044 if (perf_evsel__is_bpf_output(evsel)) {
2045 if (evsel == trace->syscalls.events.augmented)
2046 trace__fprintf_sys_enter(trace, evsel, sample);
2047 else
2048 bpf_output__fprintf(trace, sample);
2049 } else if (evsel->tp_format) {
2050 if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
2051 trace__fprintf_sys_enter(trace, evsel, sample)) {
2052 event_format__fprintf(evsel->tp_format, sample->cpu,
2053 sample->raw_data, sample->raw_size,
2054 trace->output);
2055 }
2056 }
2057
2058 fprintf(trace->output, "\n");
2059
2060 if (callchain_ret > 0)
2061 trace__fprintf_callchain(trace, sample);
2062 else if (callchain_ret < 0)
2063 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2064out:
2065 return 0;
2066}
2067
2068static void print_location(FILE *f, struct perf_sample *sample,
2069 struct addr_location *al,
2070 bool print_dso, bool print_sym)
2071{
2072
2073 if ((verbose > 0 || print_dso) && al->map)
2074 fprintf(f, "%s@", al->map->dso->long_name);
2075
2076 if ((verbose > 0 || print_sym) && al->sym)
2077 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2078 al->addr - al->sym->start);
2079 else if (al->map)
2080 fprintf(f, "0x%" PRIx64, al->addr);
2081 else
2082 fprintf(f, "0x%" PRIx64, sample->addr);
2083}
2084
2085static int trace__pgfault(struct trace *trace,
2086 struct perf_evsel *evsel,
2087 union perf_event *event __maybe_unused,
2088 struct perf_sample *sample)
2089{
2090 struct thread *thread;
2091 struct addr_location al;
2092 char map_type = 'd';
2093 struct thread_trace *ttrace;
2094 int err = -1;
2095 int callchain_ret = 0;
2096
2097 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2098
2099 if (sample->callchain) {
2100 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2101 if (callchain_ret == 0) {
2102 if (callchain_cursor.nr < trace->min_stack)
2103 goto out_put;
2104 callchain_ret = 1;
2105 }
2106 }
2107
2108 ttrace = thread__trace(thread, trace->output);
2109 if (ttrace == NULL)
2110 goto out_put;
2111
2112 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2113 ttrace->pfmaj++;
2114 else
2115 ttrace->pfmin++;
2116
2117 if (trace->summary_only)
2118 goto out;
2119
2120 thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2121
2122 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2123
2124 fprintf(trace->output, "%sfault [",
2125 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2126 "maj" : "min");
2127
2128 print_location(trace->output, sample, &al, false, true);
2129
2130 fprintf(trace->output, "] => ");
2131
2132 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2133
2134 if (!al.map) {
2135 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2136
2137 if (al.map)
2138 map_type = 'x';
2139 else
2140 map_type = '?';
2141 }
2142
2143 print_location(trace->output, sample, &al, true, false);
2144
2145 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2146
2147 if (callchain_ret > 0)
2148 trace__fprintf_callchain(trace, sample);
2149 else if (callchain_ret < 0)
2150 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2151out:
2152 err = 0;
2153out_put:
2154 thread__put(thread);
2155 return err;
2156}
2157
2158static void trace__set_base_time(struct trace *trace,
2159 struct perf_evsel *evsel,
2160 struct perf_sample *sample)
2161{
2162 /*
2163 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2164 * and don't use sample->time unconditionally, we may end up having
2165 * some other event in the future without PERF_SAMPLE_TIME for good
2166 * reason, i.e. we may not be interested in its timestamps, just in
2167 * it taking place, picking some piece of information when it
2168 * appears in our event stream (vfs_getname comes to mind).
2169 */
2170 if (trace->base_time == 0 && !trace->full_time &&
2171 (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2172 trace->base_time = sample->time;
2173}
2174
2175static int trace__process_sample(struct perf_tool *tool,
2176 union perf_event *event,
2177 struct perf_sample *sample,
2178 struct perf_evsel *evsel,
2179 struct machine *machine __maybe_unused)
2180{
2181 struct trace *trace = container_of(tool, struct trace, tool);
2182 struct thread *thread;
2183 int err = 0;
2184
2185 tracepoint_handler handler = evsel->handler;
2186
2187 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2188 if (thread && thread__is_filtered(thread))
2189 goto out;
2190
2191 trace__set_base_time(trace, evsel, sample);
2192
2193 if (handler) {
2194 ++trace->nr_events;
2195 handler(trace, evsel, event, sample);
2196 }
2197out:
2198 thread__put(thread);
2199 return err;
2200}
2201
2202static int trace__record(struct trace *trace, int argc, const char **argv)
2203{
2204 unsigned int rec_argc, i, j;
2205 const char **rec_argv;
2206 const char * const record_args[] = {
2207 "record",
2208 "-R",
2209 "-m", "1024",
2210 "-c", "1",
2211 };
2212
2213 const char * const sc_args[] = { "-e", };
2214 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2215 const char * const majpf_args[] = { "-e", "major-faults" };
2216 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2217 const char * const minpf_args[] = { "-e", "minor-faults" };
2218 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2219
2220 /* +1 is for the event string below */
2221 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2222 majpf_args_nr + minpf_args_nr + argc;
2223 rec_argv = calloc(rec_argc + 1, sizeof(char *));
2224
2225 if (rec_argv == NULL)
2226 return -ENOMEM;
2227
2228 j = 0;
2229 for (i = 0; i < ARRAY_SIZE(record_args); i++)
2230 rec_argv[j++] = record_args[i];
2231
2232 if (trace->trace_syscalls) {
2233 for (i = 0; i < sc_args_nr; i++)
2234 rec_argv[j++] = sc_args[i];
2235
2236 /* event string may be different for older kernels - e.g., RHEL6 */
2237 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2238 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2239 else if (is_valid_tracepoint("syscalls:sys_enter"))
2240 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2241 else {
2242 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2243 free(rec_argv);
2244 return -1;
2245 }
2246 }
2247
2248 if (trace->trace_pgfaults & TRACE_PFMAJ)
2249 for (i = 0; i < majpf_args_nr; i++)
2250 rec_argv[j++] = majpf_args[i];
2251
2252 if (trace->trace_pgfaults & TRACE_PFMIN)
2253 for (i = 0; i < minpf_args_nr; i++)
2254 rec_argv[j++] = minpf_args[i];
2255
2256 for (i = 0; i < (unsigned int)argc; i++)
2257 rec_argv[j++] = argv[i];
2258
2259 return cmd_record(j, rec_argv);
2260}
2261
2262static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2263
2264static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2265{
2266 bool found = false;
2267 struct perf_evsel *evsel, *tmp;
2268 struct parse_events_error err = { .idx = 0, };
2269 int ret = parse_events(evlist, "probe:vfs_getname*", &err);
2270
2271 if (ret)
2272 return false;
2273
2274 evlist__for_each_entry_safe(evlist, evsel, tmp) {
2275 if (!strstarts(perf_evsel__name(evsel), "probe:vfs_getname"))
2276 continue;
2277
2278 if (perf_evsel__field(evsel, "pathname")) {
2279 evsel->handler = trace__vfs_getname;
2280 found = true;
2281 continue;
2282 }
2283
2284 list_del_init(&evsel->node);
2285 evsel->evlist = NULL;
2286 perf_evsel__delete(evsel);
2287 }
2288
2289 return found;
2290}
2291
2292static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2293{
2294 struct perf_evsel *evsel;
2295 struct perf_event_attr attr = {
2296 .type = PERF_TYPE_SOFTWARE,
2297 .mmap_data = 1,
2298 };
2299
2300 attr.config = config;
2301 attr.sample_period = 1;
2302
2303 event_attr_init(&attr);
2304
2305 evsel = perf_evsel__new(&attr);
2306 if (evsel)
2307 evsel->handler = trace__pgfault;
2308
2309 return evsel;
2310}
2311
2312static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2313{
2314 const u32 type = event->header.type;
2315 struct perf_evsel *evsel;
2316
2317 if (type != PERF_RECORD_SAMPLE) {
2318 trace__process_event(trace, trace->host, event, sample);
2319 return;
2320 }
2321
2322 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2323 if (evsel == NULL) {
2324 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2325 return;
2326 }
2327
2328 trace__set_base_time(trace, evsel, sample);
2329
2330 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2331 sample->raw_data == NULL) {
2332 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2333 perf_evsel__name(evsel), sample->tid,
2334 sample->cpu, sample->raw_size);
2335 } else {
2336 tracepoint_handler handler = evsel->handler;
2337 handler(trace, evsel, event, sample);
2338 }
2339}
2340
2341static int trace__add_syscall_newtp(struct trace *trace)
2342{
2343 int ret = -1;
2344 struct perf_evlist *evlist = trace->evlist;
2345 struct perf_evsel *sys_enter, *sys_exit;
2346
2347 sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
2348 if (sys_enter == NULL)
2349 goto out;
2350
2351 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2352 goto out_delete_sys_enter;
2353
2354 sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
2355 if (sys_exit == NULL)
2356 goto out_delete_sys_enter;
2357
2358 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2359 goto out_delete_sys_exit;
2360
2361 perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2362 perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2363
2364 perf_evlist__add(evlist, sys_enter);
2365 perf_evlist__add(evlist, sys_exit);
2366
2367 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2368 /*
2369 * We're interested only in the user space callchain
2370 * leading to the syscall, allow overriding that for
2371 * debugging reasons using --kernel_syscall_callchains
2372 */
2373 sys_exit->attr.exclude_callchain_kernel = 1;
2374 }
2375
2376 trace->syscalls.events.sys_enter = sys_enter;
2377 trace->syscalls.events.sys_exit = sys_exit;
2378
2379 ret = 0;
2380out:
2381 return ret;
2382
2383out_delete_sys_exit:
2384 perf_evsel__delete_priv(sys_exit);
2385out_delete_sys_enter:
2386 perf_evsel__delete_priv(sys_enter);
2387 goto out;
2388}
2389
2390static int trace__set_ev_qualifier_filter(struct trace *trace)
2391{
2392 int err = -1;
2393 struct perf_evsel *sys_exit;
2394 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2395 trace->ev_qualifier_ids.nr,
2396 trace->ev_qualifier_ids.entries);
2397
2398 if (filter == NULL)
2399 goto out_enomem;
2400
2401 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2402 filter)) {
2403 sys_exit = trace->syscalls.events.sys_exit;
2404 err = perf_evsel__append_tp_filter(sys_exit, filter);
2405 }
2406
2407 free(filter);
2408out:
2409 return err;
2410out_enomem:
2411 errno = ENOMEM;
2412 goto out;
2413}
2414
2415static int trace__set_filter_loop_pids(struct trace *trace)
2416{
2417 unsigned int nr = 1;
2418 pid_t pids[32] = {
2419 getpid(),
2420 };
2421 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2422
2423 while (thread && nr < ARRAY_SIZE(pids)) {
2424 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2425
2426 if (parent == NULL)
2427 break;
2428
2429 if (!strcmp(thread__comm_str(parent), "sshd")) {
2430 pids[nr++] = parent->tid;
2431 break;
2432 }
2433 thread = parent;
2434 }
2435
2436 return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2437}
2438
2439static int trace__run(struct trace *trace, int argc, const char **argv)
2440{
2441 struct perf_evlist *evlist = trace->evlist;
2442 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2443 int err = -1, i;
2444 unsigned long before;
2445 const bool forks = argc > 0;
2446 bool draining = false;
2447
2448 trace->live = true;
2449
2450 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2451 goto out_error_raw_syscalls;
2452
2453 if (trace->trace_syscalls)
2454 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2455
2456 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2457 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2458 if (pgfault_maj == NULL)
2459 goto out_error_mem;
2460 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2461 perf_evlist__add(evlist, pgfault_maj);
2462 }
2463
2464 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2465 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2466 if (pgfault_min == NULL)
2467 goto out_error_mem;
2468 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2469 perf_evlist__add(evlist, pgfault_min);
2470 }
2471
2472 if (trace->sched &&
2473 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2474 trace__sched_stat_runtime))
2475 goto out_error_sched_stat_runtime;
2476
2477 /*
2478 * If a global cgroup was set, apply it to all the events without an
2479 * explicit cgroup. I.e.:
2480 *
2481 * trace -G A -e sched:*switch
2482 *
2483 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2484 * _and_ sched:sched_switch to the 'A' cgroup, while:
2485 *
2486 * trace -e sched:*switch -G A
2487 *
2488 * will only set the sched:sched_switch event to the 'A' cgroup, all the
2489 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2490 * a cgroup (on the root cgroup, sys wide, etc).
2491 *
2492 * Multiple cgroups:
2493 *
2494 * trace -G A -e sched:*switch -G B
2495 *
2496 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2497 * to the 'B' cgroup.
2498 *
2499 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2500 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2501 */
2502 if (trace->cgroup)
2503 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2504
2505 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2506 if (err < 0) {
2507 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2508 goto out_delete_evlist;
2509 }
2510
2511 err = trace__symbols_init(trace, evlist);
2512 if (err < 0) {
2513 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2514 goto out_delete_evlist;
2515 }
2516
2517 perf_evlist__config(evlist, &trace->opts, &callchain_param);
2518
2519 signal(SIGCHLD, sig_handler);
2520 signal(SIGINT, sig_handler);
2521
2522 if (forks) {
2523 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2524 argv, false, NULL);
2525 if (err < 0) {
2526 fprintf(trace->output, "Couldn't run the workload!\n");
2527 goto out_delete_evlist;
2528 }
2529 }
2530
2531 err = perf_evlist__open(evlist);
2532 if (err < 0)
2533 goto out_error_open;
2534
2535 err = bpf__apply_obj_config();
2536 if (err) {
2537 char errbuf[BUFSIZ];
2538
2539 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2540 pr_err("ERROR: Apply config to BPF failed: %s\n",
2541 errbuf);
2542 goto out_error_open;
2543 }
2544
2545 /*
2546 * Better not use !target__has_task() here because we need to cover the
2547 * case where no threads were specified in the command line, but a
2548 * workload was, and in that case we will fill in the thread_map when
2549 * we fork the workload in perf_evlist__prepare_workload.
2550 */
2551 if (trace->filter_pids.nr > 0)
2552 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2553 else if (thread_map__pid(evlist->threads, 0) == -1)
2554 err = trace__set_filter_loop_pids(trace);
2555
2556 if (err < 0)
2557 goto out_error_mem;
2558
2559 if (trace->ev_qualifier_ids.nr > 0) {
2560 err = trace__set_ev_qualifier_filter(trace);
2561 if (err < 0)
2562 goto out_errno;
2563
2564 pr_debug("event qualifier tracepoint filter: %s\n",
2565 trace->syscalls.events.sys_exit->filter);
2566 }
2567
2568 err = perf_evlist__apply_filters(evlist, &evsel);
2569 if (err < 0)
2570 goto out_error_apply_filters;
2571
2572 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2573 if (err < 0)
2574 goto out_error_mmap;
2575
2576 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2577 perf_evlist__enable(evlist);
2578
2579 if (forks)
2580 perf_evlist__start_workload(evlist);
2581
2582 if (trace->opts.initial_delay) {
2583 usleep(trace->opts.initial_delay * 1000);
2584 perf_evlist__enable(evlist);
2585 }
2586
2587 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2588 evlist->threads->nr > 1 ||
2589 perf_evlist__first(evlist)->attr.inherit;
2590
2591 /*
2592 * Now that we already used evsel->attr to ask the kernel to setup the
2593 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2594 * trace__resolve_callchain(), allowing per-event max-stack settings
2595 * to override an explicitely set --max-stack global setting.
2596 */
2597 evlist__for_each_entry(evlist, evsel) {
2598 if (evsel__has_callchain(evsel) &&
2599 evsel->attr.sample_max_stack == 0)
2600 evsel->attr.sample_max_stack = trace->max_stack;
2601 }
2602again:
2603 before = trace->nr_events;
2604
2605 for (i = 0; i < evlist->nr_mmaps; i++) {
2606 union perf_event *event;
2607 struct perf_mmap *md;
2608
2609 md = &evlist->mmap[i];
2610 if (perf_mmap__read_init(md) < 0)
2611 continue;
2612
2613 while ((event = perf_mmap__read_event(md)) != NULL) {
2614 struct perf_sample sample;
2615
2616 ++trace->nr_events;
2617
2618 err = perf_evlist__parse_sample(evlist, event, &sample);
2619 if (err) {
2620 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2621 goto next_event;
2622 }
2623
2624 trace__handle_event(trace, event, &sample);
2625next_event:
2626 perf_mmap__consume(md);
2627
2628 if (interrupted)
2629 goto out_disable;
2630
2631 if (done && !draining) {
2632 perf_evlist__disable(evlist);
2633 draining = true;
2634 }
2635 }
2636 perf_mmap__read_done(md);
2637 }
2638
2639 if (trace->nr_events == before) {
2640 int timeout = done ? 100 : -1;
2641
2642 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2643 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2644 draining = true;
2645
2646 goto again;
2647 }
2648 } else {
2649 goto again;
2650 }
2651
2652out_disable:
2653 thread__zput(trace->current);
2654
2655 perf_evlist__disable(evlist);
2656
2657 if (!err) {
2658 if (trace->summary)
2659 trace__fprintf_thread_summary(trace, trace->output);
2660
2661 if (trace->show_tool_stats) {
2662 fprintf(trace->output, "Stats:\n "
2663 " vfs_getname : %" PRIu64 "\n"
2664 " proc_getname: %" PRIu64 "\n",
2665 trace->stats.vfs_getname,
2666 trace->stats.proc_getname);
2667 }
2668 }
2669
2670out_delete_evlist:
2671 trace__symbols__exit(trace);
2672
2673 perf_evlist__delete(evlist);
2674 cgroup__put(trace->cgroup);
2675 trace->evlist = NULL;
2676 trace->live = false;
2677 return err;
2678{
2679 char errbuf[BUFSIZ];
2680
2681out_error_sched_stat_runtime:
2682 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2683 goto out_error;
2684
2685out_error_raw_syscalls:
2686 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2687 goto out_error;
2688
2689out_error_mmap:
2690 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2691 goto out_error;
2692
2693out_error_open:
2694 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2695
2696out_error:
2697 fprintf(trace->output, "%s\n", errbuf);
2698 goto out_delete_evlist;
2699
2700out_error_apply_filters:
2701 fprintf(trace->output,
2702 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2703 evsel->filter, perf_evsel__name(evsel), errno,
2704 str_error_r(errno, errbuf, sizeof(errbuf)));
2705 goto out_delete_evlist;
2706}
2707out_error_mem:
2708 fprintf(trace->output, "Not enough memory to run!\n");
2709 goto out_delete_evlist;
2710
2711out_errno:
2712 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2713 goto out_delete_evlist;
2714}
2715
2716static int trace__replay(struct trace *trace)
2717{
2718 const struct perf_evsel_str_handler handlers[] = {
2719 { "probe:vfs_getname", trace__vfs_getname, },
2720 };
2721 struct perf_data data = {
2722 .file = {
2723 .path = input_name,
2724 },
2725 .mode = PERF_DATA_MODE_READ,
2726 .force = trace->force,
2727 };
2728 struct perf_session *session;
2729 struct perf_evsel *evsel;
2730 int err = -1;
2731
2732 trace->tool.sample = trace__process_sample;
2733 trace->tool.mmap = perf_event__process_mmap;
2734 trace->tool.mmap2 = perf_event__process_mmap2;
2735 trace->tool.comm = perf_event__process_comm;
2736 trace->tool.exit = perf_event__process_exit;
2737 trace->tool.fork = perf_event__process_fork;
2738 trace->tool.attr = perf_event__process_attr;
2739 trace->tool.tracing_data = perf_event__process_tracing_data;
2740 trace->tool.build_id = perf_event__process_build_id;
2741 trace->tool.namespaces = perf_event__process_namespaces;
2742
2743 trace->tool.ordered_events = true;
2744 trace->tool.ordering_requires_timestamps = true;
2745
2746 /* add tid to output */
2747 trace->multiple_threads = true;
2748
2749 session = perf_session__new(&data, false, &trace->tool);
2750 if (session == NULL)
2751 return -1;
2752
2753 if (trace->opts.target.pid)
2754 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2755
2756 if (trace->opts.target.tid)
2757 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2758
2759 if (symbol__init(&session->header.env) < 0)
2760 goto out;
2761
2762 trace->host = &session->machines.host;
2763
2764 err = perf_session__set_tracepoints_handlers(session, handlers);
2765 if (err)
2766 goto out;
2767
2768 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2769 "raw_syscalls:sys_enter");
2770 /* older kernels have syscalls tp versus raw_syscalls */
2771 if (evsel == NULL)
2772 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2773 "syscalls:sys_enter");
2774
2775 if (evsel &&
2776 (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
2777 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2778 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2779 goto out;
2780 }
2781
2782 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2783 "raw_syscalls:sys_exit");
2784 if (evsel == NULL)
2785 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2786 "syscalls:sys_exit");
2787 if (evsel &&
2788 (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
2789 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2790 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2791 goto out;
2792 }
2793
2794 evlist__for_each_entry(session->evlist, evsel) {
2795 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2796 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2797 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2798 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2799 evsel->handler = trace__pgfault;
2800 }
2801
2802 setup_pager();
2803
2804 err = perf_session__process_events(session);
2805 if (err)
2806 pr_err("Failed to process events, error %d", err);
2807
2808 else if (trace->summary)
2809 trace__fprintf_thread_summary(trace, trace->output);
2810
2811out:
2812 perf_session__delete(session);
2813
2814 return err;
2815}
2816
2817static size_t trace__fprintf_threads_header(FILE *fp)
2818{
2819 size_t printed;
2820
2821 printed = fprintf(fp, "\n Summary of events:\n\n");
2822
2823 return printed;
2824}
2825
2826DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2827 struct stats *stats;
2828 double msecs;
2829 int syscall;
2830)
2831{
2832 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2833 struct stats *stats = source->priv;
2834
2835 entry->syscall = source->i;
2836 entry->stats = stats;
2837 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2838}
2839
2840static size_t thread__dump_stats(struct thread_trace *ttrace,
2841 struct trace *trace, FILE *fp)
2842{
2843 size_t printed = 0;
2844 struct syscall *sc;
2845 struct rb_node *nd;
2846 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2847
2848 if (syscall_stats == NULL)
2849 return 0;
2850
2851 printed += fprintf(fp, "\n");
2852
2853 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2854 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2855 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2856
2857 resort_rb__for_each_entry(nd, syscall_stats) {
2858 struct stats *stats = syscall_stats_entry->stats;
2859 if (stats) {
2860 double min = (double)(stats->min) / NSEC_PER_MSEC;
2861 double max = (double)(stats->max) / NSEC_PER_MSEC;
2862 double avg = avg_stats(stats);
2863 double pct;
2864 u64 n = (u64) stats->n;
2865
2866 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2867 avg /= NSEC_PER_MSEC;
2868
2869 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2870 printed += fprintf(fp, " %-15s", sc->name);
2871 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2872 n, syscall_stats_entry->msecs, min, avg);
2873 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2874 }
2875 }
2876
2877 resort_rb__delete(syscall_stats);
2878 printed += fprintf(fp, "\n\n");
2879
2880 return printed;
2881}
2882
2883static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2884{
2885 size_t printed = 0;
2886 struct thread_trace *ttrace = thread__priv(thread);
2887 double ratio;
2888
2889 if (ttrace == NULL)
2890 return 0;
2891
2892 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2893
2894 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2895 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2896 printed += fprintf(fp, "%.1f%%", ratio);
2897 if (ttrace->pfmaj)
2898 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2899 if (ttrace->pfmin)
2900 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2901 if (trace->sched)
2902 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2903 else if (fputc('\n', fp) != EOF)
2904 ++printed;
2905
2906 printed += thread__dump_stats(ttrace, trace, fp);
2907
2908 return printed;
2909}
2910
2911static unsigned long thread__nr_events(struct thread_trace *ttrace)
2912{
2913 return ttrace ? ttrace->nr_events : 0;
2914}
2915
2916DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2917 struct thread *thread;
2918)
2919{
2920 entry->thread = rb_entry(nd, struct thread, rb_node);
2921}
2922
2923static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2924{
2925 size_t printed = trace__fprintf_threads_header(fp);
2926 struct rb_node *nd;
2927 int i;
2928
2929 for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2930 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2931
2932 if (threads == NULL) {
2933 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2934 return 0;
2935 }
2936
2937 resort_rb__for_each_entry(nd, threads)
2938 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2939
2940 resort_rb__delete(threads);
2941 }
2942 return printed;
2943}
2944
2945static int trace__set_duration(const struct option *opt, const char *str,
2946 int unset __maybe_unused)
2947{
2948 struct trace *trace = opt->value;
2949
2950 trace->duration_filter = atof(str);
2951 return 0;
2952}
2953
2954static int trace__set_filter_pids(const struct option *opt, const char *str,
2955 int unset __maybe_unused)
2956{
2957 int ret = -1;
2958 size_t i;
2959 struct trace *trace = opt->value;
2960 /*
2961 * FIXME: introduce a intarray class, plain parse csv and create a
2962 * { int nr, int entries[] } struct...
2963 */
2964 struct intlist *list = intlist__new(str);
2965
2966 if (list == NULL)
2967 return -1;
2968
2969 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2970 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2971
2972 if (trace->filter_pids.entries == NULL)
2973 goto out;
2974
2975 trace->filter_pids.entries[0] = getpid();
2976
2977 for (i = 1; i < trace->filter_pids.nr; ++i)
2978 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2979
2980 intlist__delete(list);
2981 ret = 0;
2982out:
2983 return ret;
2984}
2985
2986static int trace__open_output(struct trace *trace, const char *filename)
2987{
2988 struct stat st;
2989
2990 if (!stat(filename, &st) && st.st_size) {
2991 char oldname[PATH_MAX];
2992
2993 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2994 unlink(oldname);
2995 rename(filename, oldname);
2996 }
2997
2998 trace->output = fopen(filename, "w");
2999
3000 return trace->output == NULL ? -errno : 0;
3001}
3002
3003static int parse_pagefaults(const struct option *opt, const char *str,
3004 int unset __maybe_unused)
3005{
3006 int *trace_pgfaults = opt->value;
3007
3008 if (strcmp(str, "all") == 0)
3009 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
3010 else if (strcmp(str, "maj") == 0)
3011 *trace_pgfaults |= TRACE_PFMAJ;
3012 else if (strcmp(str, "min") == 0)
3013 *trace_pgfaults |= TRACE_PFMIN;
3014 else
3015 return -1;
3016
3017 return 0;
3018}
3019
3020static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
3021{
3022 struct perf_evsel *evsel;
3023
3024 evlist__for_each_entry(evlist, evsel)
3025 evsel->handler = handler;
3026}
3027
3028static int evlist__set_syscall_tp_fields(struct perf_evlist *evlist)
3029{
3030 struct perf_evsel *evsel;
3031
3032 evlist__for_each_entry(evlist, evsel) {
3033 if (evsel->priv || !evsel->tp_format)
3034 continue;
3035
3036 if (strcmp(evsel->tp_format->system, "syscalls"))
3037 continue;
3038
3039 if (perf_evsel__init_syscall_tp(evsel))
3040 return -1;
3041
3042 if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
3043 struct syscall_tp *sc = evsel->priv;
3044
3045 if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
3046 return -1;
3047 } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
3048 struct syscall_tp *sc = evsel->priv;
3049
3050 if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
3051 return -1;
3052 }
3053 }
3054
3055 return 0;
3056}
3057
3058/*
3059 * XXX: Hackish, just splitting the combined -e+--event (syscalls
3060 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
3061 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
3062 *
3063 * It'd be better to introduce a parse_options() variant that would return a
3064 * list with the terms it didn't match to an event...
3065 */
3066static int trace__parse_events_option(const struct option *opt, const char *str,
3067 int unset __maybe_unused)
3068{
3069 struct trace *trace = (struct trace *)opt->value;
3070 const char *s = str;
3071 char *sep = NULL, *lists[2] = { NULL, NULL, };
3072 int len = strlen(str) + 1, err = -1, list, idx;
3073 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
3074 char group_name[PATH_MAX];
3075
3076 if (strace_groups_dir == NULL)
3077 return -1;
3078
3079 if (*s == '!') {
3080 ++s;
3081 trace->not_ev_qualifier = true;
3082 }
3083
3084 while (1) {
3085 if ((sep = strchr(s, ',')) != NULL)
3086 *sep = '\0';
3087
3088 list = 0;
3089 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
3090 syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
3091 list = 1;
3092 } else {
3093 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
3094 if (access(group_name, R_OK) == 0)
3095 list = 1;
3096 }
3097
3098 if (lists[list]) {
3099 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
3100 } else {
3101 lists[list] = malloc(len);
3102 if (lists[list] == NULL)
3103 goto out;
3104 strcpy(lists[list], s);
3105 }
3106
3107 if (!sep)
3108 break;
3109
3110 *sep = ',';
3111 s = sep + 1;
3112 }
3113
3114 if (lists[1] != NULL) {
3115 struct strlist_config slist_config = {
3116 .dirname = strace_groups_dir,
3117 };
3118
3119 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
3120 if (trace->ev_qualifier == NULL) {
3121 fputs("Not enough memory to parse event qualifier", trace->output);
3122 goto out;
3123 }
3124
3125 if (trace__validate_ev_qualifier(trace))
3126 goto out;
3127 trace->trace_syscalls = true;
3128 }
3129
3130 err = 0;
3131
3132 if (lists[0]) {
3133 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3134 "event selector. use 'perf list' to list available events",
3135 parse_events_option);
3136 err = parse_events_option(&o, lists[0], 0);
3137 }
3138out:
3139 if (sep)
3140 *sep = ',';
3141
3142 return err;
3143}
3144
3145static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3146{
3147 struct trace *trace = opt->value;
3148
3149 if (!list_empty(&trace->evlist->entries))
3150 return parse_cgroups(opt, str, unset);
3151
3152 trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3153
3154 return 0;
3155}
3156
3157int cmd_trace(int argc, const char **argv)
3158{
3159 const char *trace_usage[] = {
3160 "perf trace [<options>] [<command>]",
3161 "perf trace [<options>] -- <command> [<options>]",
3162 "perf trace record [<options>] [<command>]",
3163 "perf trace record [<options>] -- <command> [<options>]",
3164 NULL
3165 };
3166 struct trace trace = {
3167 .syscalls = {
3168 . max = -1,
3169 },
3170 .opts = {
3171 .target = {
3172 .uid = UINT_MAX,
3173 .uses_mmap = true,
3174 },
3175 .user_freq = UINT_MAX,
3176 .user_interval = ULLONG_MAX,
3177 .no_buffering = true,
3178 .mmap_pages = UINT_MAX,
3179 .proc_map_timeout = 500,
3180 },
3181 .output = stderr,
3182 .show_comm = true,
3183 .trace_syscalls = false,
3184 .kernel_syscallchains = false,
3185 .max_stack = UINT_MAX,
3186 };
3187 const char *output_name = NULL;
3188 const struct option trace_options[] = {
3189 OPT_CALLBACK('e', "event", &trace, "event",
3190 "event/syscall selector. use 'perf list' to list available events",
3191 trace__parse_events_option),
3192 OPT_BOOLEAN(0, "comm", &trace.show_comm,
3193 "show the thread COMM next to its id"),
3194 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3195 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3196 trace__parse_events_option),
3197 OPT_STRING('o', "output", &output_name, "file", "output file name"),
3198 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3199 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3200 "trace events on existing process id"),
3201 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3202 "trace events on existing thread id"),
3203 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3204 "pids to filter (by the kernel)", trace__set_filter_pids),
3205 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3206 "system-wide collection from all CPUs"),
3207 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3208 "list of cpus to monitor"),
3209 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3210 "child tasks do not inherit counters"),
3211 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3212 "number of mmap data pages",
3213 perf_evlist__parse_mmap_pages),
3214 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3215 "user to profile"),
3216 OPT_CALLBACK(0, "duration", &trace, "float",
3217 "show only events with duration > N.M ms",
3218 trace__set_duration),
3219 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3220 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3221 OPT_BOOLEAN('T', "time", &trace.full_time,
3222 "Show full timestamp, not time relative to first start"),
3223 OPT_BOOLEAN(0, "failure", &trace.failure_only,
3224 "Show only syscalls that failed"),
3225 OPT_BOOLEAN('s', "summary", &trace.summary_only,
3226 "Show only syscall summary with statistics"),
3227 OPT_BOOLEAN('S', "with-summary", &trace.summary,
3228 "Show all syscalls and summary with statistics"),
3229 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3230 "Trace pagefaults", parse_pagefaults, "maj"),
3231 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3232 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3233 OPT_CALLBACK(0, "call-graph", &trace.opts,
3234 "record_mode[,record_size]", record_callchain_help,
3235 &record_parse_callchain_opt),
3236 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3237 "Show the kernel callchains on the syscall exit path"),
3238 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3239 "Set the minimum stack depth when parsing the callchain, "
3240 "anything below the specified depth will be ignored."),
3241 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3242 "Set the maximum stack depth when parsing the callchain, "
3243 "anything beyond the specified depth will be ignored. "
3244 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3245 OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3246 "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3247 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3248 "per thread proc mmap processing timeout in ms"),
3249 OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3250 trace__parse_cgroups),
3251 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3252 "ms to wait before starting measurement after program "
3253 "start"),
3254 OPT_END()
3255 };
3256 bool __maybe_unused max_stack_user_set = true;
3257 bool mmap_pages_user_set = true;
3258 struct perf_evsel *evsel;
3259 const char * const trace_subcommands[] = { "record", NULL };
3260 int err = -1;
3261 char bf[BUFSIZ];
3262
3263 signal(SIGSEGV, sighandler_dump_stack);
3264 signal(SIGFPE, sighandler_dump_stack);
3265
3266 trace.evlist = perf_evlist__new();
3267 trace.sctbl = syscalltbl__new();
3268
3269 if (trace.evlist == NULL || trace.sctbl == NULL) {
3270 pr_err("Not enough memory to run!\n");
3271 err = -ENOMEM;
3272 goto out;
3273 }
3274
3275 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3276 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3277
3278 if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3279 usage_with_options_msg(trace_usage, trace_options,
3280 "cgroup monitoring only available in system-wide mode");
3281 }
3282
3283 evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__");
3284 if (IS_ERR(evsel)) {
3285 bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf));
3286 pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf);
3287 goto out;
3288 }
3289
3290 if (evsel) {
3291 if (perf_evsel__init_augmented_syscall_tp(evsel) ||
3292 perf_evsel__init_augmented_syscall_tp_args(evsel))
3293 goto out;
3294 trace.syscalls.events.augmented = evsel;
3295 }
3296
3297 err = bpf__setup_stdout(trace.evlist);
3298 if (err) {
3299 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3300 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3301 goto out;
3302 }
3303
3304 err = -1;
3305
3306 if (trace.trace_pgfaults) {
3307 trace.opts.sample_address = true;
3308 trace.opts.sample_time = true;
3309 }
3310
3311 if (trace.opts.mmap_pages == UINT_MAX)
3312 mmap_pages_user_set = false;
3313
3314 if (trace.max_stack == UINT_MAX) {
3315 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3316 max_stack_user_set = false;
3317 }
3318
3319#ifdef HAVE_DWARF_UNWIND_SUPPORT
3320 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3321 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3322 }
3323#endif
3324
3325 if (callchain_param.enabled) {
3326 if (!mmap_pages_user_set && geteuid() == 0)
3327 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3328
3329 symbol_conf.use_callchain = true;
3330 }
3331
3332 if (trace.evlist->nr_entries > 0) {
3333 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3334 if (evlist__set_syscall_tp_fields(trace.evlist)) {
3335 perror("failed to set syscalls:* tracepoint fields");
3336 goto out;
3337 }
3338 }
3339
3340 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3341 return trace__record(&trace, argc-1, &argv[1]);
3342
3343 /* summary_only implies summary option, but don't overwrite summary if set */
3344 if (trace.summary_only)
3345 trace.summary = trace.summary_only;
3346
3347 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3348 trace.evlist->nr_entries == 0 /* Was --events used? */) {
3349 trace.trace_syscalls = true;
3350 }
3351
3352 if (output_name != NULL) {
3353 err = trace__open_output(&trace, output_name);
3354 if (err < 0) {
3355 perror("failed to create output file");
3356 goto out;
3357 }
3358 }
3359
3360 err = target__validate(&trace.opts.target);
3361 if (err) {
3362 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3363 fprintf(trace.output, "%s", bf);
3364 goto out_close;
3365 }
3366
3367 err = target__parse_uid(&trace.opts.target);
3368 if (err) {
3369 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3370 fprintf(trace.output, "%s", bf);
3371 goto out_close;
3372 }
3373
3374 if (!argc && target__none(&trace.opts.target))
3375 trace.opts.target.system_wide = true;
3376
3377 if (input_name)
3378 err = trace__replay(&trace);
3379 else
3380 err = trace__run(&trace, argc, argv);
3381
3382out_close:
3383 if (output_name != NULL)
3384 fclose(trace.output);
3385out:
3386 return err;
3387}