Blame - src/kernel/linux/v4.19/tools/perf/builtin-sched.c - T800

blob: cbf39dab19c1a570fa4f988aeec1aab64f918a9d [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	#include "builtin.h"
				3	#include "perf.h"
				4
				5	#include "util/util.h"
				6	#include "util/evlist.h"
				7	#include "util/cache.h"
				8	#include "util/evsel.h"
				9	#include "util/symbol.h"
				10	#include "util/thread.h"
				11	#include "util/header.h"
				12	#include "util/session.h"
				13	#include "util/tool.h"
				14	#include "util/cloexec.h"
				15	#include "util/thread_map.h"
				16	#include "util/color.h"
				17	#include "util/stat.h"
				18	#include "util/callchain.h"
				19	#include "util/time-utils.h"
				20
				21	#include <subcmd/parse-options.h>
				22	#include "util/trace-event.h"
				23
				24	#include "util/debug.h"
				25
				26	#include <linux/kernel.h>
				27	#include <linux/log2.h>
				28	#include <sys/prctl.h>
				29	#include <sys/resource.h>
				30	#include <inttypes.h>
				31
				32	#include <errno.h>
				33	#include <semaphore.h>
				34	#include <pthread.h>
				35	#include <math.h>
				36	#include <api/fs/fs.h>
				37	#include <linux/time64.h>
				38
				39	#include "sane_ctype.h"
				40
				41	#define PR_SET_NAME 15 /* Set process name */
				42	#define MAX_CPUS 4096
				43	#define COMM_LEN 20
				44	#define SYM_LEN 129
				45	#define MAX_PID 1024000
				46
				47	struct sched_atom;
				48
				49	struct task_desc {
				50	unsigned long nr;
				51	unsigned long pid;
				52	char comm[COMM_LEN];
				53
				54	unsigned long nr_events;
				55	unsigned long curr_event;
				56	struct sched_atom **atoms;
				57
				58	pthread_t thread;
				59	sem_t sleep_sem;
				60
				61	sem_t ready_for_work;
				62	sem_t work_done_sem;
				63
				64	u64 cpu_usage;
				65	};
				66
				67	enum sched_event_type {
				68	SCHED_EVENT_RUN,
				69	SCHED_EVENT_SLEEP,
				70	SCHED_EVENT_WAKEUP,
				71	SCHED_EVENT_MIGRATION,
				72	};
				73
				74	struct sched_atom {
				75	enum sched_event_type type;
				76	int specific_wait;
				77	u64 timestamp;
				78	u64 duration;
				79	unsigned long nr;
				80	sem_t *wait_sem;
				81	struct task_desc *wakee;
				82	};
				83
				84	#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
				85
				86	/* task state bitmask, copied from include/linux/sched.h */
				87	#define TASK_RUNNING 0
				88	#define TASK_INTERRUPTIBLE 1
				89	#define TASK_UNINTERRUPTIBLE 2
				90	#define __TASK_STOPPED 4
				91	#define __TASK_TRACED 8
				92	/* in tsk->exit_state */
				93	#define EXIT_DEAD 16
				94	#define EXIT_ZOMBIE 32
				95	#define EXIT_TRACE (EXIT_ZOMBIE \| EXIT_DEAD)
				96	/* in tsk->state again */
				97	#define TASK_DEAD 64
				98	#define TASK_WAKEKILL 128
				99	#define TASK_WAKING 256
				100	#define TASK_PARKED 512
				101
				102	enum thread_state {
				103	THREAD_SLEEPING = 0,
				104	THREAD_WAIT_CPU,
				105	THREAD_SCHED_IN,
				106	THREAD_IGNORE
				107	};
				108
				109	struct work_atom {
				110	struct list_head list;
				111	enum thread_state state;
				112	u64 sched_out_time;
				113	u64 wake_up_time;
				114	u64 sched_in_time;
				115	u64 runtime;
				116	};
				117
				118	struct work_atoms {
				119	struct list_head work_list;
				120	struct thread *thread;
				121	struct rb_node node;
				122	u64 max_lat;
				123	u64 max_lat_at;
				124	u64 total_lat;
				125	u64 nb_atoms;
				126	u64 total_runtime;
				127	int num_merged;
				128	};
				129
				130	typedef int (sort_fn_t)(struct work_atoms , struct work_atoms *);
				131
				132	struct perf_sched;
				133
				134	struct trace_sched_handler {
				135	int (switch_event)(struct perf_sched sched, struct perf_evsel *evsel,
				136	struct perf_sample sample, struct machine machine);
				137
				138	int (runtime_event)(struct perf_sched sched, struct perf_evsel *evsel,
				139	struct perf_sample sample, struct machine machine);
				140
				141	int (wakeup_event)(struct perf_sched sched, struct perf_evsel *evsel,
				142	struct perf_sample sample, struct machine machine);
				143
				144	/* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
				145	int (fork_event)(struct perf_sched sched, union perf_event *event,
				146	struct machine *machine);
				147
				148	int (migrate_task_event)(struct perf_sched sched,
				149	struct perf_evsel *evsel,
				150	struct perf_sample *sample,
				151	struct machine *machine);
				152	};
				153
				154	#define COLOR_PIDS PERF_COLOR_BLUE
				155	#define COLOR_CPUS PERF_COLOR_BG_RED
				156
				157	struct perf_sched_map {
				158	DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS);
				159	int *comp_cpus;
				160	bool comp;
				161	struct thread_map *color_pids;
				162	const char *color_pids_str;
				163	struct cpu_map *color_cpus;
				164	const char *color_cpus_str;
				165	struct cpu_map *cpus;
				166	const char *cpus_str;
				167	};
				168
				169	struct perf_sched {
				170	struct perf_tool tool;
				171	const char *sort_order;
				172	unsigned long nr_tasks;
				173	struct task_desc **pid_to_task;
				174	struct task_desc **tasks;
				175	const struct trace_sched_handler *tp_handler;
				176	pthread_mutex_t start_work_mutex;
				177	pthread_mutex_t work_done_wait_mutex;
				178	int profile_cpu;
				179	/*
				180	* Track the current task - that way we can know whether there's any
				181	* weird events, such as a task being switched away that is not current.
				182	*/
				183	int max_cpu;
				184	u32 curr_pid[MAX_CPUS];
				185	struct thread *curr_thread[MAX_CPUS];
				186	char next_shortname1;
				187	char next_shortname2;
				188	unsigned int replay_repeat;
				189	unsigned long nr_run_events;
				190	unsigned long nr_sleep_events;
				191	unsigned long nr_wakeup_events;
				192	unsigned long nr_sleep_corrections;
				193	unsigned long nr_run_events_optimized;
				194	unsigned long targetless_wakeups;
				195	unsigned long multitarget_wakeups;
				196	unsigned long nr_runs;
				197	unsigned long nr_timestamps;
				198	unsigned long nr_unordered_timestamps;
				199	unsigned long nr_context_switch_bugs;
				200	unsigned long nr_events;
				201	unsigned long nr_lost_chunks;
				202	unsigned long nr_lost_events;
				203	u64 run_measurement_overhead;
				204	u64 sleep_measurement_overhead;
				205	u64 start_time;
				206	u64 cpu_usage;
				207	u64 runavg_cpu_usage;
				208	u64 parent_cpu_usage;
				209	u64 runavg_parent_cpu_usage;
				210	u64 sum_runtime;
				211	u64 sum_fluct;
				212	u64 run_avg;
				213	u64 all_runtime;
				214	u64 all_count;
				215	u64 cpu_last_switched[MAX_CPUS];
				216	struct rb_root atom_root, sorted_atom_root, merged_atom_root;
				217	struct list_head sort_list, cmp_pid;
				218	bool force;
				219	bool skip_merge;
				220	struct perf_sched_map map;
				221
				222	/* options for timehist command */
				223	bool summary;
				224	bool summary_only;
				225	bool idle_hist;
				226	bool show_callchain;
				227	unsigned int max_stack;
				228	bool show_cpu_visual;
				229	bool show_wakeups;
				230	bool show_next;
				231	bool show_migrations;
				232	bool show_state;
				233	u64 skipped_samples;
				234	const char *time_str;
				235	struct perf_time_interval ptime;
				236	struct perf_time_interval hist_time;
				237	};
				238
				239	/* per thread run time data */
				240	struct thread_runtime {
				241	u64 last_time; /* time of previous sched in/out event */
				242	u64 dt_run; /* run time */
				243	u64 dt_sleep; /* time between CPU access by sleep (off cpu) */
				244	u64 dt_iowait; /* time between CPU access by iowait (off cpu) */
				245	u64 dt_preempt; /* time between CPU access by preempt (off cpu) */
				246	u64 dt_delay; /* time between wakeup and sched-in */
				247	u64 ready_to_run; /* time of wakeup */
				248
				249	struct stats run_stats;
				250	u64 total_run_time;
				251	u64 total_sleep_time;
				252	u64 total_iowait_time;
				253	u64 total_preempt_time;
				254	u64 total_delay_time;
				255
				256	int last_state;
				257
				258	char shortname[3];
				259	bool comm_changed;
				260
				261	u64 migrations;
				262	};
				263
				264	/* per event run time data */
				265	struct evsel_runtime {
				266	u64 last_time; / time this event was last seen per cpu */
				267	u32 ncpu; /* highest cpu slot allocated */
				268	};
				269
				270	/* per cpu idle time data */
				271	struct idle_thread_runtime {
				272	struct thread_runtime tr;
				273	struct thread *last_thread;
				274	struct rb_root sorted_root;
				275	struct callchain_root callchain;
				276	struct callchain_cursor cursor;
				277	};
				278
				279	/* track idle times per cpu */
				280	static struct thread **idle_threads;
				281	static int idle_max_cpu;
				282	static char idle_comm[] = "<idle>";
				283
				284	static u64 get_nsecs(void)
				285	{
				286	struct timespec ts;
				287
				288	clock_gettime(CLOCK_MONOTONIC, &ts);
				289
				290	return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec;
				291	}
				292
				293	static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
				294	{
				295	u64 T0 = get_nsecs(), T1;
				296
				297	do {
				298	T1 = get_nsecs();
				299	} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
				300	}
				301
				302	static void sleep_nsecs(u64 nsecs)
				303	{
				304	struct timespec ts;
				305
				306	ts.tv_nsec = nsecs % 999999999;
				307	ts.tv_sec = nsecs / 999999999;
				308
				309	nanosleep(&ts, NULL);
				310	}
				311
				312	static void calibrate_run_measurement_overhead(struct perf_sched *sched)
				313	{
				314	u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
				315	int i;
				316
				317	for (i = 0; i < 10; i++) {
				318	T0 = get_nsecs();
				319	burn_nsecs(sched, 0);
				320	T1 = get_nsecs();
				321	delta = T1-T0;
				322	min_delta = min(min_delta, delta);
				323	}
				324	sched->run_measurement_overhead = min_delta;
				325
				326	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
				327	}
				328
				329	static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
				330	{
				331	u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
				332	int i;
				333
				334	for (i = 0; i < 10; i++) {
				335	T0 = get_nsecs();
				336	sleep_nsecs(10000);
				337	T1 = get_nsecs();
				338	delta = T1-T0;
				339	min_delta = min(min_delta, delta);
				340	}
				341	min_delta -= 10000;
				342	sched->sleep_measurement_overhead = min_delta;
				343
				344	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
				345	}
				346
				347	static struct sched_atom *
				348	get_new_event(struct task_desc *task, u64 timestamp)
				349	{
				350	struct sched_atom event = zalloc(sizeof(event));
				351	unsigned long idx = task->nr_events;
				352	size_t size;
				353
				354	event->timestamp = timestamp;
				355	event->nr = idx;
				356
				357	task->nr_events++;
				358	size = sizeof(struct sched_atom ) task->nr_events;
				359	task->atoms = realloc(task->atoms, size);
				360	BUG_ON(!task->atoms);
				361
				362	task->atoms[idx] = event;
				363
				364	return event;
				365	}
				366
				367	static struct sched_atom last_event(struct task_desc task)
				368	{
				369	if (!task->nr_events)
				370	return NULL;
				371
				372	return task->atoms[task->nr_events - 1];
				373	}
				374
				375	static void add_sched_event_run(struct perf_sched sched, struct task_desc task,
				376	u64 timestamp, u64 duration)
				377	{
				378	struct sched_atom event, curr_event = last_event(task);
				379
				380	/*
				381	* optimize an existing RUN event by merging this one
				382	* to it:
				383	*/
				384	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
				385	sched->nr_run_events_optimized++;
				386	curr_event->duration += duration;
				387	return;
				388	}
				389
				390	event = get_new_event(task, timestamp);
				391
				392	event->type = SCHED_EVENT_RUN;
				393	event->duration = duration;
				394
				395	sched->nr_run_events++;
				396	}
				397
				398	static void add_sched_event_wakeup(struct perf_sched sched, struct task_desc task,
				399	u64 timestamp, struct task_desc *wakee)
				400	{
				401	struct sched_atom event, wakee_event;
				402
				403	event = get_new_event(task, timestamp);
				404	event->type = SCHED_EVENT_WAKEUP;
				405	event->wakee = wakee;
				406
				407	wakee_event = last_event(wakee);
				408	if (!wakee_event \|\| wakee_event->type != SCHED_EVENT_SLEEP) {
				409	sched->targetless_wakeups++;
				410	return;
				411	}
				412	if (wakee_event->wait_sem) {
				413	sched->multitarget_wakeups++;
				414	return;
				415	}
				416
				417	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
				418	sem_init(wakee_event->wait_sem, 0, 0);
				419	wakee_event->specific_wait = 1;
				420	event->wait_sem = wakee_event->wait_sem;
				421
				422	sched->nr_wakeup_events++;
				423	}
				424
				425	static void add_sched_event_sleep(struct perf_sched sched, struct task_desc task,
				426	u64 timestamp, u64 task_state __maybe_unused)
				427	{
				428	struct sched_atom *event = get_new_event(task, timestamp);
				429
				430	event->type = SCHED_EVENT_SLEEP;
				431
				432	sched->nr_sleep_events++;
				433	}
				434
				435	static struct task_desc register_pid(struct perf_sched sched,
				436	unsigned long pid, const char *comm)
				437	{
				438	struct task_desc *task;
				439	static int pid_max;
				440
				441	if (sched->pid_to_task == NULL) {
				442	if (sysctl__read_int("kernel/pid_max", &pid_max) < 0)
				443	pid_max = MAX_PID;
				444	BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL);
				445	}
				446	if (pid >= (unsigned long)pid_max) {
				447	BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) *
				448	sizeof(struct task_desc *))) == NULL);
				449	while (pid >= (unsigned long)pid_max)
				450	sched->pid_to_task[pid_max++] = NULL;
				451	}
				452
				453	task = sched->pid_to_task[pid];
				454
				455	if (task)
				456	return task;
				457
				458	task = zalloc(sizeof(*task));
				459	task->pid = pid;
				460	task->nr = sched->nr_tasks;
				461	strcpy(task->comm, comm);
				462	/*
				463	* every task starts in sleeping state - this gets ignored
				464	* if there's no wakeup pointing to this sleep state:
				465	*/
				466	add_sched_event_sleep(sched, task, 0, 0);
				467
				468	sched->pid_to_task[pid] = task;
				469	sched->nr_tasks++;
				470	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *));
				471	BUG_ON(!sched->tasks);
				472	sched->tasks[task->nr] = task;
				473
				474	if (verbose > 0)
				475	printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
				476
				477	return task;
				478	}
				479
				480
				481	static void print_task_traces(struct perf_sched *sched)
				482	{
				483	struct task_desc *task;
				484	unsigned long i;
				485
				486	for (i = 0; i < sched->nr_tasks; i++) {
				487	task = sched->tasks[i];
				488	printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
				489	task->nr, task->comm, task->pid, task->nr_events);
				490	}
				491	}
				492
				493	static void add_cross_task_wakeups(struct perf_sched *sched)
				494	{
				495	struct task_desc task1, task2;
				496	unsigned long i, j;
				497
				498	for (i = 0; i < sched->nr_tasks; i++) {
				499	task1 = sched->tasks[i];
				500	j = i + 1;
				501	if (j == sched->nr_tasks)
				502	j = 0;
				503	task2 = sched->tasks[j];
				504	add_sched_event_wakeup(sched, task1, 0, task2);
				505	}
				506	}
				507
				508	static void perf_sched__process_event(struct perf_sched *sched,
				509	struct sched_atom *atom)
				510	{
				511	int ret = 0;
				512
				513	switch (atom->type) {
				514	case SCHED_EVENT_RUN:
				515	burn_nsecs(sched, atom->duration);
				516	break;
				517	case SCHED_EVENT_SLEEP:
				518	if (atom->wait_sem)
				519	ret = sem_wait(atom->wait_sem);
				520	BUG_ON(ret);
				521	break;
				522	case SCHED_EVENT_WAKEUP:
				523	if (atom->wait_sem)
				524	ret = sem_post(atom->wait_sem);
				525	BUG_ON(ret);
				526	break;
				527	case SCHED_EVENT_MIGRATION:
				528	break;
				529	default:
				530	BUG_ON(1);
				531	}
				532	}
				533
				534	static u64 get_cpu_usage_nsec_parent(void)
				535	{
				536	struct rusage ru;
				537	u64 sum;
				538	int err;
				539
				540	err = getrusage(RUSAGE_SELF, &ru);
				541	BUG_ON(err);
				542
				543	sum = ru.ru_utime.tv_sec * NSEC_PER_SEC + ru.ru_utime.tv_usec * NSEC_PER_USEC;
				544	sum += ru.ru_stime.tv_sec * NSEC_PER_SEC + ru.ru_stime.tv_usec * NSEC_PER_USEC;
				545
				546	return sum;
				547	}
				548
				549	static int self_open_counters(struct perf_sched *sched, unsigned long cur_task)
				550	{
				551	struct perf_event_attr attr;
				552	char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE];
				553	int fd;
				554	struct rlimit limit;
				555	bool need_privilege = false;
				556
				557	memset(&attr, 0, sizeof(attr));
				558
				559	attr.type = PERF_TYPE_SOFTWARE;
				560	attr.config = PERF_COUNT_SW_TASK_CLOCK;
				561
				562	force_again:
				563	fd = sys_perf_event_open(&attr, 0, -1, -1,
				564	perf_event_open_cloexec_flag());
				565
				566	if (fd < 0) {
				567	if (errno == EMFILE) {
				568	if (sched->force) {
				569	BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1);
				570	limit.rlim_cur += sched->nr_tasks - cur_task;
				571	if (limit.rlim_cur > limit.rlim_max) {
				572	limit.rlim_max = limit.rlim_cur;
				573	need_privilege = true;
				574	}
				575	if (setrlimit(RLIMIT_NOFILE, &limit) == -1) {
				576	if (need_privilege && errno == EPERM)
				577	strcpy(info, "Need privilege\n");
				578	} else
				579	goto force_again;
				580	} else
				581	strcpy(info, "Have a try with -f option\n");
				582	}
				583	pr_err("Error: sys_perf_event_open() syscall returned "
				584	"with %d (%s)\n%s", fd,
				585	str_error_r(errno, sbuf, sizeof(sbuf)), info);
				586	exit(EXIT_FAILURE);
				587	}
				588	return fd;
				589	}
				590
				591	static u64 get_cpu_usage_nsec_self(int fd)
				592	{
				593	u64 runtime;
				594	int ret;
				595
				596	ret = read(fd, &runtime, sizeof(runtime));
				597	BUG_ON(ret != sizeof(runtime));
				598
				599	return runtime;
				600	}
				601
				602	struct sched_thread_parms {
				603	struct task_desc *task;
				604	struct perf_sched *sched;
				605	int fd;
				606	};
				607
				608	static void thread_func(void ctx)
				609	{
				610	struct sched_thread_parms *parms = ctx;
				611	struct task_desc *this_task = parms->task;
				612	struct perf_sched *sched = parms->sched;
				613	u64 cpu_usage_0, cpu_usage_1;
				614	unsigned long i, ret;
				615	char comm2[22];
				616	int fd = parms->fd;
				617
				618	zfree(&parms);
				619
				620	sprintf(comm2, ":%s", this_task->comm);
				621	prctl(PR_SET_NAME, comm2);
				622	if (fd < 0)
				623	return NULL;
				624	again:
				625	ret = sem_post(&this_task->ready_for_work);
				626	BUG_ON(ret);
				627	ret = pthread_mutex_lock(&sched->start_work_mutex);
				628	BUG_ON(ret);
				629	ret = pthread_mutex_unlock(&sched->start_work_mutex);
				630	BUG_ON(ret);
				631
				632	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
				633
				634	for (i = 0; i < this_task->nr_events; i++) {
				635	this_task->curr_event = i;
				636	perf_sched__process_event(sched, this_task->atoms[i]);
				637	}
				638
				639	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
				640	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
				641	ret = sem_post(&this_task->work_done_sem);
				642	BUG_ON(ret);
				643
				644	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
				645	BUG_ON(ret);
				646	ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
				647	BUG_ON(ret);
				648
				649	goto again;
				650	}
				651
				652	static void create_tasks(struct perf_sched *sched)
				653	{
				654	struct task_desc *task;
				655	pthread_attr_t attr;
				656	unsigned long i;
				657	int err;
				658
				659	err = pthread_attr_init(&attr);
				660	BUG_ON(err);
				661	err = pthread_attr_setstacksize(&attr,
				662	(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
				663	BUG_ON(err);
				664	err = pthread_mutex_lock(&sched->start_work_mutex);
				665	BUG_ON(err);
				666	err = pthread_mutex_lock(&sched->work_done_wait_mutex);
				667	BUG_ON(err);
				668	for (i = 0; i < sched->nr_tasks; i++) {
				669	struct sched_thread_parms parms = malloc(sizeof(parms));
				670	BUG_ON(parms == NULL);
				671	parms->task = task = sched->tasks[i];
				672	parms->sched = sched;
				673	parms->fd = self_open_counters(sched, i);
				674	sem_init(&task->sleep_sem, 0, 0);
				675	sem_init(&task->ready_for_work, 0, 0);
				676	sem_init(&task->work_done_sem, 0, 0);
				677	task->curr_event = 0;
				678	err = pthread_create(&task->thread, &attr, thread_func, parms);
				679	BUG_ON(err);
				680	}
				681	}
				682
				683	static void wait_for_tasks(struct perf_sched *sched)
				684	{
				685	u64 cpu_usage_0, cpu_usage_1;
				686	struct task_desc *task;
				687	unsigned long i, ret;
				688
				689	sched->start_time = get_nsecs();
				690	sched->cpu_usage = 0;
				691	pthread_mutex_unlock(&sched->work_done_wait_mutex);
				692
				693	for (i = 0; i < sched->nr_tasks; i++) {
				694	task = sched->tasks[i];
				695	ret = sem_wait(&task->ready_for_work);
				696	BUG_ON(ret);
				697	sem_init(&task->ready_for_work, 0, 0);
				698	}
				699	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
				700	BUG_ON(ret);
				701
				702	cpu_usage_0 = get_cpu_usage_nsec_parent();
				703
				704	pthread_mutex_unlock(&sched->start_work_mutex);
				705
				706	for (i = 0; i < sched->nr_tasks; i++) {
				707	task = sched->tasks[i];
				708	ret = sem_wait(&task->work_done_sem);
				709	BUG_ON(ret);
				710	sem_init(&task->work_done_sem, 0, 0);
				711	sched->cpu_usage += task->cpu_usage;
				712	task->cpu_usage = 0;
				713	}
				714
				715	cpu_usage_1 = get_cpu_usage_nsec_parent();
				716	if (!sched->runavg_cpu_usage)
				717	sched->runavg_cpu_usage = sched->cpu_usage;
				718	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat;
				719
				720	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
				721	if (!sched->runavg_parent_cpu_usage)
				722	sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
				723	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) +
				724	sched->parent_cpu_usage)/sched->replay_repeat;
				725
				726	ret = pthread_mutex_lock(&sched->start_work_mutex);
				727	BUG_ON(ret);
				728
				729	for (i = 0; i < sched->nr_tasks; i++) {
				730	task = sched->tasks[i];
				731	sem_init(&task->sleep_sem, 0, 0);
				732	task->curr_event = 0;
				733	}
				734	}
				735
				736	static void run_one_test(struct perf_sched *sched)
				737	{
				738	u64 T0, T1, delta, avg_delta, fluct;
				739
				740	T0 = get_nsecs();
				741	wait_for_tasks(sched);
				742	T1 = get_nsecs();
				743
				744	delta = T1 - T0;
				745	sched->sum_runtime += delta;
				746	sched->nr_runs++;
				747
				748	avg_delta = sched->sum_runtime / sched->nr_runs;
				749	if (delta < avg_delta)
				750	fluct = avg_delta - delta;
				751	else
				752	fluct = delta - avg_delta;
				753	sched->sum_fluct += fluct;
				754	if (!sched->run_avg)
				755	sched->run_avg = delta;
				756	sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat;
				757
				758	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / NSEC_PER_MSEC);
				759
				760	printf("ravg: %0.2f, ", (double)sched->run_avg / NSEC_PER_MSEC);
				761
				762	printf("cpu: %0.2f / %0.2f",
				763	(double)sched->cpu_usage / NSEC_PER_MSEC, (double)sched->runavg_cpu_usage / NSEC_PER_MSEC);
				764
				765	#if 0
				766	/*
				767	* rusage statistics done by the parent, these are less
				768	* accurate than the sched->sum_exec_runtime based statistics:
				769	*/
				770	printf(" [%0.2f / %0.2f]",
				771	(double)sched->parent_cpu_usage / NSEC_PER_MSEC,
				772	(double)sched->runavg_parent_cpu_usage / NSEC_PER_MSEC);
				773	#endif
				774
				775	printf("\n");
				776
				777	if (sched->nr_sleep_corrections)
				778	printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
				779	sched->nr_sleep_corrections = 0;
				780	}
				781
				782	static void test_calibrations(struct perf_sched *sched)
				783	{
				784	u64 T0, T1;
				785
				786	T0 = get_nsecs();
				787	burn_nsecs(sched, NSEC_PER_MSEC);
				788	T1 = get_nsecs();
				789
				790	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
				791
				792	T0 = get_nsecs();
				793	sleep_nsecs(NSEC_PER_MSEC);
				794	T1 = get_nsecs();
				795
				796	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
				797	}
				798
				799	static int
				800	replay_wakeup_event(struct perf_sched *sched,
				801	struct perf_evsel evsel, struct perf_sample sample,
				802	struct machine *machine __maybe_unused)
				803	{
				804	const char *comm = perf_evsel__strval(evsel, sample, "comm");
				805	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
				806	struct task_desc waker, wakee;
				807
				808	if (verbose > 0) {
				809	printf("sched_wakeup event %p\n", evsel);
				810
				811	printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
				812	}
				813
				814	waker = register_pid(sched, sample->tid, "<unknown>");
				815	wakee = register_pid(sched, pid, comm);
				816
				817	add_sched_event_wakeup(sched, waker, sample->time, wakee);
				818	return 0;
				819	}
				820
				821	static int replay_switch_event(struct perf_sched *sched,
				822	struct perf_evsel *evsel,
				823	struct perf_sample *sample,
				824	struct machine *machine __maybe_unused)
				825	{
				826	const char *prev_comm = perf_evsel__strval(evsel, sample, "prev_comm"),
				827	*next_comm = perf_evsel__strval(evsel, sample, "next_comm");
				828	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
				829	next_pid = perf_evsel__intval(evsel, sample, "next_pid");
				830	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
				831	struct task_desc prev, __maybe_unused next;
				832	u64 timestamp0, timestamp = sample->time;
				833	int cpu = sample->cpu;
				834	s64 delta;
				835
				836	if (verbose > 0)
				837	printf("sched_switch event %p\n", evsel);
				838
				839	if (cpu >= MAX_CPUS \|\| cpu < 0)
				840	return 0;
				841
				842	timestamp0 = sched->cpu_last_switched[cpu];
				843	if (timestamp0)
				844	delta = timestamp - timestamp0;
				845	else
				846	delta = 0;
				847
				848	if (delta < 0) {
				849	pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
				850	return -1;
				851	}
				852
				853	pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
				854	prev_comm, prev_pid, next_comm, next_pid, delta);
				855
				856	prev = register_pid(sched, prev_pid, prev_comm);
				857	next = register_pid(sched, next_pid, next_comm);
				858
				859	sched->cpu_last_switched[cpu] = timestamp;
				860
				861	add_sched_event_run(sched, prev, timestamp, delta);
				862	add_sched_event_sleep(sched, prev, timestamp, prev_state);
				863
				864	return 0;
				865	}
				866
				867	static int replay_fork_event(struct perf_sched *sched,
				868	union perf_event *event,
				869	struct machine *machine)
				870	{
				871	struct thread child, parent;
				872
				873	child = machine__findnew_thread(machine, event->fork.pid,
				874	event->fork.tid);
				875	parent = machine__findnew_thread(machine, event->fork.ppid,
				876	event->fork.ptid);
				877
				878	if (child == NULL \|\| parent == NULL) {
				879	pr_debug("thread does not exist on fork event: child %p, parent %p\n",
				880	child, parent);
				881	goto out_put;
				882	}
				883
				884	if (verbose > 0) {
				885	printf("fork event\n");
				886	printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid);
				887	printf("... child: %s/%d\n", thread__comm_str(child), child->tid);
				888	}
				889
				890	register_pid(sched, parent->tid, thread__comm_str(parent));
				891	register_pid(sched, child->tid, thread__comm_str(child));
				892	out_put:
				893	thread__put(child);
				894	thread__put(parent);
				895	return 0;
				896	}
				897
				898	struct sort_dimension {
				899	const char *name;
				900	sort_fn_t cmp;
				901	struct list_head list;
				902	};
				903
				904	/*
				905	* handle runtime stats saved per thread
				906	*/
				907	static struct thread_runtime thread__init_runtime(struct thread thread)
				908	{
				909	struct thread_runtime *r;
				910
				911	r = zalloc(sizeof(struct thread_runtime));
				912	if (!r)
				913	return NULL;
				914
				915	init_stats(&r->run_stats);
				916	thread__set_priv(thread, r);
				917
				918	return r;
				919	}
				920
				921	static struct thread_runtime thread__get_runtime(struct thread thread)
				922	{
				923	struct thread_runtime *tr;
				924
				925	tr = thread__priv(thread);
				926	if (tr == NULL) {
				927	tr = thread__init_runtime(thread);
				928	if (tr == NULL)
				929	pr_debug("Failed to malloc memory for runtime data.\n");
				930	}
				931
				932	return tr;
				933	}
				934
				935	static int
				936	thread_lat_cmp(struct list_head list, struct work_atoms l, struct work_atoms *r)
				937	{
				938	struct sort_dimension *sort;
				939	int ret = 0;
				940
				941	BUG_ON(list_empty(list));
				942
				943	list_for_each_entry(sort, list, list) {
				944	ret = sort->cmp(l, r);
				945	if (ret)
				946	return ret;
				947	}
				948
				949	return ret;
				950	}
				951
				952	static struct work_atoms *
				953	thread_atoms_search(struct rb_root root, struct thread thread,
				954	struct list_head *sort_list)
				955	{
				956	struct rb_node *node = root->rb_node;
				957	struct work_atoms key = { .thread = thread };
				958
				959	while (node) {
				960	struct work_atoms *atoms;
				961	int cmp;
				962
				963	atoms = container_of(node, struct work_atoms, node);
				964
				965	cmp = thread_lat_cmp(sort_list, &key, atoms);
				966	if (cmp > 0)
				967	node = node->rb_left;
				968	else if (cmp < 0)
				969	node = node->rb_right;
				970	else {
				971	BUG_ON(thread != atoms->thread);
				972	return atoms;
				973	}
				974	}
				975	return NULL;
				976	}
				977
				978	static void
				979	__thread_latency_insert(struct rb_root root, struct work_atoms data,
				980	struct list_head *sort_list)
				981	{
				982	struct rb_node *new = &(root->rb_node), parent = NULL;
				983
				984	while (*new) {
				985	struct work_atoms *this;
				986	int cmp;
				987
				988	this = container_of(*new, struct work_atoms, node);
				989	parent = *new;
				990
				991	cmp = thread_lat_cmp(sort_list, data, this);
				992
				993	if (cmp > 0)
				994	new = &((*new)->rb_left);
				995	else
				996	new = &((*new)->rb_right);
				997	}
				998
				999	rb_link_node(&data->node, parent, new);
				1000	rb_insert_color(&data->node, root);
				1001	}
				1002
				1003	static int thread_atoms_insert(struct perf_sched sched, struct thread thread)
				1004	{
				1005	struct work_atoms atoms = zalloc(sizeof(atoms));
				1006	if (!atoms) {
				1007	pr_err("No memory at %s\n", __func__);
				1008	return -1;
				1009	}
				1010
				1011	atoms->thread = thread__get(thread);
				1012	INIT_LIST_HEAD(&atoms->work_list);
				1013	__thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
				1014	return 0;
				1015	}
				1016
				1017	static char sched_out_state(u64 prev_state)
				1018	{
				1019	const char *str = TASK_STATE_TO_CHAR_STR;
				1020
				1021	return str[prev_state];
				1022	}
				1023
				1024	static int
				1025	add_sched_out_event(struct work_atoms *atoms,
				1026	char run_state,
				1027	u64 timestamp)
				1028	{
				1029	struct work_atom atom = zalloc(sizeof(atom));
				1030	if (!atom) {
				1031	pr_err("Non memory at %s", __func__);
				1032	return -1;
				1033	}
				1034
				1035	atom->sched_out_time = timestamp;
				1036
				1037	if (run_state == 'R') {
				1038	atom->state = THREAD_WAIT_CPU;
				1039	atom->wake_up_time = atom->sched_out_time;
				1040	}
				1041
				1042	list_add_tail(&atom->list, &atoms->work_list);
				1043	return 0;
				1044	}
				1045
				1046	static void
				1047	add_runtime_event(struct work_atoms *atoms, u64 delta,
				1048	u64 timestamp __maybe_unused)
				1049	{
				1050	struct work_atom *atom;
				1051
				1052	BUG_ON(list_empty(&atoms->work_list));
				1053
				1054	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
				1055
				1056	atom->runtime += delta;
				1057	atoms->total_runtime += delta;
				1058	}
				1059
				1060	static void
				1061	add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
				1062	{
				1063	struct work_atom *atom;
				1064	u64 delta;
				1065
				1066	if (list_empty(&atoms->work_list))
				1067	return;
				1068
				1069	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
				1070
				1071	if (atom->state != THREAD_WAIT_CPU)
				1072	return;
				1073
				1074	if (timestamp < atom->wake_up_time) {
				1075	atom->state = THREAD_IGNORE;
				1076	return;
				1077	}
				1078
				1079	atom->state = THREAD_SCHED_IN;
				1080	atom->sched_in_time = timestamp;
				1081
				1082	delta = atom->sched_in_time - atom->wake_up_time;
				1083	atoms->total_lat += delta;
				1084	if (delta > atoms->max_lat) {
				1085	atoms->max_lat = delta;
				1086	atoms->max_lat_at = timestamp;
				1087	}
				1088	atoms->nb_atoms++;
				1089	}
				1090
				1091	static int latency_switch_event(struct perf_sched *sched,
				1092	struct perf_evsel *evsel,
				1093	struct perf_sample *sample,
				1094	struct machine *machine)
				1095	{
				1096	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
				1097	next_pid = perf_evsel__intval(evsel, sample, "next_pid");
				1098	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
				1099	struct work_atoms out_events, in_events;
				1100	struct thread sched_out, sched_in;
				1101	u64 timestamp0, timestamp = sample->time;
				1102	int cpu = sample->cpu, err = -1;
				1103	s64 delta;
				1104
				1105	BUG_ON(cpu >= MAX_CPUS \|\| cpu < 0);
				1106
				1107	timestamp0 = sched->cpu_last_switched[cpu];
				1108	sched->cpu_last_switched[cpu] = timestamp;
				1109	if (timestamp0)
				1110	delta = timestamp - timestamp0;
				1111	else
				1112	delta = 0;
				1113
				1114	if (delta < 0) {
				1115	pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
				1116	return -1;
				1117	}
				1118
				1119	sched_out = machine__findnew_thread(machine, -1, prev_pid);
				1120	sched_in = machine__findnew_thread(machine, -1, next_pid);
				1121	if (sched_out == NULL \|\| sched_in == NULL)
				1122	goto out_put;
				1123
				1124	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
				1125	if (!out_events) {
				1126	if (thread_atoms_insert(sched, sched_out))
				1127	goto out_put;
				1128	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
				1129	if (!out_events) {
				1130	pr_err("out-event: Internal tree error");
				1131	goto out_put;
				1132	}
				1133	}
				1134	if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
				1135	return -1;
				1136
				1137	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
				1138	if (!in_events) {
				1139	if (thread_atoms_insert(sched, sched_in))
				1140	goto out_put;
				1141	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
				1142	if (!in_events) {
				1143	pr_err("in-event: Internal tree error");
				1144	goto out_put;
				1145	}
				1146	/*
				1147	* Take came in we have not heard about yet,
				1148	* add in an initial atom in runnable state:
				1149	*/
				1150	if (add_sched_out_event(in_events, 'R', timestamp))
				1151	goto out_put;
				1152	}
				1153	add_sched_in_event(in_events, timestamp);
				1154	err = 0;
				1155	out_put:
				1156	thread__put(sched_out);
				1157	thread__put(sched_in);
				1158	return err;
				1159	}
				1160
				1161	static int latency_runtime_event(struct perf_sched *sched,
				1162	struct perf_evsel *evsel,
				1163	struct perf_sample *sample,
				1164	struct machine *machine)
				1165	{
				1166	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
				1167	const u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
				1168	struct thread *thread = machine__findnew_thread(machine, -1, pid);
				1169	struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
				1170	u64 timestamp = sample->time;
				1171	int cpu = sample->cpu, err = -1;
				1172
				1173	if (thread == NULL)
				1174	return -1;
				1175
				1176	BUG_ON(cpu >= MAX_CPUS \|\| cpu < 0);
				1177	if (!atoms) {
				1178	if (thread_atoms_insert(sched, thread))
				1179	goto out_put;
				1180	atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
				1181	if (!atoms) {
				1182	pr_err("in-event: Internal tree error");
				1183	goto out_put;
				1184	}
				1185	if (add_sched_out_event(atoms, 'R', timestamp))
				1186	goto out_put;
				1187	}
				1188
				1189	add_runtime_event(atoms, runtime, timestamp);
				1190	err = 0;
				1191	out_put:
				1192	thread__put(thread);
				1193	return err;
				1194	}
				1195
				1196	static int latency_wakeup_event(struct perf_sched *sched,
				1197	struct perf_evsel *evsel,
				1198	struct perf_sample *sample,
				1199	struct machine *machine)
				1200	{
				1201	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
				1202	struct work_atoms *atoms;
				1203	struct work_atom *atom;
				1204	struct thread *wakee;
				1205	u64 timestamp = sample->time;
				1206	int err = -1;
				1207
				1208	wakee = machine__findnew_thread(machine, -1, pid);
				1209	if (wakee == NULL)
				1210	return -1;
				1211	atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
				1212	if (!atoms) {
				1213	if (thread_atoms_insert(sched, wakee))
				1214	goto out_put;
				1215	atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
				1216	if (!atoms) {
				1217	pr_err("wakeup-event: Internal tree error");
				1218	goto out_put;
				1219	}
				1220	if (add_sched_out_event(atoms, 'S', timestamp))
				1221	goto out_put;
				1222	}
				1223
				1224	BUG_ON(list_empty(&atoms->work_list));
				1225
				1226	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
				1227
				1228	/*
				1229	* As we do not guarantee the wakeup event happens when
				1230	* task is out of run queue, also may happen when task is
				1231	* on run queue and wakeup only change ->state to TASK_RUNNING,
				1232	* then we should not set the ->wake_up_time when wake up a
				1233	* task which is on run queue.
				1234	*
				1235	* You WILL be missing events if you've recorded only
				1236	* one CPU, or are only looking at only one, so don't
				1237	* skip in this case.
				1238	*/
				1239	if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
				1240	goto out_ok;
				1241
				1242	sched->nr_timestamps++;
				1243	if (atom->sched_out_time > timestamp) {
				1244	sched->nr_unordered_timestamps++;
				1245	goto out_ok;
				1246	}
				1247
				1248	atom->state = THREAD_WAIT_CPU;
				1249	atom->wake_up_time = timestamp;
				1250	out_ok:
				1251	err = 0;
				1252	out_put:
				1253	thread__put(wakee);
				1254	return err;
				1255	}
				1256
				1257	static int latency_migrate_task_event(struct perf_sched *sched,
				1258	struct perf_evsel *evsel,
				1259	struct perf_sample *sample,
				1260	struct machine *machine)
				1261	{
				1262	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
				1263	u64 timestamp = sample->time;
				1264	struct work_atoms *atoms;
				1265	struct work_atom *atom;
				1266	struct thread *migrant;
				1267	int err = -1;
				1268
				1269	/*
				1270	* Only need to worry about migration when profiling one CPU.
				1271	*/
				1272	if (sched->profile_cpu == -1)
				1273	return 0;
				1274
				1275	migrant = machine__findnew_thread(machine, -1, pid);
				1276	if (migrant == NULL)
				1277	return -1;
				1278	atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
				1279	if (!atoms) {
				1280	if (thread_atoms_insert(sched, migrant))
				1281	goto out_put;
				1282	register_pid(sched, migrant->tid, thread__comm_str(migrant));
				1283	atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
				1284	if (!atoms) {
				1285	pr_err("migration-event: Internal tree error");
				1286	goto out_put;
				1287	}
				1288	if (add_sched_out_event(atoms, 'R', timestamp))
				1289	goto out_put;
				1290	}
				1291
				1292	BUG_ON(list_empty(&atoms->work_list));
				1293
				1294	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
				1295	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
				1296
				1297	sched->nr_timestamps++;
				1298
				1299	if (atom->sched_out_time > timestamp)
				1300	sched->nr_unordered_timestamps++;
				1301	err = 0;
				1302	out_put:
				1303	thread__put(migrant);
				1304	return err;
				1305	}
				1306
				1307	static void output_lat_thread(struct perf_sched sched, struct work_atoms work_list)
				1308	{
				1309	int i;
				1310	int ret;
				1311	u64 avg;
				1312	char max_lat_at[32];
				1313
				1314	if (!work_list->nb_atoms)
				1315	return;
				1316	/*
				1317	* Ignore idle threads:
				1318	*/
				1319	if (!strcmp(thread__comm_str(work_list->thread), "swapper"))
				1320	return;
				1321
				1322	sched->all_runtime += work_list->total_runtime;
				1323	sched->all_count += work_list->nb_atoms;
				1324
				1325	if (work_list->num_merged > 1)
				1326	ret = printf(" %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged);
				1327	else
				1328	ret = printf(" %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
				1329
				1330	for (i = 0; i < 24 - ret; i++)
				1331	printf(" ");
				1332
				1333	avg = work_list->total_lat / work_list->nb_atoms;
				1334	timestamp__scnprintf_usec(work_list->max_lat_at, max_lat_at, sizeof(max_lat_at));
				1335
				1336	printf("\|%11.3f ms \|%9" PRIu64 " \| avg:%9.3f ms \| max:%9.3f ms \| max at: %13s s\n",
				1337	(double)work_list->total_runtime / NSEC_PER_MSEC,
				1338	work_list->nb_atoms, (double)avg / NSEC_PER_MSEC,
				1339	(double)work_list->max_lat / NSEC_PER_MSEC,
				1340	max_lat_at);
				1341	}
				1342
				1343	static int pid_cmp(struct work_atoms l, struct work_atoms r)
				1344	{
				1345	if (l->thread == r->thread)
				1346	return 0;
				1347	if (l->thread->tid < r->thread->tid)
				1348	return -1;
				1349	if (l->thread->tid > r->thread->tid)
				1350	return 1;
				1351	return (int)(l->thread - r->thread);
				1352	}
				1353
				1354	static int avg_cmp(struct work_atoms l, struct work_atoms r)
				1355	{
				1356	u64 avgl, avgr;
				1357
				1358	if (!l->nb_atoms)
				1359	return -1;
				1360
				1361	if (!r->nb_atoms)
				1362	return 1;
				1363
				1364	avgl = l->total_lat / l->nb_atoms;
				1365	avgr = r->total_lat / r->nb_atoms;
				1366
				1367	if (avgl < avgr)
				1368	return -1;
				1369	if (avgl > avgr)
				1370	return 1;
				1371
				1372	return 0;
				1373	}
				1374
				1375	static int max_cmp(struct work_atoms l, struct work_atoms r)
				1376	{
				1377	if (l->max_lat < r->max_lat)
				1378	return -1;
				1379	if (l->max_lat > r->max_lat)
				1380	return 1;
				1381
				1382	return 0;
				1383	}
				1384
				1385	static int switch_cmp(struct work_atoms l, struct work_atoms r)
				1386	{
				1387	if (l->nb_atoms < r->nb_atoms)
				1388	return -1;
				1389	if (l->nb_atoms > r->nb_atoms)
				1390	return 1;
				1391
				1392	return 0;
				1393	}
				1394
				1395	static int runtime_cmp(struct work_atoms l, struct work_atoms r)
				1396	{
				1397	if (l->total_runtime < r->total_runtime)
				1398	return -1;
				1399	if (l->total_runtime > r->total_runtime)
				1400	return 1;
				1401
				1402	return 0;
				1403	}
				1404
				1405	static int sort_dimension__add(const char tok, struct list_head list)
				1406	{
				1407	size_t i;
				1408	static struct sort_dimension avg_sort_dimension = {
				1409	.name = "avg",
				1410	.cmp = avg_cmp,
				1411	};
				1412	static struct sort_dimension max_sort_dimension = {
				1413	.name = "max",
				1414	.cmp = max_cmp,
				1415	};
				1416	static struct sort_dimension pid_sort_dimension = {
				1417	.name = "pid",
				1418	.cmp = pid_cmp,
				1419	};
				1420	static struct sort_dimension runtime_sort_dimension = {
				1421	.name = "runtime",
				1422	.cmp = runtime_cmp,
				1423	};
				1424	static struct sort_dimension switch_sort_dimension = {
				1425	.name = "switch",
				1426	.cmp = switch_cmp,
				1427	};
				1428	struct sort_dimension *available_sorts[] = {
				1429	&pid_sort_dimension,
				1430	&avg_sort_dimension,
				1431	&max_sort_dimension,
				1432	&switch_sort_dimension,
				1433	&runtime_sort_dimension,
				1434	};
				1435
				1436	for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
				1437	if (!strcmp(available_sorts[i]->name, tok)) {
				1438	list_add_tail(&available_sorts[i]->list, list);
				1439
				1440	return 0;
				1441	}
				1442	}
				1443
				1444	return -1;
				1445	}
				1446
				1447	static void perf_sched__sort_lat(struct perf_sched *sched)
				1448	{
				1449	struct rb_node *node;
				1450	struct rb_root *root = &sched->atom_root;
				1451	again:
				1452	for (;;) {
				1453	struct work_atoms *data;
				1454	node = rb_first(root);
				1455	if (!node)
				1456	break;
				1457
				1458	rb_erase(node, root);
				1459	data = rb_entry(node, struct work_atoms, node);
				1460	__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
				1461	}
				1462	if (root == &sched->atom_root) {
				1463	root = &sched->merged_atom_root;
				1464	goto again;
				1465	}
				1466	}
				1467
				1468	static int process_sched_wakeup_event(struct perf_tool *tool,
				1469	struct perf_evsel *evsel,
				1470	struct perf_sample *sample,
				1471	struct machine *machine)
				1472	{
				1473	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
				1474
				1475	if (sched->tp_handler->wakeup_event)
				1476	return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
				1477
				1478	return 0;
				1479	}
				1480
				1481	union map_priv {
				1482	void *ptr;
				1483	bool color;
				1484	};
				1485
				1486	static bool thread__has_color(struct thread *thread)
				1487	{
				1488	union map_priv priv = {
				1489	.ptr = thread__priv(thread),
				1490	};
				1491
				1492	return priv.color;
				1493	}
				1494
				1495	static struct thread*
				1496	map__findnew_thread(struct perf_sched sched, struct machine machine, pid_t pid, pid_t tid)
				1497	{
				1498	struct thread *thread = machine__findnew_thread(machine, pid, tid);
				1499	union map_priv priv = {
				1500	.color = false,
				1501	};
				1502
				1503	if (!sched->map.color_pids \|\| !thread \|\| thread__priv(thread))
				1504	return thread;
				1505
				1506	if (thread_map__has(sched->map.color_pids, tid))
				1507	priv.color = true;
				1508
				1509	thread__set_priv(thread, priv.ptr);
				1510	return thread;
				1511	}
				1512
				1513	static int map_switch_event(struct perf_sched sched, struct perf_evsel evsel,
				1514	struct perf_sample sample, struct machine machine)
				1515	{
				1516	const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
				1517	struct thread *sched_in;
				1518	struct thread_runtime *tr;
				1519	int new_shortname;
				1520	u64 timestamp0, timestamp = sample->time;
				1521	s64 delta;
				1522	int i, this_cpu = sample->cpu;
				1523	int cpus_nr;
				1524	bool new_cpu = false;
				1525	const char *color = PERF_COLOR_NORMAL;
				1526	char stimestamp[32];
				1527
				1528	BUG_ON(this_cpu >= MAX_CPUS \|\| this_cpu < 0);
				1529
				1530	if (this_cpu > sched->max_cpu)
				1531	sched->max_cpu = this_cpu;
				1532
				1533	if (sched->map.comp) {
				1534	cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS);
				1535	if (!test_and_set_bit(this_cpu, sched->map.comp_cpus_mask)) {
				1536	sched->map.comp_cpus[cpus_nr++] = this_cpu;
				1537	new_cpu = true;
				1538	}
				1539	} else
				1540	cpus_nr = sched->max_cpu;
				1541
				1542	timestamp0 = sched->cpu_last_switched[this_cpu];
				1543	sched->cpu_last_switched[this_cpu] = timestamp;
				1544	if (timestamp0)
				1545	delta = timestamp - timestamp0;
				1546	else
				1547	delta = 0;
				1548
				1549	if (delta < 0) {
				1550	pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
				1551	return -1;
				1552	}
				1553
				1554	sched_in = map__findnew_thread(sched, machine, -1, next_pid);
				1555	if (sched_in == NULL)
				1556	return -1;
				1557
				1558	tr = thread__get_runtime(sched_in);
				1559	if (tr == NULL) {
				1560	thread__put(sched_in);
				1561	return -1;
				1562	}
				1563
				1564	sched->curr_thread[this_cpu] = thread__get(sched_in);
				1565
				1566	printf(" ");
				1567
				1568	new_shortname = 0;
				1569	if (!tr->shortname[0]) {
				1570	if (!strcmp(thread__comm_str(sched_in), "swapper")) {
				1571	/*
				1572	* Don't allocate a letter-number for swapper:0
				1573	* as a shortname. Instead, we use '.' for it.
				1574	*/
				1575	tr->shortname[0] = '.';
				1576	tr->shortname[1] = ' ';
				1577	} else {
				1578	tr->shortname[0] = sched->next_shortname1;
				1579	tr->shortname[1] = sched->next_shortname2;
				1580
				1581	if (sched->next_shortname1 < 'Z') {
				1582	sched->next_shortname1++;
				1583	} else {
				1584	sched->next_shortname1 = 'A';
				1585	if (sched->next_shortname2 < '9')
				1586	sched->next_shortname2++;
				1587	else
				1588	sched->next_shortname2 = '0';
				1589	}
				1590	}
				1591	new_shortname = 1;
				1592	}
				1593
				1594	for (i = 0; i < cpus_nr; i++) {
				1595	int cpu = sched->map.comp ? sched->map.comp_cpus[i] : i;
				1596	struct thread *curr_thread = sched->curr_thread[cpu];
				1597	struct thread_runtime *curr_tr;
				1598	const char *pid_color = color;
				1599	const char *cpu_color = color;
				1600
				1601	if (curr_thread && thread__has_color(curr_thread))
				1602	pid_color = COLOR_PIDS;
				1603
				1604	if (sched->map.cpus && !cpu_map__has(sched->map.cpus, cpu))
				1605	continue;
				1606
				1607	if (sched->map.color_cpus && cpu_map__has(sched->map.color_cpus, cpu))
				1608	cpu_color = COLOR_CPUS;
				1609
				1610	if (cpu != this_cpu)
				1611	color_fprintf(stdout, color, " ");
				1612	else
				1613	color_fprintf(stdout, cpu_color, "*");
				1614
				1615	if (sched->curr_thread[cpu]) {
				1616	curr_tr = thread__get_runtime(sched->curr_thread[cpu]);
				1617	if (curr_tr == NULL) {
				1618	thread__put(sched_in);
				1619	return -1;
				1620	}
				1621	color_fprintf(stdout, pid_color, "%2s ", curr_tr->shortname);
				1622	} else
				1623	color_fprintf(stdout, color, " ");
				1624	}
				1625
				1626	if (sched->map.cpus && !cpu_map__has(sched->map.cpus, this_cpu))
				1627	goto out;
				1628
				1629	timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp));
				1630	color_fprintf(stdout, color, " %12s secs ", stimestamp);
				1631	if (new_shortname \|\| tr->comm_changed \|\| (verbose > 0 && sched_in->tid)) {
				1632	const char *pid_color = color;
				1633
				1634	if (thread__has_color(sched_in))
				1635	pid_color = COLOR_PIDS;
				1636
				1637	color_fprintf(stdout, pid_color, "%s => %s:%d",
				1638	tr->shortname, thread__comm_str(sched_in), sched_in->tid);
				1639	tr->comm_changed = false;
				1640	}
				1641
				1642	if (sched->map.comp && new_cpu)
				1643	color_fprintf(stdout, color, " (CPU %d)", this_cpu);
				1644
				1645	out:
				1646	color_fprintf(stdout, color, "\n");
				1647
				1648	thread__put(sched_in);
				1649
				1650	return 0;
				1651	}
				1652
				1653	static int process_sched_switch_event(struct perf_tool *tool,
				1654	struct perf_evsel *evsel,
				1655	struct perf_sample *sample,
				1656	struct machine *machine)
				1657	{
				1658	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
				1659	int this_cpu = sample->cpu, err = 0;
				1660	u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
				1661	next_pid = perf_evsel__intval(evsel, sample, "next_pid");
				1662
				1663	if (sched->curr_pid[this_cpu] != (u32)-1) {
				1664	/*
				1665	* Are we trying to switch away a PID that is
				1666	* not current?
				1667	*/
				1668	if (sched->curr_pid[this_cpu] != prev_pid)
				1669	sched->nr_context_switch_bugs++;
				1670	}
				1671
				1672	if (sched->tp_handler->switch_event)
				1673	err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
				1674
				1675	sched->curr_pid[this_cpu] = next_pid;
				1676	return err;
				1677	}
				1678
				1679	static int process_sched_runtime_event(struct perf_tool *tool,
				1680	struct perf_evsel *evsel,
				1681	struct perf_sample *sample,
				1682	struct machine *machine)
				1683	{
				1684	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
				1685
				1686	if (sched->tp_handler->runtime_event)
				1687	return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
				1688
				1689	return 0;
				1690	}
				1691
				1692	static int perf_sched__process_fork_event(struct perf_tool *tool,
				1693	union perf_event *event,
				1694	struct perf_sample *sample,
				1695	struct machine *machine)
				1696	{
				1697	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
				1698
				1699	/* run the fork event through the perf machineruy */
				1700	perf_event__process_fork(tool, event, sample, machine);
				1701
				1702	/* and then run additional processing needed for this command */
				1703	if (sched->tp_handler->fork_event)
				1704	return sched->tp_handler->fork_event(sched, event, machine);
				1705
				1706	return 0;
				1707	}
				1708
				1709	static int process_sched_migrate_task_event(struct perf_tool *tool,
				1710	struct perf_evsel *evsel,
				1711	struct perf_sample *sample,
				1712	struct machine *machine)
				1713	{
				1714	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
				1715
				1716	if (sched->tp_handler->migrate_task_event)
				1717	return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
				1718
				1719	return 0;
				1720	}
				1721
				1722	typedef int (tracepoint_handler)(struct perf_tool tool,
				1723	struct perf_evsel *evsel,
				1724	struct perf_sample *sample,
				1725	struct machine *machine);
				1726
				1727	static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
				1728	union perf_event *event __maybe_unused,
				1729	struct perf_sample *sample,
				1730	struct perf_evsel *evsel,
				1731	struct machine *machine)
				1732	{
				1733	int err = 0;
				1734
				1735	if (evsel->handler != NULL) {
				1736	tracepoint_handler f = evsel->handler;
				1737	err = f(tool, evsel, sample, machine);
				1738	}
				1739
				1740	return err;
				1741	}
				1742
				1743	static int perf_sched__process_comm(struct perf_tool *tool __maybe_unused,
				1744	union perf_event *event,
				1745	struct perf_sample *sample,
				1746	struct machine *machine)
				1747	{
				1748	struct thread *thread;
				1749	struct thread_runtime *tr;
				1750	int err;
				1751
				1752	err = perf_event__process_comm(tool, event, sample, machine);
				1753	if (err)
				1754	return err;
				1755
				1756	thread = machine__find_thread(machine, sample->pid, sample->tid);
				1757	if (!thread) {
				1758	pr_err("Internal error: can't find thread\n");
				1759	return -1;
				1760	}
				1761
				1762	tr = thread__get_runtime(thread);
				1763	if (tr == NULL) {
				1764	thread__put(thread);
				1765	return -1;
				1766	}
				1767
				1768	tr->comm_changed = true;
				1769	thread__put(thread);
				1770
				1771	return 0;
				1772	}
				1773
				1774	static int perf_sched__read_events(struct perf_sched *sched)
				1775	{
				1776	const struct perf_evsel_str_handler handlers[] = {
				1777	{ "sched:sched_switch", process_sched_switch_event, },
				1778	{ "sched:sched_stat_runtime", process_sched_runtime_event, },
				1779	{ "sched:sched_wakeup", process_sched_wakeup_event, },
				1780	{ "sched:sched_wakeup_new", process_sched_wakeup_event, },
				1781	{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
				1782	};
				1783	struct perf_session *session;
				1784	struct perf_data data = {
				1785	.file = {
				1786	.path = input_name,
				1787	},
				1788	.mode = PERF_DATA_MODE_READ,
				1789	.force = sched->force,
				1790	};
				1791	int rc = -1;
				1792
				1793	session = perf_session__new(&data, false, &sched->tool);
				1794	if (session == NULL) {
				1795	pr_debug("No Memory for session\n");
				1796	return -1;
				1797	}
				1798
				1799	symbol__init(&session->header.env);
				1800
				1801	if (perf_session__set_tracepoints_handlers(session, handlers))
				1802	goto out_delete;
				1803
				1804	if (perf_session__has_traces(session, "record -R")) {
				1805	int err = perf_session__process_events(session);
				1806	if (err) {
				1807	pr_err("Failed to process events, error %d", err);
				1808	goto out_delete;
				1809	}
				1810
				1811	sched->nr_events = session->evlist->stats.nr_events[0];
				1812	sched->nr_lost_events = session->evlist->stats.total_lost;
				1813	sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST];
				1814	}
				1815
				1816	rc = 0;
				1817	out_delete:
				1818	perf_session__delete(session);
				1819	return rc;
				1820	}
				1821
				1822	/*
				1823	* scheduling times are printed as msec.usec
				1824	*/
				1825	static inline void print_sched_time(unsigned long long nsecs, int width)
				1826	{
				1827	unsigned long msecs;
				1828	unsigned long usecs;
				1829
				1830	msecs = nsecs / NSEC_PER_MSEC;
				1831	nsecs -= msecs * NSEC_PER_MSEC;
				1832	usecs = nsecs / NSEC_PER_USEC;
				1833	printf("%*lu.%03lu ", width, msecs, usecs);
				1834	}
				1835
				1836	/*
				1837	* returns runtime data for event, allocating memory for it the
				1838	* first time it is used.
				1839	*/
				1840	static struct evsel_runtime perf_evsel__get_runtime(struct perf_evsel evsel)
				1841	{
				1842	struct evsel_runtime *r = evsel->priv;
				1843
				1844	if (r == NULL) {
				1845	r = zalloc(sizeof(struct evsel_runtime));
				1846	evsel->priv = r;
				1847	}
				1848
				1849	return r;
				1850	}
				1851
				1852	/*
				1853	* save last time event was seen per cpu
				1854	*/
				1855	static void perf_evsel__save_time(struct perf_evsel *evsel,
				1856	u64 timestamp, u32 cpu)
				1857	{
				1858	struct evsel_runtime *r = perf_evsel__get_runtime(evsel);
				1859
				1860	if (r == NULL)
				1861	return;
				1862
				1863	if ((cpu >= r->ncpu) \|\| (r->last_time == NULL)) {
				1864	int i, n = __roundup_pow_of_two(cpu+1);
				1865	void *p = r->last_time;
				1866
				1867	p = realloc(r->last_time, n * sizeof(u64));
				1868	if (!p)
				1869	return;
				1870
				1871	r->last_time = p;
				1872	for (i = r->ncpu; i < n; ++i)
				1873	r->last_time[i] = (u64) 0;
				1874
				1875	r->ncpu = n;
				1876	}
				1877
				1878	r->last_time[cpu] = timestamp;
				1879	}
				1880
				1881	/* returns last time this event was seen on the given cpu */
				1882	static u64 perf_evsel__get_time(struct perf_evsel *evsel, u32 cpu)
				1883	{
				1884	struct evsel_runtime *r = perf_evsel__get_runtime(evsel);
				1885
				1886	if ((r == NULL) \|\| (r->last_time == NULL) \|\| (cpu >= r->ncpu))
				1887	return 0;
				1888
				1889	return r->last_time[cpu];
				1890	}
				1891
				1892	static int comm_width = 30;
				1893
				1894	static char timehist_get_commstr(struct thread thread)
				1895	{
				1896	static char str[32];
				1897	const char *comm = thread__comm_str(thread);
				1898	pid_t tid = thread->tid;
				1899	pid_t pid = thread->pid_;
				1900	int n;
				1901
				1902	if (pid == 0)
				1903	n = scnprintf(str, sizeof(str), "%s", comm);
				1904
				1905	else if (tid != pid)
				1906	n = scnprintf(str, sizeof(str), "%s[%d/%d]", comm, tid, pid);
				1907
				1908	else
				1909	n = scnprintf(str, sizeof(str), "%s[%d]", comm, tid);
				1910
				1911	if (n > comm_width)
				1912	comm_width = n;
				1913
				1914	return str;
				1915	}
				1916
				1917	static void timehist_header(struct perf_sched *sched)
				1918	{
				1919	u32 ncpus = sched->max_cpu + 1;
				1920	u32 i, j;
				1921
				1922	printf("%15s %6s ", "time", "cpu");
				1923
				1924	if (sched->show_cpu_visual) {
				1925	printf(" ");
				1926	for (i = 0, j = 0; i < ncpus; ++i) {
				1927	printf("%x", j++);
				1928	if (j > 15)
				1929	j = 0;
				1930	}
				1931	printf(" ");
				1932	}
				1933
				1934	printf(" %-*s %9s %9s %9s", comm_width,
				1935	"task name", "wait time", "sch delay", "run time");
				1936
				1937	if (sched->show_state)
				1938	printf(" %s", "state");
				1939
				1940	printf("\n");
				1941
				1942	/*
				1943	* units row
				1944	*/
				1945	printf("%15s %-6s ", "", "");
				1946
				1947	if (sched->show_cpu_visual)
				1948	printf(" %*s ", ncpus, "");
				1949
				1950	printf(" %-*s %9s %9s %9s", comm_width,
				1951	"[tid/pid]", "(msec)", "(msec)", "(msec)");
				1952
				1953	if (sched->show_state)
				1954	printf(" %5s", "");
				1955
				1956	printf("\n");
				1957
				1958	/*
				1959	* separator
				1960	*/
				1961	printf("%.15s %.6s ", graph_dotted_line, graph_dotted_line);
				1962
				1963	if (sched->show_cpu_visual)
				1964	printf(" %.*s ", ncpus, graph_dotted_line);
				1965
				1966	printf(" %.*s %.9s %.9s %.9s", comm_width,
				1967	graph_dotted_line, graph_dotted_line, graph_dotted_line,
				1968	graph_dotted_line);
				1969
				1970	if (sched->show_state)
				1971	printf(" %.5s", graph_dotted_line);
				1972
				1973	printf("\n");
				1974	}
				1975
				1976	static char task_state_char(struct thread *thread, int state)
				1977	{
				1978	static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
				1979	unsigned bit = state ? ffs(state) : 0;
				1980
				1981	/* 'I' for idle */
				1982	if (thread->tid == 0)
				1983	return 'I';
				1984
				1985	return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
				1986	}
				1987
				1988	static void timehist_print_sample(struct perf_sched *sched,
				1989	struct perf_evsel *evsel,
				1990	struct perf_sample *sample,
				1991	struct addr_location *al,
				1992	struct thread *thread,
				1993	u64 t, int state)
				1994	{
				1995	struct thread_runtime *tr = thread__priv(thread);
				1996	const char *next_comm = perf_evsel__strval(evsel, sample, "next_comm");
				1997	const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
				1998	u32 max_cpus = sched->max_cpu + 1;
				1999	char tstr[64];
				2000	char nstr[30];
				2001	u64 wait_time;
				2002
				2003	timestamp__scnprintf_usec(t, tstr, sizeof(tstr));
				2004	printf("%15s [%04d] ", tstr, sample->cpu);
				2005
				2006	if (sched->show_cpu_visual) {
				2007	u32 i;
				2008	char c;
				2009
				2010	printf(" ");
				2011	for (i = 0; i < max_cpus; ++i) {
				2012	/* flag idle times with 'i'; others are sched events */
				2013	if (i == sample->cpu)
				2014	c = (thread->tid == 0) ? 'i' : 's';
				2015	else
				2016	c = ' ';
				2017	printf("%c", c);
				2018	}
				2019	printf(" ");
				2020	}
				2021
				2022	printf(" %-*s ", comm_width, timehist_get_commstr(thread));
				2023
				2024	wait_time = tr->dt_sleep + tr->dt_iowait + tr->dt_preempt;
				2025	print_sched_time(wait_time, 6);
				2026
				2027	print_sched_time(tr->dt_delay, 6);
				2028	print_sched_time(tr->dt_run, 6);
				2029
				2030	if (sched->show_state)
				2031	printf(" %5c ", task_state_char(thread, state));
				2032
				2033	if (sched->show_next) {
				2034	snprintf(nstr, sizeof(nstr), "next: %s[%d]", next_comm, next_pid);
				2035	printf(" %-*s", comm_width, nstr);
				2036	}
				2037
				2038	if (sched->show_wakeups && !sched->show_next)
				2039	printf(" %-*s", comm_width, "");
				2040
				2041	if (thread->tid == 0)
				2042	goto out;
				2043
				2044	if (sched->show_callchain)
				2045	printf(" ");
				2046
				2047	sample__fprintf_sym(sample, al, 0,
				2048	EVSEL__PRINT_SYM \| EVSEL__PRINT_ONELINE \|
				2049	EVSEL__PRINT_CALLCHAIN_ARROW \|
				2050	EVSEL__PRINT_SKIP_IGNORED,
				2051	&callchain_cursor, stdout);
				2052
				2053	out:
				2054	printf("\n");
				2055	}
				2056
				2057	/*
				2058	* Explanation of delta-time stats:
				2059	*
				2060	* t = time of current schedule out event
				2061	* tprev = time of previous sched out event
				2062	* also time of schedule-in event for current task
				2063	* last_time = time of last sched change event for current task
				2064	* (i.e, time process was last scheduled out)
				2065	* ready_to_run = time of wakeup for current task
				2066	*
				2067	* -----\|------------\|------------\|------------\|------
				2068	* last ready tprev t
				2069	* time to run
				2070	*
				2071	* \|-------- dt_wait --------\|
				2072	* \|- dt_delay -\|-- dt_run --\|
				2073	*
				2074	* dt_run = run time of current task
				2075	* dt_wait = time between last schedule out event for task and tprev
				2076	* represents time spent off the cpu
				2077	* dt_delay = time between wakeup and schedule-in of task
				2078	*/
				2079
				2080	static void timehist_update_runtime_stats(struct thread_runtime *r,
				2081	u64 t, u64 tprev)
				2082	{
				2083	r->dt_delay = 0;
				2084	r->dt_sleep = 0;
				2085	r->dt_iowait = 0;
				2086	r->dt_preempt = 0;
				2087	r->dt_run = 0;
				2088
				2089	if (tprev) {
				2090	r->dt_run = t - tprev;
				2091	if (r->ready_to_run) {
				2092	if (r->ready_to_run > tprev)
				2093	pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
				2094	else
				2095	r->dt_delay = tprev - r->ready_to_run;
				2096	}
				2097
				2098	if (r->last_time > tprev)
				2099	pr_debug("time travel: last sched out time for task > previous sched_switch event\n");
				2100	else if (r->last_time) {
				2101	u64 dt_wait = tprev - r->last_time;
				2102
				2103	if (r->last_state == TASK_RUNNING)
				2104	r->dt_preempt = dt_wait;
				2105	else if (r->last_state == TASK_UNINTERRUPTIBLE)
				2106	r->dt_iowait = dt_wait;
				2107	else
				2108	r->dt_sleep = dt_wait;
				2109	}
				2110	}
				2111
				2112	update_stats(&r->run_stats, r->dt_run);
				2113
				2114	r->total_run_time += r->dt_run;
				2115	r->total_delay_time += r->dt_delay;
				2116	r->total_sleep_time += r->dt_sleep;
				2117	r->total_iowait_time += r->dt_iowait;
				2118	r->total_preempt_time += r->dt_preempt;
				2119	}
				2120
				2121	static bool is_idle_sample(struct perf_sample *sample,
				2122	struct perf_evsel *evsel)
				2123	{
				2124	/* pid 0 == swapper == idle task */
				2125	if (strcmp(perf_evsel__name(evsel), "sched:sched_switch") == 0)
				2126	return perf_evsel__intval(evsel, sample, "prev_pid") == 0;
				2127
				2128	return sample->pid == 0;
				2129	}
				2130
				2131	static void save_task_callchain(struct perf_sched *sched,
				2132	struct perf_sample *sample,
				2133	struct perf_evsel *evsel,
				2134	struct machine *machine)
				2135	{
				2136	struct callchain_cursor *cursor = &callchain_cursor;
				2137	struct thread *thread;
				2138
				2139	/* want main thread for process - has maps */
				2140	thread = machine__findnew_thread(machine, sample->pid, sample->pid);
				2141	if (thread == NULL) {
				2142	pr_debug("Failed to get thread for pid %d.\n", sample->pid);
				2143	return;
				2144	}
				2145
				2146	if (!sched->show_callchain \|\| sample->callchain == NULL)
				2147	return;
				2148
				2149	if (thread__resolve_callchain(thread, cursor, evsel, sample,
				2150	NULL, NULL, sched->max_stack + 2) != 0) {
				2151	if (verbose > 0)
				2152	pr_err("Failed to resolve callchain. Skipping\n");
				2153
				2154	return;
				2155	}
				2156
				2157	callchain_cursor_commit(cursor);
				2158
				2159	while (true) {
				2160	struct callchain_cursor_node *node;
				2161	struct symbol *sym;
				2162
				2163	node = callchain_cursor_current(cursor);
				2164	if (node == NULL)
				2165	break;
				2166
				2167	sym = node->sym;
				2168	if (sym) {
				2169	if (!strcmp(sym->name, "schedule") \|\|
				2170	!strcmp(sym->name, "__schedule") \|\|
				2171	!strcmp(sym->name, "preempt_schedule"))
				2172	sym->ignore = 1;
				2173	}
				2174
				2175	callchain_cursor_advance(cursor);
				2176	}
				2177	}
				2178
				2179	static int init_idle_thread(struct thread *thread)
				2180	{
				2181	struct idle_thread_runtime *itr;
				2182
				2183	thread__set_comm(thread, idle_comm, 0);
				2184
				2185	itr = zalloc(sizeof(*itr));
				2186	if (itr == NULL)
				2187	return -ENOMEM;
				2188
				2189	init_stats(&itr->tr.run_stats);
				2190	callchain_init(&itr->callchain);
				2191	callchain_cursor_reset(&itr->cursor);
				2192	thread__set_priv(thread, itr);
				2193
				2194	return 0;
				2195	}
				2196
				2197	/*
				2198	* Track idle stats per cpu by maintaining a local thread
				2199	* struct for the idle task on each cpu.
				2200	*/
				2201	static int init_idle_threads(int ncpu)
				2202	{
				2203	int i, ret;
				2204
				2205	idle_threads = zalloc(ncpu * sizeof(struct thread *));
				2206	if (!idle_threads)
				2207	return -ENOMEM;
				2208
				2209	idle_max_cpu = ncpu;
				2210
				2211	/* allocate the actual thread struct if needed */
				2212	for (i = 0; i < ncpu; ++i) {
				2213	idle_threads[i] = thread__new(0, 0);
				2214	if (idle_threads[i] == NULL)
				2215	return -ENOMEM;
				2216
				2217	ret = init_idle_thread(idle_threads[i]);
				2218	if (ret < 0)
				2219	return ret;
				2220	}
				2221
				2222	return 0;
				2223	}
				2224
				2225	static void free_idle_threads(void)
				2226	{
				2227	int i;
				2228
				2229	if (idle_threads == NULL)
				2230	return;
				2231
				2232	for (i = 0; i < idle_max_cpu; ++i) {
				2233	if ((idle_threads[i]))
				2234	thread__delete(idle_threads[i]);
				2235	}
				2236
				2237	free(idle_threads);
				2238	}
				2239
				2240	static struct thread *get_idle_thread(int cpu)
				2241	{
				2242	/*
				2243	* expand/allocate array of pointers to local thread
				2244	* structs if needed
				2245	*/
				2246	if ((cpu >= idle_max_cpu) \|\| (idle_threads == NULL)) {
				2247	int i, j = __roundup_pow_of_two(cpu+1);
				2248	void *p;
				2249
				2250	p = realloc(idle_threads, j * sizeof(struct thread *));
				2251	if (!p)
				2252	return NULL;
				2253
				2254	idle_threads = (struct thread **) p;
				2255	for (i = idle_max_cpu; i < j; ++i)
				2256	idle_threads[i] = NULL;
				2257
				2258	idle_max_cpu = j;
				2259	}
				2260
				2261	/* allocate a new thread struct if needed */
				2262	if (idle_threads[cpu] == NULL) {
				2263	idle_threads[cpu] = thread__new(0, 0);
				2264
				2265	if (idle_threads[cpu]) {
				2266	if (init_idle_thread(idle_threads[cpu]) < 0)
				2267	return NULL;
				2268	}
				2269	}
				2270
				2271	return idle_threads[cpu];
				2272	}
				2273
				2274	static void save_idle_callchain(struct perf_sched *sched,
				2275	struct idle_thread_runtime *itr,
				2276	struct perf_sample *sample)
				2277	{
				2278	if (!sched->show_callchain \|\| sample->callchain == NULL)
				2279	return;
				2280
				2281	callchain_cursor__copy(&itr->cursor, &callchain_cursor);
				2282	}
				2283
				2284	static struct thread timehist_get_thread(struct perf_sched sched,
				2285	struct perf_sample *sample,
				2286	struct machine *machine,
				2287	struct perf_evsel *evsel)
				2288	{
				2289	struct thread *thread;
				2290
				2291	if (is_idle_sample(sample, evsel)) {
				2292	thread = get_idle_thread(sample->cpu);
				2293	if (thread == NULL)
				2294	pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu);
				2295
				2296	} else {
				2297	/* there were samples with tid 0 but non-zero pid */
				2298	thread = machine__findnew_thread(machine, sample->pid,
				2299	sample->tid ?: sample->pid);
				2300	if (thread == NULL) {
				2301	pr_debug("Failed to get thread for tid %d. skipping sample.\n",
				2302	sample->tid);
				2303	}
				2304
				2305	save_task_callchain(sched, sample, evsel, machine);
				2306	if (sched->idle_hist) {
				2307	struct thread *idle;
				2308	struct idle_thread_runtime *itr;
				2309
				2310	idle = get_idle_thread(sample->cpu);
				2311	if (idle == NULL) {
				2312	pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu);
				2313	return NULL;
				2314	}
				2315
				2316	itr = thread__priv(idle);
				2317	if (itr == NULL)
				2318	return NULL;
				2319
				2320	itr->last_thread = thread;
				2321
				2322	/* copy task callchain when entering to idle */
				2323	if (perf_evsel__intval(evsel, sample, "next_pid") == 0)
				2324	save_idle_callchain(sched, itr, sample);
				2325	}
				2326	}
				2327
				2328	return thread;
				2329	}
				2330
				2331	static bool timehist_skip_sample(struct perf_sched *sched,
				2332	struct thread *thread,
				2333	struct perf_evsel *evsel,
				2334	struct perf_sample *sample)
				2335	{
				2336	bool rc = false;
				2337
				2338	if (thread__is_filtered(thread)) {
				2339	rc = true;
				2340	sched->skipped_samples++;
				2341	}
				2342
				2343	if (sched->idle_hist) {
				2344	if (strcmp(perf_evsel__name(evsel), "sched:sched_switch"))
				2345	rc = true;
				2346	else if (perf_evsel__intval(evsel, sample, "prev_pid") != 0 &&
				2347	perf_evsel__intval(evsel, sample, "next_pid") != 0)
				2348	rc = true;
				2349	}
				2350
				2351	return rc;
				2352	}
				2353
				2354	static void timehist_print_wakeup_event(struct perf_sched *sched,
				2355	struct perf_evsel *evsel,
				2356	struct perf_sample *sample,
				2357	struct machine *machine,
				2358	struct thread *awakened)
				2359	{
				2360	struct thread *thread;
				2361	char tstr[64];
				2362
				2363	thread = machine__findnew_thread(machine, sample->pid, sample->tid);
				2364	if (thread == NULL)
				2365	return;
				2366
				2367	/* show wakeup unless both awakee and awaker are filtered */
				2368	if (timehist_skip_sample(sched, thread, evsel, sample) &&
				2369	timehist_skip_sample(sched, awakened, evsel, sample)) {
				2370	return;
				2371	}
				2372
				2373	timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
				2374	printf("%15s [%04d] ", tstr, sample->cpu);
				2375	if (sched->show_cpu_visual)
				2376	printf(" %*s ", sched->max_cpu + 1, "");
				2377
				2378	printf(" %-*s ", comm_width, timehist_get_commstr(thread));
				2379
				2380	/* dt spacer */
				2381	printf(" %9s %9s %9s ", "", "", "");
				2382
				2383	printf("awakened: %s", timehist_get_commstr(awakened));
				2384
				2385	printf("\n");
				2386	}
				2387
				2388	static int timehist_sched_wakeup_event(struct perf_tool *tool,
				2389	union perf_event *event __maybe_unused,
				2390	struct perf_evsel *evsel,
				2391	struct perf_sample *sample,
				2392	struct machine *machine)
				2393	{
				2394	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
				2395	struct thread *thread;
				2396	struct thread_runtime *tr = NULL;
				2397	/* want pid of awakened task not pid in sample */
				2398	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
				2399
				2400	thread = machine__findnew_thread(machine, 0, pid);
				2401	if (thread == NULL)
				2402	return -1;
				2403
				2404	tr = thread__get_runtime(thread);
				2405	if (tr == NULL)
				2406	return -1;
				2407
				2408	if (tr->ready_to_run == 0)
				2409	tr->ready_to_run = sample->time;
				2410
				2411	/* show wakeups if requested */
				2412	if (sched->show_wakeups &&
				2413	!perf_time__skip_sample(&sched->ptime, sample->time))
				2414	timehist_print_wakeup_event(sched, evsel, sample, machine, thread);
				2415
				2416	return 0;
				2417	}
				2418
				2419	static void timehist_print_migration_event(struct perf_sched *sched,
				2420	struct perf_evsel *evsel,
				2421	struct perf_sample *sample,
				2422	struct machine *machine,
				2423	struct thread *migrated)
				2424	{
				2425	struct thread *thread;
				2426	char tstr[64];
				2427	u32 max_cpus = sched->max_cpu + 1;
				2428	u32 ocpu, dcpu;
				2429
				2430	if (sched->summary_only)
				2431	return;
				2432
				2433	max_cpus = sched->max_cpu + 1;
				2434	ocpu = perf_evsel__intval(evsel, sample, "orig_cpu");
				2435	dcpu = perf_evsel__intval(evsel, sample, "dest_cpu");
				2436
				2437	thread = machine__findnew_thread(machine, sample->pid, sample->tid);
				2438	if (thread == NULL)
				2439	return;
				2440
				2441	if (timehist_skip_sample(sched, thread, evsel, sample) &&
				2442	timehist_skip_sample(sched, migrated, evsel, sample)) {
				2443	return;
				2444	}
				2445
				2446	timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
				2447	printf("%15s [%04d] ", tstr, sample->cpu);
				2448
				2449	if (sched->show_cpu_visual) {
				2450	u32 i;
				2451	char c;
				2452
				2453	printf(" ");
				2454	for (i = 0; i < max_cpus; ++i) {
				2455	c = (i == sample->cpu) ? 'm' : ' ';
				2456	printf("%c", c);
				2457	}
				2458	printf(" ");
				2459	}
				2460
				2461	printf(" %-*s ", comm_width, timehist_get_commstr(thread));
				2462
				2463	/* dt spacer */
				2464	printf(" %9s %9s %9s ", "", "", "");
				2465
				2466	printf("migrated: %s", timehist_get_commstr(migrated));
				2467	printf(" cpu %d => %d", ocpu, dcpu);
				2468
				2469	printf("\n");
				2470	}
				2471
				2472	static int timehist_migrate_task_event(struct perf_tool *tool,
				2473	union perf_event *event __maybe_unused,
				2474	struct perf_evsel *evsel,
				2475	struct perf_sample *sample,
				2476	struct machine *machine)
				2477	{
				2478	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
				2479	struct thread *thread;
				2480	struct thread_runtime *tr = NULL;
				2481	/* want pid of migrated task not pid in sample */
				2482	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
				2483
				2484	thread = machine__findnew_thread(machine, 0, pid);
				2485	if (thread == NULL)
				2486	return -1;
				2487
				2488	tr = thread__get_runtime(thread);
				2489	if (tr == NULL)
				2490	return -1;
				2491
				2492	tr->migrations++;
				2493
				2494	/* show migrations if requested */
				2495	timehist_print_migration_event(sched, evsel, sample, machine, thread);
				2496
				2497	return 0;
				2498	}
				2499
				2500	static int timehist_sched_change_event(struct perf_tool *tool,
				2501	union perf_event *event,
				2502	struct perf_evsel *evsel,
				2503	struct perf_sample *sample,
				2504	struct machine *machine)
				2505	{
				2506	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
				2507	struct perf_time_interval *ptime = &sched->ptime;
				2508	struct addr_location al;
				2509	struct thread *thread;
				2510	struct thread_runtime *tr = NULL;
				2511	u64 tprev, t = sample->time;
				2512	int rc = 0;
				2513	int state = perf_evsel__intval(evsel, sample, "prev_state");
				2514
				2515
				2516	if (machine__resolve(machine, &al, sample) < 0) {
				2517	pr_err("problem processing %d event. skipping it\n",
				2518	event->header.type);
				2519	rc = -1;
				2520	goto out;
				2521	}
				2522
				2523	thread = timehist_get_thread(sched, sample, machine, evsel);
				2524	if (thread == NULL) {
				2525	rc = -1;
				2526	goto out;
				2527	}
				2528
				2529	if (timehist_skip_sample(sched, thread, evsel, sample))
				2530	goto out;
				2531
				2532	tr = thread__get_runtime(thread);
				2533	if (tr == NULL) {
				2534	rc = -1;
				2535	goto out;
				2536	}
				2537
				2538	tprev = perf_evsel__get_time(evsel, sample->cpu);
				2539
				2540	/*
				2541	* If start time given:
				2542	* - sample time is under window user cares about - skip sample
				2543	* - tprev is under window user cares about - reset to start of window
				2544	*/
				2545	if (ptime->start && ptime->start > t)
				2546	goto out;
				2547
				2548	if (tprev && ptime->start > tprev)
				2549	tprev = ptime->start;
				2550
				2551	/*
				2552	* If end time given:
				2553	* - previous sched event is out of window - we are done
				2554	* - sample time is beyond window user cares about - reset it
				2555	* to close out stats for time window interest
				2556	*/
				2557	if (ptime->end) {
				2558	if (tprev > ptime->end)
				2559	goto out;
				2560
				2561	if (t > ptime->end)
				2562	t = ptime->end;
				2563	}
				2564
				2565	if (!sched->idle_hist \|\| thread->tid == 0) {
				2566	timehist_update_runtime_stats(tr, t, tprev);
				2567
				2568	if (sched->idle_hist) {
				2569	struct idle_thread_runtime itr = (void )tr;
				2570	struct thread_runtime *last_tr;
				2571
				2572	BUG_ON(thread->tid != 0);
				2573
				2574	if (itr->last_thread == NULL)
				2575	goto out;
				2576
				2577	/* add current idle time as last thread's runtime */
				2578	last_tr = thread__get_runtime(itr->last_thread);
				2579	if (last_tr == NULL)
				2580	goto out;
				2581
				2582	timehist_update_runtime_stats(last_tr, t, tprev);
				2583	/*
				2584	* remove delta time of last thread as it's not updated
				2585	* and otherwise it will show an invalid value next
				2586	* time. we only care total run time and run stat.
				2587	*/
				2588	last_tr->dt_run = 0;
				2589	last_tr->dt_delay = 0;
				2590	last_tr->dt_sleep = 0;
				2591	last_tr->dt_iowait = 0;
				2592	last_tr->dt_preempt = 0;
				2593
				2594	if (itr->cursor.nr)
				2595	callchain_append(&itr->callchain, &itr->cursor, t - tprev);
				2596
				2597	itr->last_thread = NULL;
				2598	}
				2599	}
				2600
				2601	if (!sched->summary_only)
				2602	timehist_print_sample(sched, evsel, sample, &al, thread, t, state);
				2603
				2604	out:
				2605	if (sched->hist_time.start == 0 && t >= ptime->start)
				2606	sched->hist_time.start = t;
				2607	if (ptime->end == 0 \|\| t <= ptime->end)
				2608	sched->hist_time.end = t;
				2609
				2610	if (tr) {
				2611	/* time of this sched_switch event becomes last time task seen */
				2612	tr->last_time = sample->time;
				2613
				2614	/* last state is used to determine where to account wait time */
				2615	tr->last_state = state;
				2616
				2617	/* sched out event for task so reset ready to run time */
				2618	tr->ready_to_run = 0;
				2619	}
				2620
				2621	perf_evsel__save_time(evsel, sample->time, sample->cpu);
				2622
				2623	return rc;
				2624	}
				2625
				2626	static int timehist_sched_switch_event(struct perf_tool *tool,
				2627	union perf_event *event,
				2628	struct perf_evsel *evsel,
				2629	struct perf_sample *sample,
				2630	struct machine *machine __maybe_unused)
				2631	{
				2632	return timehist_sched_change_event(tool, event, evsel, sample, machine);
				2633	}
				2634
				2635	static int process_lost(struct perf_tool *tool __maybe_unused,
				2636	union perf_event *event,
				2637	struct perf_sample *sample,
				2638	struct machine *machine __maybe_unused)
				2639	{
				2640	char tstr[64];
				2641
				2642	timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr));
				2643	printf("%15s ", tstr);
				2644	printf("lost %" PRIu64 " events on cpu %d\n", event->lost.lost, sample->cpu);
				2645
				2646	return 0;
				2647	}
				2648
				2649
				2650	static void print_thread_runtime(struct thread *t,
				2651	struct thread_runtime *r)
				2652	{
				2653	double mean = avg_stats(&r->run_stats);
				2654	float stddev;
				2655
				2656	printf("%*s %5d %9" PRIu64 " ",
				2657	comm_width, timehist_get_commstr(t), t->ppid,
				2658	(u64) r->run_stats.n);
				2659
				2660	print_sched_time(r->total_run_time, 8);
				2661	stddev = rel_stddev_stats(stddev_stats(&r->run_stats), mean);
				2662	print_sched_time(r->run_stats.min, 6);
				2663	printf(" ");
				2664	print_sched_time((u64) mean, 6);
				2665	printf(" ");
				2666	print_sched_time(r->run_stats.max, 6);
				2667	printf(" ");
				2668	printf("%5.2f", stddev);
				2669	printf(" %5" PRIu64, r->migrations);
				2670	printf("\n");
				2671	}
				2672
				2673	static void print_thread_waittime(struct thread *t,
				2674	struct thread_runtime *r)
				2675	{
				2676	printf("%*s %5d %9" PRIu64 " ",
				2677	comm_width, timehist_get_commstr(t), t->ppid,
				2678	(u64) r->run_stats.n);
				2679
				2680	print_sched_time(r->total_run_time, 8);
				2681	print_sched_time(r->total_sleep_time, 6);
				2682	printf(" ");
				2683	print_sched_time(r->total_iowait_time, 6);
				2684	printf(" ");
				2685	print_sched_time(r->total_preempt_time, 6);
				2686	printf(" ");
				2687	print_sched_time(r->total_delay_time, 6);
				2688	printf("\n");
				2689	}
				2690
				2691	struct total_run_stats {
				2692	struct perf_sched *sched;
				2693	u64 sched_count;
				2694	u64 task_count;
				2695	u64 total_run_time;
				2696	};
				2697
				2698	static int __show_thread_runtime(struct thread t, void priv)
				2699	{
				2700	struct total_run_stats *stats = priv;
				2701	struct thread_runtime *r;
				2702
				2703	if (thread__is_filtered(t))
				2704	return 0;
				2705
				2706	r = thread__priv(t);
				2707	if (r && r->run_stats.n) {
				2708	stats->task_count++;
				2709	stats->sched_count += r->run_stats.n;
				2710	stats->total_run_time += r->total_run_time;
				2711
				2712	if (stats->sched->show_state)
				2713	print_thread_waittime(t, r);
				2714	else
				2715	print_thread_runtime(t, r);
				2716	}
				2717
				2718	return 0;
				2719	}
				2720
				2721	static int show_thread_runtime(struct thread t, void priv)
				2722	{
				2723	if (t->dead)
				2724	return 0;
				2725
				2726	return __show_thread_runtime(t, priv);
				2727	}
				2728
				2729	static int show_deadthread_runtime(struct thread t, void priv)
				2730	{
				2731	if (!t->dead)
				2732	return 0;
				2733
				2734	return __show_thread_runtime(t, priv);
				2735	}
				2736
				2737	static size_t callchain__fprintf_folded(FILE fp, struct callchain_node node)
				2738	{
				2739	const char *sep = " <- ";
				2740	struct callchain_list *chain;
				2741	size_t ret = 0;
				2742	char bf[1024];
				2743	bool first;
				2744
				2745	if (node == NULL)
				2746	return 0;
				2747
				2748	ret = callchain__fprintf_folded(fp, node->parent);
				2749	first = (ret == 0);
				2750
				2751	list_for_each_entry(chain, &node->val, list) {
				2752	if (chain->ip >= PERF_CONTEXT_MAX)
				2753	continue;
				2754	if (chain->ms.sym && chain->ms.sym->ignore)
				2755	continue;
				2756	ret += fprintf(fp, "%s%s", first ? "" : sep,
				2757	callchain_list__sym_name(chain, bf, sizeof(bf),
				2758	false));
				2759	first = false;
				2760	}
				2761
				2762	return ret;
				2763	}
				2764
				2765	static size_t timehist_print_idlehist_callchain(struct rb_root *root)
				2766	{
				2767	size_t ret = 0;
				2768	FILE *fp = stdout;
				2769	struct callchain_node *chain;
				2770	struct rb_node *rb_node = rb_first(root);
				2771
				2772	printf(" %16s %8s %s\n", "Idle time (msec)", "Count", "Callchains");
				2773	printf(" %.16s %.8s %.50s\n", graph_dotted_line, graph_dotted_line,
				2774	graph_dotted_line);
				2775
				2776	while (rb_node) {
				2777	chain = rb_entry(rb_node, struct callchain_node, rb_node);
				2778	rb_node = rb_next(rb_node);
				2779
				2780	ret += fprintf(fp, " ");
				2781	print_sched_time(chain->hit, 12);
				2782	ret += 16; /* print_sched_time returns 2nd arg + 4 */
				2783	ret += fprintf(fp, " %8d ", chain->count);
				2784	ret += callchain__fprintf_folded(fp, chain);
				2785	ret += fprintf(fp, "\n");
				2786	}
				2787
				2788	return ret;
				2789	}
				2790
				2791	static void timehist_print_summary(struct perf_sched *sched,
				2792	struct perf_session *session)
				2793	{
				2794	struct machine *m = &session->machines.host;
				2795	struct total_run_stats totals;
				2796	u64 task_count;
				2797	struct thread *t;
				2798	struct thread_runtime *r;
				2799	int i;
				2800	u64 hist_time = sched->hist_time.end - sched->hist_time.start;
				2801
				2802	memset(&totals, 0, sizeof(totals));
				2803	totals.sched = sched;
				2804
				2805	if (sched->idle_hist) {
				2806	printf("\nIdle-time summary\n");
				2807	printf("%*s parent sched-out ", comm_width, "comm");
				2808	printf(" idle-time min-idle avg-idle max-idle stddev migrations\n");
				2809	} else if (sched->show_state) {
				2810	printf("\nWait-time summary\n");
				2811	printf("%*s parent sched-in ", comm_width, "comm");
				2812	printf(" run-time sleep iowait preempt delay\n");
				2813	} else {
				2814	printf("\nRuntime summary\n");
				2815	printf("%*s parent sched-in ", comm_width, "comm");
				2816	printf(" run-time min-run avg-run max-run stddev migrations\n");
				2817	}
				2818	printf("%*s (count) ", comm_width, "");
				2819	printf(" (msec) (msec) (msec) (msec) %s\n",
				2820	sched->show_state ? "(msec)" : "%");
				2821	printf("%.117s\n", graph_dotted_line);
				2822
				2823	machine__for_each_thread(m, show_thread_runtime, &totals);
				2824	task_count = totals.task_count;
				2825	if (!task_count)
				2826	printf("<no still running tasks>\n");
				2827
				2828	printf("\nTerminated tasks:\n");
				2829	machine__for_each_thread(m, show_deadthread_runtime, &totals);
				2830	if (task_count == totals.task_count)
				2831	printf("<no terminated tasks>\n");
				2832
				2833	/* CPU idle stats not tracked when samples were skipped */
				2834	if (sched->skipped_samples && !sched->idle_hist)
				2835	return;
				2836
				2837	printf("\nIdle stats:\n");
				2838	for (i = 0; i < idle_max_cpu; ++i) {
				2839	t = idle_threads[i];
				2840	if (!t)
				2841	continue;
				2842
				2843	r = thread__priv(t);
				2844	if (r && r->run_stats.n) {
				2845	totals.sched_count += r->run_stats.n;
				2846	printf(" CPU %2d idle for ", i);
				2847	print_sched_time(r->total_run_time, 6);
				2848	printf(" msec (%6.2f%%)\n", 100.0 * r->total_run_time / hist_time);
				2849	} else
				2850	printf(" CPU %2d idle entire time window\n", i);
				2851	}
				2852
				2853	if (sched->idle_hist && sched->show_callchain) {
				2854	callchain_param.mode = CHAIN_FOLDED;
				2855	callchain_param.value = CCVAL_PERIOD;
				2856
				2857	callchain_register_param(&callchain_param);
				2858
				2859	printf("\nIdle stats by callchain:\n");
				2860	for (i = 0; i < idle_max_cpu; ++i) {
				2861	struct idle_thread_runtime *itr;
				2862
				2863	t = idle_threads[i];
				2864	if (!t)
				2865	continue;
				2866
				2867	itr = thread__priv(t);
				2868	if (itr == NULL)
				2869	continue;
				2870
				2871	callchain_param.sort(&itr->sorted_root, &itr->callchain,
				2872	0, &callchain_param);
				2873
				2874	printf(" CPU %2d:", i);
				2875	print_sched_time(itr->tr.total_run_time, 6);
				2876	printf(" msec\n");
				2877	timehist_print_idlehist_callchain(&itr->sorted_root);
				2878	printf("\n");
				2879	}
				2880	}
				2881
				2882	printf("\n"
				2883	" Total number of unique tasks: %" PRIu64 "\n"
				2884	"Total number of context switches: %" PRIu64 "\n",
				2885	totals.task_count, totals.sched_count);
				2886
				2887	printf(" Total run time (msec): ");
				2888	print_sched_time(totals.total_run_time, 2);
				2889	printf("\n");
				2890
				2891	printf(" Total scheduling time (msec): ");
				2892	print_sched_time(hist_time, 2);
				2893	printf(" (x %d)\n", sched->max_cpu);
				2894	}
				2895
				2896	typedef int (sched_handler)(struct perf_tool tool,
				2897	union perf_event *event,
				2898	struct perf_evsel *evsel,
				2899	struct perf_sample *sample,
				2900	struct machine *machine);
				2901
				2902	static int perf_timehist__process_sample(struct perf_tool *tool,
				2903	union perf_event *event,
				2904	struct perf_sample *sample,
				2905	struct perf_evsel *evsel,
				2906	struct machine *machine)
				2907	{
				2908	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
				2909	int err = 0;
				2910	int this_cpu = sample->cpu;
				2911
				2912	if (this_cpu > sched->max_cpu)
				2913	sched->max_cpu = this_cpu;
				2914
				2915	if (evsel->handler != NULL) {
				2916	sched_handler f = evsel->handler;
				2917
				2918	err = f(tool, event, evsel, sample, machine);
				2919	}
				2920
				2921	return err;
				2922	}
				2923
				2924	static int timehist_check_attr(struct perf_sched *sched,
				2925	struct perf_evlist *evlist)
				2926	{
				2927	struct perf_evsel *evsel;
				2928	struct evsel_runtime *er;
				2929
				2930	list_for_each_entry(evsel, &evlist->entries, node) {
				2931	er = perf_evsel__get_runtime(evsel);
				2932	if (er == NULL) {
				2933	pr_err("Failed to allocate memory for evsel runtime data\n");
				2934	return -1;
				2935	}
				2936
				2937	if (sched->show_callchain && !evsel__has_callchain(evsel)) {
				2938	pr_info("Samples do not have callchains.\n");
				2939	sched->show_callchain = 0;
				2940	symbol_conf.use_callchain = 0;
				2941	}
				2942	}
				2943
				2944	return 0;
				2945	}
				2946
				2947	static int perf_sched__timehist(struct perf_sched *sched)
				2948	{
				2949	const struct perf_evsel_str_handler handlers[] = {
				2950	{ "sched:sched_switch", timehist_sched_switch_event, },
				2951	{ "sched:sched_wakeup", timehist_sched_wakeup_event, },
				2952	{ "sched:sched_wakeup_new", timehist_sched_wakeup_event, },
				2953	};
				2954	const struct perf_evsel_str_handler migrate_handlers[] = {
				2955	{ "sched:sched_migrate_task", timehist_migrate_task_event, },
				2956	};
				2957	struct perf_data data = {
				2958	.file = {
				2959	.path = input_name,
				2960	},
				2961	.mode = PERF_DATA_MODE_READ,
				2962	.force = sched->force,
				2963	};
				2964
				2965	struct perf_session *session;
				2966	struct perf_evlist *evlist;
				2967	int err = -1;
				2968
				2969	/*
				2970	* event handlers for timehist option
				2971	*/
				2972	sched->tool.sample = perf_timehist__process_sample;
				2973	sched->tool.mmap = perf_event__process_mmap;
				2974	sched->tool.comm = perf_event__process_comm;
				2975	sched->tool.exit = perf_event__process_exit;
				2976	sched->tool.fork = perf_event__process_fork;
				2977	sched->tool.lost = process_lost;
				2978	sched->tool.attr = perf_event__process_attr;
				2979	sched->tool.tracing_data = perf_event__process_tracing_data;
				2980	sched->tool.build_id = perf_event__process_build_id;
				2981
				2982	sched->tool.ordered_events = true;
				2983	sched->tool.ordering_requires_timestamps = true;
				2984
				2985	symbol_conf.use_callchain = sched->show_callchain;
				2986
				2987	session = perf_session__new(&data, false, &sched->tool);
				2988	if (session == NULL)
				2989	return -ENOMEM;
				2990
				2991	evlist = session->evlist;
				2992
				2993	symbol__init(&session->header.env);
				2994
				2995	if (perf_time__parse_str(&sched->ptime, sched->time_str) != 0) {
				2996	pr_err("Invalid time string\n");
				2997	return -EINVAL;
				2998	}
				2999
				3000	if (timehist_check_attr(sched, evlist) != 0)
				3001	goto out;
				3002
				3003	setup_pager();
				3004
				3005	/* setup per-evsel handlers */
				3006	if (perf_session__set_tracepoints_handlers(session, handlers))
				3007	goto out;
				3008
				3009	/* sched_switch event at a minimum needs to exist */
				3010	if (!perf_evlist__find_tracepoint_by_name(session->evlist,
				3011	"sched:sched_switch")) {
				3012	pr_err("No sched_switch events found. Have you run 'perf sched record'?\n");
				3013	goto out;
				3014	}
				3015
				3016	if (sched->show_migrations &&
				3017	perf_session__set_tracepoints_handlers(session, migrate_handlers))
				3018	goto out;
				3019
				3020	/* pre-allocate struct for per-CPU idle stats */
				3021	sched->max_cpu = session->header.env.nr_cpus_online;
				3022	if (sched->max_cpu == 0)
				3023	sched->max_cpu = 4;
				3024	if (init_idle_threads(sched->max_cpu))
				3025	goto out;
				3026
				3027	/* summary_only implies summary option, but don't overwrite summary if set */
				3028	if (sched->summary_only)
				3029	sched->summary = sched->summary_only;
				3030
				3031	if (!sched->summary_only)
				3032	timehist_header(sched);
				3033
				3034	err = perf_session__process_events(session);
				3035	if (err) {
				3036	pr_err("Failed to process events, error %d", err);
				3037	goto out;
				3038	}
				3039
				3040	sched->nr_events = evlist->stats.nr_events[0];
				3041	sched->nr_lost_events = evlist->stats.total_lost;
				3042	sched->nr_lost_chunks = evlist->stats.nr_events[PERF_RECORD_LOST];
				3043
				3044	if (sched->summary)
				3045	timehist_print_summary(sched, session);
				3046
				3047	out:
				3048	free_idle_threads();
				3049	perf_session__delete(session);
				3050
				3051	return err;
				3052	}
				3053
				3054
				3055	static void print_bad_events(struct perf_sched *sched)
				3056	{
				3057	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
				3058	printf(" INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
				3059	(double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
				3060	sched->nr_unordered_timestamps, sched->nr_timestamps);
				3061	}
				3062	if (sched->nr_lost_events && sched->nr_events) {
				3063	printf(" INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
				3064	(double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
				3065	sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
				3066	}
				3067	if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
				3068	printf(" INFO: %.3f%% context switch bugs (%ld out of %ld)",
				3069	(double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
				3070	sched->nr_context_switch_bugs, sched->nr_timestamps);
				3071	if (sched->nr_lost_events)
				3072	printf(" (due to lost events?)");
				3073	printf("\n");
				3074	}
				3075	}
				3076
				3077	static void __merge_work_atoms(struct rb_root root, struct work_atoms data)
				3078	{
				3079	struct rb_node *new = &(root->rb_node), parent = NULL;
				3080	struct work_atoms *this;
				3081	const char comm = thread__comm_str(data->thread), this_comm;
				3082
				3083	while (*new) {
				3084	int cmp;
				3085
				3086	this = container_of(*new, struct work_atoms, node);
				3087	parent = *new;
				3088
				3089	this_comm = thread__comm_str(this->thread);
				3090	cmp = strcmp(comm, this_comm);
				3091	if (cmp > 0) {
				3092	new = &((*new)->rb_left);
				3093	} else if (cmp < 0) {
				3094	new = &((*new)->rb_right);
				3095	} else {
				3096	this->num_merged++;
				3097	this->total_runtime += data->total_runtime;
				3098	this->nb_atoms += data->nb_atoms;
				3099	this->total_lat += data->total_lat;
				3100	list_splice(&data->work_list, &this->work_list);
				3101	if (this->max_lat < data->max_lat) {
				3102	this->max_lat = data->max_lat;
				3103	this->max_lat_at = data->max_lat_at;
				3104	}
				3105	zfree(&data);
				3106	return;
				3107	}
				3108	}
				3109
				3110	data->num_merged++;
				3111	rb_link_node(&data->node, parent, new);
				3112	rb_insert_color(&data->node, root);
				3113	}
				3114
				3115	static void perf_sched__merge_lat(struct perf_sched *sched)
				3116	{
				3117	struct work_atoms *data;
				3118	struct rb_node *node;
				3119
				3120	if (sched->skip_merge)
				3121	return;
				3122
				3123	while ((node = rb_first(&sched->atom_root))) {
				3124	rb_erase(node, &sched->atom_root);
				3125	data = rb_entry(node, struct work_atoms, node);
				3126	__merge_work_atoms(&sched->merged_atom_root, data);
				3127	}
				3128	}
				3129
				3130	static int perf_sched__lat(struct perf_sched *sched)
				3131	{
				3132	struct rb_node *next;
				3133
				3134	setup_pager();
				3135
				3136	if (perf_sched__read_events(sched))
				3137	return -1;
				3138
				3139	perf_sched__merge_lat(sched);
				3140	perf_sched__sort_lat(sched);
				3141
				3142	printf("\n -----------------------------------------------------------------------------------------------------------------\n");
				3143	printf(" Task \| Runtime ms \| Switches \| Average delay ms \| Maximum delay ms \| Maximum delay at \|\n");
				3144	printf(" -----------------------------------------------------------------------------------------------------------------\n");
				3145
				3146	next = rb_first(&sched->sorted_atom_root);
				3147
				3148	while (next) {
				3149	struct work_atoms *work_list;
				3150
				3151	work_list = rb_entry(next, struct work_atoms, node);
				3152	output_lat_thread(sched, work_list);
				3153	next = rb_next(next);
				3154	thread__zput(work_list->thread);
				3155	}
				3156
				3157	printf(" -----------------------------------------------------------------------------------------------------------------\n");
				3158	printf(" TOTAL: \|%11.3f ms \|%9" PRIu64 " \|\n",
				3159	(double)sched->all_runtime / NSEC_PER_MSEC, sched->all_count);
				3160
				3161	printf(" ---------------------------------------------------\n");
				3162
				3163	print_bad_events(sched);
				3164	printf("\n");
				3165
				3166	return 0;
				3167	}
				3168
				3169	static int setup_map_cpus(struct perf_sched *sched)
				3170	{
				3171	struct cpu_map *map;
				3172
				3173	sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF);
				3174
				3175	if (sched->map.comp) {
				3176	sched->map.comp_cpus = zalloc(sched->max_cpu * sizeof(int));
				3177	if (!sched->map.comp_cpus)
				3178	return -1;
				3179	}
				3180
				3181	if (!sched->map.cpus_str)
				3182	return 0;
				3183
				3184	map = cpu_map__new(sched->map.cpus_str);
				3185	if (!map) {
				3186	pr_err("failed to get cpus map from %s\n", sched->map.cpus_str);
				3187	return -1;
				3188	}
				3189
				3190	sched->map.cpus = map;
				3191	return 0;
				3192	}
				3193
				3194	static int setup_color_pids(struct perf_sched *sched)
				3195	{
				3196	struct thread_map *map;
				3197
				3198	if (!sched->map.color_pids_str)
				3199	return 0;
				3200
				3201	map = thread_map__new_by_tid_str(sched->map.color_pids_str);
				3202	if (!map) {
				3203	pr_err("failed to get thread map from %s\n", sched->map.color_pids_str);
				3204	return -1;
				3205	}
				3206
				3207	sched->map.color_pids = map;
				3208	return 0;
				3209	}
				3210
				3211	static int setup_color_cpus(struct perf_sched *sched)
				3212	{
				3213	struct cpu_map *map;
				3214
				3215	if (!sched->map.color_cpus_str)
				3216	return 0;
				3217
				3218	map = cpu_map__new(sched->map.color_cpus_str);
				3219	if (!map) {
				3220	pr_err("failed to get thread map from %s\n", sched->map.color_cpus_str);
				3221	return -1;
				3222	}
				3223
				3224	sched->map.color_cpus = map;
				3225	return 0;
				3226	}
				3227
				3228	static int perf_sched__map(struct perf_sched *sched)
				3229	{
				3230	if (setup_map_cpus(sched))
				3231	return -1;
				3232
				3233	if (setup_color_pids(sched))
				3234	return -1;
				3235
				3236	if (setup_color_cpus(sched))
				3237	return -1;
				3238
				3239	setup_pager();
				3240	if (perf_sched__read_events(sched))
				3241	return -1;
				3242	print_bad_events(sched);
				3243	return 0;
				3244	}
				3245
				3246	static int perf_sched__replay(struct perf_sched *sched)
				3247	{
				3248	unsigned long i;
				3249
				3250	calibrate_run_measurement_overhead(sched);
				3251	calibrate_sleep_measurement_overhead(sched);
				3252
				3253	test_calibrations(sched);
				3254
				3255	if (perf_sched__read_events(sched))
				3256	return -1;
				3257
				3258	printf("nr_run_events: %ld\n", sched->nr_run_events);
				3259	printf("nr_sleep_events: %ld\n", sched->nr_sleep_events);
				3260	printf("nr_wakeup_events: %ld\n", sched->nr_wakeup_events);
				3261
				3262	if (sched->targetless_wakeups)
				3263	printf("target-less wakeups: %ld\n", sched->targetless_wakeups);
				3264	if (sched->multitarget_wakeups)
				3265	printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
				3266	if (sched->nr_run_events_optimized)
				3267	printf("run atoms optimized: %ld\n",
				3268	sched->nr_run_events_optimized);
				3269
				3270	print_task_traces(sched);
				3271	add_cross_task_wakeups(sched);
				3272
				3273	create_tasks(sched);
				3274	printf("------------------------------------------------------------\n");
				3275	for (i = 0; i < sched->replay_repeat; i++)
				3276	run_one_test(sched);
				3277
				3278	return 0;
				3279	}
				3280
				3281	static void setup_sorting(struct perf_sched sched, const struct option options,
				3282	const char * const usage_msg[])
				3283	{
				3284	char tmp, tok, *str = strdup(sched->sort_order);
				3285
				3286	for (tok = strtok_r(str, ", ", &tmp);
				3287	tok; tok = strtok_r(NULL, ", ", &tmp)) {
				3288	if (sort_dimension__add(tok, &sched->sort_list) < 0) {
				3289	usage_with_options_msg(usage_msg, options,
				3290	"Unknown --sort key: `%s'", tok);
				3291	}
				3292	}
				3293
				3294	free(str);
				3295
				3296	sort_dimension__add("pid", &sched->cmp_pid);
				3297	}
				3298
				3299	static int __cmd_record(int argc, const char **argv)
				3300	{
				3301	unsigned int rec_argc, i, j;
				3302	const char **rec_argv;
				3303	const char * const record_args[] = {
				3304	"record",
				3305	"-a",
				3306	"-R",
				3307	"-m", "1024",
				3308	"-c", "1",
				3309	"-e", "sched:sched_switch",
				3310	"-e", "sched:sched_stat_wait",
				3311	"-e", "sched:sched_stat_sleep",
				3312	"-e", "sched:sched_stat_iowait",
				3313	"-e", "sched:sched_stat_runtime",
				3314	"-e", "sched:sched_process_fork",
				3315	"-e", "sched:sched_wakeup",
				3316	"-e", "sched:sched_wakeup_new",
				3317	"-e", "sched:sched_migrate_task",
				3318	};
				3319
				3320	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
				3321	rec_argv = calloc(rec_argc + 1, sizeof(char *));
				3322
				3323	if (rec_argv == NULL)
				3324	return -ENOMEM;
				3325
				3326	for (i = 0; i < ARRAY_SIZE(record_args); i++)
				3327	rec_argv[i] = strdup(record_args[i]);
				3328
				3329	for (j = 1; j < (unsigned int)argc; j++, i++)
				3330	rec_argv[i] = argv[j];
				3331
				3332	BUG_ON(i != rec_argc);
				3333
				3334	return cmd_record(i, rec_argv);
				3335	}
				3336
				3337	int cmd_sched(int argc, const char **argv)
				3338	{
				3339	const char default_sort_order[] = "avg, max, switch, runtime";
				3340	struct perf_sched sched = {
				3341	.tool = {
				3342	.sample = perf_sched__process_tracepoint_sample,
				3343	.comm = perf_sched__process_comm,
				3344	.namespaces = perf_event__process_namespaces,
				3345	.lost = perf_event__process_lost,
				3346	.fork = perf_sched__process_fork_event,
				3347	.ordered_events = true,
				3348	},
				3349	.cmp_pid = LIST_HEAD_INIT(sched.cmp_pid),
				3350	.sort_list = LIST_HEAD_INIT(sched.sort_list),
				3351	.start_work_mutex = PTHREAD_MUTEX_INITIALIZER,
				3352	.work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
				3353	.sort_order = default_sort_order,
				3354	.replay_repeat = 10,
				3355	.profile_cpu = -1,
				3356	.next_shortname1 = 'A',
				3357	.next_shortname2 = '0',
				3358	.skip_merge = 0,
				3359	.show_callchain = 1,
				3360	.max_stack = 5,
				3361	};
				3362	const struct option sched_options[] = {
				3363	OPT_STRING('i', "input", &input_name, "file",
				3364	"input file name"),
				3365	OPT_INCR('v', "verbose", &verbose,
				3366	"be more verbose (show symbol address, etc)"),
				3367	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
				3368	"dump raw trace in ASCII"),
				3369	OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
				3370	OPT_END()
				3371	};
				3372	const struct option latency_options[] = {
				3373	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
				3374	"sort by key(s): runtime, switch, avg, max"),
				3375	OPT_INTEGER('C', "CPU", &sched.profile_cpu,
				3376	"CPU to profile on"),
				3377	OPT_BOOLEAN('p', "pids", &sched.skip_merge,
				3378	"latency stats per pid instead of per comm"),
				3379	OPT_PARENT(sched_options)
				3380	};
				3381	const struct option replay_options[] = {
				3382	OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
				3383	"repeat the workload replay N times (-1: infinite)"),
				3384	OPT_PARENT(sched_options)
				3385	};
				3386	const struct option map_options[] = {
				3387	OPT_BOOLEAN(0, "compact", &sched.map.comp,
				3388	"map output in compact mode"),
				3389	OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids",
				3390	"highlight given pids in map"),
				3391	OPT_STRING(0, "color-cpus", &sched.map.color_cpus_str, "cpus",
				3392	"highlight given CPUs in map"),
				3393	OPT_STRING(0, "cpus", &sched.map.cpus_str, "cpus",
				3394	"display given CPUs in map"),
				3395	OPT_PARENT(sched_options)
				3396	};
				3397	const struct option timehist_options[] = {
				3398	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
				3399	"file", "vmlinux pathname"),
				3400	OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
				3401	"file", "kallsyms pathname"),
				3402	OPT_BOOLEAN('g', "call-graph", &sched.show_callchain,
				3403	"Display call chains if present (default on)"),
				3404	OPT_UINTEGER(0, "max-stack", &sched.max_stack,
				3405	"Maximum number of functions to display backtrace."),
				3406	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
				3407	"Look for files with symbols relative to this directory"),
				3408	OPT_BOOLEAN('s', "summary", &sched.summary_only,
				3409	"Show only syscall summary with statistics"),
				3410	OPT_BOOLEAN('S', "with-summary", &sched.summary,
				3411	"Show all syscalls and summary with statistics"),
				3412	OPT_BOOLEAN('w', "wakeups", &sched.show_wakeups, "Show wakeup events"),
				3413	OPT_BOOLEAN('n', "next", &sched.show_next, "Show next task"),
				3414	OPT_BOOLEAN('M', "migrations", &sched.show_migrations, "Show migration events"),
				3415	OPT_BOOLEAN('V', "cpu-visual", &sched.show_cpu_visual, "Add CPU visual"),
				3416	OPT_BOOLEAN('I', "idle-hist", &sched.idle_hist, "Show idle events only"),
				3417	OPT_STRING(0, "time", &sched.time_str, "str",
				3418	"Time span for analysis (start,stop)"),
				3419	OPT_BOOLEAN(0, "state", &sched.show_state, "Show task state when sched-out"),
				3420	OPT_STRING('p', "pid", &symbol_conf.pid_list_str, "pid[,pid...]",
				3421	"analyze events only for given process id(s)"),
				3422	OPT_STRING('t', "tid", &symbol_conf.tid_list_str, "tid[,tid...]",
				3423	"analyze events only for given thread id(s)"),
				3424	OPT_PARENT(sched_options)
				3425	};
				3426
				3427	const char * const latency_usage[] = {
				3428	"perf sched latency [<options>]",
				3429	NULL
				3430	};
				3431	const char * const replay_usage[] = {
				3432	"perf sched replay [<options>]",
				3433	NULL
				3434	};
				3435	const char * const map_usage[] = {
				3436	"perf sched map [<options>]",
				3437	NULL
				3438	};
				3439	const char * const timehist_usage[] = {
				3440	"perf sched timehist [<options>]",
				3441	NULL
				3442	};
				3443	const char *const sched_subcommands[] = { "record", "latency", "map",
				3444	"replay", "script",
				3445	"timehist", NULL };
				3446	const char *sched_usage[] = {
				3447	NULL,
				3448	NULL
				3449	};
				3450	struct trace_sched_handler lat_ops = {
				3451	.wakeup_event = latency_wakeup_event,
				3452	.switch_event = latency_switch_event,
				3453	.runtime_event = latency_runtime_event,
				3454	.migrate_task_event = latency_migrate_task_event,
				3455	};
				3456	struct trace_sched_handler map_ops = {
				3457	.switch_event = map_switch_event,
				3458	};
				3459	struct trace_sched_handler replay_ops = {
				3460	.wakeup_event = replay_wakeup_event,
				3461	.switch_event = replay_switch_event,
				3462	.fork_event = replay_fork_event,
				3463	};
				3464	unsigned int i;
				3465
				3466	for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
				3467	sched.curr_pid[i] = -1;
				3468
				3469	argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
				3470	sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
				3471	if (!argc)
				3472	usage_with_options(sched_usage, sched_options);
				3473
				3474	/*
				3475	* Aliased to 'perf script' for now:
				3476	*/
				3477	if (!strcmp(argv[0], "script"))
				3478	return cmd_script(argc, argv);
				3479
				3480	if (!strncmp(argv[0], "rec", 3)) {
				3481	return __cmd_record(argc, argv);
				3482	} else if (!strncmp(argv[0], "lat", 3)) {
				3483	sched.tp_handler = &lat_ops;
				3484	if (argc > 1) {
				3485	argc = parse_options(argc, argv, latency_options, latency_usage, 0);
				3486	if (argc)
				3487	usage_with_options(latency_usage, latency_options);
				3488	}
				3489	setup_sorting(&sched, latency_options, latency_usage);
				3490	return perf_sched__lat(&sched);
				3491	} else if (!strcmp(argv[0], "map")) {
				3492	if (argc) {
				3493	argc = parse_options(argc, argv, map_options, map_usage, 0);
				3494	if (argc)
				3495	usage_with_options(map_usage, map_options);
				3496	}
				3497	sched.tp_handler = &map_ops;
				3498	setup_sorting(&sched, latency_options, latency_usage);
				3499	return perf_sched__map(&sched);
				3500	} else if (!strncmp(argv[0], "rep", 3)) {
				3501	sched.tp_handler = &replay_ops;
				3502	if (argc) {
				3503	argc = parse_options(argc, argv, replay_options, replay_usage, 0);
				3504	if (argc)
				3505	usage_with_options(replay_usage, replay_options);
				3506	}
				3507	return perf_sched__replay(&sched);
				3508	} else if (!strcmp(argv[0], "timehist")) {
				3509	if (argc) {
				3510	argc = parse_options(argc, argv, timehist_options,
				3511	timehist_usage, 0);
				3512	if (argc)
				3513	usage_with_options(timehist_usage, timehist_options);
				3514	}
				3515	if ((sched.show_wakeups \|\| sched.show_next) &&
				3516	sched.summary_only) {
				3517	pr_err(" Error: -s and -[n\|w] are mutually exclusive.\n");
				3518	parse_options_usage(timehist_usage, timehist_options, "s", true);
				3519	if (sched.show_wakeups)
				3520	parse_options_usage(NULL, timehist_options, "w", true);
				3521	if (sched.show_next)
				3522	parse_options_usage(NULL, timehist_options, "n", true);
				3523	return -EINVAL;
				3524	}
				3525
				3526	return perf_sched__timehist(&sched);
				3527	} else {
				3528	usage_with_options(sched_usage, sched_options);
				3529	}
				3530
				3531	return 0;
				3532	}