Blame - marvell/linux/kernel/events/core.c - T108

blob: 87f17b776593746606914b3d8128dcebe627a225 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Performance events core code:
				4	*
				5	* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
				6	* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
				7	* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
				8	* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
				9	*/
				10
				11	#include <linux/fs.h>
				12	#include <linux/mm.h>
				13	#include <linux/cpu.h>
				14	#include <linux/smp.h>
				15	#include <linux/idr.h>
				16	#include <linux/file.h>
				17	#include <linux/poll.h>
				18	#include <linux/slab.h>
				19	#include <linux/hash.h>
				20	#include <linux/tick.h>
				21	#include <linux/sysfs.h>
				22	#include <linux/dcache.h>
				23	#include <linux/percpu.h>
				24	#include <linux/ptrace.h>
				25	#include <linux/reboot.h>
				26	#include <linux/vmstat.h>
				27	#include <linux/device.h>
				28	#include <linux/export.h>
				29	#include <linux/vmalloc.h>
				30	#include <linux/hardirq.h>
				31	#include <linux/rculist.h>
				32	#include <linux/uaccess.h>
				33	#include <linux/syscalls.h>
				34	#include <linux/anon_inodes.h>
				35	#include <linux/kernel_stat.h>
				36	#include <linux/cgroup.h>
				37	#include <linux/perf_event.h>
				38	#include <linux/trace_events.h>
				39	#include <linux/hw_breakpoint.h>
				40	#include <linux/mm_types.h>
				41	#include <linux/module.h>
				42	#include <linux/mman.h>
				43	#include <linux/compat.h>
				44	#include <linux/bpf.h>
				45	#include <linux/filter.h>
				46	#include <linux/namei.h>
				47	#include <linux/parser.h>
				48	#include <linux/sched/clock.h>
				49	#include <linux/sched/mm.h>
				50	#include <linux/proc_ns.h>
				51	#include <linux/mount.h>
				52
				53	#include "internal.h"
				54
				55	#include <asm/irq_regs.h>
				56
				57	typedef int (remote_function_f)(void );
				58
				59	struct remote_function_call {
				60	struct task_struct *p;
				61	remote_function_f func;
				62	void *info;
				63	int ret;
				64	};
				65
				66	static void remote_function(void *data)
				67	{
				68	struct remote_function_call *tfc = data;
				69	struct task_struct *p = tfc->p;
				70
				71	if (p) {
				72	/* -EAGAIN */
				73	if (task_cpu(p) != smp_processor_id())
				74	return;
				75
				76	/*
				77	* Now that we're on right CPU with IRQs disabled, we can test
				78	* if we hit the right task without races.
				79	*/
				80
				81	tfc->ret = -ESRCH; /* No such (running) process */
				82	if (p != current)
				83	return;
				84	}
				85
				86	tfc->ret = tfc->func(tfc->info);
				87	}
				88
				89	/**
				90	* task_function_call - call a function on the cpu on which a task runs
				91	* @p: the task to evaluate
				92	* @func: the function to be called
				93	* @info: the function call argument
				94	*
				95	* Calls the function @func when the task is currently running. This might
				96	* be on the current CPU, which just calls the function directly. This will
				97	* retry due to any failures in smp_call_function_single(), such as if the
				98	* task_cpu() goes offline concurrently.
				99	*
				100	* returns @func return value or -ESRCH or -ENXIO when the process isn't running
				101	*/
				102	static int
				103	task_function_call(struct task_struct p, remote_function_f func, void info)
				104	{
				105	struct remote_function_call data = {
				106	.p = p,
				107	.func = func,
				108	.info = info,
				109	.ret = -EAGAIN,
				110	};
				111	int ret;
				112
				113	for (;;) {
				114	ret = smp_call_function_single(task_cpu(p), remote_function,
				115	&data, 1);
				116	if (!ret)
				117	ret = data.ret;
				118
				119	if (ret != -EAGAIN)
				120	break;
				121
				122	cond_resched();
				123	}
				124
				125	return ret;
				126	}
				127
				128	/**
				129	* cpu_function_call - call a function on the cpu
				130	* @func: the function to be called
				131	* @info: the function call argument
				132	*
				133	* Calls the function @func on the remote cpu.
				134	*
				135	* returns: @func return value or -ENXIO when the cpu is offline
				136	*/
				137	static int cpu_function_call(int cpu, remote_function_f func, void *info)
				138	{
				139	struct remote_function_call data = {
				140	.p = NULL,
				141	.func = func,
				142	.info = info,
				143	.ret = -ENXIO, /* No such CPU */
				144	};
				145
				146	smp_call_function_single(cpu, remote_function, &data, 1);
				147
				148	return data.ret;
				149	}
				150
				151	static inline struct perf_cpu_context *
				152	__get_cpu_context(struct perf_event_context *ctx)
				153	{
				154	return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
				155	}
				156
				157	static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
				158	struct perf_event_context *ctx)
				159	{
				160	raw_spin_lock(&cpuctx->ctx.lock);
				161	if (ctx)
				162	raw_spin_lock(&ctx->lock);
				163	}
				164
				165	static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
				166	struct perf_event_context *ctx)
				167	{
				168	if (ctx)
				169	raw_spin_unlock(&ctx->lock);
				170	raw_spin_unlock(&cpuctx->ctx.lock);
				171	}
				172
				173	#define TASK_TOMBSTONE ((void *)-1L)
				174
				175	static bool is_kernel_event(struct perf_event *event)
				176	{
				177	return READ_ONCE(event->owner) == TASK_TOMBSTONE;
				178	}
				179
				180	/*
				181	* On task ctx scheduling...
				182	*
				183	* When !ctx->nr_events a task context will not be scheduled. This means
				184	* we can disable the scheduler hooks (for performance) without leaving
				185	* pending task ctx state.
				186	*
				187	* This however results in two special cases:
				188	*
				189	* - removing the last event from a task ctx; this is relatively straight
				190	* forward and is done in __perf_remove_from_context.
				191	*
				192	* - adding the first event to a task ctx; this is tricky because we cannot
				193	* rely on ctx->is_active and therefore cannot use event_function_call().
				194	* See perf_install_in_context().
				195	*
				196	* If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
				197	*/
				198
				199	typedef void (event_f)(struct perf_event , struct perf_cpu_context *,
				200	struct perf_event_context , void );
				201
				202	struct event_function_struct {
				203	struct perf_event *event;
				204	event_f func;
				205	void *data;
				206	};
				207
				208	static int event_function(void *info)
				209	{
				210	struct event_function_struct *efs = info;
				211	struct perf_event *event = efs->event;
				212	struct perf_event_context *ctx = event->ctx;
				213	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
				214	struct perf_event_context *task_ctx = cpuctx->task_ctx;
				215	int ret = 0;
				216
				217	lockdep_assert_irqs_disabled();
				218
				219	perf_ctx_lock(cpuctx, task_ctx);
				220	/*
				221	* Since we do the IPI call without holding ctx->lock things can have
				222	* changed, double check we hit the task we set out to hit.
				223	*/
				224	if (ctx->task) {
				225	if (ctx->task != current) {
				226	ret = -ESRCH;
				227	goto unlock;
				228	}
				229
				230	/*
				231	* We only use event_function_call() on established contexts,
				232	* and event_function() is only ever called when active (or
				233	* rather, we'll have bailed in task_function_call() or the
				234	* above ctx->task != current test), therefore we must have
				235	* ctx->is_active here.
				236	*/
				237	WARN_ON_ONCE(!ctx->is_active);
				238	/*
				239	* And since we have ctx->is_active, cpuctx->task_ctx must
				240	* match.
				241	*/
				242	WARN_ON_ONCE(task_ctx != ctx);
				243	} else {
				244	WARN_ON_ONCE(&cpuctx->ctx != ctx);
				245	}
				246
				247	efs->func(event, cpuctx, ctx, efs->data);
				248	unlock:
				249	perf_ctx_unlock(cpuctx, task_ctx);
				250
				251	return ret;
				252	}
				253
				254	static void event_function_call(struct perf_event event, event_f func, void data)
				255	{
				256	struct perf_event_context *ctx = event->ctx;
				257	struct task_struct task = READ_ONCE(ctx->task); / verified in event_function */
				258	struct event_function_struct efs = {
				259	.event = event,
				260	.func = func,
				261	.data = data,
				262	};
				263
				264	if (!event->parent) {
				265	/*
				266	* If this is a !child event, we must hold ctx::mutex to
				267	* stabilize the the event->ctx relation. See
				268	* perf_event_ctx_lock().
				269	*/
				270	lockdep_assert_held(&ctx->mutex);
				271	}
				272
				273	if (!task) {
				274	cpu_function_call(event->cpu, event_function, &efs);
				275	return;
				276	}
				277
				278	if (task == TASK_TOMBSTONE)
				279	return;
				280
				281	again:
				282	if (!task_function_call(task, event_function, &efs))
				283	return;
				284
				285	raw_spin_lock_irq(&ctx->lock);
				286	/*
				287	* Reload the task pointer, it might have been changed by
				288	* a concurrent perf_event_context_sched_out().
				289	*/
				290	task = ctx->task;
				291	if (task == TASK_TOMBSTONE) {
				292	raw_spin_unlock_irq(&ctx->lock);
				293	return;
				294	}
				295	if (ctx->is_active) {
				296	raw_spin_unlock_irq(&ctx->lock);
				297	goto again;
				298	}
				299	func(event, NULL, ctx, data);
				300	raw_spin_unlock_irq(&ctx->lock);
				301	}
				302
				303	/*
				304	* Similar to event_function_call() + event_function(), but hard assumes IRQs
				305	* are already disabled and we're on the right CPU.
				306	*/
				307	static void event_function_local(struct perf_event event, event_f func, void data)
				308	{
				309	struct perf_event_context *ctx = event->ctx;
				310	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
				311	struct task_struct *task = READ_ONCE(ctx->task);
				312	struct perf_event_context *task_ctx = NULL;
				313
				314	lockdep_assert_irqs_disabled();
				315
				316	if (task) {
				317	if (task == TASK_TOMBSTONE)
				318	return;
				319
				320	task_ctx = ctx;
				321	}
				322
				323	perf_ctx_lock(cpuctx, task_ctx);
				324
				325	task = ctx->task;
				326	if (task == TASK_TOMBSTONE)
				327	goto unlock;
				328
				329	if (task) {
				330	/*
				331	* We must be either inactive or active and the right task,
				332	* otherwise we're screwed, since we cannot IPI to somewhere
				333	* else.
				334	*/
				335	if (ctx->is_active) {
				336	if (WARN_ON_ONCE(task != current))
				337	goto unlock;
				338
				339	if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
				340	goto unlock;
				341	}
				342	} else {
				343	WARN_ON_ONCE(&cpuctx->ctx != ctx);
				344	}
				345
				346	func(event, cpuctx, ctx, data);
				347	unlock:
				348	perf_ctx_unlock(cpuctx, task_ctx);
				349	}
				350
				351	#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP \|\
				352	PERF_FLAG_FD_OUTPUT \|\
				353	PERF_FLAG_PID_CGROUP \|\
				354	PERF_FLAG_FD_CLOEXEC)
				355
				356	/*
				357	* branch priv levels that need permission checks
				358	*/
				359	#define PERF_SAMPLE_BRANCH_PERM_PLM \
				360	(PERF_SAMPLE_BRANCH_KERNEL \|\
				361	PERF_SAMPLE_BRANCH_HV)
				362
				363	enum event_type_t {
				364	EVENT_FLEXIBLE = 0x1,
				365	EVENT_PINNED = 0x2,
				366	EVENT_TIME = 0x4,
				367	/* see ctx_resched() for details */
				368	EVENT_CPU = 0x8,
				369	EVENT_ALL = EVENT_FLEXIBLE \| EVENT_PINNED,
				370	};
				371
				372	/*
				373	* perf_sched_events : >0 events exist
				374	* perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
				375	*/
				376
				377	static void perf_sched_delayed(struct work_struct *work);
				378	DEFINE_STATIC_KEY_FALSE(perf_sched_events);
				379	static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
				380	static DEFINE_MUTEX(perf_sched_mutex);
				381	static atomic_t perf_sched_count;
				382
				383	static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
				384	static DEFINE_PER_CPU(int, perf_sched_cb_usages);
				385	static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
				386
				387	static atomic_t nr_mmap_events __read_mostly;
				388	static atomic_t nr_comm_events __read_mostly;
				389	static atomic_t nr_namespaces_events __read_mostly;
				390	static atomic_t nr_task_events __read_mostly;
				391	static atomic_t nr_freq_events __read_mostly;
				392	static atomic_t nr_switch_events __read_mostly;
				393	static atomic_t nr_ksymbol_events __read_mostly;
				394	static atomic_t nr_bpf_events __read_mostly;
				395
				396	static LIST_HEAD(pmus);
				397	static DEFINE_MUTEX(pmus_lock);
				398	static struct srcu_struct pmus_srcu;
				399	static cpumask_var_t perf_online_mask;
				400
				401	/*
				402	* perf event paranoia level:
				403	* -1 - not paranoid at all
				404	* 0 - disallow raw tracepoint access for unpriv
				405	* 1 - disallow cpu events for unpriv
				406	* 2 - disallow kernel profiling for unpriv
				407	*/
				408	int sysctl_perf_event_paranoid __read_mostly = 2;
				409
				410	/* Minimum for 512 kiB + 1 user control page */
				411	int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
				412
				413	/*
				414	* max perf event sample rate
				415	*/
				416	#define DEFAULT_MAX_SAMPLE_RATE 100000
				417	#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
				418	#define DEFAULT_CPU_TIME_MAX_PERCENT 25
				419
				420	int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
				421
				422	static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
				423	static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
				424
				425	static int perf_sample_allowed_ns __read_mostly =
				426	DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
				427
				428	static void update_perf_cpu_limits(void)
				429	{
				430	u64 tmp = perf_sample_period_ns;
				431
				432	tmp *= sysctl_perf_cpu_time_max_percent;
				433	tmp = div_u64(tmp, 100);
				434	if (!tmp)
				435	tmp = 1;
				436
				437	WRITE_ONCE(perf_sample_allowed_ns, tmp);
				438	}
				439
				440	static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
				441
				442	int perf_proc_update_handler(struct ctl_table *table, int write,
				443	void __user buffer, size_t lenp,
				444	loff_t *ppos)
				445	{
				446	int ret;
				447	int perf_cpu = sysctl_perf_cpu_time_max_percent;
				448	/*
				449	* If throttling is disabled don't allow the write:
				450	*/
				451	if (write && (perf_cpu == 100 \|\| perf_cpu == 0))
				452	return -EINVAL;
				453
				454	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
				455	if (ret \|\| !write)
				456	return ret;
				457
				458	max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
				459	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
				460	update_perf_cpu_limits();
				461
				462	return 0;
				463	}
				464
				465	int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
				466
				467	int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
				468	void __user buffer, size_t lenp,
				469	loff_t *ppos)
				470	{
				471	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
				472
				473	if (ret \|\| !write)
				474	return ret;
				475
				476	if (sysctl_perf_cpu_time_max_percent == 100 \|\|
				477	sysctl_perf_cpu_time_max_percent == 0) {
				478	printk(KERN_WARNING
				479	"perf: Dynamic interrupt throttling disabled, can hang your system!\n");
				480	WRITE_ONCE(perf_sample_allowed_ns, 0);
				481	} else {
				482	update_perf_cpu_limits();
				483	}
				484
				485	return 0;
				486	}
				487
				488	/*
				489	* perf samples are done in some very critical code paths (NMIs).
				490	* If they take too much CPU time, the system can lock up and not
				491	* get any real work done. This will drop the sample rate when
				492	* we detect that events are taking too long.
				493	*/
				494	#define NR_ACCUMULATED_SAMPLES 128
				495	static DEFINE_PER_CPU(u64, running_sample_length);
				496
				497	static u64 __report_avg;
				498	static u64 __report_allowed;
				499
				500	static void perf_duration_warn(struct irq_work *w)
				501	{
				502	printk_ratelimited(KERN_INFO
				503	"perf: interrupt took too long (%lld > %lld), lowering "
				504	"kernel.perf_event_max_sample_rate to %d\n",
				505	__report_avg, __report_allowed,
				506	sysctl_perf_event_sample_rate);
				507	}
				508
				509	static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
				510
				511	void perf_sample_event_took(u64 sample_len_ns)
				512	{
				513	u64 max_len = READ_ONCE(perf_sample_allowed_ns);
				514	u64 running_len;
				515	u64 avg_len;
				516	u32 max;
				517
				518	if (max_len == 0)
				519	return;
				520
				521	/* Decay the counter by 1 average sample. */
				522	running_len = __this_cpu_read(running_sample_length);
				523	running_len -= running_len/NR_ACCUMULATED_SAMPLES;
				524	running_len += sample_len_ns;
				525	__this_cpu_write(running_sample_length, running_len);
				526
				527	/*
				528	* Note: this will be biased artifically low until we have
				529	* seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
				530	* from having to maintain a count.
				531	*/
				532	avg_len = running_len/NR_ACCUMULATED_SAMPLES;
				533	if (avg_len <= max_len)
				534	return;
				535
				536	__report_avg = avg_len;
				537	__report_allowed = max_len;
				538
				539	/*
				540	* Compute a throttle threshold 25% below the current duration.
				541	*/
				542	avg_len += avg_len / 4;
				543	max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
				544	if (avg_len < max)
				545	max /= (u32)avg_len;
				546	else
				547	max = 1;
				548
				549	WRITE_ONCE(perf_sample_allowed_ns, avg_len);
				550	WRITE_ONCE(max_samples_per_tick, max);
				551
				552	sysctl_perf_event_sample_rate = max * HZ;
				553	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
				554
				555	if (!irq_work_queue(&perf_duration_work)) {
				556	early_printk("perf: interrupt took too long (%lld > %lld), lowering "
				557	"kernel.perf_event_max_sample_rate to %d\n",
				558	__report_avg, __report_allowed,
				559	sysctl_perf_event_sample_rate);
				560	}
				561	}
				562
				563	static atomic64_t perf_event_id;
				564
				565	static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
				566	enum event_type_t event_type);
				567
				568	static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
				569	enum event_type_t event_type,
				570	struct task_struct *task);
				571
				572	static void update_context_time(struct perf_event_context *ctx);
				573	static u64 perf_event_time(struct perf_event *event);
				574
				575	void __weak perf_event_print_debug(void) { }
				576
				577	extern __weak const char *perf_pmu_name(void)
				578	{
				579	return "pmu";
				580	}
				581
				582	static inline u64 perf_clock(void)
				583	{
				584	return local_clock();
				585	}
				586
				587	static inline u64 perf_event_clock(struct perf_event *event)
				588	{
				589	return event->clock();
				590	}
				591
				592	/*
				593	* State based event timekeeping...
				594	*
				595	* The basic idea is to use event->state to determine which (if any) time
				596	* fields to increment with the current delta. This means we only need to
				597	* update timestamps when we change state or when they are explicitly requested
				598	* (read).
				599	*
				600	* Event groups make things a little more complicated, but not terribly so. The
				601	* rules for a group are that if the group leader is OFF the entire group is
				602	* OFF, irrespecive of what the group member states are. This results in
				603	* __perf_effective_state().
				604	*
				605	* A futher ramification is that when a group leader flips between OFF and
				606	* !OFF, we need to update all group member times.
				607	*
				608	*
				609	* NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
				610	* need to make sure the relevant context time is updated before we try and
				611	* update our timestamps.
				612	*/
				613
				614	static __always_inline enum perf_event_state
				615	__perf_effective_state(struct perf_event *event)
				616	{
				617	struct perf_event *leader = event->group_leader;
				618
				619	if (leader->state <= PERF_EVENT_STATE_OFF)
				620	return leader->state;
				621
				622	return event->state;
				623	}
				624
				625	static __always_inline void
				626	__perf_update_times(struct perf_event event, u64 now, u64 enabled, u64 *running)
				627	{
				628	enum perf_event_state state = __perf_effective_state(event);
				629	u64 delta = now - event->tstamp;
				630
				631	*enabled = event->total_time_enabled;
				632	if (state >= PERF_EVENT_STATE_INACTIVE)
				633	*enabled += delta;
				634
				635	*running = event->total_time_running;
				636	if (state >= PERF_EVENT_STATE_ACTIVE)
				637	*running += delta;
				638	}
				639
				640	static void perf_event_update_time(struct perf_event *event)
				641	{
				642	u64 now = perf_event_time(event);
				643
				644	__perf_update_times(event, now, &event->total_time_enabled,
				645	&event->total_time_running);
				646	event->tstamp = now;
				647	}
				648
				649	static void perf_event_update_sibling_time(struct perf_event *leader)
				650	{
				651	struct perf_event *sibling;
				652
				653	for_each_sibling_event(sibling, leader)
				654	perf_event_update_time(sibling);
				655	}
				656
				657	static void
				658	perf_event_set_state(struct perf_event *event, enum perf_event_state state)
				659	{
				660	if (event->state == state)
				661	return;
				662
				663	perf_event_update_time(event);
				664	/*
				665	* If a group leader gets enabled/disabled all its siblings
				666	* are affected too.
				667	*/
				668	if ((event->state < 0) ^ (state < 0))
				669	perf_event_update_sibling_time(event);
				670
				671	WRITE_ONCE(event->state, state);
				672	}
				673
				674	#ifdef CONFIG_CGROUP_PERF
				675
				676	static inline bool
				677	perf_cgroup_match(struct perf_event *event)
				678	{
				679	struct perf_event_context *ctx = event->ctx;
				680	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
				681
				682	/* @event doesn't care about cgroup */
				683	if (!event->cgrp)
				684	return true;
				685
				686	/* wants specific cgroup scope but @cpuctx isn't associated with any */
				687	if (!cpuctx->cgrp)
				688	return false;
				689
				690	/*
				691	* Cgroup scoping is recursive. An event enabled for a cgroup is
				692	* also enabled for all its descendant cgroups. If @cpuctx's
				693	* cgroup is a descendant of @event's (the test covers identity
				694	* case), it's a match.
				695	*/
				696	return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
				697	event->cgrp->css.cgroup);
				698	}
				699
				700	static inline void perf_detach_cgroup(struct perf_event *event)
				701	{
				702	css_put(&event->cgrp->css);
				703	event->cgrp = NULL;
				704	}
				705
				706	static inline int is_cgroup_event(struct perf_event *event)
				707	{
				708	return event->cgrp != NULL;
				709	}
				710
				711	static inline u64 perf_cgroup_event_time(struct perf_event *event)
				712	{
				713	struct perf_cgroup_info *t;
				714
				715	t = per_cpu_ptr(event->cgrp->info, event->cpu);
				716	return t->time;
				717	}
				718
				719	static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
				720	{
				721	struct perf_cgroup_info *info;
				722	u64 now;
				723
				724	now = perf_clock();
				725
				726	info = this_cpu_ptr(cgrp->info);
				727
				728	info->time += now - info->timestamp;
				729	info->timestamp = now;
				730	}
				731
				732	static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
				733	{
				734	struct perf_cgroup *cgrp = cpuctx->cgrp;
				735	struct cgroup_subsys_state *css;
				736
				737	if (cgrp) {
				738	for (css = &cgrp->css; css; css = css->parent) {
				739	cgrp = container_of(css, struct perf_cgroup, css);
				740	__update_cgrp_time(cgrp);
				741	}
				742	}
				743	}
				744
				745	static inline void update_cgrp_time_from_event(struct perf_event *event)
				746	{
				747	struct perf_cgroup *cgrp;
				748
				749	/*
				750	* ensure we access cgroup data only when needed and
				751	* when we know the cgroup is pinned (css_get)
				752	*/
				753	if (!is_cgroup_event(event))
				754	return;
				755
				756	cgrp = perf_cgroup_from_task(current, event->ctx);
				757	/*
				758	* Do not update time when cgroup is not active
				759	*/
				760	if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
				761	__update_cgrp_time(event->cgrp);
				762	}
				763
				764	static inline void
				765	perf_cgroup_set_timestamp(struct task_struct *task,
				766	struct perf_event_context *ctx)
				767	{
				768	struct perf_cgroup *cgrp;
				769	struct perf_cgroup_info *info;
				770	struct cgroup_subsys_state *css;
				771
				772	/*
				773	* ctx->lock held by caller
				774	* ensure we do not access cgroup data
				775	* unless we have the cgroup pinned (css_get)
				776	*/
				777	if (!task \|\| !ctx->nr_cgroups)
				778	return;
				779
				780	cgrp = perf_cgroup_from_task(task, ctx);
				781
				782	for (css = &cgrp->css; css; css = css->parent) {
				783	cgrp = container_of(css, struct perf_cgroup, css);
				784	info = this_cpu_ptr(cgrp->info);
				785	info->timestamp = ctx->timestamp;
				786	}
				787	}
				788
				789	static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
				790
				791	#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
				792	#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
				793
				794	/*
				795	* reschedule events based on the cgroup constraint of task.
				796	*
				797	* mode SWOUT : schedule out everything
				798	* mode SWIN : schedule in based on cgroup for next
				799	*/
				800	static void perf_cgroup_switch(struct task_struct *task, int mode)
				801	{
				802	struct perf_cpu_context cpuctx, tmp;
				803	struct list_head *list;
				804	unsigned long flags;
				805
				806	/*
				807	* Disable interrupts and preemption to avoid this CPU's
				808	* cgrp_cpuctx_entry to change under us.
				809	*/
				810	local_irq_save(flags);
				811
				812	list = this_cpu_ptr(&cgrp_cpuctx_list);
				813	list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
				814	WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
				815
				816	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
				817	perf_pmu_disable(cpuctx->ctx.pmu);
				818
				819	if (mode & PERF_CGROUP_SWOUT) {
				820	cpu_ctx_sched_out(cpuctx, EVENT_ALL);
				821	/*
				822	* must not be done before ctxswout due
				823	* to event_filter_match() in event_sched_out()
				824	*/
				825	cpuctx->cgrp = NULL;
				826	}
				827
				828	if (mode & PERF_CGROUP_SWIN) {
				829	WARN_ON_ONCE(cpuctx->cgrp);
				830	/*
				831	* set cgrp before ctxsw in to allow
				832	* event_filter_match() to not have to pass
				833	* task around
				834	* we pass the cpuctx->ctx to perf_cgroup_from_task()
				835	* because cgorup events are only per-cpu
				836	*/
				837	cpuctx->cgrp = perf_cgroup_from_task(task,
				838	&cpuctx->ctx);
				839	cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
				840	}
				841	perf_pmu_enable(cpuctx->ctx.pmu);
				842	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
				843	}
				844
				845	local_irq_restore(flags);
				846	}
				847
				848	static inline void perf_cgroup_sched_out(struct task_struct *task,
				849	struct task_struct *next)
				850	{
				851	struct perf_cgroup *cgrp1;
				852	struct perf_cgroup *cgrp2 = NULL;
				853
				854	rcu_read_lock();
				855	/*
				856	* we come here when we know perf_cgroup_events > 0
				857	* we do not need to pass the ctx here because we know
				858	* we are holding the rcu lock
				859	*/
				860	cgrp1 = perf_cgroup_from_task(task, NULL);
				861	cgrp2 = perf_cgroup_from_task(next, NULL);
				862
				863	/*
				864	* only schedule out current cgroup events if we know
				865	* that we are switching to a different cgroup. Otherwise,
				866	* do no touch the cgroup events.
				867	*/
				868	if (cgrp1 != cgrp2)
				869	perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
				870
				871	rcu_read_unlock();
				872	}
				873
				874	static inline void perf_cgroup_sched_in(struct task_struct *prev,
				875	struct task_struct *task)
				876	{
				877	struct perf_cgroup *cgrp1;
				878	struct perf_cgroup *cgrp2 = NULL;
				879
				880	rcu_read_lock();
				881	/*
				882	* we come here when we know perf_cgroup_events > 0
				883	* we do not need to pass the ctx here because we know
				884	* we are holding the rcu lock
				885	*/
				886	cgrp1 = perf_cgroup_from_task(task, NULL);
				887	cgrp2 = perf_cgroup_from_task(prev, NULL);
				888
				889	/*
				890	* only need to schedule in cgroup events if we are changing
				891	* cgroup during ctxsw. Cgroup events were not scheduled
				892	* out of ctxsw out if that was not the case.
				893	*/
				894	if (cgrp1 != cgrp2)
				895	perf_cgroup_switch(task, PERF_CGROUP_SWIN);
				896
				897	rcu_read_unlock();
				898	}
				899
				900	static inline int perf_cgroup_connect(int fd, struct perf_event *event,
				901	struct perf_event_attr *attr,
				902	struct perf_event *group_leader)
				903	{
				904	struct perf_cgroup *cgrp;
				905	struct cgroup_subsys_state *css;
				906	struct fd f = fdget(fd);
				907	int ret = 0;
				908
				909	if (!f.file)
				910	return -EBADF;
				911
				912	css = css_tryget_online_from_dir(f.file->f_path.dentry,
				913	&perf_event_cgrp_subsys);
				914	if (IS_ERR(css)) {
				915	ret = PTR_ERR(css);
				916	goto out;
				917	}
				918
				919	cgrp = container_of(css, struct perf_cgroup, css);
				920	event->cgrp = cgrp;
				921
				922	/*
				923	* all events in a group must monitor
				924	* the same cgroup because a task belongs
				925	* to only one perf cgroup at a time
				926	*/
				927	if (group_leader && group_leader->cgrp != cgrp) {
				928	perf_detach_cgroup(event);
				929	ret = -EINVAL;
				930	}
				931	out:
				932	fdput(f);
				933	return ret;
				934	}
				935
				936	static inline void
				937	perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
				938	{
				939	struct perf_cgroup_info *t;
				940	t = per_cpu_ptr(event->cgrp->info, event->cpu);
				941	event->shadow_ctx_time = now - t->timestamp;
				942	}
				943
				944	/*
				945	* Update cpuctx->cgrp so that it is set when first cgroup event is added and
				946	* cleared when last cgroup event is removed.
				947	*/
				948	static inline void
				949	list_update_cgroup_event(struct perf_event *event,
				950	struct perf_event_context *ctx, bool add)
				951	{
				952	struct perf_cpu_context *cpuctx;
				953	struct list_head *cpuctx_entry;
				954
				955	if (!is_cgroup_event(event))
				956	return;
				957
				958	/*
				959	* Because cgroup events are always per-cpu events,
				960	* this will always be called from the right CPU.
				961	*/
				962	cpuctx = __get_cpu_context(ctx);
				963
				964	/*
				965	* Since setting cpuctx->cgrp is conditional on the current @cgrp
				966	* matching the event's cgroup, we must do this for every new event,
				967	* because if the first would mismatch, the second would not try again
				968	* and we would leave cpuctx->cgrp unset.
				969	*/
				970	if (add && !cpuctx->cgrp) {
				971	struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
				972
				973	if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
				974	cpuctx->cgrp = cgrp;
				975	}
				976
				977	if (add && ctx->nr_cgroups++)
				978	return;
				979	else if (!add && --ctx->nr_cgroups)
				980	return;
				981
				982	/* no cgroup running */
				983	if (!add)
				984	cpuctx->cgrp = NULL;
				985
				986	cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
				987	if (add)
				988	list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
				989	else
				990	list_del(cpuctx_entry);
				991	}
				992
				993	#else /* !CONFIG_CGROUP_PERF */
				994
				995	static inline bool
				996	perf_cgroup_match(struct perf_event *event)
				997	{
				998	return true;
				999	}
				1000
				1001	static inline void perf_detach_cgroup(struct perf_event *event)
				1002	{}
				1003
				1004	static inline int is_cgroup_event(struct perf_event *event)
				1005	{
				1006	return 0;
				1007	}
				1008
				1009	static inline void update_cgrp_time_from_event(struct perf_event *event)
				1010	{
				1011	}
				1012
				1013	static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
				1014	{
				1015	}
				1016
				1017	static inline void perf_cgroup_sched_out(struct task_struct *task,
				1018	struct task_struct *next)
				1019	{
				1020	}
				1021
				1022	static inline void perf_cgroup_sched_in(struct task_struct *prev,
				1023	struct task_struct *task)
				1024	{
				1025	}
				1026
				1027	static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
				1028	struct perf_event_attr *attr,
				1029	struct perf_event *group_leader)
				1030	{
				1031	return -EINVAL;
				1032	}
				1033
				1034	static inline void
				1035	perf_cgroup_set_timestamp(struct task_struct *task,
				1036	struct perf_event_context *ctx)
				1037	{
				1038	}
				1039
				1040	static inline void
				1041	perf_cgroup_switch(struct task_struct task, struct task_struct next)
				1042	{
				1043	}
				1044
				1045	static inline void
				1046	perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
				1047	{
				1048	}
				1049
				1050	static inline u64 perf_cgroup_event_time(struct perf_event *event)
				1051	{
				1052	return 0;
				1053	}
				1054
				1055	static inline void
				1056	list_update_cgroup_event(struct perf_event *event,
				1057	struct perf_event_context *ctx, bool add)
				1058	{
				1059	}
				1060
				1061	#endif
				1062
				1063	/*
				1064	* set default to be dependent on timer tick just
				1065	* like original code
				1066	*/
				1067	#define PERF_CPU_HRTIMER (1000 / HZ)
				1068	/*
				1069	* function must be called with interrupts disabled
				1070	*/
				1071	static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
				1072	{
				1073	struct perf_cpu_context *cpuctx;
				1074	bool rotations;
				1075
				1076	lockdep_assert_irqs_disabled();
				1077
				1078	cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
				1079	rotations = perf_rotate_context(cpuctx);
				1080
				1081	raw_spin_lock(&cpuctx->hrtimer_lock);
				1082	if (rotations)
				1083	hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
				1084	else
				1085	cpuctx->hrtimer_active = 0;
				1086	raw_spin_unlock(&cpuctx->hrtimer_lock);
				1087
				1088	return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
				1089	}
				1090
				1091	static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
				1092	{
				1093	struct hrtimer *timer = &cpuctx->hrtimer;
				1094	struct pmu *pmu = cpuctx->ctx.pmu;
				1095	u64 interval;
				1096
				1097	/* no multiplexing needed for SW PMU */
				1098	if (pmu->task_ctx_nr == perf_sw_context)
				1099	return;
				1100
				1101	/*
				1102	* check default is sane, if not set then force to
				1103	* default interval (1/tick)
				1104	*/
				1105	interval = pmu->hrtimer_interval_ms;
				1106	if (interval < 1)
				1107	interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
				1108
				1109	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
				1110
				1111	raw_spin_lock_init(&cpuctx->hrtimer_lock);
				1112	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
				1113	timer->function = perf_mux_hrtimer_handler;
				1114	}
				1115
				1116	static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
				1117	{
				1118	struct hrtimer *timer = &cpuctx->hrtimer;
				1119	struct pmu *pmu = cpuctx->ctx.pmu;
				1120	unsigned long flags;
				1121
				1122	/* not for SW PMU */
				1123	if (pmu->task_ctx_nr == perf_sw_context)
				1124	return 0;
				1125
				1126	raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
				1127	if (!cpuctx->hrtimer_active) {
				1128	cpuctx->hrtimer_active = 1;
				1129	hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
				1130	hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
				1131	}
				1132	raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
				1133
				1134	return 0;
				1135	}
				1136
				1137	static int perf_mux_hrtimer_restart_ipi(void *arg)
				1138	{
				1139	return perf_mux_hrtimer_restart(arg);
				1140	}
				1141
				1142	void perf_pmu_disable(struct pmu *pmu)
				1143	{
				1144	int *count = this_cpu_ptr(pmu->pmu_disable_count);
				1145	if (!(*count)++)
				1146	pmu->pmu_disable(pmu);
				1147	}
				1148
				1149	void perf_pmu_enable(struct pmu *pmu)
				1150	{
				1151	int *count = this_cpu_ptr(pmu->pmu_disable_count);
				1152	if (!--(*count))
				1153	pmu->pmu_enable(pmu);
				1154	}
				1155
				1156	static DEFINE_PER_CPU(struct list_head, active_ctx_list);
				1157
				1158	/*
				1159	* perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
				1160	* perf_event_task_tick() are fully serialized because they're strictly cpu
				1161	* affine and perf_event_ctx{activate,deactivate} are called with IRQs
				1162	* disabled, while perf_event_task_tick is called from IRQ context.
				1163	*/
				1164	static void perf_event_ctx_activate(struct perf_event_context *ctx)
				1165	{
				1166	struct list_head *head = this_cpu_ptr(&active_ctx_list);
				1167
				1168	lockdep_assert_irqs_disabled();
				1169
				1170	WARN_ON(!list_empty(&ctx->active_ctx_list));
				1171
				1172	list_add(&ctx->active_ctx_list, head);
				1173	}
				1174
				1175	static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
				1176	{
				1177	lockdep_assert_irqs_disabled();
				1178
				1179	WARN_ON(list_empty(&ctx->active_ctx_list));
				1180
				1181	list_del_init(&ctx->active_ctx_list);
				1182	}
				1183
				1184	static void get_ctx(struct perf_event_context *ctx)
				1185	{
				1186	refcount_inc(&ctx->refcount);
				1187	}
				1188
				1189	static void free_ctx(struct rcu_head *head)
				1190	{
				1191	struct perf_event_context *ctx;
				1192
				1193	ctx = container_of(head, struct perf_event_context, rcu_head);
				1194	kfree(ctx->task_ctx_data);
				1195	kfree(ctx);
				1196	}
				1197
				1198	static void put_ctx(struct perf_event_context *ctx)
				1199	{
				1200	if (refcount_dec_and_test(&ctx->refcount)) {
				1201	if (ctx->parent_ctx)
				1202	put_ctx(ctx->parent_ctx);
				1203	if (ctx->task && ctx->task != TASK_TOMBSTONE)
				1204	put_task_struct(ctx->task);
				1205	call_rcu(&ctx->rcu_head, free_ctx);
				1206	}
				1207	}
				1208
				1209	/*
				1210	* Because of perf_event::ctx migration in sys_perf_event_open::move_group and
				1211	* perf_pmu_migrate_context() we need some magic.
				1212	*
				1213	* Those places that change perf_event::ctx will hold both
				1214	* perf_event_ctx::mutex of the 'old' and 'new' ctx value.
				1215	*
				1216	* Lock ordering is by mutex address. There are two other sites where
				1217	* perf_event_context::mutex nests and those are:
				1218	*
				1219	* - perf_event_exit_task_context() [ child , 0 ]
				1220	* perf_event_exit_event()
				1221	* put_event() [ parent, 1 ]
				1222	*
				1223	* - perf_event_init_context() [ parent, 0 ]
				1224	* inherit_task_group()
				1225	* inherit_group()
				1226	* inherit_event()
				1227	* perf_event_alloc()
				1228	* perf_init_event()
				1229	* perf_try_init_event() [ child , 1 ]
				1230	*
				1231	* While it appears there is an obvious deadlock here -- the parent and child
				1232	* nesting levels are inverted between the two. This is in fact safe because
				1233	* life-time rules separate them. That is an exiting task cannot fork, and a
				1234	* spawning task cannot (yet) exit.
				1235	*
				1236	* But remember that that these are parent<->child context relations, and
				1237	* migration does not affect children, therefore these two orderings should not
				1238	* interact.
				1239	*
				1240	* The change in perf_event::ctx does not affect children (as claimed above)
				1241	* because the sys_perf_event_open() case will install a new event and break
				1242	* the ctx parent<->child relation, and perf_pmu_migrate_context() is only
				1243	* concerned with cpuctx and that doesn't have children.
				1244	*
				1245	* The places that change perf_event::ctx will issue:
				1246	*
				1247	* perf_remove_from_context();
				1248	* synchronize_rcu();
				1249	* perf_install_in_context();
				1250	*
				1251	* to affect the change. The remove_from_context() + synchronize_rcu() should
				1252	* quiesce the event, after which we can install it in the new location. This
				1253	* means that only external vectors (perf_fops, prctl) can perturb the event
				1254	* while in transit. Therefore all such accessors should also acquire
				1255	* perf_event_context::mutex to serialize against this.
				1256	*
				1257	* However; because event->ctx can change while we're waiting to acquire
				1258	* ctx->mutex we must be careful and use the below perf_event_ctx_lock()
				1259	* function.
				1260	*
				1261	* Lock order:
				1262	* exec_update_lock
				1263	* task_struct::perf_event_mutex
				1264	* perf_event_context::mutex
				1265	* perf_event::child_mutex;
				1266	* perf_event_context::lock
				1267	* perf_event::mmap_mutex
				1268	* mmap_sem
				1269	* perf_addr_filters_head::lock
				1270	*
				1271	* cpu_hotplug_lock
				1272	* pmus_lock
				1273	* cpuctx->mutex / perf_event_context::mutex
				1274	*/
				1275	static struct perf_event_context *
				1276	perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
				1277	{
				1278	struct perf_event_context *ctx;
				1279
				1280	again:
				1281	rcu_read_lock();
				1282	ctx = READ_ONCE(event->ctx);
				1283	if (!refcount_inc_not_zero(&ctx->refcount)) {
				1284	rcu_read_unlock();
				1285	goto again;
				1286	}
				1287	rcu_read_unlock();
				1288
				1289	mutex_lock_nested(&ctx->mutex, nesting);
				1290	if (event->ctx != ctx) {
				1291	mutex_unlock(&ctx->mutex);
				1292	put_ctx(ctx);
				1293	goto again;
				1294	}
				1295
				1296	return ctx;
				1297	}
				1298
				1299	static inline struct perf_event_context *
				1300	perf_event_ctx_lock(struct perf_event *event)
				1301	{
				1302	return perf_event_ctx_lock_nested(event, 0);
				1303	}
				1304
				1305	static void perf_event_ctx_unlock(struct perf_event *event,
				1306	struct perf_event_context *ctx)
				1307	{
				1308	mutex_unlock(&ctx->mutex);
				1309	put_ctx(ctx);
				1310	}
				1311
				1312	/*
				1313	* This must be done under the ctx->lock, such as to serialize against
				1314	* context_equiv(), therefore we cannot call put_ctx() since that might end up
				1315	* calling scheduler related locks and ctx->lock nests inside those.
				1316	*/
				1317	static __must_check struct perf_event_context *
				1318	unclone_ctx(struct perf_event_context *ctx)
				1319	{
				1320	struct perf_event_context *parent_ctx = ctx->parent_ctx;
				1321
				1322	lockdep_assert_held(&ctx->lock);
				1323
				1324	if (parent_ctx)
				1325	ctx->parent_ctx = NULL;
				1326	ctx->generation++;
				1327
				1328	return parent_ctx;
				1329	}
				1330
				1331	static u32 perf_event_pid_type(struct perf_event event, struct task_struct p,
				1332	enum pid_type type)
				1333	{
				1334	u32 nr;
				1335	/*
				1336	* only top level events have the pid namespace they were created in
				1337	*/
				1338	if (event->parent)
				1339	event = event->parent;
				1340
				1341	nr = __task_pid_nr_ns(p, type, event->ns);
				1342	/* avoid -1 if it is idle thread or runs in another ns */
				1343	if (!nr && !pid_alive(p))
				1344	nr = -1;
				1345	return nr;
				1346	}
				1347
				1348	static u32 perf_event_pid(struct perf_event event, struct task_struct p)
				1349	{
				1350	return perf_event_pid_type(event, p, PIDTYPE_TGID);
				1351	}
				1352
				1353	static u32 perf_event_tid(struct perf_event event, struct task_struct p)
				1354	{
				1355	return perf_event_pid_type(event, p, PIDTYPE_PID);
				1356	}
				1357
				1358	/*
				1359	* If we inherit events we want to return the parent event id
				1360	* to userspace.
				1361	*/
				1362	static u64 primary_event_id(struct perf_event *event)
				1363	{
				1364	u64 id = event->id;
				1365
				1366	if (event->parent)
				1367	id = event->parent->id;
				1368
				1369	return id;
				1370	}
				1371
				1372	/*
				1373	* Get the perf_event_context for a task and lock it.
				1374	*
				1375	* This has to cope with with the fact that until it is locked,
				1376	* the context could get moved to another task.
				1377	*/
				1378	static struct perf_event_context *
				1379	perf_lock_task_context(struct task_struct task, int ctxn, unsigned long flags)
				1380	{
				1381	struct perf_event_context *ctx;
				1382
				1383	retry:
				1384	/*
				1385	* One of the few rules of preemptible RCU is that one cannot do
				1386	* rcu_read_unlock() while holding a scheduler (or nested) lock when
				1387	* part of the read side critical section was irqs-enabled -- see
				1388	* rcu_read_unlock_special().
				1389	*
				1390	* Since ctx->lock nests under rq->lock we must ensure the entire read
				1391	* side critical section has interrupts disabled.
				1392	*/
				1393	local_irq_save(*flags);
				1394	rcu_read_lock();
				1395	ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
				1396	if (ctx) {
				1397	/*
				1398	* If this context is a clone of another, it might
				1399	* get swapped for another underneath us by
				1400	* perf_event_task_sched_out, though the
				1401	* rcu_read_lock() protects us from any context
				1402	* getting freed. Lock the context and check if it
				1403	* got swapped before we could get the lock, and retry
				1404	* if so. If we locked the right context, then it
				1405	* can't get swapped on us any more.
				1406	*/
				1407	raw_spin_lock(&ctx->lock);
				1408	if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
				1409	raw_spin_unlock(&ctx->lock);
				1410	rcu_read_unlock();
				1411	local_irq_restore(*flags);
				1412	goto retry;
				1413	}
				1414
				1415	if (ctx->task == TASK_TOMBSTONE \|\|
				1416	!refcount_inc_not_zero(&ctx->refcount)) {
				1417	raw_spin_unlock(&ctx->lock);
				1418	ctx = NULL;
				1419	} else {
				1420	WARN_ON_ONCE(ctx->task != task);
				1421	}
				1422	}
				1423	rcu_read_unlock();
				1424	if (!ctx)
				1425	local_irq_restore(*flags);
				1426	return ctx;
				1427	}
				1428
				1429	/*
				1430	* Get the context for a task and increment its pin_count so it
				1431	* can't get swapped to another task. This also increments its
				1432	* reference count so that the context can't get freed.
				1433	*/
				1434	static struct perf_event_context *
				1435	perf_pin_task_context(struct task_struct *task, int ctxn)
				1436	{
				1437	struct perf_event_context *ctx;
				1438	unsigned long flags;
				1439
				1440	ctx = perf_lock_task_context(task, ctxn, &flags);
				1441	if (ctx) {
				1442	++ctx->pin_count;
				1443	raw_spin_unlock_irqrestore(&ctx->lock, flags);
				1444	}
				1445	return ctx;
				1446	}
				1447
				1448	static void perf_unpin_context(struct perf_event_context *ctx)
				1449	{
				1450	unsigned long flags;
				1451
				1452	raw_spin_lock_irqsave(&ctx->lock, flags);
				1453	--ctx->pin_count;
				1454	raw_spin_unlock_irqrestore(&ctx->lock, flags);
				1455	}
				1456
				1457	/*
				1458	* Update the record of the current time in a context.
				1459	*/
				1460	static void update_context_time(struct perf_event_context *ctx)
				1461	{
				1462	u64 now = perf_clock();
				1463
				1464	ctx->time += now - ctx->timestamp;
				1465	ctx->timestamp = now;
				1466	}
				1467
				1468	static u64 perf_event_time(struct perf_event *event)
				1469	{
				1470	struct perf_event_context *ctx = event->ctx;
				1471
				1472	if (is_cgroup_event(event))
				1473	return perf_cgroup_event_time(event);
				1474
				1475	return ctx ? ctx->time : 0;
				1476	}
				1477
				1478	static enum event_type_t get_event_type(struct perf_event *event)
				1479	{
				1480	struct perf_event_context *ctx = event->ctx;
				1481	enum event_type_t event_type;
				1482
				1483	lockdep_assert_held(&ctx->lock);
				1484
				1485	/*
				1486	* It's 'group type', really, because if our group leader is
				1487	* pinned, so are we.
				1488	*/
				1489	if (event->group_leader != event)
				1490	event = event->group_leader;
				1491
				1492	event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
				1493	if (!ctx->task)
				1494	event_type \|= EVENT_CPU;
				1495
				1496	return event_type;
				1497	}
				1498
				1499	/*
				1500	* Helper function to initialize event group nodes.
				1501	*/
				1502	static void init_event_group(struct perf_event *event)
				1503	{
				1504	RB_CLEAR_NODE(&event->group_node);
				1505	event->group_index = 0;
				1506	}
				1507
				1508	/*
				1509	* Extract pinned or flexible groups from the context
				1510	* based on event attrs bits.
				1511	*/
				1512	static struct perf_event_groups *
				1513	get_event_groups(struct perf_event event, struct perf_event_context ctx)
				1514	{
				1515	if (event->attr.pinned)
				1516	return &ctx->pinned_groups;
				1517	else
				1518	return &ctx->flexible_groups;
				1519	}
				1520
				1521	/*
				1522	* Helper function to initializes perf_event_group trees.
				1523	*/
				1524	static void perf_event_groups_init(struct perf_event_groups *groups)
				1525	{
				1526	groups->tree = RB_ROOT;
				1527	groups->index = 0;
				1528	}
				1529
				1530	/*
				1531	* Compare function for event groups;
				1532	*
				1533	* Implements complex key that first sorts by CPU and then by virtual index
				1534	* which provides ordering when rotating groups for the same CPU.
				1535	*/
				1536	static bool
				1537	perf_event_groups_less(struct perf_event left, struct perf_event right)
				1538	{
				1539	if (left->cpu < right->cpu)
				1540	return true;
				1541	if (left->cpu > right->cpu)
				1542	return false;
				1543
				1544	if (left->group_index < right->group_index)
				1545	return true;
				1546	if (left->group_index > right->group_index)
				1547	return false;
				1548
				1549	return false;
				1550	}
				1551
				1552	/*
				1553	* Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
				1554	* key (see perf_event_groups_less). This places it last inside the CPU
				1555	* subtree.
				1556	*/
				1557	static void
				1558	perf_event_groups_insert(struct perf_event_groups *groups,
				1559	struct perf_event *event)
				1560	{
				1561	struct perf_event *node_event;
				1562	struct rb_node *parent;
				1563	struct rb_node **node;
				1564
				1565	event->group_index = ++groups->index;
				1566
				1567	node = &groups->tree.rb_node;
				1568	parent = *node;
				1569
				1570	while (*node) {
				1571	parent = *node;
				1572	node_event = container_of(*node, struct perf_event, group_node);
				1573
				1574	if (perf_event_groups_less(event, node_event))
				1575	node = &parent->rb_left;
				1576	else
				1577	node = &parent->rb_right;
				1578	}
				1579
				1580	rb_link_node(&event->group_node, parent, node);
				1581	rb_insert_color(&event->group_node, &groups->tree);
				1582	}
				1583
				1584	/*
				1585	* Helper function to insert event into the pinned or flexible groups.
				1586	*/
				1587	static void
				1588	add_event_to_groups(struct perf_event event, struct perf_event_context ctx)
				1589	{
				1590	struct perf_event_groups *groups;
				1591
				1592	groups = get_event_groups(event, ctx);
				1593	perf_event_groups_insert(groups, event);
				1594	}
				1595
				1596	/*
				1597	* Delete a group from a tree.
				1598	*/
				1599	static void
				1600	perf_event_groups_delete(struct perf_event_groups *groups,
				1601	struct perf_event *event)
				1602	{
				1603	WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) \|\|
				1604	RB_EMPTY_ROOT(&groups->tree));
				1605
				1606	rb_erase(&event->group_node, &groups->tree);
				1607	init_event_group(event);
				1608	}
				1609
				1610	/*
				1611	* Helper function to delete event from its groups.
				1612	*/
				1613	static void
				1614	del_event_from_groups(struct perf_event event, struct perf_event_context ctx)
				1615	{
				1616	struct perf_event_groups *groups;
				1617
				1618	groups = get_event_groups(event, ctx);
				1619	perf_event_groups_delete(groups, event);
				1620	}
				1621
				1622	/*
				1623	* Get the leftmost event in the @cpu subtree.
				1624	*/
				1625	static struct perf_event *
				1626	perf_event_groups_first(struct perf_event_groups *groups, int cpu)
				1627	{
				1628	struct perf_event node_event = NULL, match = NULL;
				1629	struct rb_node *node = groups->tree.rb_node;
				1630
				1631	while (node) {
				1632	node_event = container_of(node, struct perf_event, group_node);
				1633
				1634	if (cpu < node_event->cpu) {
				1635	node = node->rb_left;
				1636	} else if (cpu > node_event->cpu) {
				1637	node = node->rb_right;
				1638	} else {
				1639	match = node_event;
				1640	node = node->rb_left;
				1641	}
				1642	}
				1643
				1644	return match;
				1645	}
				1646
				1647	/*
				1648	* Like rb_entry_next_safe() for the @cpu subtree.
				1649	*/
				1650	static struct perf_event *
				1651	perf_event_groups_next(struct perf_event *event)
				1652	{
				1653	struct perf_event *next;
				1654
				1655	next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
				1656	if (next && next->cpu == event->cpu)
				1657	return next;
				1658
				1659	return NULL;
				1660	}
				1661
				1662	/*
				1663	* Iterate through the whole groups tree.
				1664	*/
				1665	#define perf_event_groups_for_each(event, groups) \
				1666	for (event = rb_entry_safe(rb_first(&((groups)->tree)), \
				1667	typeof(*event), group_node); event; \
				1668	event = rb_entry_safe(rb_next(&event->group_node), \
				1669	typeof(*event), group_node))
				1670
				1671	/*
				1672	* Add an event from the lists for its context.
				1673	* Must be called with ctx->mutex and ctx->lock held.
				1674	*/
				1675	static void
				1676	list_add_event(struct perf_event event, struct perf_event_context ctx)
				1677	{
				1678	lockdep_assert_held(&ctx->lock);
				1679
				1680	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
				1681	event->attach_state \|= PERF_ATTACH_CONTEXT;
				1682
				1683	event->tstamp = perf_event_time(event);
				1684
				1685	/*
				1686	* If we're a stand alone event or group leader, we go to the context
				1687	* list, group events are kept attached to the group so that
				1688	* perf_group_detach can, at all times, locate all siblings.
				1689	*/
				1690	if (event->group_leader == event) {
				1691	event->group_caps = event->event_caps;
				1692	add_event_to_groups(event, ctx);
				1693	}
				1694
				1695	list_update_cgroup_event(event, ctx, true);
				1696
				1697	list_add_rcu(&event->event_entry, &ctx->event_list);
				1698	ctx->nr_events++;
				1699	if (event->attr.inherit_stat)
				1700	ctx->nr_stat++;
				1701
				1702	ctx->generation++;
				1703	}
				1704
				1705	/*
				1706	* Initialize event state based on the perf_event_attr::disabled.
				1707	*/
				1708	static inline void perf_event__state_init(struct perf_event *event)
				1709	{
				1710	event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
				1711	PERF_EVENT_STATE_INACTIVE;
				1712	}
				1713
				1714	static int __perf_event_read_size(u64 read_format, int nr_siblings)
				1715	{
				1716	int entry = sizeof(u64); /* value */
				1717	int size = 0;
				1718	int nr = 1;
				1719
				1720	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
				1721	size += sizeof(u64);
				1722
				1723	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
				1724	size += sizeof(u64);
				1725
				1726	if (read_format & PERF_FORMAT_ID)
				1727	entry += sizeof(u64);
				1728
				1729	if (read_format & PERF_FORMAT_LOST)
				1730	entry += sizeof(u64);
				1731
				1732	if (read_format & PERF_FORMAT_GROUP) {
				1733	nr += nr_siblings;
				1734	size += sizeof(u64);
				1735	}
				1736
				1737	/*
				1738	* Since perf_event_validate_size() limits this to 16k and inhibits
				1739	* adding more siblings, this will never overflow.
				1740	*/
				1741	return size + nr * entry;
				1742	}
				1743
				1744	static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
				1745	{
				1746	struct perf_sample_data *data;
				1747	u16 size = 0;
				1748
				1749	if (sample_type & PERF_SAMPLE_IP)
				1750	size += sizeof(data->ip);
				1751
				1752	if (sample_type & PERF_SAMPLE_ADDR)
				1753	size += sizeof(data->addr);
				1754
				1755	if (sample_type & PERF_SAMPLE_PERIOD)
				1756	size += sizeof(data->period);
				1757
				1758	if (sample_type & PERF_SAMPLE_WEIGHT)
				1759	size += sizeof(data->weight);
				1760
				1761	if (sample_type & PERF_SAMPLE_READ)
				1762	size += event->read_size;
				1763
				1764	if (sample_type & PERF_SAMPLE_DATA_SRC)
				1765	size += sizeof(data->data_src.val);
				1766
				1767	if (sample_type & PERF_SAMPLE_TRANSACTION)
				1768	size += sizeof(data->txn);
				1769
				1770	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
				1771	size += sizeof(data->phys_addr);
				1772
				1773	event->header_size = size;
				1774	}
				1775
				1776	/*
				1777	* Called at perf_event creation and when events are attached/detached from a
				1778	* group.
				1779	*/
				1780	static void perf_event__header_size(struct perf_event *event)
				1781	{
				1782	event->read_size =
				1783	__perf_event_read_size(event->attr.read_format,
				1784	event->group_leader->nr_siblings);
				1785	__perf_event_header_size(event, event->attr.sample_type);
				1786	}
				1787
				1788	static void perf_event__id_header_size(struct perf_event *event)
				1789	{
				1790	struct perf_sample_data *data;
				1791	u64 sample_type = event->attr.sample_type;
				1792	u16 size = 0;
				1793
				1794	if (sample_type & PERF_SAMPLE_TID)
				1795	size += sizeof(data->tid_entry);
				1796
				1797	if (sample_type & PERF_SAMPLE_TIME)
				1798	size += sizeof(data->time);
				1799
				1800	if (sample_type & PERF_SAMPLE_IDENTIFIER)
				1801	size += sizeof(data->id);
				1802
				1803	if (sample_type & PERF_SAMPLE_ID)
				1804	size += sizeof(data->id);
				1805
				1806	if (sample_type & PERF_SAMPLE_STREAM_ID)
				1807	size += sizeof(data->stream_id);
				1808
				1809	if (sample_type & PERF_SAMPLE_CPU)
				1810	size += sizeof(data->cpu_entry);
				1811
				1812	event->id_header_size = size;
				1813	}
				1814
				1815	/*
				1816	* Check that adding an event to the group does not result in anybody
				1817	* overflowing the 64k event limit imposed by the output buffer.
				1818	*
				1819	* Specifically, check that the read_size for the event does not exceed 16k,
				1820	* read_size being the one term that grows with groups size. Since read_size
				1821	* depends on per-event read_format, also (re)check the existing events.
				1822	*
				1823	* This leaves 48k for the constant size fields and things like callchains,
				1824	* branch stacks and register sets.
				1825	*/
				1826	static bool perf_event_validate_size(struct perf_event *event)
				1827	{
				1828	struct perf_event sibling, group_leader = event->group_leader;
				1829
				1830	if (__perf_event_read_size(event->attr.read_format,
				1831	group_leader->nr_siblings + 1) > 16*1024)
				1832	return false;
				1833
				1834	if (__perf_event_read_size(group_leader->attr.read_format,
				1835	group_leader->nr_siblings + 1) > 16*1024)
				1836	return false;
				1837
				1838	/*
				1839	* When creating a new group leader, group_leader->ctx is initialized
				1840	* after the size has been validated, but we cannot safely use
				1841	* for_each_sibling_event() until group_leader->ctx is set. A new group
				1842	* leader cannot have any siblings yet, so we can safely skip checking
				1843	* the non-existent siblings.
				1844	*/
				1845	if (event == group_leader)
				1846	return true;
				1847
				1848	for_each_sibling_event(sibling, group_leader) {
				1849	if (__perf_event_read_size(sibling->attr.read_format,
				1850	group_leader->nr_siblings + 1) > 16*1024)
				1851	return false;
				1852	}
				1853
				1854	return true;
				1855	}
				1856
				1857	static void perf_group_attach(struct perf_event *event)
				1858	{
				1859	struct perf_event group_leader = event->group_leader, pos;
				1860
				1861	lockdep_assert_held(&event->ctx->lock);
				1862
				1863	/*
				1864	* We can have double attach due to group movement in perf_event_open.
				1865	*/
				1866	if (event->attach_state & PERF_ATTACH_GROUP)
				1867	return;
				1868
				1869	event->attach_state \|= PERF_ATTACH_GROUP;
				1870
				1871	if (group_leader == event)
				1872	return;
				1873
				1874	WARN_ON_ONCE(group_leader->ctx != event->ctx);
				1875
				1876	group_leader->group_caps &= event->event_caps;
				1877
				1878	list_add_tail(&event->sibling_list, &group_leader->sibling_list);
				1879	group_leader->nr_siblings++;
				1880	group_leader->group_generation++;
				1881
				1882	perf_event__header_size(group_leader);
				1883
				1884	for_each_sibling_event(pos, group_leader)
				1885	perf_event__header_size(pos);
				1886	}
				1887
				1888	/*
				1889	* Remove an event from the lists for its context.
				1890	* Must be called with ctx->mutex and ctx->lock held.
				1891	*/
				1892	static void
				1893	list_del_event(struct perf_event event, struct perf_event_context ctx)
				1894	{
				1895	WARN_ON_ONCE(event->ctx != ctx);
				1896	lockdep_assert_held(&ctx->lock);
				1897
				1898	/*
				1899	* We can have double detach due to exit/hot-unplug + close.
				1900	*/
				1901	if (!(event->attach_state & PERF_ATTACH_CONTEXT))
				1902	return;
				1903
				1904	event->attach_state &= ~PERF_ATTACH_CONTEXT;
				1905
				1906	list_update_cgroup_event(event, ctx, false);
				1907
				1908	ctx->nr_events--;
				1909	if (event->attr.inherit_stat)
				1910	ctx->nr_stat--;
				1911
				1912	list_del_rcu(&event->event_entry);
				1913
				1914	if (event->group_leader == event)
				1915	del_event_from_groups(event, ctx);
				1916
				1917	/*
				1918	* If event was in error state, then keep it
				1919	* that way, otherwise bogus counts will be
				1920	* returned on read(). The only way to get out
				1921	* of error state is by explicit re-enabling
				1922	* of the event
				1923	*/
				1924	if (event->state > PERF_EVENT_STATE_OFF)
				1925	perf_event_set_state(event, PERF_EVENT_STATE_OFF);
				1926
				1927	ctx->generation++;
				1928	}
				1929
				1930	static int
				1931	perf_aux_output_match(struct perf_event event, struct perf_event aux_event)
				1932	{
				1933	if (!has_aux(aux_event))
				1934	return 0;
				1935
				1936	if (!event->pmu->aux_output_match)
				1937	return 0;
				1938
				1939	return event->pmu->aux_output_match(aux_event);
				1940	}
				1941
				1942	static void put_event(struct perf_event *event);
				1943	static void event_sched_out(struct perf_event *event,
				1944	struct perf_cpu_context *cpuctx,
				1945	struct perf_event_context *ctx);
				1946
				1947	static void perf_put_aux_event(struct perf_event *event)
				1948	{
				1949	struct perf_event_context *ctx = event->ctx;
				1950	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
				1951	struct perf_event *iter;
				1952
				1953	/*
				1954	* If event uses aux_event tear down the link
				1955	*/
				1956	if (event->aux_event) {
				1957	iter = event->aux_event;
				1958	event->aux_event = NULL;
				1959	put_event(iter);
				1960	return;
				1961	}
				1962
				1963	/*
				1964	* If the event is an aux_event, tear down all links to
				1965	* it from other events.
				1966	*/
				1967	for_each_sibling_event(iter, event->group_leader) {
				1968	if (iter->aux_event != event)
				1969	continue;
				1970
				1971	iter->aux_event = NULL;
				1972	put_event(event);
				1973
				1974	/*
				1975	* If it's ACTIVE, schedule it out and put it into ERROR
				1976	* state so that we don't try to schedule it again. Note
				1977	* that perf_event_enable() will clear the ERROR status.
				1978	*/
				1979	event_sched_out(iter, cpuctx, ctx);
				1980	perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
				1981	}
				1982	}
				1983
				1984	static int perf_get_aux_event(struct perf_event *event,
				1985	struct perf_event *group_leader)
				1986	{
				1987	/*
				1988	* Our group leader must be an aux event if we want to be
				1989	* an aux_output. This way, the aux event will precede its
				1990	* aux_output events in the group, and therefore will always
				1991	* schedule first.
				1992	*/
				1993	if (!group_leader)
				1994	return 0;
				1995
				1996	if (!perf_aux_output_match(event, group_leader))
				1997	return 0;
				1998
				1999	if (!atomic_long_inc_not_zero(&group_leader->refcount))
				2000	return 0;
				2001
				2002	/*
				2003	* Link aux_outputs to their aux event; this is undone in
				2004	* perf_group_detach() by perf_put_aux_event(). When the
				2005	* group in torn down, the aux_output events loose their
				2006	* link to the aux_event and can't schedule any more.
				2007	*/
				2008	event->aux_event = group_leader;
				2009
				2010	return 1;
				2011	}
				2012
				2013	static void perf_group_detach(struct perf_event *event)
				2014	{
				2015	struct perf_event sibling, tmp;
				2016	struct perf_event_context *ctx = event->ctx;
				2017
				2018	lockdep_assert_held(&ctx->lock);
				2019
				2020	/*
				2021	* We can have double detach due to exit/hot-unplug + close.
				2022	*/
				2023	if (!(event->attach_state & PERF_ATTACH_GROUP))
				2024	return;
				2025
				2026	event->attach_state &= ~PERF_ATTACH_GROUP;
				2027
				2028	perf_put_aux_event(event);
				2029
				2030	/*
				2031	* If this is a sibling, remove it from its group.
				2032	*/
				2033	if (event->group_leader != event) {
				2034	list_del_init(&event->sibling_list);
				2035	event->group_leader->nr_siblings--;
				2036	event->group_leader->group_generation++;
				2037	goto out;
				2038	}
				2039
				2040	/*
				2041	* If this was a group event with sibling events then
				2042	* upgrade the siblings to singleton events by adding them
				2043	* to whatever list we are on.
				2044	*/
				2045	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
				2046
				2047	sibling->group_leader = sibling;
				2048	list_del_init(&sibling->sibling_list);
				2049
				2050	/* Inherit group flags from the previous leader */
				2051	sibling->group_caps = event->group_caps;
				2052
				2053	if (!RB_EMPTY_NODE(&event->group_node)) {
				2054	add_event_to_groups(sibling, event->ctx);
				2055
				2056	if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
				2057	struct list_head *list = sibling->attr.pinned ?
				2058	&ctx->pinned_active : &ctx->flexible_active;
				2059
				2060	list_add_tail(&sibling->active_list, list);
				2061	}
				2062	}
				2063
				2064	WARN_ON_ONCE(sibling->ctx != event->ctx);
				2065	}
				2066
				2067	out:
				2068	perf_event__header_size(event->group_leader);
				2069
				2070	for_each_sibling_event(tmp, event->group_leader)
				2071	perf_event__header_size(tmp);
				2072	}
				2073
				2074	static bool is_orphaned_event(struct perf_event *event)
				2075	{
				2076	return event->state == PERF_EVENT_STATE_DEAD;
				2077	}
				2078
				2079	static inline int __pmu_filter_match(struct perf_event *event)
				2080	{
				2081	struct pmu *pmu = event->pmu;
				2082	return pmu->filter_match ? pmu->filter_match(event) : 1;
				2083	}
				2084
				2085	/*
				2086	* Check whether we should attempt to schedule an event group based on
				2087	* PMU-specific filtering. An event group can consist of HW and SW events,
				2088	* potentially with a SW leader, so we must check all the filters, to
				2089	* determine whether a group is schedulable:
				2090	*/
				2091	static inline int pmu_filter_match(struct perf_event *event)
				2092	{
				2093	struct perf_event *sibling;
				2094
				2095	if (!__pmu_filter_match(event))
				2096	return 0;
				2097
				2098	for_each_sibling_event(sibling, event) {
				2099	if (!__pmu_filter_match(sibling))
				2100	return 0;
				2101	}
				2102
				2103	return 1;
				2104	}
				2105
				2106	static inline int
				2107	event_filter_match(struct perf_event *event)
				2108	{
				2109	return (event->cpu == -1 \|\| event->cpu == smp_processor_id()) &&
				2110	perf_cgroup_match(event) && pmu_filter_match(event);
				2111	}
				2112
				2113	static void
				2114	event_sched_out(struct perf_event *event,
				2115	struct perf_cpu_context *cpuctx,
				2116	struct perf_event_context *ctx)
				2117	{
				2118	enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
				2119
				2120	WARN_ON_ONCE(event->ctx != ctx);
				2121	lockdep_assert_held(&ctx->lock);
				2122
				2123	if (event->state != PERF_EVENT_STATE_ACTIVE)
				2124	return;
				2125
				2126	/*
				2127	* Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
				2128	* we can schedule events _OUT_ individually through things like
				2129	* __perf_remove_from_context().
				2130	*/
				2131	list_del_init(&event->active_list);
				2132
				2133	perf_pmu_disable(event->pmu);
				2134
				2135	event->pmu->del(event, 0);
				2136	event->oncpu = -1;
				2137
				2138	if (READ_ONCE(event->pending_disable) >= 0) {
				2139	WRITE_ONCE(event->pending_disable, -1);
				2140	state = PERF_EVENT_STATE_OFF;
				2141	}
				2142	perf_event_set_state(event, state);
				2143
				2144	if (!is_software_event(event))
				2145	cpuctx->active_oncpu--;
				2146	if (!--ctx->nr_active)
				2147	perf_event_ctx_deactivate(ctx);
				2148	if (event->attr.freq && event->attr.sample_freq)
				2149	ctx->nr_freq--;
				2150	if (event->attr.exclusive \|\| !cpuctx->active_oncpu)
				2151	cpuctx->exclusive = 0;
				2152
				2153	perf_pmu_enable(event->pmu);
				2154	}
				2155
				2156	static void
				2157	group_sched_out(struct perf_event *group_event,
				2158	struct perf_cpu_context *cpuctx,
				2159	struct perf_event_context *ctx)
				2160	{
				2161	struct perf_event *event;
				2162
				2163	if (group_event->state != PERF_EVENT_STATE_ACTIVE)
				2164	return;
				2165
				2166	perf_pmu_disable(ctx->pmu);
				2167
				2168	event_sched_out(group_event, cpuctx, ctx);
				2169
				2170	/*
				2171	* Schedule out siblings (if any):
				2172	*/
				2173	for_each_sibling_event(event, group_event)
				2174	event_sched_out(event, cpuctx, ctx);
				2175
				2176	perf_pmu_enable(ctx->pmu);
				2177
				2178	if (group_event->attr.exclusive)
				2179	cpuctx->exclusive = 0;
				2180	}
				2181
				2182	#define DETACH_GROUP 0x01UL
				2183
				2184	/*
				2185	* Cross CPU call to remove a performance event
				2186	*
				2187	* We disable the event on the hardware level first. After that we
				2188	* remove it from the context list.
				2189	*/
				2190	static void
				2191	__perf_remove_from_context(struct perf_event *event,
				2192	struct perf_cpu_context *cpuctx,
				2193	struct perf_event_context *ctx,
				2194	void *info)
				2195	{
				2196	unsigned long flags = (unsigned long)info;
				2197
				2198	if (ctx->is_active & EVENT_TIME) {
				2199	update_context_time(ctx);
				2200	update_cgrp_time_from_cpuctx(cpuctx);
				2201	}
				2202
				2203	event_sched_out(event, cpuctx, ctx);
				2204	if (flags & DETACH_GROUP)
				2205	perf_group_detach(event);
				2206	list_del_event(event, ctx);
				2207
				2208	if (!ctx->nr_events && ctx->is_active) {
				2209	ctx->is_active = 0;
				2210	ctx->rotate_necessary = 0;
				2211	if (ctx->task) {
				2212	WARN_ON_ONCE(cpuctx->task_ctx != ctx);
				2213	cpuctx->task_ctx = NULL;
				2214	}
				2215	}
				2216	}
				2217
				2218	/*
				2219	* Remove the event from a task's (or a CPU's) list of events.
				2220	*
				2221	* If event->ctx is a cloned context, callers must make sure that
				2222	* every task struct that event->ctx->task could possibly point to
				2223	* remains valid. This is OK when called from perf_release since
				2224	* that only calls us on the top-level context, which can't be a clone.
				2225	* When called from perf_event_exit_task, it's OK because the
				2226	* context has been detached from its task.
				2227	*/
				2228	static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
				2229	{
				2230	struct perf_event_context *ctx = event->ctx;
				2231
				2232	lockdep_assert_held(&ctx->mutex);
				2233
				2234	event_function_call(event, __perf_remove_from_context, (void *)flags);
				2235
				2236	/*
				2237	* The above event_function_call() can NO-OP when it hits
				2238	* TASK_TOMBSTONE. In that case we must already have been detached
				2239	* from the context (by perf_event_exit_event()) but the grouping
				2240	* might still be in-tact.
				2241	*/
				2242	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
				2243	if ((flags & DETACH_GROUP) &&
				2244	(event->attach_state & PERF_ATTACH_GROUP)) {
				2245	/*
				2246	* Since in that case we cannot possibly be scheduled, simply
				2247	* detach now.
				2248	*/
				2249	raw_spin_lock_irq(&ctx->lock);
				2250	perf_group_detach(event);
				2251	raw_spin_unlock_irq(&ctx->lock);
				2252	}
				2253	}
				2254
				2255	/*
				2256	* Cross CPU call to disable a performance event
				2257	*/
				2258	static void __perf_event_disable(struct perf_event *event,
				2259	struct perf_cpu_context *cpuctx,
				2260	struct perf_event_context *ctx,
				2261	void *info)
				2262	{
				2263	if (event->state < PERF_EVENT_STATE_INACTIVE)
				2264	return;
				2265
				2266	if (ctx->is_active & EVENT_TIME) {
				2267	update_context_time(ctx);
				2268	update_cgrp_time_from_event(event);
				2269	}
				2270
				2271	if (event == event->group_leader)
				2272	group_sched_out(event, cpuctx, ctx);
				2273	else
				2274	event_sched_out(event, cpuctx, ctx);
				2275
				2276	perf_event_set_state(event, PERF_EVENT_STATE_OFF);
				2277	}
				2278
				2279	/*
				2280	* Disable an event.
				2281	*
				2282	* If event->ctx is a cloned context, callers must make sure that
				2283	* every task struct that event->ctx->task could possibly point to
				2284	* remains valid. This condition is satisfied when called through
				2285	* perf_event_for_each_child or perf_event_for_each because they
				2286	* hold the top-level event's child_mutex, so any descendant that
				2287	* goes to exit will block in perf_event_exit_event().
				2288	*
				2289	* When called from perf_pending_event it's OK because event->ctx
				2290	* is the current context on this CPU and preemption is disabled,
				2291	* hence we can't get into perf_event_task_sched_out for this context.
				2292	*/
				2293	static void _perf_event_disable(struct perf_event *event)
				2294	{
				2295	struct perf_event_context *ctx = event->ctx;
				2296
				2297	raw_spin_lock_irq(&ctx->lock);
				2298	if (event->state <= PERF_EVENT_STATE_OFF) {
				2299	raw_spin_unlock_irq(&ctx->lock);
				2300	return;
				2301	}
				2302	raw_spin_unlock_irq(&ctx->lock);
				2303
				2304	event_function_call(event, __perf_event_disable, NULL);
				2305	}
				2306
				2307	void perf_event_disable_local(struct perf_event *event)
				2308	{
				2309	event_function_local(event, __perf_event_disable, NULL);
				2310	}
				2311
				2312	/*
				2313	* Strictly speaking kernel users cannot create groups and therefore this
				2314	* interface does not need the perf_event_ctx_lock() magic.
				2315	*/
				2316	void perf_event_disable(struct perf_event *event)
				2317	{
				2318	struct perf_event_context *ctx;
				2319
				2320	ctx = perf_event_ctx_lock(event);
				2321	_perf_event_disable(event);
				2322	perf_event_ctx_unlock(event, ctx);
				2323	}
				2324	EXPORT_SYMBOL_GPL(perf_event_disable);
				2325
				2326	void perf_event_disable_inatomic(struct perf_event *event)
				2327	{
				2328	WRITE_ONCE(event->pending_disable, smp_processor_id());
				2329	/* can fail, see perf_pending_event_disable() */
				2330	irq_work_queue(&event->pending);
				2331	}
				2332
				2333	static void perf_set_shadow_time(struct perf_event *event,
				2334	struct perf_event_context *ctx)
				2335	{
				2336	/*
				2337	* use the correct time source for the time snapshot
				2338	*
				2339	* We could get by without this by leveraging the
				2340	* fact that to get to this function, the caller
				2341	* has most likely already called update_context_time()
				2342	* and update_cgrp_time_xx() and thus both timestamp
				2343	* are identical (or very close). Given that tstamp is,
				2344	* already adjusted for cgroup, we could say that:
				2345	* tstamp - ctx->timestamp
				2346	* is equivalent to
				2347	* tstamp - cgrp->timestamp.
				2348	*
				2349	* Then, in perf_output_read(), the calculation would
				2350	* work with no changes because:
				2351	* - event is guaranteed scheduled in
				2352	* - no scheduled out in between
				2353	* - thus the timestamp would be the same
				2354	*
				2355	* But this is a bit hairy.
				2356	*
				2357	* So instead, we have an explicit cgroup call to remain
				2358	* within the time time source all along. We believe it
				2359	* is cleaner and simpler to understand.
				2360	*/
				2361	if (is_cgroup_event(event))
				2362	perf_cgroup_set_shadow_time(event, event->tstamp);
				2363	else
				2364	event->shadow_ctx_time = event->tstamp - ctx->timestamp;
				2365	}
				2366
				2367	#define MAX_INTERRUPTS (~0ULL)
				2368
				2369	static void perf_log_throttle(struct perf_event *event, int enable);
				2370	static void perf_log_itrace_start(struct perf_event *event);
				2371
				2372	static int
				2373	event_sched_in(struct perf_event *event,
				2374	struct perf_cpu_context *cpuctx,
				2375	struct perf_event_context *ctx)
				2376	{
				2377	int ret = 0;
				2378
				2379	lockdep_assert_held(&ctx->lock);
				2380
				2381	if (event->state <= PERF_EVENT_STATE_OFF)
				2382	return 0;
				2383
				2384	WRITE_ONCE(event->oncpu, smp_processor_id());
				2385	/*
				2386	* Order event::oncpu write to happen before the ACTIVE state is
				2387	* visible. This allows perf_event_{stop,read}() to observe the correct
				2388	* ->oncpu if it sees ACTIVE.
				2389	*/
				2390	smp_wmb();
				2391	perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
				2392
				2393	/*
				2394	* Unthrottle events, since we scheduled we might have missed several
				2395	* ticks already, also for a heavily scheduling task there is little
				2396	* guarantee it'll get a tick in a timely manner.
				2397	*/
				2398	if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
				2399	perf_log_throttle(event, 1);
				2400	event->hw.interrupts = 0;
				2401	}
				2402
				2403	perf_pmu_disable(event->pmu);
				2404
				2405	perf_set_shadow_time(event, ctx);
				2406
				2407	perf_log_itrace_start(event);
				2408
				2409	if (event->pmu->add(event, PERF_EF_START)) {
				2410	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
				2411	event->oncpu = -1;
				2412	ret = -EAGAIN;
				2413	goto out;
				2414	}
				2415
				2416	if (!is_software_event(event))
				2417	cpuctx->active_oncpu++;
				2418	if (!ctx->nr_active++)
				2419	perf_event_ctx_activate(ctx);
				2420	if (event->attr.freq && event->attr.sample_freq)
				2421	ctx->nr_freq++;
				2422
				2423	if (event->attr.exclusive)
				2424	cpuctx->exclusive = 1;
				2425
				2426	out:
				2427	perf_pmu_enable(event->pmu);
				2428
				2429	return ret;
				2430	}
				2431
				2432	static int
				2433	group_sched_in(struct perf_event *group_event,
				2434	struct perf_cpu_context *cpuctx,
				2435	struct perf_event_context *ctx)
				2436	{
				2437	struct perf_event event, partial_group = NULL;
				2438	struct pmu *pmu = ctx->pmu;
				2439
				2440	if (group_event->state == PERF_EVENT_STATE_OFF)
				2441	return 0;
				2442
				2443	pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
				2444
				2445	if (event_sched_in(group_event, cpuctx, ctx)) {
				2446	pmu->cancel_txn(pmu);
				2447	perf_mux_hrtimer_restart(cpuctx);
				2448	return -EAGAIN;
				2449	}
				2450
				2451	/*
				2452	* Schedule in siblings as one group (if any):
				2453	*/
				2454	for_each_sibling_event(event, group_event) {
				2455	if (event_sched_in(event, cpuctx, ctx)) {
				2456	partial_group = event;
				2457	goto group_error;
				2458	}
				2459	}
				2460
				2461	if (!pmu->commit_txn(pmu))
				2462	return 0;
				2463
				2464	group_error:
				2465	/*
				2466	* Groups can be scheduled in as one unit only, so undo any
				2467	* partial group before returning:
				2468	* The events up to the failed event are scheduled out normally.
				2469	*/
				2470	for_each_sibling_event(event, group_event) {
				2471	if (event == partial_group)
				2472	break;
				2473
				2474	event_sched_out(event, cpuctx, ctx);
				2475	}
				2476	event_sched_out(group_event, cpuctx, ctx);
				2477
				2478	pmu->cancel_txn(pmu);
				2479
				2480	perf_mux_hrtimer_restart(cpuctx);
				2481
				2482	return -EAGAIN;
				2483	}
				2484
				2485	/*
				2486	* Work out whether we can put this event group on the CPU now.
				2487	*/
				2488	static int group_can_go_on(struct perf_event *event,
				2489	struct perf_cpu_context *cpuctx,
				2490	int can_add_hw)
				2491	{
				2492	/*
				2493	* Groups consisting entirely of software events can always go on.
				2494	*/
				2495	if (event->group_caps & PERF_EV_CAP_SOFTWARE)
				2496	return 1;
				2497	/*
				2498	* If an exclusive group is already on, no other hardware
				2499	* events can go on.
				2500	*/
				2501	if (cpuctx->exclusive)
				2502	return 0;
				2503	/*
				2504	* If this group is exclusive and there are already
				2505	* events on the CPU, it can't go on.
				2506	*/
				2507	if (event->attr.exclusive && cpuctx->active_oncpu)
				2508	return 0;
				2509	/*
				2510	* Otherwise, try to add it if all previous groups were able
				2511	* to go on.
				2512	*/
				2513	return can_add_hw;
				2514	}
				2515
				2516	static void add_event_to_ctx(struct perf_event *event,
				2517	struct perf_event_context *ctx)
				2518	{
				2519	list_add_event(event, ctx);
				2520	perf_group_attach(event);
				2521	}
				2522
				2523	static void ctx_sched_out(struct perf_event_context *ctx,
				2524	struct perf_cpu_context *cpuctx,
				2525	enum event_type_t event_type);
				2526	static void
				2527	ctx_sched_in(struct perf_event_context *ctx,
				2528	struct perf_cpu_context *cpuctx,
				2529	enum event_type_t event_type,
				2530	struct task_struct *task);
				2531
				2532	static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
				2533	struct perf_event_context *ctx,
				2534	enum event_type_t event_type)
				2535	{
				2536	if (!cpuctx->task_ctx)
				2537	return;
				2538
				2539	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
				2540	return;
				2541
				2542	ctx_sched_out(ctx, cpuctx, event_type);
				2543	}
				2544
				2545	static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
				2546	struct perf_event_context *ctx,
				2547	struct task_struct *task)
				2548	{
				2549	cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
				2550	if (ctx)
				2551	ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
				2552	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
				2553	if (ctx)
				2554	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
				2555	}
				2556
				2557	/*
				2558	* We want to maintain the following priority of scheduling:
				2559	* - CPU pinned (EVENT_CPU \| EVENT_PINNED)
				2560	* - task pinned (EVENT_PINNED)
				2561	* - CPU flexible (EVENT_CPU \| EVENT_FLEXIBLE)
				2562	* - task flexible (EVENT_FLEXIBLE).
				2563	*
				2564	* In order to avoid unscheduling and scheduling back in everything every
				2565	* time an event is added, only do it for the groups of equal priority and
				2566	* below.
				2567	*
				2568	* This can be called after a batch operation on task events, in which case
				2569	* event_type is a bit mask of the types of events involved. For CPU events,
				2570	* event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
				2571	*/
				2572	static void ctx_resched(struct perf_cpu_context *cpuctx,
				2573	struct perf_event_context *task_ctx,
				2574	enum event_type_t event_type)
				2575	{
				2576	enum event_type_t ctx_event_type;
				2577	bool cpu_event = !!(event_type & EVENT_CPU);
				2578
				2579	/*
				2580	* If pinned groups are involved, flexible groups also need to be
				2581	* scheduled out.
				2582	*/
				2583	if (event_type & EVENT_PINNED)
				2584	event_type \|= EVENT_FLEXIBLE;
				2585
				2586	ctx_event_type = event_type & EVENT_ALL;
				2587
				2588	perf_pmu_disable(cpuctx->ctx.pmu);
				2589	if (task_ctx)
				2590	task_ctx_sched_out(cpuctx, task_ctx, event_type);
				2591
				2592	/*
				2593	* Decide which cpu ctx groups to schedule out based on the types
				2594	* of events that caused rescheduling:
				2595	* - EVENT_CPU: schedule out corresponding groups;
				2596	* - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
				2597	* - otherwise, do nothing more.
				2598	*/
				2599	if (cpu_event)
				2600	cpu_ctx_sched_out(cpuctx, ctx_event_type);
				2601	else if (ctx_event_type & EVENT_PINNED)
				2602	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
				2603
				2604	perf_event_sched_in(cpuctx, task_ctx, current);
				2605	perf_pmu_enable(cpuctx->ctx.pmu);
				2606	}
				2607
				2608	void perf_pmu_resched(struct pmu *pmu)
				2609	{
				2610	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
				2611	struct perf_event_context *task_ctx = cpuctx->task_ctx;
				2612
				2613	perf_ctx_lock(cpuctx, task_ctx);
				2614	ctx_resched(cpuctx, task_ctx, EVENT_ALL\|EVENT_CPU);
				2615	perf_ctx_unlock(cpuctx, task_ctx);
				2616	}
				2617
				2618	/*
				2619	* Cross CPU call to install and enable a performance event
				2620	*
				2621	* Very similar to remote_function() + event_function() but cannot assume that
				2622	* things like ctx->is_active and cpuctx->task_ctx are set.
				2623	*/
				2624	static int __perf_install_in_context(void *info)
				2625	{
				2626	struct perf_event *event = info;
				2627	struct perf_event_context *ctx = event->ctx;
				2628	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
				2629	struct perf_event_context *task_ctx = cpuctx->task_ctx;
				2630	bool reprogram = true;
				2631	int ret = 0;
				2632
				2633	raw_spin_lock(&cpuctx->ctx.lock);
				2634	if (ctx->task) {
				2635	raw_spin_lock(&ctx->lock);
				2636	task_ctx = ctx;
				2637
				2638	reprogram = (ctx->task == current);
				2639
				2640	/*
				2641	* If the task is running, it must be running on this CPU,
				2642	* otherwise we cannot reprogram things.
				2643	*
				2644	* If its not running, we don't care, ctx->lock will
				2645	* serialize against it becoming runnable.
				2646	*/
				2647	if (task_curr(ctx->task) && !reprogram) {
				2648	ret = -ESRCH;
				2649	goto unlock;
				2650	}
				2651
				2652	WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
				2653	} else if (task_ctx) {
				2654	raw_spin_lock(&task_ctx->lock);
				2655	}
				2656
				2657	#ifdef CONFIG_CGROUP_PERF
				2658	if (is_cgroup_event(event)) {
				2659	/*
				2660	* If the current cgroup doesn't match the event's
				2661	* cgroup, we should not try to schedule it.
				2662	*/
				2663	struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
				2664	reprogram = cgroup_is_descendant(cgrp->css.cgroup,
				2665	event->cgrp->css.cgroup);
				2666	}
				2667	#endif
				2668
				2669	if (reprogram) {
				2670	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
				2671	add_event_to_ctx(event, ctx);
				2672	ctx_resched(cpuctx, task_ctx, get_event_type(event));
				2673	} else {
				2674	add_event_to_ctx(event, ctx);
				2675	}
				2676
				2677	unlock:
				2678	perf_ctx_unlock(cpuctx, task_ctx);
				2679
				2680	return ret;
				2681	}
				2682
				2683	static bool exclusive_event_installable(struct perf_event *event,
				2684	struct perf_event_context *ctx);
				2685
				2686	/*
				2687	* Attach a performance event to a context.
				2688	*
				2689	* Very similar to event_function_call, see comment there.
				2690	*/
				2691	static void
				2692	perf_install_in_context(struct perf_event_context *ctx,
				2693	struct perf_event *event,
				2694	int cpu)
				2695	{
				2696	struct task_struct *task = READ_ONCE(ctx->task);
				2697
				2698	lockdep_assert_held(&ctx->mutex);
				2699
				2700	WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
				2701
				2702	if (event->cpu != -1)
				2703	event->cpu = cpu;
				2704
				2705	/*
				2706	* Ensures that if we can observe event->ctx, both the event and ctx
				2707	* will be 'complete'. See perf_iterate_sb_cpu().
				2708	*/
				2709	smp_store_release(&event->ctx, ctx);
				2710
				2711	if (!task) {
				2712	cpu_function_call(cpu, __perf_install_in_context, event);
				2713	return;
				2714	}
				2715
				2716	/*
				2717	* Should not happen, we validate the ctx is still alive before calling.
				2718	*/
				2719	if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
				2720	return;
				2721
				2722	/*
				2723	* Installing events is tricky because we cannot rely on ctx->is_active
				2724	* to be set in case this is the nr_events 0 -> 1 transition.
				2725	*
				2726	* Instead we use task_curr(), which tells us if the task is running.
				2727	* However, since we use task_curr() outside of rq::lock, we can race
				2728	* against the actual state. This means the result can be wrong.
				2729	*
				2730	* If we get a false positive, we retry, this is harmless.
				2731	*
				2732	* If we get a false negative, things are complicated. If we are after
				2733	* perf_event_context_sched_in() ctx::lock will serialize us, and the
				2734	* value must be correct. If we're before, it doesn't matter since
				2735	* perf_event_context_sched_in() will program the counter.
				2736	*
				2737	* However, this hinges on the remote context switch having observed
				2738	* our task->perf_event_ctxp[] store, such that it will in fact take
				2739	* ctx::lock in perf_event_context_sched_in().
				2740	*
				2741	* We do this by task_function_call(), if the IPI fails to hit the task
				2742	* we know any future context switch of task must see the
				2743	* perf_event_ctpx[] store.
				2744	*/
				2745
				2746	/*
				2747	* This smp_mb() orders the task->perf_event_ctxp[] store with the
				2748	* task_cpu() load, such that if the IPI then does not find the task
				2749	* running, a future context switch of that task must observe the
				2750	* store.
				2751	*/
				2752	smp_mb();
				2753	again:
				2754	if (!task_function_call(task, __perf_install_in_context, event))
				2755	return;
				2756
				2757	raw_spin_lock_irq(&ctx->lock);
				2758	task = ctx->task;
				2759	if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
				2760	/*
				2761	* Cannot happen because we already checked above (which also
				2762	* cannot happen), and we hold ctx->mutex, which serializes us
				2763	* against perf_event_exit_task_context().
				2764	*/
				2765	raw_spin_unlock_irq(&ctx->lock);
				2766	return;
				2767	}
				2768	/*
				2769	* If the task is not running, ctx->lock will avoid it becoming so,
				2770	* thus we can safely install the event.
				2771	*/
				2772	if (task_curr(task)) {
				2773	raw_spin_unlock_irq(&ctx->lock);
				2774	goto again;
				2775	}
				2776	add_event_to_ctx(event, ctx);
				2777	raw_spin_unlock_irq(&ctx->lock);
				2778	}
				2779
				2780	/*
				2781	* Cross CPU call to enable a performance event
				2782	*/
				2783	static void __perf_event_enable(struct perf_event *event,
				2784	struct perf_cpu_context *cpuctx,
				2785	struct perf_event_context *ctx,
				2786	void *info)
				2787	{
				2788	struct perf_event *leader = event->group_leader;
				2789	struct perf_event_context *task_ctx;
				2790
				2791	if (event->state >= PERF_EVENT_STATE_INACTIVE \|\|
				2792	event->state <= PERF_EVENT_STATE_ERROR)
				2793	return;
				2794
				2795	if (ctx->is_active)
				2796	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
				2797
				2798	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
				2799
				2800	if (!ctx->is_active)
				2801	return;
				2802
				2803	if (!event_filter_match(event)) {
				2804	ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
				2805	return;
				2806	}
				2807
				2808	/*
				2809	* If the event is in a group and isn't the group leader,
				2810	* then don't put it on unless the group is on.
				2811	*/
				2812	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
				2813	ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
				2814	return;
				2815	}
				2816
				2817	task_ctx = cpuctx->task_ctx;
				2818	if (ctx->task)
				2819	WARN_ON_ONCE(task_ctx != ctx);
				2820
				2821	ctx_resched(cpuctx, task_ctx, get_event_type(event));
				2822	}
				2823
				2824	/*
				2825	* Enable an event.
				2826	*
				2827	* If event->ctx is a cloned context, callers must make sure that
				2828	* every task struct that event->ctx->task could possibly point to
				2829	* remains valid. This condition is satisfied when called through
				2830	* perf_event_for_each_child or perf_event_for_each as described
				2831	* for perf_event_disable.
				2832	*/
				2833	static void _perf_event_enable(struct perf_event *event)
				2834	{
				2835	struct perf_event_context *ctx = event->ctx;
				2836
				2837	raw_spin_lock_irq(&ctx->lock);
				2838	if (event->state >= PERF_EVENT_STATE_INACTIVE \|\|
				2839	event->state < PERF_EVENT_STATE_ERROR) {
				2840	raw_spin_unlock_irq(&ctx->lock);
				2841	return;
				2842	}
				2843
				2844	/*
				2845	* If the event is in error state, clear that first.
				2846	*
				2847	* That way, if we see the event in error state below, we know that it
				2848	* has gone back into error state, as distinct from the task having
				2849	* been scheduled away before the cross-call arrived.
				2850	*/
				2851	if (event->state == PERF_EVENT_STATE_ERROR)
				2852	event->state = PERF_EVENT_STATE_OFF;
				2853	raw_spin_unlock_irq(&ctx->lock);
				2854
				2855	event_function_call(event, __perf_event_enable, NULL);
				2856	}
				2857
				2858	/*
				2859	* See perf_event_disable();
				2860	*/
				2861	void perf_event_enable(struct perf_event *event)
				2862	{
				2863	struct perf_event_context *ctx;
				2864
				2865	ctx = perf_event_ctx_lock(event);
				2866	_perf_event_enable(event);
				2867	perf_event_ctx_unlock(event, ctx);
				2868	}
				2869	EXPORT_SYMBOL_GPL(perf_event_enable);
				2870
				2871	struct stop_event_data {
				2872	struct perf_event *event;
				2873	unsigned int restart;
				2874	};
				2875
				2876	static int __perf_event_stop(void *info)
				2877	{
				2878	struct stop_event_data *sd = info;
				2879	struct perf_event *event = sd->event;
				2880
				2881	/* if it's already INACTIVE, do nothing */
				2882	if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
				2883	return 0;
				2884
				2885	/* matches smp_wmb() in event_sched_in() */
				2886	smp_rmb();
				2887
				2888	/*
				2889	* There is a window with interrupts enabled before we get here,
				2890	* so we need to check again lest we try to stop another CPU's event.
				2891	*/
				2892	if (READ_ONCE(event->oncpu) != smp_processor_id())
				2893	return -EAGAIN;
				2894
				2895	event->pmu->stop(event, PERF_EF_UPDATE);
				2896
				2897	/*
				2898	* May race with the actual stop (through perf_pmu_output_stop()),
				2899	* but it is only used for events with AUX ring buffer, and such
				2900	* events will refuse to restart because of rb::aux_mmap_count==0,
				2901	* see comments in perf_aux_output_begin().
				2902	*
				2903	* Since this is happening on an event-local CPU, no trace is lost
				2904	* while restarting.
				2905	*/
				2906	if (sd->restart)
				2907	event->pmu->start(event, 0);
				2908
				2909	return 0;
				2910	}
				2911
				2912	static int perf_event_stop(struct perf_event *event, int restart)
				2913	{
				2914	struct stop_event_data sd = {
				2915	.event = event,
				2916	.restart = restart,
				2917	};
				2918	int ret = 0;
				2919
				2920	do {
				2921	if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
				2922	return 0;
				2923
				2924	/* matches smp_wmb() in event_sched_in() */
				2925	smp_rmb();
				2926
				2927	/*
				2928	* We only want to restart ACTIVE events, so if the event goes
				2929	* inactive here (event->oncpu==-1), there's nothing more to do;
				2930	* fall through with ret==-ENXIO.
				2931	*/
				2932	ret = cpu_function_call(READ_ONCE(event->oncpu),
				2933	__perf_event_stop, &sd);
				2934	} while (ret == -EAGAIN);
				2935
				2936	return ret;
				2937	}
				2938
				2939	/*
				2940	* In order to contain the amount of racy and tricky in the address filter
				2941	* configuration management, it is a two part process:
				2942	*
				2943	* (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
				2944	* we update the addresses of corresponding vmas in
				2945	* event::addr_filter_ranges array and bump the event::addr_filters_gen;
				2946	* (p2) when an event is scheduled in (pmu::add), it calls
				2947	* perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
				2948	* if the generation has changed since the previous call.
				2949	*
				2950	* If (p1) happens while the event is active, we restart it to force (p2).
				2951	*
				2952	* (1) perf_addr_filters_apply(): adjusting filters' offsets based on
				2953	* pre-existing mappings, called once when new filters arrive via SET_FILTER
				2954	* ioctl;
				2955	* (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
				2956	* registered mapping, called for every new mmap(), with mm::mmap_sem down
				2957	* for reading;
				2958	* (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
				2959	* of exec.
				2960	*/
				2961	void perf_event_addr_filters_sync(struct perf_event *event)
				2962	{
				2963	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
				2964
				2965	if (!has_addr_filter(event))
				2966	return;
				2967
				2968	raw_spin_lock(&ifh->lock);
				2969	if (event->addr_filters_gen != event->hw.addr_filters_gen) {
				2970	event->pmu->addr_filters_sync(event);
				2971	event->hw.addr_filters_gen = event->addr_filters_gen;
				2972	}
				2973	raw_spin_unlock(&ifh->lock);
				2974	}
				2975	EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
				2976
				2977	static int _perf_event_refresh(struct perf_event *event, int refresh)
				2978	{
				2979	/*
				2980	* not supported on inherited events
				2981	*/
				2982	if (event->attr.inherit \|\| !is_sampling_event(event))
				2983	return -EINVAL;
				2984
				2985	atomic_add(refresh, &event->event_limit);
				2986	_perf_event_enable(event);
				2987
				2988	return 0;
				2989	}
				2990
				2991	/*
				2992	* See perf_event_disable()
				2993	*/
				2994	int perf_event_refresh(struct perf_event *event, int refresh)
				2995	{
				2996	struct perf_event_context *ctx;
				2997	int ret;
				2998
				2999	ctx = perf_event_ctx_lock(event);
				3000	ret = _perf_event_refresh(event, refresh);
				3001	perf_event_ctx_unlock(event, ctx);
				3002
				3003	return ret;
				3004	}
				3005	EXPORT_SYMBOL_GPL(perf_event_refresh);
				3006
				3007	static int perf_event_modify_breakpoint(struct perf_event *bp,
				3008	struct perf_event_attr *attr)
				3009	{
				3010	int err;
				3011
				3012	_perf_event_disable(bp);
				3013
				3014	err = modify_user_hw_breakpoint_check(bp, attr, true);
				3015
				3016	if (!bp->attr.disabled)
				3017	_perf_event_enable(bp);
				3018
				3019	return err;
				3020	}
				3021
				3022	static int perf_event_modify_attr(struct perf_event *event,
				3023	struct perf_event_attr *attr)
				3024	{
				3025	if (event->attr.type != attr->type)
				3026	return -EINVAL;
				3027
				3028	switch (event->attr.type) {
				3029	case PERF_TYPE_BREAKPOINT:
				3030	return perf_event_modify_breakpoint(event, attr);
				3031	default:
				3032	/* Place holder for future additions. */
				3033	return -EOPNOTSUPP;
				3034	}
				3035	}
				3036
				3037	static void ctx_sched_out(struct perf_event_context *ctx,
				3038	struct perf_cpu_context *cpuctx,
				3039	enum event_type_t event_type)
				3040	{
				3041	struct perf_event event, tmp;
				3042	int is_active = ctx->is_active;
				3043
				3044	lockdep_assert_held(&ctx->lock);
				3045
				3046	if (likely(!ctx->nr_events)) {
				3047	/*
				3048	* See __perf_remove_from_context().
				3049	*/
				3050	WARN_ON_ONCE(ctx->is_active);
				3051	if (ctx->task)
				3052	WARN_ON_ONCE(cpuctx->task_ctx);
				3053	return;
				3054	}
				3055
				3056	ctx->is_active &= ~event_type;
				3057	if (!(ctx->is_active & EVENT_ALL))
				3058	ctx->is_active = 0;
				3059
				3060	if (ctx->task) {
				3061	WARN_ON_ONCE(cpuctx->task_ctx != ctx);
				3062	if (!ctx->is_active)
				3063	cpuctx->task_ctx = NULL;
				3064	}
				3065
				3066	/*
				3067	* Always update time if it was set; not only when it changes.
				3068	* Otherwise we can 'forget' to update time for any but the last
				3069	* context we sched out. For example:
				3070	*
				3071	* ctx_sched_out(.event_type = EVENT_FLEXIBLE)
				3072	* ctx_sched_out(.event_type = EVENT_PINNED)
				3073	*
				3074	* would only update time for the pinned events.
				3075	*/
				3076	if (is_active & EVENT_TIME) {
				3077	/* update (and stop) ctx time */
				3078	update_context_time(ctx);
				3079	update_cgrp_time_from_cpuctx(cpuctx);
				3080	}
				3081
				3082	is_active ^= ctx->is_active; /* changed bits */
				3083
				3084	if (!ctx->nr_active \|\| !(is_active & EVENT_ALL))
				3085	return;
				3086
				3087	perf_pmu_disable(ctx->pmu);
				3088	if (is_active & EVENT_PINNED) {
				3089	list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
				3090	group_sched_out(event, cpuctx, ctx);
				3091	}
				3092
				3093	if (is_active & EVENT_FLEXIBLE) {
				3094	list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
				3095	group_sched_out(event, cpuctx, ctx);
				3096
				3097	/*
				3098	* Since we cleared EVENT_FLEXIBLE, also clear
				3099	* rotate_necessary, is will be reset by
				3100	* ctx_flexible_sched_in() when needed.
				3101	*/
				3102	ctx->rotate_necessary = 0;
				3103	}
				3104	perf_pmu_enable(ctx->pmu);
				3105	}
				3106
				3107	/*
				3108	* Test whether two contexts are equivalent, i.e. whether they have both been
				3109	* cloned from the same version of the same context.
				3110	*
				3111	* Equivalence is measured using a generation number in the context that is
				3112	* incremented on each modification to it; see unclone_ctx(), list_add_event()
				3113	* and list_del_event().
				3114	*/
				3115	static int context_equiv(struct perf_event_context *ctx1,
				3116	struct perf_event_context *ctx2)
				3117	{
				3118	lockdep_assert_held(&ctx1->lock);
				3119	lockdep_assert_held(&ctx2->lock);
				3120
				3121	/* Pinning disables the swap optimization */
				3122	if (ctx1->pin_count \|\| ctx2->pin_count)
				3123	return 0;
				3124
				3125	/* If ctx1 is the parent of ctx2 */
				3126	if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
				3127	return 1;
				3128
				3129	/* If ctx2 is the parent of ctx1 */
				3130	if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
				3131	return 1;
				3132
				3133	/*
				3134	* If ctx1 and ctx2 have the same parent; we flatten the parent
				3135	* hierarchy, see perf_event_init_context().
				3136	*/
				3137	if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
				3138	ctx1->parent_gen == ctx2->parent_gen)
				3139	return 1;
				3140
				3141	/* Unmatched */
				3142	return 0;
				3143	}
				3144
				3145	static void __perf_event_sync_stat(struct perf_event *event,
				3146	struct perf_event *next_event)
				3147	{
				3148	u64 value;
				3149
				3150	if (!event->attr.inherit_stat)
				3151	return;
				3152
				3153	/*
				3154	* Update the event value, we cannot use perf_event_read()
				3155	* because we're in the middle of a context switch and have IRQs
				3156	* disabled, which upsets smp_call_function_single(), however
				3157	* we know the event must be on the current CPU, therefore we
				3158	* don't need to use it.
				3159	*/
				3160	if (event->state == PERF_EVENT_STATE_ACTIVE)
				3161	event->pmu->read(event);
				3162
				3163	perf_event_update_time(event);
				3164
				3165	/*
				3166	* In order to keep per-task stats reliable we need to flip the event
				3167	* values when we flip the contexts.
				3168	*/
				3169	value = local64_read(&next_event->count);
				3170	value = local64_xchg(&event->count, value);
				3171	local64_set(&next_event->count, value);
				3172
				3173	swap(event->total_time_enabled, next_event->total_time_enabled);
				3174	swap(event->total_time_running, next_event->total_time_running);
				3175
				3176	/*
				3177	* Since we swizzled the values, update the user visible data too.
				3178	*/
				3179	perf_event_update_userpage(event);
				3180	perf_event_update_userpage(next_event);
				3181	}
				3182
				3183	static void perf_event_sync_stat(struct perf_event_context *ctx,
				3184	struct perf_event_context *next_ctx)
				3185	{
				3186	struct perf_event event, next_event;
				3187
				3188	if (!ctx->nr_stat)
				3189	return;
				3190
				3191	update_context_time(ctx);
				3192
				3193	event = list_first_entry(&ctx->event_list,
				3194	struct perf_event, event_entry);
				3195
				3196	next_event = list_first_entry(&next_ctx->event_list,
				3197	struct perf_event, event_entry);
				3198
				3199	while (&event->event_entry != &ctx->event_list &&
				3200	&next_event->event_entry != &next_ctx->event_list) {
				3201
				3202	__perf_event_sync_stat(event, next_event);
				3203
				3204	event = list_next_entry(event, event_entry);
				3205	next_event = list_next_entry(next_event, event_entry);
				3206	}
				3207	}
				3208
				3209	static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
				3210	struct task_struct *next)
				3211	{
				3212	struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
				3213	struct perf_event_context *next_ctx;
				3214	struct perf_event_context parent, next_parent;
				3215	struct perf_cpu_context *cpuctx;
				3216	int do_switch = 1;
				3217
				3218	if (likely(!ctx))
				3219	return;
				3220
				3221	cpuctx = __get_cpu_context(ctx);
				3222	if (!cpuctx->task_ctx)
				3223	return;
				3224
				3225	rcu_read_lock();
				3226	next_ctx = next->perf_event_ctxp[ctxn];
				3227	if (!next_ctx)
				3228	goto unlock;
				3229
				3230	parent = rcu_dereference(ctx->parent_ctx);
				3231	next_parent = rcu_dereference(next_ctx->parent_ctx);
				3232
				3233	/* If neither context have a parent context; they cannot be clones. */
				3234	if (!parent && !next_parent)
				3235	goto unlock;
				3236
				3237	if (next_parent == ctx \|\| next_ctx == parent \|\| next_parent == parent) {
				3238	/*
				3239	* Looks like the two contexts are clones, so we might be
				3240	* able to optimize the context switch. We lock both
				3241	* contexts and check that they are clones under the
				3242	* lock (including re-checking that neither has been
				3243	* uncloned in the meantime). It doesn't matter which
				3244	* order we take the locks because no other cpu could
				3245	* be trying to lock both of these tasks.
				3246	*/
				3247	raw_spin_lock(&ctx->lock);
				3248	raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
				3249	if (context_equiv(ctx, next_ctx)) {
				3250	WRITE_ONCE(ctx->task, next);
				3251	WRITE_ONCE(next_ctx->task, task);
				3252
				3253	swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
				3254
				3255	/*
				3256	* RCU_INIT_POINTER here is safe because we've not
				3257	* modified the ctx and the above modification of
				3258	* ctx->task and ctx->task_ctx_data are immaterial
				3259	* since those values are always verified under
				3260	* ctx->lock which we're now holding.
				3261	*/
				3262	RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
				3263	RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
				3264
				3265	do_switch = 0;
				3266
				3267	perf_event_sync_stat(ctx, next_ctx);
				3268	}
				3269	raw_spin_unlock(&next_ctx->lock);
				3270	raw_spin_unlock(&ctx->lock);
				3271	}
				3272	unlock:
				3273	rcu_read_unlock();
				3274
				3275	if (do_switch) {
				3276	raw_spin_lock(&ctx->lock);
				3277	task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
				3278	raw_spin_unlock(&ctx->lock);
				3279	}
				3280	}
				3281
				3282	static DEFINE_PER_CPU(struct list_head, sched_cb_list);
				3283
				3284	void perf_sched_cb_dec(struct pmu *pmu)
				3285	{
				3286	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
				3287
				3288	this_cpu_dec(perf_sched_cb_usages);
				3289
				3290	if (!--cpuctx->sched_cb_usage)
				3291	list_del(&cpuctx->sched_cb_entry);
				3292	}
				3293
				3294
				3295	void perf_sched_cb_inc(struct pmu *pmu)
				3296	{
				3297	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
				3298
				3299	if (!cpuctx->sched_cb_usage++)
				3300	list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
				3301
				3302	this_cpu_inc(perf_sched_cb_usages);
				3303	}
				3304
				3305	/*
				3306	* This function provides the context switch callback to the lower code
				3307	* layer. It is invoked ONLY when the context switch callback is enabled.
				3308	*
				3309	* This callback is relevant even to per-cpu events; for example multi event
				3310	* PEBS requires this to provide PID/TID information. This requires we flush
				3311	* all queued PEBS records before we context switch to a new task.
				3312	*/
				3313	static void perf_pmu_sched_task(struct task_struct *prev,
				3314	struct task_struct *next,
				3315	bool sched_in)
				3316	{
				3317	struct perf_cpu_context *cpuctx;
				3318	struct pmu *pmu;
				3319
				3320	if (prev == next)
				3321	return;
				3322
				3323	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
				3324	pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
				3325
				3326	if (WARN_ON_ONCE(!pmu->sched_task))
				3327	continue;
				3328
				3329	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
				3330	perf_pmu_disable(pmu);
				3331
				3332	pmu->sched_task(cpuctx->task_ctx, sched_in);
				3333
				3334	perf_pmu_enable(pmu);
				3335	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
				3336	}
				3337	}
				3338
				3339	static void perf_event_switch(struct task_struct *task,
				3340	struct task_struct *next_prev, bool sched_in);
				3341
				3342	#define for_each_task_context_nr(ctxn) \
				3343	for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
				3344
				3345	/*
				3346	* Called from scheduler to remove the events of the current task,
				3347	* with interrupts disabled.
				3348	*
				3349	* We stop each event and update the event value in event->count.
				3350	*
				3351	* This does not protect us against NMI, but disable()
				3352	* sets the disabled bit in the control field of event _before_
				3353	* accessing the event control register. If a NMI hits, then it will
				3354	* not restart the event.
				3355	*/
				3356	void __perf_event_task_sched_out(struct task_struct *task,
				3357	struct task_struct *next)
				3358	{
				3359	int ctxn;
				3360
				3361	if (__this_cpu_read(perf_sched_cb_usages))
				3362	perf_pmu_sched_task(task, next, false);
				3363
				3364	if (atomic_read(&nr_switch_events))
				3365	perf_event_switch(task, next, false);
				3366
				3367	for_each_task_context_nr(ctxn)
				3368	perf_event_context_sched_out(task, ctxn, next);
				3369
				3370	/*
				3371	* if cgroup events exist on this CPU, then we need
				3372	* to check if we have to switch out PMU state.
				3373	* cgroup event are system-wide mode only
				3374	*/
				3375	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
				3376	perf_cgroup_sched_out(task, next);
				3377	}
				3378
				3379	/*
				3380	* Called with IRQs disabled
				3381	*/
				3382	static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
				3383	enum event_type_t event_type)
				3384	{
				3385	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
				3386	}
				3387
				3388	static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
				3389	int (func)(struct perf_event , void ), void data)
				3390	{
				3391	struct perf_event *evt, evt1, *evt2;
				3392	int ret;
				3393
				3394	evt1 = perf_event_groups_first(groups, -1);
				3395	evt2 = perf_event_groups_first(groups, cpu);
				3396
				3397	while (evt1 \|\| evt2) {
				3398	if (evt1 && evt2) {
				3399	if (evt1->group_index < evt2->group_index)
				3400	evt = &evt1;
				3401	else
				3402	evt = &evt2;
				3403	} else if (evt1) {
				3404	evt = &evt1;
				3405	} else {
				3406	evt = &evt2;
				3407	}
				3408
				3409	ret = func(*evt, data);
				3410	if (ret)
				3411	return ret;
				3412
				3413	evt = perf_event_groups_next(evt);
				3414	}
				3415
				3416	return 0;
				3417	}
				3418
				3419	struct sched_in_data {
				3420	struct perf_event_context *ctx;
				3421	struct perf_cpu_context *cpuctx;
				3422	int can_add_hw;
				3423	};
				3424
				3425	static int pinned_sched_in(struct perf_event event, void data)
				3426	{
				3427	struct sched_in_data *sid = data;
				3428
				3429	if (event->state <= PERF_EVENT_STATE_OFF)
				3430	return 0;
				3431
				3432	if (!event_filter_match(event))
				3433	return 0;
				3434
				3435	if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
				3436	if (!group_sched_in(event, sid->cpuctx, sid->ctx))
				3437	list_add_tail(&event->active_list, &sid->ctx->pinned_active);
				3438	}
				3439
				3440	/*
				3441	* If this pinned group hasn't been scheduled,
				3442	* put it in error state.
				3443	*/
				3444	if (event->state == PERF_EVENT_STATE_INACTIVE)
				3445	perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
				3446
				3447	return 0;
				3448	}
				3449
				3450	static int flexible_sched_in(struct perf_event event, void data)
				3451	{
				3452	struct sched_in_data *sid = data;
				3453
				3454	if (event->state <= PERF_EVENT_STATE_OFF)
				3455	return 0;
				3456
				3457	if (!event_filter_match(event))
				3458	return 0;
				3459
				3460	if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
				3461	int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
				3462	if (ret) {
				3463	sid->can_add_hw = 0;
				3464	sid->ctx->rotate_necessary = 1;
				3465	return 0;
				3466	}
				3467	list_add_tail(&event->active_list, &sid->ctx->flexible_active);
				3468	}
				3469
				3470	return 0;
				3471	}
				3472
				3473	static void
				3474	ctx_pinned_sched_in(struct perf_event_context *ctx,
				3475	struct perf_cpu_context *cpuctx)
				3476	{
				3477	struct sched_in_data sid = {
				3478	.ctx = ctx,
				3479	.cpuctx = cpuctx,
				3480	.can_add_hw = 1,
				3481	};
				3482
				3483	visit_groups_merge(&ctx->pinned_groups,
				3484	smp_processor_id(),
				3485	pinned_sched_in, &sid);
				3486	}
				3487
				3488	static void
				3489	ctx_flexible_sched_in(struct perf_event_context *ctx,
				3490	struct perf_cpu_context *cpuctx)
				3491	{
				3492	struct sched_in_data sid = {
				3493	.ctx = ctx,
				3494	.cpuctx = cpuctx,
				3495	.can_add_hw = 1,
				3496	};
				3497
				3498	visit_groups_merge(&ctx->flexible_groups,
				3499	smp_processor_id(),
				3500	flexible_sched_in, &sid);
				3501	}
				3502
				3503	static void
				3504	ctx_sched_in(struct perf_event_context *ctx,
				3505	struct perf_cpu_context *cpuctx,
				3506	enum event_type_t event_type,
				3507	struct task_struct *task)
				3508	{
				3509	int is_active = ctx->is_active;
				3510	u64 now;
				3511
				3512	lockdep_assert_held(&ctx->lock);
				3513
				3514	if (likely(!ctx->nr_events))
				3515	return;
				3516
				3517	ctx->is_active \|= (event_type \| EVENT_TIME);
				3518	if (ctx->task) {
				3519	if (!is_active)
				3520	cpuctx->task_ctx = ctx;
				3521	else
				3522	WARN_ON_ONCE(cpuctx->task_ctx != ctx);
				3523	}
				3524
				3525	is_active ^= ctx->is_active; /* changed bits */
				3526
				3527	if (is_active & EVENT_TIME) {
				3528	/* start ctx time */
				3529	now = perf_clock();
				3530	ctx->timestamp = now;
				3531	perf_cgroup_set_timestamp(task, ctx);
				3532	}
				3533
				3534	/*
				3535	* First go through the list and put on any pinned groups
				3536	* in order to give them the best chance of going on.
				3537	*/
				3538	if (is_active & EVENT_PINNED)
				3539	ctx_pinned_sched_in(ctx, cpuctx);
				3540
				3541	/* Then walk through the lower prio flexible groups */
				3542	if (is_active & EVENT_FLEXIBLE)
				3543	ctx_flexible_sched_in(ctx, cpuctx);
				3544	}
				3545
				3546	static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
				3547	enum event_type_t event_type,
				3548	struct task_struct *task)
				3549	{
				3550	struct perf_event_context *ctx = &cpuctx->ctx;
				3551
				3552	ctx_sched_in(ctx, cpuctx, event_type, task);
				3553	}
				3554
				3555	static void perf_event_context_sched_in(struct perf_event_context *ctx,
				3556	struct task_struct *task)
				3557	{
				3558	struct perf_cpu_context *cpuctx;
				3559
				3560	cpuctx = __get_cpu_context(ctx);
				3561	if (cpuctx->task_ctx == ctx)
				3562	return;
				3563
				3564	perf_ctx_lock(cpuctx, ctx);
				3565	/*
				3566	* We must check ctx->nr_events while holding ctx->lock, such
				3567	* that we serialize against perf_install_in_context().
				3568	*/
				3569	if (!ctx->nr_events)
				3570	goto unlock;
				3571
				3572	perf_pmu_disable(ctx->pmu);
				3573	/*
				3574	* We want to keep the following priority order:
				3575	* cpu pinned (that don't need to move), task pinned,
				3576	* cpu flexible, task flexible.
				3577	*
				3578	* However, if task's ctx is not carrying any pinned
				3579	* events, no need to flip the cpuctx's events around.
				3580	*/
				3581	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
				3582	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
				3583	perf_event_sched_in(cpuctx, ctx, task);
				3584	perf_pmu_enable(ctx->pmu);
				3585
				3586	unlock:
				3587	perf_ctx_unlock(cpuctx, ctx);
				3588	}
				3589
				3590	/*
				3591	* Called from scheduler to add the events of the current task
				3592	* with interrupts disabled.
				3593	*
				3594	* We restore the event value and then enable it.
				3595	*
				3596	* This does not protect us against NMI, but enable()
				3597	* sets the enabled bit in the control field of event _before_
				3598	* accessing the event control register. If a NMI hits, then it will
				3599	* keep the event running.
				3600	*/
				3601	void __perf_event_task_sched_in(struct task_struct *prev,
				3602	struct task_struct *task)
				3603	{
				3604	struct perf_event_context *ctx;
				3605	int ctxn;
				3606
				3607	/*
				3608	* If cgroup events exist on this CPU, then we need to check if we have
				3609	* to switch in PMU state; cgroup event are system-wide mode only.
				3610	*
				3611	* Since cgroup events are CPU events, we must schedule these in before
				3612	* we schedule in the task events.
				3613	*/
				3614	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
				3615	perf_cgroup_sched_in(prev, task);
				3616
				3617	for_each_task_context_nr(ctxn) {
				3618	ctx = task->perf_event_ctxp[ctxn];
				3619	if (likely(!ctx))
				3620	continue;
				3621
				3622	perf_event_context_sched_in(ctx, task);
				3623	}
				3624
				3625	if (atomic_read(&nr_switch_events))
				3626	perf_event_switch(task, prev, true);
				3627
				3628	if (__this_cpu_read(perf_sched_cb_usages))
				3629	perf_pmu_sched_task(prev, task, true);
				3630	}
				3631
				3632	static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
				3633	{
				3634	u64 frequency = event->attr.sample_freq;
				3635	u64 sec = NSEC_PER_SEC;
				3636	u64 divisor, dividend;
				3637
				3638	int count_fls, nsec_fls, frequency_fls, sec_fls;
				3639
				3640	count_fls = fls64(count);
				3641	nsec_fls = fls64(nsec);
				3642	frequency_fls = fls64(frequency);
				3643	sec_fls = 30;
				3644
				3645	/*
				3646	* We got @count in @nsec, with a target of sample_freq HZ
				3647	* the target period becomes:
				3648	*
				3649	* @count * 10^9
				3650	* period = -------------------
				3651	* @nsec * sample_freq
				3652	*
				3653	*/
				3654
				3655	/*
				3656	* Reduce accuracy by one bit such that @a and @b converge
				3657	* to a similar magnitude.
				3658	*/
				3659	#define REDUCE_FLS(a, b) \
				3660	do { \
				3661	if (a##_fls > b##_fls) { \
				3662	a >>= 1; \
				3663	a##_fls--; \
				3664	} else { \
				3665	b >>= 1; \
				3666	b##_fls--; \
				3667	} \
				3668	} while (0)
				3669
				3670	/*
				3671	* Reduce accuracy until either term fits in a u64, then proceed with
				3672	* the other, so that finally we can do a u64/u64 division.
				3673	*/
				3674	while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
				3675	REDUCE_FLS(nsec, frequency);
				3676	REDUCE_FLS(sec, count);
				3677	}
				3678
				3679	if (count_fls + sec_fls > 64) {
				3680	divisor = nsec * frequency;
				3681
				3682	while (count_fls + sec_fls > 64) {
				3683	REDUCE_FLS(count, sec);
				3684	divisor >>= 1;
				3685	}
				3686
				3687	dividend = count * sec;
				3688	} else {
				3689	dividend = count * sec;
				3690
				3691	while (nsec_fls + frequency_fls > 64) {
				3692	REDUCE_FLS(nsec, frequency);
				3693	dividend >>= 1;
				3694	}
				3695
				3696	divisor = nsec * frequency;
				3697	}
				3698
				3699	if (!divisor)
				3700	return dividend;
				3701
				3702	return div64_u64(dividend, divisor);
				3703	}
				3704
				3705	static DEFINE_PER_CPU(int, perf_throttled_count);
				3706	static DEFINE_PER_CPU(u64, perf_throttled_seq);
				3707
				3708	static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
				3709	{
				3710	struct hw_perf_event *hwc = &event->hw;
				3711	s64 period, sample_period;
				3712	s64 delta;
				3713
				3714	period = perf_calculate_period(event, nsec, count);
				3715
				3716	delta = (s64)(period - hwc->sample_period);
				3717	if (delta >= 0)
				3718	delta += 7;
				3719	else
				3720	delta -= 7;
				3721	delta /= 8; /* low pass filter */
				3722
				3723	sample_period = hwc->sample_period + delta;
				3724
				3725	if (!sample_period)
				3726	sample_period = 1;
				3727
				3728	hwc->sample_period = sample_period;
				3729
				3730	if (local64_read(&hwc->period_left) > 8*sample_period) {
				3731	if (disable)
				3732	event->pmu->stop(event, PERF_EF_UPDATE);
				3733
				3734	local64_set(&hwc->period_left, 0);
				3735
				3736	if (disable)
				3737	event->pmu->start(event, PERF_EF_RELOAD);
				3738	}
				3739	}
				3740
				3741	/*
				3742	* combine freq adjustment with unthrottling to avoid two passes over the
				3743	* events. At the same time, make sure, having freq events does not change
				3744	* the rate of unthrottling as that would introduce bias.
				3745	*/
				3746	static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
				3747	int needs_unthr)
				3748	{
				3749	struct perf_event *event;
				3750	struct hw_perf_event *hwc;
				3751	u64 now, period = TICK_NSEC;
				3752	s64 delta;
				3753
				3754	/*
				3755	* only need to iterate over all events iff:
				3756	* - context have events in frequency mode (needs freq adjust)
				3757	* - there are events to unthrottle on this cpu
				3758	*/
				3759	if (!(ctx->nr_freq \|\| needs_unthr))
				3760	return;
				3761
				3762	raw_spin_lock(&ctx->lock);
				3763	perf_pmu_disable(ctx->pmu);
				3764
				3765	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
				3766	if (event->state != PERF_EVENT_STATE_ACTIVE)
				3767	continue;
				3768
				3769	if (!event_filter_match(event))
				3770	continue;
				3771
				3772	perf_pmu_disable(event->pmu);
				3773
				3774	hwc = &event->hw;
				3775
				3776	if (hwc->interrupts == MAX_INTERRUPTS) {
				3777	hwc->interrupts = 0;
				3778	perf_log_throttle(event, 1);
				3779	event->pmu->start(event, 0);
				3780	}
				3781
				3782	if (!event->attr.freq \|\| !event->attr.sample_freq)
				3783	goto next;
				3784
				3785	/*
				3786	* stop the event and update event->count
				3787	*/
				3788	event->pmu->stop(event, PERF_EF_UPDATE);
				3789
				3790	now = local64_read(&event->count);
				3791	delta = now - hwc->freq_count_stamp;
				3792	hwc->freq_count_stamp = now;
				3793
				3794	/*
				3795	* restart the event
				3796	* reload only if value has changed
				3797	* we have stopped the event so tell that
				3798	* to perf_adjust_period() to avoid stopping it
				3799	* twice.
				3800	*/
				3801	if (delta > 0)
				3802	perf_adjust_period(event, period, delta, false);
				3803
				3804	event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
				3805	next:
				3806	perf_pmu_enable(event->pmu);
				3807	}
				3808
				3809	perf_pmu_enable(ctx->pmu);
				3810	raw_spin_unlock(&ctx->lock);
				3811	}
				3812
				3813	/*
				3814	* Move @event to the tail of the @ctx's elegible events.
				3815	*/
				3816	static void rotate_ctx(struct perf_event_context ctx, struct perf_event event)
				3817	{
				3818	/*
				3819	* Rotate the first entry last of non-pinned groups. Rotation might be
				3820	* disabled by the inheritance code.
				3821	*/
				3822	if (ctx->rotate_disable)
				3823	return;
				3824
				3825	perf_event_groups_delete(&ctx->flexible_groups, event);
				3826	perf_event_groups_insert(&ctx->flexible_groups, event);
				3827	}
				3828
				3829	/* pick an event from the flexible_groups to rotate */
				3830	static inline struct perf_event *
				3831	ctx_event_to_rotate(struct perf_event_context *ctx)
				3832	{
				3833	struct perf_event *event;
				3834
				3835	/* pick the first active flexible event */
				3836	event = list_first_entry_or_null(&ctx->flexible_active,
				3837	struct perf_event, active_list);
				3838
				3839	/* if no active flexible event, pick the first event */
				3840	if (!event) {
				3841	event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
				3842	typeof(*event), group_node);
				3843	}
				3844
				3845	/*
				3846	* Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
				3847	* finds there are unschedulable events, it will set it again.
				3848	*/
				3849	ctx->rotate_necessary = 0;
				3850
				3851	return event;
				3852	}
				3853
				3854	static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
				3855	{
				3856	struct perf_event cpu_event = NULL, task_event = NULL;
				3857	struct perf_event_context *task_ctx = NULL;
				3858	int cpu_rotate, task_rotate;
				3859
				3860	/*
				3861	* Since we run this from IRQ context, nobody can install new
				3862	* events, thus the event count values are stable.
				3863	*/
				3864
				3865	cpu_rotate = cpuctx->ctx.rotate_necessary;
				3866	task_ctx = cpuctx->task_ctx;
				3867	task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
				3868
				3869	if (!(cpu_rotate \|\| task_rotate))
				3870	return false;
				3871
				3872	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
				3873	perf_pmu_disable(cpuctx->ctx.pmu);
				3874
				3875	if (task_rotate)
				3876	task_event = ctx_event_to_rotate(task_ctx);
				3877	if (cpu_rotate)
				3878	cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
				3879
				3880	/*
				3881	* As per the order given at ctx_resched() first 'pop' task flexible
				3882	* and then, if needed CPU flexible.
				3883	*/
				3884	if (task_event \|\| (task_ctx && cpu_event))
				3885	ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
				3886	if (cpu_event)
				3887	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
				3888
				3889	if (task_event)
				3890	rotate_ctx(task_ctx, task_event);
				3891	if (cpu_event)
				3892	rotate_ctx(&cpuctx->ctx, cpu_event);
				3893
				3894	perf_event_sched_in(cpuctx, task_ctx, current);
				3895
				3896	perf_pmu_enable(cpuctx->ctx.pmu);
				3897	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
				3898
				3899	return true;
				3900	}
				3901
				3902	void perf_event_task_tick(void)
				3903	{
				3904	struct list_head *head = this_cpu_ptr(&active_ctx_list);
				3905	struct perf_event_context ctx, tmp;
				3906	int throttled;
				3907
				3908	lockdep_assert_irqs_disabled();
				3909
				3910	__this_cpu_inc(perf_throttled_seq);
				3911	throttled = __this_cpu_xchg(perf_throttled_count, 0);
				3912	tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
				3913
				3914	list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
				3915	perf_adjust_freq_unthr_context(ctx, throttled);
				3916	}
				3917
				3918	static int event_enable_on_exec(struct perf_event *event,
				3919	struct perf_event_context *ctx)
				3920	{
				3921	if (!event->attr.enable_on_exec)
				3922	return 0;
				3923
				3924	event->attr.enable_on_exec = 0;
				3925	if (event->state >= PERF_EVENT_STATE_INACTIVE)
				3926	return 0;
				3927
				3928	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
				3929
				3930	return 1;
				3931	}
				3932
				3933	/*
				3934	* Enable all of a task's events that have been marked enable-on-exec.
				3935	* This expects task == current.
				3936	*/
				3937	static void perf_event_enable_on_exec(int ctxn)
				3938	{
				3939	struct perf_event_context ctx, clone_ctx = NULL;
				3940	enum event_type_t event_type = 0;
				3941	struct perf_cpu_context *cpuctx;
				3942	struct perf_event *event;
				3943	unsigned long flags;
				3944	int enabled = 0;
				3945
				3946	local_irq_save(flags);
				3947	ctx = current->perf_event_ctxp[ctxn];
				3948	if (!ctx \|\| !ctx->nr_events)
				3949	goto out;
				3950
				3951	cpuctx = __get_cpu_context(ctx);
				3952	perf_ctx_lock(cpuctx, ctx);
				3953	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
				3954	list_for_each_entry(event, &ctx->event_list, event_entry) {
				3955	enabled \|= event_enable_on_exec(event, ctx);
				3956	event_type \|= get_event_type(event);
				3957	}
				3958
				3959	/*
				3960	* Unclone and reschedule this context if we enabled any event.
				3961	*/
				3962	if (enabled) {
				3963	clone_ctx = unclone_ctx(ctx);
				3964	ctx_resched(cpuctx, ctx, event_type);
				3965	} else {
				3966	ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
				3967	}
				3968	perf_ctx_unlock(cpuctx, ctx);
				3969
				3970	out:
				3971	local_irq_restore(flags);
				3972
				3973	if (clone_ctx)
				3974	put_ctx(clone_ctx);
				3975	}
				3976
				3977	struct perf_read_data {
				3978	struct perf_event *event;
				3979	bool group;
				3980	int ret;
				3981	};
				3982
				3983	static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
				3984	{
				3985	u16 local_pkg, event_pkg;
				3986
				3987	if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
				3988	int local_cpu = smp_processor_id();
				3989
				3990	event_pkg = topology_physical_package_id(event_cpu);
				3991	local_pkg = topology_physical_package_id(local_cpu);
				3992
				3993	if (event_pkg == local_pkg)
				3994	return local_cpu;
				3995	}
				3996
				3997	return event_cpu;
				3998	}
				3999
				4000	/*
				4001	* Cross CPU call to read the hardware event
				4002	*/
				4003	static void __perf_event_read(void *info)
				4004	{
				4005	struct perf_read_data *data = info;
				4006	struct perf_event sub, event = data->event;
				4007	struct perf_event_context *ctx = event->ctx;
				4008	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
				4009	struct pmu *pmu = event->pmu;
				4010
				4011	/*
				4012	* If this is a task context, we need to check whether it is
				4013	* the current task context of this cpu. If not it has been
				4014	* scheduled out before the smp call arrived. In that case
				4015	* event->count would have been updated to a recent sample
				4016	* when the event was scheduled out.
				4017	*/
				4018	if (ctx->task && cpuctx->task_ctx != ctx)
				4019	return;
				4020
				4021	raw_spin_lock(&ctx->lock);
				4022	if (ctx->is_active & EVENT_TIME) {
				4023	update_context_time(ctx);
				4024	update_cgrp_time_from_event(event);
				4025	}
				4026
				4027	perf_event_update_time(event);
				4028	if (data->group)
				4029	perf_event_update_sibling_time(event);
				4030
				4031	if (event->state != PERF_EVENT_STATE_ACTIVE)
				4032	goto unlock;
				4033
				4034	if (!data->group) {
				4035	pmu->read(event);
				4036	data->ret = 0;
				4037	goto unlock;
				4038	}
				4039
				4040	pmu->start_txn(pmu, PERF_PMU_TXN_READ);
				4041
				4042	pmu->read(event);
				4043
				4044	for_each_sibling_event(sub, event) {
				4045	if (sub->state == PERF_EVENT_STATE_ACTIVE) {
				4046	/*
				4047	* Use sibling's PMU rather than @event's since
				4048	* sibling could be on different (eg: software) PMU.
				4049	*/
				4050	sub->pmu->read(sub);
				4051	}
				4052	}
				4053
				4054	data->ret = pmu->commit_txn(pmu);
				4055
				4056	unlock:
				4057	raw_spin_unlock(&ctx->lock);
				4058	}
				4059
				4060	static inline u64 perf_event_count(struct perf_event *event)
				4061	{
				4062	return local64_read(&event->count) + atomic64_read(&event->child_count);
				4063	}
				4064
				4065	/*
				4066	* NMI-safe method to read a local event, that is an event that
				4067	* is:
				4068	* - either for the current task, or for this CPU
				4069	* - does not have inherit set, for inherited task events
				4070	* will not be local and we cannot read them atomically
				4071	* - must not have a pmu::count method
				4072	*/
				4073	int perf_event_read_local(struct perf_event event, u64 value,
				4074	u64 enabled, u64 running)
				4075	{
				4076	unsigned long flags;
				4077	int ret = 0;
				4078
				4079	/*
				4080	* Disabling interrupts avoids all counter scheduling (context
				4081	* switches, timer based rotation and IPIs).
				4082	*/
				4083	local_irq_save(flags);
				4084
				4085	/*
				4086	* It must not be an event with inherit set, we cannot read
				4087	* all child counters from atomic context.
				4088	*/
				4089	if (event->attr.inherit) {
				4090	ret = -EOPNOTSUPP;
				4091	goto out;
				4092	}
				4093
				4094	/* If this is a per-task event, it must be for current */
				4095	if ((event->attach_state & PERF_ATTACH_TASK) &&
				4096	event->hw.target != current) {
				4097	ret = -EINVAL;
				4098	goto out;
				4099	}
				4100
				4101	/* If this is a per-CPU event, it must be for this CPU */
				4102	if (!(event->attach_state & PERF_ATTACH_TASK) &&
				4103	event->cpu != smp_processor_id()) {
				4104	ret = -EINVAL;
				4105	goto out;
				4106	}
				4107
				4108	/* If this is a pinned event it must be running on this CPU */
				4109	if (event->attr.pinned && event->oncpu != smp_processor_id()) {
				4110	ret = -EBUSY;
				4111	goto out;
				4112	}
				4113
				4114	/*
				4115	* If the event is currently on this CPU, its either a per-task event,
				4116	* or local to this CPU. Furthermore it means its ACTIVE (otherwise
				4117	* oncpu == -1).
				4118	*/
				4119	if (event->oncpu == smp_processor_id())
				4120	event->pmu->read(event);
				4121
				4122	*value = local64_read(&event->count);
				4123	if (enabled \|\| running) {
				4124	u64 now = event->shadow_ctx_time + perf_clock();
				4125	u64 __enabled, __running;
				4126
				4127	__perf_update_times(event, now, &__enabled, &__running);
				4128	if (enabled)
				4129	*enabled = __enabled;
				4130	if (running)
				4131	*running = __running;
				4132	}
				4133	out:
				4134	local_irq_restore(flags);
				4135
				4136	return ret;
				4137	}
				4138
				4139	static int perf_event_read(struct perf_event *event, bool group)
				4140	{
				4141	enum perf_event_state state = READ_ONCE(event->state);
				4142	int event_cpu, ret = 0;
				4143
				4144	/*
				4145	* If event is enabled and currently active on a CPU, update the
				4146	* value in the event structure:
				4147	*/
				4148	again:
				4149	if (state == PERF_EVENT_STATE_ACTIVE) {
				4150	struct perf_read_data data;
				4151
				4152	/*
				4153	* Orders the ->state and ->oncpu loads such that if we see
				4154	* ACTIVE we must also see the right ->oncpu.
				4155	*
				4156	* Matches the smp_wmb() from event_sched_in().
				4157	*/
				4158	smp_rmb();
				4159
				4160	event_cpu = READ_ONCE(event->oncpu);
				4161	if ((unsigned)event_cpu >= nr_cpu_ids)
				4162	return 0;
				4163
				4164	data = (struct perf_read_data){
				4165	.event = event,
				4166	.group = group,
				4167	.ret = 0,
				4168	};
				4169
				4170	preempt_disable();
				4171	event_cpu = __perf_event_read_cpu(event, event_cpu);
				4172
				4173	/*
				4174	* Purposely ignore the smp_call_function_single() return
				4175	* value.
				4176	*
				4177	* If event_cpu isn't a valid CPU it means the event got
				4178	* scheduled out and that will have updated the event count.
				4179	*
				4180	* Therefore, either way, we'll have an up-to-date event count
				4181	* after this.
				4182	*/
				4183	(void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
				4184	preempt_enable();
				4185	ret = data.ret;
				4186
				4187	} else if (state == PERF_EVENT_STATE_INACTIVE) {
				4188	struct perf_event_context *ctx = event->ctx;
				4189	unsigned long flags;
				4190
				4191	raw_spin_lock_irqsave(&ctx->lock, flags);
				4192	state = event->state;
				4193	if (state != PERF_EVENT_STATE_INACTIVE) {
				4194	raw_spin_unlock_irqrestore(&ctx->lock, flags);
				4195	goto again;
				4196	}
				4197
				4198	/*
				4199	* May read while context is not active (e.g., thread is
				4200	* blocked), in that case we cannot update context time
				4201	*/
				4202	if (ctx->is_active & EVENT_TIME) {
				4203	update_context_time(ctx);
				4204	update_cgrp_time_from_event(event);
				4205	}
				4206
				4207	perf_event_update_time(event);
				4208	if (group)
				4209	perf_event_update_sibling_time(event);
				4210	raw_spin_unlock_irqrestore(&ctx->lock, flags);
				4211	}
				4212
				4213	return ret;
				4214	}
				4215
				4216	/*
				4217	* Initialize the perf_event context in a task_struct:
				4218	*/
				4219	static void __perf_event_init_context(struct perf_event_context *ctx)
				4220	{
				4221	raw_spin_lock_init(&ctx->lock);
				4222	mutex_init(&ctx->mutex);
				4223	INIT_LIST_HEAD(&ctx->active_ctx_list);
				4224	perf_event_groups_init(&ctx->pinned_groups);
				4225	perf_event_groups_init(&ctx->flexible_groups);
				4226	INIT_LIST_HEAD(&ctx->event_list);
				4227	INIT_LIST_HEAD(&ctx->pinned_active);
				4228	INIT_LIST_HEAD(&ctx->flexible_active);
				4229	refcount_set(&ctx->refcount, 1);
				4230	}
				4231
				4232	static struct perf_event_context *
				4233	alloc_perf_context(struct pmu pmu, struct task_struct task)
				4234	{
				4235	struct perf_event_context *ctx;
				4236
				4237	ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
				4238	if (!ctx)
				4239	return NULL;
				4240
				4241	__perf_event_init_context(ctx);
				4242	if (task)
				4243	ctx->task = get_task_struct(task);
				4244	ctx->pmu = pmu;
				4245
				4246	return ctx;
				4247	}
				4248
				4249	static struct task_struct *
				4250	find_lively_task_by_vpid(pid_t vpid)
				4251	{
				4252	struct task_struct *task;
				4253
				4254	rcu_read_lock();
				4255	if (!vpid)
				4256	task = current;
				4257	else
				4258	task = find_task_by_vpid(vpid);
				4259	if (task)
				4260	get_task_struct(task);
				4261	rcu_read_unlock();
				4262
				4263	if (!task)
				4264	return ERR_PTR(-ESRCH);
				4265
				4266	return task;
				4267	}
				4268
				4269	/*
				4270	* Returns a matching context with refcount and pincount.
				4271	*/
				4272	static struct perf_event_context *
				4273	find_get_context(struct pmu pmu, struct task_struct task,
				4274	struct perf_event *event)
				4275	{
				4276	struct perf_event_context ctx, clone_ctx = NULL;
				4277	struct perf_cpu_context *cpuctx;
				4278	void *task_ctx_data = NULL;
				4279	unsigned long flags;
				4280	int ctxn, err;
				4281	int cpu = event->cpu;
				4282
				4283	if (!task) {
				4284	/* Must be root to operate on a CPU event: */
				4285	err = perf_allow_cpu(&event->attr);
				4286	if (err)
				4287	return ERR_PTR(err);
				4288
				4289	cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
				4290	ctx = &cpuctx->ctx;
				4291	get_ctx(ctx);
				4292	raw_spin_lock_irqsave(&ctx->lock, flags);
				4293	++ctx->pin_count;
				4294	raw_spin_unlock_irqrestore(&ctx->lock, flags);
				4295
				4296	return ctx;
				4297	}
				4298
				4299	err = -EINVAL;
				4300	ctxn = pmu->task_ctx_nr;
				4301	if (ctxn < 0)
				4302	goto errout;
				4303
				4304	if (event->attach_state & PERF_ATTACH_TASK_DATA) {
				4305	task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
				4306	if (!task_ctx_data) {
				4307	err = -ENOMEM;
				4308	goto errout;
				4309	}
				4310	}
				4311
				4312	retry:
				4313	ctx = perf_lock_task_context(task, ctxn, &flags);
				4314	if (ctx) {
				4315	clone_ctx = unclone_ctx(ctx);
				4316	++ctx->pin_count;
				4317
				4318	if (task_ctx_data && !ctx->task_ctx_data) {
				4319	ctx->task_ctx_data = task_ctx_data;
				4320	task_ctx_data = NULL;
				4321	}
				4322	raw_spin_unlock_irqrestore(&ctx->lock, flags);
				4323
				4324	if (clone_ctx)
				4325	put_ctx(clone_ctx);
				4326	} else {
				4327	ctx = alloc_perf_context(pmu, task);
				4328	err = -ENOMEM;
				4329	if (!ctx)
				4330	goto errout;
				4331
				4332	if (task_ctx_data) {
				4333	ctx->task_ctx_data = task_ctx_data;
				4334	task_ctx_data = NULL;
				4335	}
				4336
				4337	err = 0;
				4338	mutex_lock(&task->perf_event_mutex);
				4339	/*
				4340	* If it has already passed perf_event_exit_task().
				4341	* we must see PF_EXITING, it takes this mutex too.
				4342	*/
				4343	if (task->flags & PF_EXITING)
				4344	err = -ESRCH;
				4345	else if (task->perf_event_ctxp[ctxn])
				4346	err = -EAGAIN;
				4347	else {
				4348	get_ctx(ctx);
				4349	++ctx->pin_count;
				4350	rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
				4351	}
				4352	mutex_unlock(&task->perf_event_mutex);
				4353
				4354	if (unlikely(err)) {
				4355	put_ctx(ctx);
				4356
				4357	if (err == -EAGAIN)
				4358	goto retry;
				4359	goto errout;
				4360	}
				4361	}
				4362
				4363	kfree(task_ctx_data);
				4364	return ctx;
				4365
				4366	errout:
				4367	kfree(task_ctx_data);
				4368	return ERR_PTR(err);
				4369	}
				4370
				4371	static void perf_event_free_filter(struct perf_event *event);
				4372	static void perf_event_free_bpf_prog(struct perf_event *event);
				4373
				4374	static void free_event_rcu(struct rcu_head *head)
				4375	{
				4376	struct perf_event *event;
				4377
				4378	event = container_of(head, struct perf_event, rcu_head);
				4379	if (event->ns)
				4380	put_pid_ns(event->ns);
				4381	perf_event_free_filter(event);
				4382	kfree(event);
				4383	}
				4384
				4385	static void ring_buffer_attach(struct perf_event *event,
				4386	struct ring_buffer *rb);
				4387
				4388	static void detach_sb_event(struct perf_event *event)
				4389	{
				4390	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
				4391
				4392	raw_spin_lock(&pel->lock);
				4393	list_del_rcu(&event->sb_list);
				4394	raw_spin_unlock(&pel->lock);
				4395	}
				4396
				4397	static bool is_sb_event(struct perf_event *event)
				4398	{
				4399	struct perf_event_attr *attr = &event->attr;
				4400
				4401	if (event->parent)
				4402	return false;
				4403
				4404	if (event->attach_state & PERF_ATTACH_TASK)
				4405	return false;
				4406
				4407	if (attr->mmap \|\| attr->mmap_data \|\| attr->mmap2 \|\|
				4408	attr->comm \|\| attr->comm_exec \|\|
				4409	attr->task \|\| attr->ksymbol \|\|
				4410	attr->context_switch \|\|
				4411	attr->bpf_event)
				4412	return true;
				4413	return false;
				4414	}
				4415
				4416	static void unaccount_pmu_sb_event(struct perf_event *event)
				4417	{
				4418	if (is_sb_event(event))
				4419	detach_sb_event(event);
				4420	}
				4421
				4422	static void unaccount_event_cpu(struct perf_event *event, int cpu)
				4423	{
				4424	if (event->parent)
				4425	return;
				4426
				4427	if (is_cgroup_event(event))
				4428	atomic_dec(&per_cpu(perf_cgroup_events, cpu));
				4429	}
				4430
				4431	#ifdef CONFIG_NO_HZ_FULL
				4432	static DEFINE_SPINLOCK(nr_freq_lock);
				4433	#endif
				4434
				4435	static void unaccount_freq_event_nohz(void)
				4436	{
				4437	#ifdef CONFIG_NO_HZ_FULL
				4438	spin_lock(&nr_freq_lock);
				4439	if (atomic_dec_and_test(&nr_freq_events))
				4440	tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
				4441	spin_unlock(&nr_freq_lock);
				4442	#endif
				4443	}
				4444
				4445	static void unaccount_freq_event(void)
				4446	{
				4447	if (tick_nohz_full_enabled())
				4448	unaccount_freq_event_nohz();
				4449	else
				4450	atomic_dec(&nr_freq_events);
				4451	}
				4452
				4453	static void unaccount_event(struct perf_event *event)
				4454	{
				4455	bool dec = false;
				4456
				4457	if (event->parent)
				4458	return;
				4459
				4460	if (event->attach_state & PERF_ATTACH_TASK)
				4461	dec = true;
				4462	if (event->attr.mmap \|\| event->attr.mmap_data)
				4463	atomic_dec(&nr_mmap_events);
				4464	if (event->attr.comm)
				4465	atomic_dec(&nr_comm_events);
				4466	if (event->attr.namespaces)
				4467	atomic_dec(&nr_namespaces_events);
				4468	if (event->attr.task)
				4469	atomic_dec(&nr_task_events);
				4470	if (event->attr.freq)
				4471	unaccount_freq_event();
				4472	if (event->attr.context_switch) {
				4473	dec = true;
				4474	atomic_dec(&nr_switch_events);
				4475	}
				4476	if (is_cgroup_event(event))
				4477	dec = true;
				4478	if (has_branch_stack(event))
				4479	dec = true;
				4480	if (event->attr.ksymbol)
				4481	atomic_dec(&nr_ksymbol_events);
				4482	if (event->attr.bpf_event)
				4483	atomic_dec(&nr_bpf_events);
				4484
				4485	if (dec) {
				4486	if (!atomic_add_unless(&perf_sched_count, -1, 1))
				4487	schedule_delayed_work(&perf_sched_work, HZ);
				4488	}
				4489
				4490	unaccount_event_cpu(event, event->cpu);
				4491
				4492	unaccount_pmu_sb_event(event);
				4493	}
				4494
				4495	static void perf_sched_delayed(struct work_struct *work)
				4496	{
				4497	mutex_lock(&perf_sched_mutex);
				4498	if (atomic_dec_and_test(&perf_sched_count))
				4499	static_branch_disable(&perf_sched_events);
				4500	mutex_unlock(&perf_sched_mutex);
				4501	}
				4502
				4503	/*
				4504	* The following implement mutual exclusion of events on "exclusive" pmus
				4505	* (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
				4506	* at a time, so we disallow creating events that might conflict, namely:
				4507	*
				4508	* 1) cpu-wide events in the presence of per-task events,
				4509	* 2) per-task events in the presence of cpu-wide events,
				4510	* 3) two matching events on the same context.
				4511	*
				4512	* The former two cases are handled in the allocation path (perf_event_alloc(),
				4513	* _free_event()), the latter -- before the first perf_install_in_context().
				4514	*/
				4515	static int exclusive_event_init(struct perf_event *event)
				4516	{
				4517	struct pmu *pmu = event->pmu;
				4518
				4519	if (!is_exclusive_pmu(pmu))
				4520	return 0;
				4521
				4522	/*
				4523	* Prevent co-existence of per-task and cpu-wide events on the
				4524	* same exclusive pmu.
				4525	*
				4526	* Negative pmu::exclusive_cnt means there are cpu-wide
				4527	* events on this "exclusive" pmu, positive means there are
				4528	* per-task events.
				4529	*
				4530	* Since this is called in perf_event_alloc() path, event::ctx
				4531	* doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
				4532	* to mean "per-task event", because unlike other attach states it
				4533	* never gets cleared.
				4534	*/
				4535	if (event->attach_state & PERF_ATTACH_TASK) {
				4536	if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
				4537	return -EBUSY;
				4538	} else {
				4539	if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
				4540	return -EBUSY;
				4541	}
				4542
				4543	return 0;
				4544	}
				4545
				4546	static void exclusive_event_destroy(struct perf_event *event)
				4547	{
				4548	struct pmu *pmu = event->pmu;
				4549
				4550	if (!is_exclusive_pmu(pmu))
				4551	return;
				4552
				4553	/* see comment in exclusive_event_init() */
				4554	if (event->attach_state & PERF_ATTACH_TASK)
				4555	atomic_dec(&pmu->exclusive_cnt);
				4556	else
				4557	atomic_inc(&pmu->exclusive_cnt);
				4558	}
				4559
				4560	static bool exclusive_event_match(struct perf_event e1, struct perf_event e2)
				4561	{
				4562	if ((e1->pmu == e2->pmu) &&
				4563	(e1->cpu == e2->cpu \|\|
				4564	e1->cpu == -1 \|\|
				4565	e2->cpu == -1))
				4566	return true;
				4567	return false;
				4568	}
				4569
				4570	static bool exclusive_event_installable(struct perf_event *event,
				4571	struct perf_event_context *ctx)
				4572	{
				4573	struct perf_event *iter_event;
				4574	struct pmu *pmu = event->pmu;
				4575
				4576	lockdep_assert_held(&ctx->mutex);
				4577
				4578	if (!is_exclusive_pmu(pmu))
				4579	return true;
				4580
				4581	list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
				4582	if (exclusive_event_match(iter_event, event))
				4583	return false;
				4584	}
				4585
				4586	return true;
				4587	}
				4588
				4589	static void perf_addr_filters_splice(struct perf_event *event,
				4590	struct list_head *head);
				4591
				4592	static void _free_event(struct perf_event *event)
				4593	{
				4594	irq_work_sync(&event->pending);
				4595
				4596	unaccount_event(event);
				4597
				4598	security_perf_event_free(event);
				4599
				4600	if (event->rb) {
				4601	/*
				4602	* Can happen when we close an event with re-directed output.
				4603	*
				4604	* Since we have a 0 refcount, perf_mmap_close() will skip
				4605	* over us; possibly making our ring_buffer_put() the last.
				4606	*/
				4607	mutex_lock(&event->mmap_mutex);
				4608	ring_buffer_attach(event, NULL);
				4609	mutex_unlock(&event->mmap_mutex);
				4610	}
				4611
				4612	if (is_cgroup_event(event))
				4613	perf_detach_cgroup(event);
				4614
				4615	if (!event->parent) {
				4616	if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
				4617	put_callchain_buffers();
				4618	}
				4619
				4620	perf_event_free_bpf_prog(event);
				4621	perf_addr_filters_splice(event, NULL);
				4622	kfree(event->addr_filter_ranges);
				4623
				4624	if (event->destroy)
				4625	event->destroy(event);
				4626
				4627	/*
				4628	* Must be after ->destroy(), due to uprobe_perf_close() using
				4629	* hw.target.
				4630	*/
				4631	if (event->hw.target)
				4632	put_task_struct(event->hw.target);
				4633
				4634	/*
				4635	* perf_event_free_task() relies on put_ctx() being 'last', in particular
				4636	* all task references must be cleaned up.
				4637	*/
				4638	if (event->ctx)
				4639	put_ctx(event->ctx);
				4640
				4641	exclusive_event_destroy(event);
				4642	module_put(event->pmu->module);
				4643
				4644	call_rcu(&event->rcu_head, free_event_rcu);
				4645	}
				4646
				4647	/*
				4648	* Used to free events which have a known refcount of 1, such as in error paths
				4649	* where the event isn't exposed yet and inherited events.
				4650	*/
				4651	static void free_event(struct perf_event *event)
				4652	{
				4653	if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
				4654	"unexpected event refcount: %ld; ptr=%p\n",
				4655	atomic_long_read(&event->refcount), event)) {
				4656	/* leak to avoid use-after-free */
				4657	return;
				4658	}
				4659
				4660	_free_event(event);
				4661	}
				4662
				4663	/*
				4664	* Remove user event from the owner task.
				4665	*/
				4666	static void perf_remove_from_owner(struct perf_event *event)
				4667	{
				4668	struct task_struct *owner;
				4669
				4670	rcu_read_lock();
				4671	/*
				4672	* Matches the smp_store_release() in perf_event_exit_task(). If we
				4673	* observe !owner it means the list deletion is complete and we can
				4674	* indeed free this event, otherwise we need to serialize on
				4675	* owner->perf_event_mutex.
				4676	*/
				4677	owner = READ_ONCE(event->owner);
				4678	if (owner) {
				4679	/*
				4680	* Since delayed_put_task_struct() also drops the last
				4681	* task reference we can safely take a new reference
				4682	* while holding the rcu_read_lock().
				4683	*/
				4684	get_task_struct(owner);
				4685	}
				4686	rcu_read_unlock();
				4687
				4688	if (owner) {
				4689	/*
				4690	* If we're here through perf_event_exit_task() we're already
				4691	* holding ctx->mutex which would be an inversion wrt. the
				4692	* normal lock order.
				4693	*
				4694	* However we can safely take this lock because its the child
				4695	* ctx->mutex.
				4696	*/
				4697	mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
				4698
				4699	/*
				4700	* We have to re-check the event->owner field, if it is cleared
				4701	* we raced with perf_event_exit_task(), acquiring the mutex
				4702	* ensured they're done, and we can proceed with freeing the
				4703	* event.
				4704	*/
				4705	if (event->owner) {
				4706	list_del_init(&event->owner_entry);
				4707	smp_store_release(&event->owner, NULL);
				4708	}
				4709	mutex_unlock(&owner->perf_event_mutex);
				4710	put_task_struct(owner);
				4711	}
				4712	}
				4713
				4714	static void put_event(struct perf_event *event)
				4715	{
				4716	if (!atomic_long_dec_and_test(&event->refcount))
				4717	return;
				4718
				4719	_free_event(event);
				4720	}
				4721
				4722	/*
				4723	* Kill an event dead; while event:refcount will preserve the event
				4724	* object, it will not preserve its functionality. Once the last 'user'
				4725	* gives up the object, we'll destroy the thing.
				4726	*/
				4727	int perf_event_release_kernel(struct perf_event *event)
				4728	{
				4729	struct perf_event_context *ctx = event->ctx;
				4730	struct perf_event child, tmp;
				4731	LIST_HEAD(free_list);
				4732
				4733	/*
				4734	* If we got here through err_file: fput(event_file); we will not have
				4735	* attached to a context yet.
				4736	*/
				4737	if (!ctx) {
				4738	WARN_ON_ONCE(event->attach_state &
				4739	(PERF_ATTACH_CONTEXT\|PERF_ATTACH_GROUP));
				4740	goto no_ctx;
				4741	}
				4742
				4743	if (!is_kernel_event(event))
				4744	perf_remove_from_owner(event);
				4745
				4746	ctx = perf_event_ctx_lock(event);
				4747	WARN_ON_ONCE(ctx->parent_ctx);
				4748	perf_remove_from_context(event, DETACH_GROUP);
				4749
				4750	raw_spin_lock_irq(&ctx->lock);
				4751	/*
				4752	* Mark this event as STATE_DEAD, there is no external reference to it
				4753	* anymore.
				4754	*
				4755	* Anybody acquiring event->child_mutex after the below loop _must_
				4756	* also see this, most importantly inherit_event() which will avoid
				4757	* placing more children on the list.
				4758	*
				4759	* Thus this guarantees that we will in fact observe and kill _ALL_
				4760	* child events.
				4761	*/
				4762	event->state = PERF_EVENT_STATE_DEAD;
				4763	raw_spin_unlock_irq(&ctx->lock);
				4764
				4765	perf_event_ctx_unlock(event, ctx);
				4766
				4767	again:
				4768	mutex_lock(&event->child_mutex);
				4769	list_for_each_entry(child, &event->child_list, child_list) {
				4770	void *var = NULL;
				4771
				4772	/*
				4773	* Cannot change, child events are not migrated, see the
				4774	* comment with perf_event_ctx_lock_nested().
				4775	*/
				4776	ctx = READ_ONCE(child->ctx);
				4777	/*
				4778	* Since child_mutex nests inside ctx::mutex, we must jump
				4779	* through hoops. We start by grabbing a reference on the ctx.
				4780	*
				4781	* Since the event cannot get freed while we hold the
				4782	* child_mutex, the context must also exist and have a !0
				4783	* reference count.
				4784	*/
				4785	get_ctx(ctx);
				4786
				4787	/*
				4788	* Now that we have a ctx ref, we can drop child_mutex, and
				4789	* acquire ctx::mutex without fear of it going away. Then we
				4790	* can re-acquire child_mutex.
				4791	*/
				4792	mutex_unlock(&event->child_mutex);
				4793	mutex_lock(&ctx->mutex);
				4794	mutex_lock(&event->child_mutex);
				4795
				4796	/*
				4797	* Now that we hold ctx::mutex and child_mutex, revalidate our
				4798	* state, if child is still the first entry, it didn't get freed
				4799	* and we can continue doing so.
				4800	*/
				4801	tmp = list_first_entry_or_null(&event->child_list,
				4802	struct perf_event, child_list);
				4803	if (tmp == child) {
				4804	perf_remove_from_context(child, DETACH_GROUP);
				4805	list_move(&child->child_list, &free_list);
				4806	/*
				4807	* This matches the refcount bump in inherit_event();
				4808	* this can't be the last reference.
				4809	*/
				4810	put_event(event);
				4811	} else {
				4812	var = &ctx->refcount;
				4813	}
				4814
				4815	mutex_unlock(&event->child_mutex);
				4816	mutex_unlock(&ctx->mutex);
				4817	put_ctx(ctx);
				4818
				4819	if (var) {
				4820	/*
				4821	* If perf_event_free_task() has deleted all events from the
				4822	* ctx while the child_mutex got released above, make sure to
				4823	* notify about the preceding put_ctx().
				4824	*/
				4825	smp_mb(); /* pairs with wait_var_event() */
				4826	wake_up_var(var);
				4827	}
				4828	goto again;
				4829	}
				4830	mutex_unlock(&event->child_mutex);
				4831
				4832	list_for_each_entry_safe(child, tmp, &free_list, child_list) {
				4833	void *var = &child->ctx->refcount;
				4834
				4835	list_del(&child->child_list);
				4836	free_event(child);
				4837
				4838	/*
				4839	* Wake any perf_event_free_task() waiting for this event to be
				4840	* freed.
				4841	*/
				4842	smp_mb(); /* pairs with wait_var_event() */
				4843	wake_up_var(var);
				4844	}
				4845
				4846	no_ctx:
				4847	put_event(event); /* Must be the 'last' reference */
				4848	return 0;
				4849	}
				4850	EXPORT_SYMBOL_GPL(perf_event_release_kernel);
				4851
				4852	/*
				4853	* Called when the last reference to the file is gone.
				4854	*/
				4855	static int perf_release(struct inode inode, struct file file)
				4856	{
				4857	perf_event_release_kernel(file->private_data);
				4858	return 0;
				4859	}
				4860
				4861	static u64 __perf_event_read_value(struct perf_event event, u64 enabled, u64 *running)
				4862	{
				4863	struct perf_event *child;
				4864	u64 total = 0;
				4865
				4866	*enabled = 0;
				4867	*running = 0;
				4868
				4869	mutex_lock(&event->child_mutex);
				4870
				4871	(void)perf_event_read(event, false);
				4872	total += perf_event_count(event);
				4873
				4874	*enabled += event->total_time_enabled +
				4875	atomic64_read(&event->child_total_time_enabled);
				4876	*running += event->total_time_running +
				4877	atomic64_read(&event->child_total_time_running);
				4878
				4879	list_for_each_entry(child, &event->child_list, child_list) {
				4880	(void)perf_event_read(child, false);
				4881	total += perf_event_count(child);
				4882	*enabled += child->total_time_enabled;
				4883	*running += child->total_time_running;
				4884	}
				4885	mutex_unlock(&event->child_mutex);
				4886
				4887	return total;
				4888	}
				4889
				4890	u64 perf_event_read_value(struct perf_event event, u64 enabled, u64 *running)
				4891	{
				4892	struct perf_event_context *ctx;
				4893	u64 count;
				4894
				4895	ctx = perf_event_ctx_lock(event);
				4896	count = __perf_event_read_value(event, enabled, running);
				4897	perf_event_ctx_unlock(event, ctx);
				4898
				4899	return count;
				4900	}
				4901	EXPORT_SYMBOL_GPL(perf_event_read_value);
				4902
				4903	static int __perf_read_group_add(struct perf_event *leader,
				4904	u64 read_format, u64 *values)
				4905	{
				4906	struct perf_event_context *ctx = leader->ctx;
				4907	struct perf_event sub, parent;
				4908	unsigned long flags;
				4909	int n = 1; /* skip @nr */
				4910	int ret;
				4911
				4912	ret = perf_event_read(leader, true);
				4913	if (ret)
				4914	return ret;
				4915
				4916	raw_spin_lock_irqsave(&ctx->lock, flags);
				4917	/*
				4918	* Verify the grouping between the parent and child (inherited)
				4919	* events is still in tact.
				4920	*
				4921	* Specifically:
				4922	* - leader->ctx->lock pins leader->sibling_list
				4923	* - parent->child_mutex pins parent->child_list
				4924	* - parent->ctx->mutex pins parent->sibling_list
				4925	*
				4926	* Because parent->ctx != leader->ctx (and child_list nests inside
				4927	* ctx->mutex), group destruction is not atomic between children, also
				4928	* see perf_event_release_kernel(). Additionally, parent can grow the
				4929	* group.
				4930	*
				4931	* Therefore it is possible to have parent and child groups in a
				4932	* different configuration and summing over such a beast makes no sense
				4933	* what so ever.
				4934	*
				4935	* Reject this.
				4936	*/
				4937	parent = leader->parent;
				4938	if (parent &&
				4939	(parent->group_generation != leader->group_generation \|\|
				4940	parent->nr_siblings != leader->nr_siblings)) {
				4941	ret = -ECHILD;
				4942	goto unlock;
				4943	}
				4944
				4945	/*
				4946	* Since we co-schedule groups, {enabled,running} times of siblings
				4947	* will be identical to those of the leader, so we only publish one
				4948	* set.
				4949	*/
				4950	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
				4951	values[n++] += leader->total_time_enabled +
				4952	atomic64_read(&leader->child_total_time_enabled);
				4953	}
				4954
				4955	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
				4956	values[n++] += leader->total_time_running +
				4957	atomic64_read(&leader->child_total_time_running);
				4958	}
				4959
				4960	/*
				4961	* Write {count,id} tuples for every sibling.
				4962	*/
				4963	values[n++] += perf_event_count(leader);
				4964	if (read_format & PERF_FORMAT_ID)
				4965	values[n++] = primary_event_id(leader);
				4966	if (read_format & PERF_FORMAT_LOST)
				4967	values[n++] = atomic64_read(&leader->lost_samples);
				4968
				4969	for_each_sibling_event(sub, leader) {
				4970	values[n++] += perf_event_count(sub);
				4971	if (read_format & PERF_FORMAT_ID)
				4972	values[n++] = primary_event_id(sub);
				4973	if (read_format & PERF_FORMAT_LOST)
				4974	values[n++] = atomic64_read(&sub->lost_samples);
				4975	}
				4976
				4977	unlock:
				4978	raw_spin_unlock_irqrestore(&ctx->lock, flags);
				4979	return ret;
				4980	}
				4981
				4982	static int perf_read_group(struct perf_event *event,
				4983	u64 read_format, char __user *buf)
				4984	{
				4985	struct perf_event leader = event->group_leader, child;
				4986	struct perf_event_context *ctx = leader->ctx;
				4987	int ret;
				4988	u64 *values;
				4989
				4990	lockdep_assert_held(&ctx->mutex);
				4991
				4992	values = kzalloc(event->read_size, GFP_KERNEL);
				4993	if (!values)
				4994	return -ENOMEM;
				4995
				4996	values[0] = 1 + leader->nr_siblings;
				4997
				4998	mutex_lock(&leader->child_mutex);
				4999
				5000	ret = __perf_read_group_add(leader, read_format, values);
				5001	if (ret)
				5002	goto unlock;
				5003
				5004	list_for_each_entry(child, &leader->child_list, child_list) {
				5005	ret = __perf_read_group_add(child, read_format, values);
				5006	if (ret)
				5007	goto unlock;
				5008	}
				5009
				5010	mutex_unlock(&leader->child_mutex);
				5011
				5012	ret = event->read_size;
				5013	if (copy_to_user(buf, values, event->read_size))
				5014	ret = -EFAULT;
				5015	goto out;
				5016
				5017	unlock:
				5018	mutex_unlock(&leader->child_mutex);
				5019	out:
				5020	kfree(values);
				5021	return ret;
				5022	}
				5023
				5024	static int perf_read_one(struct perf_event *event,
				5025	u64 read_format, char __user *buf)
				5026	{
				5027	u64 enabled, running;
				5028	u64 values[5];
				5029	int n = 0;
				5030
				5031	values[n++] = __perf_event_read_value(event, &enabled, &running);
				5032	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
				5033	values[n++] = enabled;
				5034	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
				5035	values[n++] = running;
				5036	if (read_format & PERF_FORMAT_ID)
				5037	values[n++] = primary_event_id(event);
				5038	if (read_format & PERF_FORMAT_LOST)
				5039	values[n++] = atomic64_read(&event->lost_samples);
				5040
				5041	if (copy_to_user(buf, values, n * sizeof(u64)))
				5042	return -EFAULT;
				5043
				5044	return n * sizeof(u64);
				5045	}
				5046
				5047	static bool is_event_hup(struct perf_event *event)
				5048	{
				5049	bool no_children;
				5050
				5051	if (event->state > PERF_EVENT_STATE_EXIT)
				5052	return false;
				5053
				5054	mutex_lock(&event->child_mutex);
				5055	no_children = list_empty(&event->child_list);
				5056	mutex_unlock(&event->child_mutex);
				5057	return no_children;
				5058	}
				5059
				5060	/*
				5061	* Read the performance event - simple non blocking version for now
				5062	*/
				5063	static ssize_t
				5064	__perf_read(struct perf_event event, char __user buf, size_t count)
				5065	{
				5066	u64 read_format = event->attr.read_format;
				5067	int ret;
				5068
				5069	/*
				5070	* Return end-of-file for a read on an event that is in
				5071	* error state (i.e. because it was pinned but it couldn't be
				5072	* scheduled on to the CPU at some point).
				5073	*/
				5074	if (event->state == PERF_EVENT_STATE_ERROR)
				5075	return 0;
				5076
				5077	if (count < event->read_size)
				5078	return -ENOSPC;
				5079
				5080	WARN_ON_ONCE(event->ctx->parent_ctx);
				5081	if (read_format & PERF_FORMAT_GROUP)
				5082	ret = perf_read_group(event, read_format, buf);
				5083	else
				5084	ret = perf_read_one(event, read_format, buf);
				5085
				5086	return ret;
				5087	}
				5088
				5089	static ssize_t
				5090	perf_read(struct file file, char __user buf, size_t count, loff_t *ppos)
				5091	{
				5092	struct perf_event *event = file->private_data;
				5093	struct perf_event_context *ctx;
				5094	int ret;
				5095
				5096	ret = security_perf_event_read(event);
				5097	if (ret)
				5098	return ret;
				5099
				5100	ctx = perf_event_ctx_lock(event);
				5101	ret = __perf_read(event, buf, count);
				5102	perf_event_ctx_unlock(event, ctx);
				5103
				5104	return ret;
				5105	}
				5106
				5107	static __poll_t perf_poll(struct file file, poll_table wait)
				5108	{
				5109	struct perf_event *event = file->private_data;
				5110	struct ring_buffer *rb;
				5111	__poll_t events = EPOLLHUP;
				5112
				5113	poll_wait(file, &event->waitq, wait);
				5114
				5115	if (is_event_hup(event))
				5116	return events;
				5117
				5118	/*
				5119	* Pin the event->rb by taking event->mmap_mutex; otherwise
				5120	* perf_event_set_output() can swizzle our rb and make us miss wakeups.
				5121	*/
				5122	mutex_lock(&event->mmap_mutex);
				5123	rb = event->rb;
				5124	if (rb)
				5125	events = atomic_xchg(&rb->poll, 0);
				5126	mutex_unlock(&event->mmap_mutex);
				5127	return events;
				5128	}
				5129
				5130	static void _perf_event_reset(struct perf_event *event)
				5131	{
				5132	(void)perf_event_read(event, false);
				5133	local64_set(&event->count, 0);
				5134	perf_event_update_userpage(event);
				5135	}
				5136
				5137	/*
				5138	* Holding the top-level event's child_mutex means that any
				5139	* descendant process that has inherited this event will block
				5140	* in perf_event_exit_event() if it goes to exit, thus satisfying the
				5141	* task existence requirements of perf_event_enable/disable.
				5142	*/
				5143	static void perf_event_for_each_child(struct perf_event *event,
				5144	void (func)(struct perf_event ))
				5145	{
				5146	struct perf_event *child;
				5147
				5148	WARN_ON_ONCE(event->ctx->parent_ctx);
				5149
				5150	mutex_lock(&event->child_mutex);
				5151	func(event);
				5152	list_for_each_entry(child, &event->child_list, child_list)
				5153	func(child);
				5154	mutex_unlock(&event->child_mutex);
				5155	}
				5156
				5157	static void perf_event_for_each(struct perf_event *event,
				5158	void (func)(struct perf_event ))
				5159	{
				5160	struct perf_event_context *ctx = event->ctx;
				5161	struct perf_event *sibling;
				5162
				5163	lockdep_assert_held(&ctx->mutex);
				5164
				5165	event = event->group_leader;
				5166
				5167	perf_event_for_each_child(event, func);
				5168	for_each_sibling_event(sibling, event)
				5169	perf_event_for_each_child(sibling, func);
				5170	}
				5171
				5172	static void __perf_event_period(struct perf_event *event,
				5173	struct perf_cpu_context *cpuctx,
				5174	struct perf_event_context *ctx,
				5175	void *info)
				5176	{
				5177	u64 value = ((u64 )info);
				5178	bool active;
				5179
				5180	if (event->attr.freq) {
				5181	event->attr.sample_freq = value;
				5182	} else {
				5183	event->attr.sample_period = value;
				5184	event->hw.sample_period = value;
				5185	}
				5186
				5187	active = (event->state == PERF_EVENT_STATE_ACTIVE);
				5188	if (active) {
				5189	perf_pmu_disable(ctx->pmu);
				5190	/*
				5191	* We could be throttled; unthrottle now to avoid the tick
				5192	* trying to unthrottle while we already re-started the event.
				5193	*/
				5194	if (event->hw.interrupts == MAX_INTERRUPTS) {
				5195	event->hw.interrupts = 0;
				5196	perf_log_throttle(event, 1);
				5197	}
				5198	event->pmu->stop(event, PERF_EF_UPDATE);
				5199	}
				5200
				5201	local64_set(&event->hw.period_left, 0);
				5202
				5203	if (active) {
				5204	event->pmu->start(event, PERF_EF_RELOAD);
				5205	perf_pmu_enable(ctx->pmu);
				5206	}
				5207	}
				5208
				5209	static int perf_event_check_period(struct perf_event *event, u64 value)
				5210	{
				5211	return event->pmu->check_period(event, value);
				5212	}
				5213
				5214	static int perf_event_period(struct perf_event event, u64 __user arg)
				5215	{
				5216	u64 value;
				5217
				5218	if (!is_sampling_event(event))
				5219	return -EINVAL;
				5220
				5221	if (copy_from_user(&value, arg, sizeof(value)))
				5222	return -EFAULT;
				5223
				5224	if (!value)
				5225	return -EINVAL;
				5226
				5227	if (event->attr.freq && value > sysctl_perf_event_sample_rate)
				5228	return -EINVAL;
				5229
				5230	if (perf_event_check_period(event, value))
				5231	return -EINVAL;
				5232
				5233	if (!event->attr.freq && (value & (1ULL << 63)))
				5234	return -EINVAL;
				5235
				5236	event_function_call(event, __perf_event_period, &value);
				5237
				5238	return 0;
				5239	}
				5240
				5241	static const struct file_operations perf_fops;
				5242
				5243	static inline int perf_fget_light(int fd, struct fd *p)
				5244	{
				5245	struct fd f = fdget(fd);
				5246	if (!f.file)
				5247	return -EBADF;
				5248
				5249	if (f.file->f_op != &perf_fops) {
				5250	fdput(f);
				5251	return -EBADF;
				5252	}
				5253	*p = f;
				5254	return 0;
				5255	}
				5256
				5257	static int perf_event_set_output(struct perf_event *event,
				5258	struct perf_event *output_event);
				5259	static int perf_event_set_filter(struct perf_event event, void __user arg);
				5260	static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
				5261	static int perf_copy_attr(struct perf_event_attr __user *uattr,
				5262	struct perf_event_attr *attr);
				5263
				5264	static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
				5265	{
				5266	void (func)(struct perf_event );
				5267	u32 flags = arg;
				5268
				5269	switch (cmd) {
				5270	case PERF_EVENT_IOC_ENABLE:
				5271	func = _perf_event_enable;
				5272	break;
				5273	case PERF_EVENT_IOC_DISABLE:
				5274	func = _perf_event_disable;
				5275	break;
				5276	case PERF_EVENT_IOC_RESET:
				5277	func = _perf_event_reset;
				5278	break;
				5279
				5280	case PERF_EVENT_IOC_REFRESH:
				5281	return _perf_event_refresh(event, arg);
				5282
				5283	case PERF_EVENT_IOC_PERIOD:
				5284	return perf_event_period(event, (u64 __user *)arg);
				5285
				5286	case PERF_EVENT_IOC_ID:
				5287	{
				5288	u64 id = primary_event_id(event);
				5289
				5290	if (copy_to_user((void __user *)arg, &id, sizeof(id)))
				5291	return -EFAULT;
				5292	return 0;
				5293	}
				5294
				5295	case PERF_EVENT_IOC_SET_OUTPUT:
				5296	{
				5297	int ret;
				5298	if (arg != -1) {
				5299	struct perf_event *output_event;
				5300	struct fd output;
				5301	ret = perf_fget_light(arg, &output);
				5302	if (ret)
				5303	return ret;
				5304	output_event = output.file->private_data;
				5305	ret = perf_event_set_output(event, output_event);
				5306	fdput(output);
				5307	} else {
				5308	ret = perf_event_set_output(event, NULL);
				5309	}
				5310	return ret;
				5311	}
				5312
				5313	case PERF_EVENT_IOC_SET_FILTER:
				5314	return perf_event_set_filter(event, (void __user *)arg);
				5315
				5316	case PERF_EVENT_IOC_SET_BPF:
				5317	return perf_event_set_bpf_prog(event, arg);
				5318
				5319	case PERF_EVENT_IOC_PAUSE_OUTPUT: {
				5320	struct ring_buffer *rb;
				5321
				5322	rcu_read_lock();
				5323	rb = rcu_dereference(event->rb);
				5324	if (!rb \|\| !rb->nr_pages) {
				5325	rcu_read_unlock();
				5326	return -EINVAL;
				5327	}
				5328	rb_toggle_paused(rb, !!arg);
				5329	rcu_read_unlock();
				5330	return 0;
				5331	}
				5332
				5333	case PERF_EVENT_IOC_QUERY_BPF:
				5334	return perf_event_query_prog_array(event, (void __user *)arg);
				5335
				5336	case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
				5337	struct perf_event_attr new_attr;
				5338	int err = perf_copy_attr((struct perf_event_attr __user *)arg,
				5339	&new_attr);
				5340
				5341	if (err)
				5342	return err;
				5343
				5344	return perf_event_modify_attr(event, &new_attr);
				5345	}
				5346	default:
				5347	return -ENOTTY;
				5348	}
				5349
				5350	if (flags & PERF_IOC_FLAG_GROUP)
				5351	perf_event_for_each(event, func);
				5352	else
				5353	perf_event_for_each_child(event, func);
				5354
				5355	return 0;
				5356	}
				5357
				5358	static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
				5359	{
				5360	struct perf_event *event = file->private_data;
				5361	struct perf_event_context *ctx;
				5362	long ret;
				5363
				5364	/* Treat ioctl like writes as it is likely a mutating operation. */
				5365	ret = security_perf_event_write(event);
				5366	if (ret)
				5367	return ret;
				5368
				5369	ctx = perf_event_ctx_lock(event);
				5370	ret = _perf_ioctl(event, cmd, arg);
				5371	perf_event_ctx_unlock(event, ctx);
				5372
				5373	return ret;
				5374	}
				5375
				5376	#ifdef CONFIG_COMPAT
				5377	static long perf_compat_ioctl(struct file *file, unsigned int cmd,
				5378	unsigned long arg)
				5379	{
				5380	switch (_IOC_NR(cmd)) {
				5381	case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
				5382	case _IOC_NR(PERF_EVENT_IOC_ID):
				5383	case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
				5384	case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
				5385	/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
				5386	if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
				5387	cmd &= ~IOCSIZE_MASK;
				5388	cmd \|= sizeof(void *) << IOCSIZE_SHIFT;
				5389	}
				5390	break;
				5391	}
				5392	return perf_ioctl(file, cmd, arg);
				5393	}
				5394	#else
				5395	# define perf_compat_ioctl NULL
				5396	#endif
				5397
				5398	int perf_event_task_enable(void)
				5399	{
				5400	struct perf_event_context *ctx;
				5401	struct perf_event *event;
				5402
				5403	mutex_lock(&current->perf_event_mutex);
				5404	list_for_each_entry(event, &current->perf_event_list, owner_entry) {
				5405	ctx = perf_event_ctx_lock(event);
				5406	perf_event_for_each_child(event, _perf_event_enable);
				5407	perf_event_ctx_unlock(event, ctx);
				5408	}
				5409	mutex_unlock(&current->perf_event_mutex);
				5410
				5411	return 0;
				5412	}
				5413
				5414	int perf_event_task_disable(void)
				5415	{
				5416	struct perf_event_context *ctx;
				5417	struct perf_event *event;
				5418
				5419	mutex_lock(&current->perf_event_mutex);
				5420	list_for_each_entry(event, &current->perf_event_list, owner_entry) {
				5421	ctx = perf_event_ctx_lock(event);
				5422	perf_event_for_each_child(event, _perf_event_disable);
				5423	perf_event_ctx_unlock(event, ctx);
				5424	}
				5425	mutex_unlock(&current->perf_event_mutex);
				5426
				5427	return 0;
				5428	}
				5429
				5430	static int perf_event_index(struct perf_event *event)
				5431	{
				5432	if (event->hw.state & PERF_HES_STOPPED)
				5433	return 0;
				5434
				5435	if (event->state != PERF_EVENT_STATE_ACTIVE)
				5436	return 0;
				5437
				5438	return event->pmu->event_idx(event);
				5439	}
				5440
				5441	static void calc_timer_values(struct perf_event *event,
				5442	u64 *now,
				5443	u64 *enabled,
				5444	u64 *running)
				5445	{
				5446	u64 ctx_time;
				5447
				5448	*now = perf_clock();
				5449	ctx_time = event->shadow_ctx_time + *now;
				5450	__perf_update_times(event, ctx_time, enabled, running);
				5451	}
				5452
				5453	static void perf_event_init_userpage(struct perf_event *event)
				5454	{
				5455	struct perf_event_mmap_page *userpg;
				5456	struct ring_buffer *rb;
				5457
				5458	rcu_read_lock();
				5459	rb = rcu_dereference(event->rb);
				5460	if (!rb)
				5461	goto unlock;
				5462
				5463	userpg = rb->user_page;
				5464
				5465	/* Allow new userspace to detect that bit 0 is deprecated */
				5466	userpg->cap_bit0_is_deprecated = 1;
				5467	userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
				5468	userpg->data_offset = PAGE_SIZE;
				5469	userpg->data_size = perf_data_size(rb);
				5470
				5471	unlock:
				5472	rcu_read_unlock();
				5473	}
				5474
				5475	void __weak arch_perf_update_userpage(
				5476	struct perf_event event, struct perf_event_mmap_page userpg, u64 now)
				5477	{
				5478	}
				5479
				5480	/*
				5481	* Callers need to ensure there can be no nesting of this function, otherwise
				5482	* the seqlock logic goes bad. We can not serialize this because the arch
				5483	* code calls this from NMI context.
				5484	*/
				5485	void perf_event_update_userpage(struct perf_event *event)
				5486	{
				5487	struct perf_event_mmap_page *userpg;
				5488	struct ring_buffer *rb;
				5489	u64 enabled, running, now;
				5490
				5491	rcu_read_lock();
				5492	rb = rcu_dereference(event->rb);
				5493	if (!rb)
				5494	goto unlock;
				5495
				5496	/*
				5497	* compute total_time_enabled, total_time_running
				5498	* based on snapshot values taken when the event
				5499	* was last scheduled in.
				5500	*
				5501	* we cannot simply called update_context_time()
				5502	* because of locking issue as we can be called in
				5503	* NMI context
				5504	*/
				5505	calc_timer_values(event, &now, &enabled, &running);
				5506
				5507	userpg = rb->user_page;
				5508	/*
				5509	* Disable preemption to guarantee consistent time stamps are stored to
				5510	* the user page.
				5511	*/
				5512	preempt_disable();
				5513	++userpg->lock;
				5514	barrier();
				5515	userpg->index = perf_event_index(event);
				5516	userpg->offset = perf_event_count(event);
				5517	if (userpg->index)
				5518	userpg->offset -= local64_read(&event->hw.prev_count);
				5519
				5520	userpg->time_enabled = enabled +
				5521	atomic64_read(&event->child_total_time_enabled);
				5522
				5523	userpg->time_running = running +
				5524	atomic64_read(&event->child_total_time_running);
				5525
				5526	arch_perf_update_userpage(event, userpg, now);
				5527
				5528	barrier();
				5529	++userpg->lock;
				5530	preempt_enable();
				5531	unlock:
				5532	rcu_read_unlock();
				5533	}
				5534	EXPORT_SYMBOL_GPL(perf_event_update_userpage);
				5535
				5536	static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
				5537	{
				5538	struct perf_event *event = vmf->vma->vm_file->private_data;
				5539	struct ring_buffer *rb;
				5540	vm_fault_t ret = VM_FAULT_SIGBUS;
				5541
				5542	if (vmf->flags & FAULT_FLAG_MKWRITE) {
				5543	if (vmf->pgoff == 0)
				5544	ret = 0;
				5545	return ret;
				5546	}
				5547
				5548	rcu_read_lock();
				5549	rb = rcu_dereference(event->rb);
				5550	if (!rb)
				5551	goto unlock;
				5552
				5553	if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
				5554	goto unlock;
				5555
				5556	vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
				5557	if (!vmf->page)
				5558	goto unlock;
				5559
				5560	get_page(vmf->page);
				5561	vmf->page->mapping = vmf->vma->vm_file->f_mapping;
				5562	vmf->page->index = vmf->pgoff;
				5563
				5564	ret = 0;
				5565	unlock:
				5566	rcu_read_unlock();
				5567
				5568	return ret;
				5569	}
				5570
				5571	static void ring_buffer_attach(struct perf_event *event,
				5572	struct ring_buffer *rb)
				5573	{
				5574	struct ring_buffer *old_rb = NULL;
				5575	unsigned long flags;
				5576
				5577	if (event->rb) {
				5578	/*
				5579	* Should be impossible, we set this when removing
				5580	* event->rb_entry and wait/clear when adding event->rb_entry.
				5581	*/
				5582	WARN_ON_ONCE(event->rcu_pending);
				5583
				5584	old_rb = event->rb;
				5585	spin_lock_irqsave(&old_rb->event_lock, flags);
				5586	list_del_rcu(&event->rb_entry);
				5587	spin_unlock_irqrestore(&old_rb->event_lock, flags);
				5588
				5589	event->rcu_batches = get_state_synchronize_rcu();
				5590	event->rcu_pending = 1;
				5591	}
				5592
				5593	if (rb) {
				5594	if (event->rcu_pending) {
				5595	cond_synchronize_rcu(event->rcu_batches);
				5596	event->rcu_pending = 0;
				5597	}
				5598
				5599	spin_lock_irqsave(&rb->event_lock, flags);
				5600	list_add_rcu(&event->rb_entry, &rb->event_list);
				5601	spin_unlock_irqrestore(&rb->event_lock, flags);
				5602	}
				5603
				5604	/*
				5605	* Avoid racing with perf_mmap_close(AUX): stop the event
				5606	* before swizzling the event::rb pointer; if it's getting
				5607	* unmapped, its aux_mmap_count will be 0 and it won't
				5608	* restart. See the comment in __perf_pmu_output_stop().
				5609	*
				5610	* Data will inevitably be lost when set_output is done in
				5611	* mid-air, but then again, whoever does it like this is
				5612	* not in for the data anyway.
				5613	*/
				5614	if (has_aux(event))
				5615	perf_event_stop(event, 0);
				5616
				5617	rcu_assign_pointer(event->rb, rb);
				5618
				5619	if (old_rb) {
				5620	ring_buffer_put(old_rb);
				5621	/*
				5622	* Since we detached before setting the new rb, so that we
				5623	* could attach the new rb, we could have missed a wakeup.
				5624	* Provide it now.
				5625	*/
				5626	wake_up_all(&event->waitq);
				5627	}
				5628	}
				5629
				5630	static void ring_buffer_wakeup(struct perf_event *event)
				5631	{
				5632	struct ring_buffer *rb;
				5633
				5634	rcu_read_lock();
				5635	rb = rcu_dereference(event->rb);
				5636	if (rb) {
				5637	list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
				5638	wake_up_all(&event->waitq);
				5639	}
				5640	rcu_read_unlock();
				5641	}
				5642
				5643	struct ring_buffer ring_buffer_get(struct perf_event event)
				5644	{
				5645	struct ring_buffer *rb;
				5646
				5647	rcu_read_lock();
				5648	rb = rcu_dereference(event->rb);
				5649	if (rb) {
				5650	if (!refcount_inc_not_zero(&rb->refcount))
				5651	rb = NULL;
				5652	}
				5653	rcu_read_unlock();
				5654
				5655	return rb;
				5656	}
				5657
				5658	void ring_buffer_put(struct ring_buffer *rb)
				5659	{
				5660	if (!refcount_dec_and_test(&rb->refcount))
				5661	return;
				5662
				5663	WARN_ON_ONCE(!list_empty(&rb->event_list));
				5664
				5665	call_rcu(&rb->rcu_head, rb_free_rcu);
				5666	}
				5667
				5668	static void perf_mmap_open(struct vm_area_struct *vma)
				5669	{
				5670	struct perf_event *event = vma->vm_file->private_data;
				5671
				5672	atomic_inc(&event->mmap_count);
				5673	atomic_inc(&event->rb->mmap_count);
				5674
				5675	if (vma->vm_pgoff)
				5676	atomic_inc(&event->rb->aux_mmap_count);
				5677
				5678	if (event->pmu->event_mapped)
				5679	event->pmu->event_mapped(event, vma->vm_mm);
				5680	}
				5681
				5682	static void perf_pmu_output_stop(struct perf_event *event);
				5683
				5684	/*
				5685	* A buffer can be mmap()ed multiple times; either directly through the same
				5686	* event, or through other events by use of perf_event_set_output().
				5687	*
				5688	* In order to undo the VM accounting done by perf_mmap() we need to destroy
				5689	* the buffer here, where we still have a VM context. This means we need
				5690	* to detach all events redirecting to us.
				5691	*/
				5692	static void perf_mmap_close(struct vm_area_struct *vma)
				5693	{
				5694	struct perf_event *event = vma->vm_file->private_data;
				5695	struct ring_buffer *rb = ring_buffer_get(event);
				5696	struct user_struct *mmap_user = rb->mmap_user;
				5697	int mmap_locked = rb->mmap_locked;
				5698	unsigned long size = perf_data_size(rb);
				5699	bool detach_rest = false;
				5700
				5701	if (event->pmu->event_unmapped)
				5702	event->pmu->event_unmapped(event, vma->vm_mm);
				5703
				5704	/*
				5705	* rb->aux_mmap_count will always drop before rb->mmap_count and
				5706	* event->mmap_count, so it is ok to use event->mmap_mutex to
				5707	* serialize with perf_mmap here.
				5708	*/
				5709	if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
				5710	atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
				5711	/*
				5712	* Stop all AUX events that are writing to this buffer,
				5713	* so that we can free its AUX pages and corresponding PMU
				5714	* data. Note that after rb::aux_mmap_count dropped to zero,
				5715	* they won't start any more (see perf_aux_output_begin()).
				5716	*/
				5717	perf_pmu_output_stop(event);
				5718
				5719	/* now it's safe to free the pages */
				5720	atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
				5721	atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
				5722
				5723	/* this has to be the last one */
				5724	rb_free_aux(rb);
				5725	WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
				5726
				5727	mutex_unlock(&event->mmap_mutex);
				5728	}
				5729
				5730	if (atomic_dec_and_test(&rb->mmap_count))
				5731	detach_rest = true;
				5732
				5733	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
				5734	goto out_put;
				5735
				5736	ring_buffer_attach(event, NULL);
				5737	mutex_unlock(&event->mmap_mutex);
				5738
				5739	/* If there's still other mmap()s of this buffer, we're done. */
				5740	if (!detach_rest)
				5741	goto out_put;
				5742
				5743	/*
				5744	* No other mmap()s, detach from all other events that might redirect
				5745	* into the now unreachable buffer. Somewhat complicated by the
				5746	* fact that rb::event_lock otherwise nests inside mmap_mutex.
				5747	*/
				5748	again:
				5749	rcu_read_lock();
				5750	list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
				5751	if (!atomic_long_inc_not_zero(&event->refcount)) {
				5752	/*
				5753	* This event is en-route to free_event() which will
				5754	* detach it and remove it from the list.
				5755	*/
				5756	continue;
				5757	}
				5758	rcu_read_unlock();
				5759
				5760	mutex_lock(&event->mmap_mutex);
				5761	/*
				5762	* Check we didn't race with perf_event_set_output() which can
				5763	* swizzle the rb from under us while we were waiting to
				5764	* acquire mmap_mutex.
				5765	*
				5766	* If we find a different rb; ignore this event, a next
				5767	* iteration will no longer find it on the list. We have to
				5768	* still restart the iteration to make sure we're not now
				5769	* iterating the wrong list.
				5770	*/
				5771	if (event->rb == rb)
				5772	ring_buffer_attach(event, NULL);
				5773
				5774	mutex_unlock(&event->mmap_mutex);
				5775	put_event(event);
				5776
				5777	/*
				5778	* Restart the iteration; either we're on the wrong list or
				5779	* destroyed its integrity by doing a deletion.
				5780	*/
				5781	goto again;
				5782	}
				5783	rcu_read_unlock();
				5784
				5785	/*
				5786	* It could be there's still a few 0-ref events on the list; they'll
				5787	* get cleaned up by free_event() -- they'll also still have their
				5788	* ref on the rb and will free it whenever they are done with it.
				5789	*
				5790	* Aside from that, this buffer is 'fully' detached and unmapped,
				5791	* undo the VM accounting.
				5792	*/
				5793
				5794	atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
				5795	&mmap_user->locked_vm);
				5796	atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
				5797	free_uid(mmap_user);
				5798
				5799	out_put:
				5800	ring_buffer_put(rb); /* could be last */
				5801	}
				5802
				5803	static const struct vm_operations_struct perf_mmap_vmops = {
				5804	.open = perf_mmap_open,
				5805	.close = perf_mmap_close, /* non mergeable */
				5806	.fault = perf_mmap_fault,
				5807	.page_mkwrite = perf_mmap_fault,
				5808	};
				5809
				5810	static int perf_mmap(struct file file, struct vm_area_struct vma)
				5811	{
				5812	struct perf_event *event = file->private_data;
				5813	unsigned long user_locked, user_lock_limit;
				5814	struct user_struct *user = current_user();
				5815	unsigned long locked, lock_limit;
				5816	struct ring_buffer *rb = NULL;
				5817	unsigned long vma_size;
				5818	unsigned long nr_pages;
				5819	long user_extra = 0, extra = 0;
				5820	int ret = 0, flags = 0;
				5821
				5822	/*
				5823	* Don't allow mmap() of inherited per-task counters. This would
				5824	* create a performance issue due to all children writing to the
				5825	* same rb.
				5826	*/
				5827	if (event->cpu == -1 && event->attr.inherit)
				5828	return -EINVAL;
				5829
				5830	if (!(vma->vm_flags & VM_SHARED))
				5831	return -EINVAL;
				5832
				5833	ret = security_perf_event_read(event);
				5834	if (ret)
				5835	return ret;
				5836
				5837	vma_size = vma->vm_end - vma->vm_start;
				5838
				5839	if (vma->vm_pgoff == 0) {
				5840	nr_pages = (vma_size / PAGE_SIZE) - 1;
				5841	} else {
				5842	/*
				5843	* AUX area mapping: if rb->aux_nr_pages != 0, it's already
				5844	* mapped, all subsequent mappings should have the same size
				5845	* and offset. Must be above the normal perf buffer.
				5846	*/
				5847	u64 aux_offset, aux_size;
				5848
				5849	if (!event->rb)
				5850	return -EINVAL;
				5851
				5852	nr_pages = vma_size / PAGE_SIZE;
				5853	if (nr_pages > INT_MAX)
				5854	return -ENOMEM;
				5855
				5856	mutex_lock(&event->mmap_mutex);
				5857	ret = -EINVAL;
				5858
				5859	rb = event->rb;
				5860	if (!rb)
				5861	goto aux_unlock;
				5862
				5863	aux_offset = READ_ONCE(rb->user_page->aux_offset);
				5864	aux_size = READ_ONCE(rb->user_page->aux_size);
				5865
				5866	if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
				5867	goto aux_unlock;
				5868
				5869	if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
				5870	goto aux_unlock;
				5871
				5872	/* already mapped with a different offset */
				5873	if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
				5874	goto aux_unlock;
				5875
				5876	if (aux_size != vma_size \|\| aux_size != nr_pages * PAGE_SIZE)
				5877	goto aux_unlock;
				5878
				5879	/* already mapped with a different size */
				5880	if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
				5881	goto aux_unlock;
				5882
				5883	if (!is_power_of_2(nr_pages))
				5884	goto aux_unlock;
				5885
				5886	if (!atomic_inc_not_zero(&rb->mmap_count))
				5887	goto aux_unlock;
				5888
				5889	if (rb_has_aux(rb)) {
				5890	atomic_inc(&rb->aux_mmap_count);
				5891	ret = 0;
				5892	goto unlock;
				5893	}
				5894
				5895	atomic_set(&rb->aux_mmap_count, 1);
				5896	user_extra = nr_pages;
				5897
				5898	goto accounting;
				5899	}
				5900
				5901	/*
				5902	* If we have rb pages ensure they're a power-of-two number, so we
				5903	* can do bitmasks instead of modulo.
				5904	*/
				5905	if (nr_pages != 0 && !is_power_of_2(nr_pages))
				5906	return -EINVAL;
				5907
				5908	if (vma_size != PAGE_SIZE * (1 + nr_pages))
				5909	return -EINVAL;
				5910
				5911	WARN_ON_ONCE(event->ctx->parent_ctx);
				5912	again:
				5913	mutex_lock(&event->mmap_mutex);
				5914	if (event->rb) {
				5915	if (event->rb->nr_pages != nr_pages) {
				5916	ret = -EINVAL;
				5917	goto unlock;
				5918	}
				5919
				5920	if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
				5921	/*
				5922	* Raced against perf_mmap_close(); remove the
				5923	* event and try again.
				5924	*/
				5925	ring_buffer_attach(event, NULL);
				5926	mutex_unlock(&event->mmap_mutex);
				5927	goto again;
				5928	}
				5929
				5930	goto unlock;
				5931	}
				5932
				5933	user_extra = nr_pages + 1;
				5934
				5935	accounting:
				5936	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
				5937
				5938	/*
				5939	* Increase the limit linearly with more CPUs:
				5940	*/
				5941	user_lock_limit *= num_online_cpus();
				5942
				5943	user_locked = atomic_long_read(&user->locked_vm);
				5944
				5945	/*
				5946	* sysctl_perf_event_mlock may have changed, so that
				5947	* user->locked_vm > user_lock_limit
				5948	*/
				5949	if (user_locked > user_lock_limit)
				5950	user_locked = user_lock_limit;
				5951	user_locked += user_extra;
				5952
				5953	if (user_locked <= user_lock_limit) {
				5954	/* charge all to locked_vm */
				5955	} else if (atomic_long_read(&user->locked_vm) >= user_lock_limit) {
				5956	/* charge all to pinned_vm */
				5957	extra = user_extra;
				5958	user_extra = 0;
				5959	} else {
				5960	/*
				5961	* charge locked_vm until it hits user_lock_limit;
				5962	* charge the rest from pinned_vm
				5963	*/
				5964	extra = user_locked - user_lock_limit;
				5965	user_extra -= extra;
				5966	}
				5967
				5968	lock_limit = rlimit(RLIMIT_MEMLOCK);
				5969	lock_limit >>= PAGE_SHIFT;
				5970	locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
				5971
				5972	if ((locked > lock_limit) && perf_is_paranoid() &&
				5973	!capable(CAP_IPC_LOCK)) {
				5974	ret = -EPERM;
				5975	goto unlock;
				5976	}
				5977
				5978	WARN_ON(!rb && event->rb);
				5979
				5980	if (vma->vm_flags & VM_WRITE)
				5981	flags \|= RING_BUFFER_WRITABLE;
				5982
				5983	if (!rb) {
				5984	rb = rb_alloc(nr_pages,
				5985	event->attr.watermark ? event->attr.wakeup_watermark : 0,
				5986	event->cpu, flags);
				5987
				5988	if (!rb) {
				5989	ret = -ENOMEM;
				5990	goto unlock;
				5991	}
				5992
				5993	atomic_set(&rb->mmap_count, 1);
				5994	rb->mmap_user = get_current_user();
				5995	rb->mmap_locked = extra;
				5996
				5997	ring_buffer_attach(event, rb);
				5998
				5999	perf_event_init_userpage(event);
				6000	perf_event_update_userpage(event);
				6001	} else {
				6002	ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
				6003	event->attr.aux_watermark, flags);
				6004	if (!ret)
				6005	rb->aux_mmap_locked = extra;
				6006	}
				6007
				6008	unlock:
				6009	if (!ret) {
				6010	atomic_long_add(user_extra, &user->locked_vm);
				6011	atomic64_add(extra, &vma->vm_mm->pinned_vm);
				6012
				6013	atomic_inc(&event->mmap_count);
				6014	} else if (rb) {
				6015	atomic_dec(&rb->mmap_count);
				6016	}
				6017	aux_unlock:
				6018	mutex_unlock(&event->mmap_mutex);
				6019
				6020	/*
				6021	* Since pinned accounting is per vm we cannot allow fork() to copy our
				6022	* vma.
				6023	*/
				6024	vma->vm_flags \|= VM_DONTCOPY \| VM_DONTEXPAND \| VM_DONTDUMP;
				6025	vma->vm_ops = &perf_mmap_vmops;
				6026
				6027	if (event->pmu->event_mapped)
				6028	event->pmu->event_mapped(event, vma->vm_mm);
				6029
				6030	return ret;
				6031	}
				6032
				6033	static int perf_fasync(int fd, struct file *filp, int on)
				6034	{
				6035	struct inode *inode = file_inode(filp);
				6036	struct perf_event *event = filp->private_data;
				6037	int retval;
				6038
				6039	inode_lock(inode);
				6040	retval = fasync_helper(fd, filp, on, &event->fasync);
				6041	inode_unlock(inode);
				6042
				6043	if (retval < 0)
				6044	return retval;
				6045
				6046	return 0;
				6047	}
				6048
				6049	static const struct file_operations perf_fops = {
				6050	.llseek = no_llseek,
				6051	.release = perf_release,
				6052	.read = perf_read,
				6053	.poll = perf_poll,
				6054	.unlocked_ioctl = perf_ioctl,
				6055	.compat_ioctl = perf_compat_ioctl,
				6056	.mmap = perf_mmap,
				6057	.fasync = perf_fasync,
				6058	};
				6059
				6060	/*
				6061	* Perf event wakeup
				6062	*
				6063	* If there's data, ensure we set the poll() state and publish everything
				6064	* to user-space before waking everybody up.
				6065	*/
				6066
				6067	static inline struct fasync_struct *perf_event_fasync(struct perf_event event)
				6068	{
				6069	/* only the parent has fasync state */
				6070	if (event->parent)
				6071	event = event->parent;
				6072	return &event->fasync;
				6073	}
				6074
				6075	void perf_event_wakeup(struct perf_event *event)
				6076	{
				6077	ring_buffer_wakeup(event);
				6078
				6079	if (event->pending_kill) {
				6080	kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
				6081	event->pending_kill = 0;
				6082	}
				6083	}
				6084
				6085	static void perf_pending_event_disable(struct perf_event *event)
				6086	{
				6087	int cpu = READ_ONCE(event->pending_disable);
				6088
				6089	if (cpu < 0)
				6090	return;
				6091
				6092	if (cpu == smp_processor_id()) {
				6093	WRITE_ONCE(event->pending_disable, -1);
				6094	perf_event_disable_local(event);
				6095	return;
				6096	}
				6097
				6098	/*
				6099	* CPU-A CPU-B
				6100	*
				6101	* perf_event_disable_inatomic()
				6102	* @pending_disable = CPU-A;
				6103	* irq_work_queue();
				6104	*
				6105	* sched-out
				6106	* @pending_disable = -1;
				6107	*
				6108	* sched-in
				6109	* perf_event_disable_inatomic()
				6110	* @pending_disable = CPU-B;
				6111	* irq_work_queue(); // FAILS
				6112	*
				6113	* irq_work_run()
				6114	* perf_pending_event()
				6115	*
				6116	* But the event runs on CPU-B and wants disabling there.
				6117	*/
				6118	irq_work_queue_on(&event->pending, cpu);
				6119	}
				6120
				6121	static void perf_pending_event(struct irq_work *entry)
				6122	{
				6123	struct perf_event *event = container_of(entry, struct perf_event, pending);
				6124	int rctx;
				6125
				6126	rctx = perf_swevent_get_recursion_context();
				6127	/*
				6128	* If we 'fail' here, that's OK, it means recursion is already disabled
				6129	* and we won't recurse 'further'.
				6130	*/
				6131
				6132	perf_pending_event_disable(event);
				6133
				6134	if (event->pending_wakeup) {
				6135	event->pending_wakeup = 0;
				6136	perf_event_wakeup(event);
				6137	}
				6138
				6139	if (rctx >= 0)
				6140	perf_swevent_put_recursion_context(rctx);
				6141	}
				6142
				6143	/*
				6144	* We assume there is only KVM supporting the callbacks.
				6145	* Later on, we might change it to a list if there is
				6146	* another virtualization implementation supporting the callbacks.
				6147	*/
				6148	struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
				6149
				6150	int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
				6151	{
				6152	if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
				6153	return -EBUSY;
				6154
				6155	rcu_assign_pointer(perf_guest_cbs, cbs);
				6156	return 0;
				6157	}
				6158	EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
				6159
				6160	int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
				6161	{
				6162	if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
				6163	return -EINVAL;
				6164
				6165	rcu_assign_pointer(perf_guest_cbs, NULL);
				6166	synchronize_rcu();
				6167	return 0;
				6168	}
				6169	EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
				6170
				6171	static void
				6172	perf_output_sample_regs(struct perf_output_handle *handle,
				6173	struct pt_regs *regs, u64 mask)
				6174	{
				6175	int bit;
				6176	DECLARE_BITMAP(_mask, 64);
				6177
				6178	bitmap_from_u64(_mask, mask);
				6179	for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
				6180	u64 val;
				6181
				6182	val = perf_reg_value(regs, bit);
				6183	perf_output_put(handle, val);
				6184	}
				6185	}
				6186
				6187	static void perf_sample_regs_user(struct perf_regs *regs_user,
				6188	struct pt_regs *regs,
				6189	struct pt_regs *regs_user_copy)
				6190	{
				6191	if (user_mode(regs)) {
				6192	regs_user->abi = perf_reg_abi(current);
				6193	regs_user->regs = regs;
				6194	} else if (!(current->flags & PF_KTHREAD)) {
				6195	perf_get_regs_user(regs_user, regs, regs_user_copy);
				6196	} else {
				6197	regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
				6198	regs_user->regs = NULL;
				6199	}
				6200	}
				6201
				6202	static void perf_sample_regs_intr(struct perf_regs *regs_intr,
				6203	struct pt_regs *regs)
				6204	{
				6205	regs_intr->regs = regs;
				6206	regs_intr->abi = perf_reg_abi(current);
				6207	}
				6208
				6209
				6210	/*
				6211	* Get remaining task size from user stack pointer.
				6212	*
				6213	* It'd be better to take stack vma map and limit this more
				6214	* precisely, but there's no way to get it safely under interrupt,
				6215	* so using TASK_SIZE as limit.
				6216	*/
				6217	static u64 perf_ustack_task_size(struct pt_regs *regs)
				6218	{
				6219	unsigned long addr = perf_user_stack_pointer(regs);
				6220
				6221	if (!addr \|\| addr >= TASK_SIZE)
				6222	return 0;
				6223
				6224	return TASK_SIZE - addr;
				6225	}
				6226
				6227	static u16
				6228	perf_sample_ustack_size(u16 stack_size, u16 header_size,
				6229	struct pt_regs *regs)
				6230	{
				6231	u64 task_size;
				6232
				6233	/* No regs, no stack pointer, no dump. */
				6234	if (!regs)
				6235	return 0;
				6236
				6237	/*
				6238	* Check if we fit in with the requested stack size into the:
				6239	* - TASK_SIZE
				6240	* If we don't, we limit the size to the TASK_SIZE.
				6241	*
				6242	* - remaining sample size
				6243	* If we don't, we customize the stack size to
				6244	* fit in to the remaining sample size.
				6245	*/
				6246
				6247	task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
				6248	stack_size = min(stack_size, (u16) task_size);
				6249
				6250	/* Current header size plus static size and dynamic size. */
				6251	header_size += 2 * sizeof(u64);
				6252
				6253	/* Do we fit in with the current stack dump size? */
				6254	if ((u16) (header_size + stack_size) < header_size) {
				6255	/*
				6256	* If we overflow the maximum size for the sample,
				6257	* we customize the stack dump size to fit in.
				6258	*/
				6259	stack_size = USHRT_MAX - header_size - sizeof(u64);
				6260	stack_size = round_up(stack_size, sizeof(u64));
				6261	}
				6262
				6263	return stack_size;
				6264	}
				6265
				6266	static void
				6267	perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
				6268	struct pt_regs *regs)
				6269	{
				6270	/* Case of a kernel thread, nothing to dump */
				6271	if (!regs) {
				6272	u64 size = 0;
				6273	perf_output_put(handle, size);
				6274	} else {
				6275	unsigned long sp;
				6276	unsigned int rem;
				6277	u64 dyn_size;
				6278	mm_segment_t fs;
				6279
				6280	/*
				6281	* We dump:
				6282	* static size
				6283	* - the size requested by user or the best one we can fit
				6284	* in to the sample max size
				6285	* data
				6286	* - user stack dump data
				6287	* dynamic size
				6288	* - the actual dumped size
				6289	*/
				6290
				6291	/* Static size. */
				6292	perf_output_put(handle, dump_size);
				6293
				6294	/* Data. */
				6295	sp = perf_user_stack_pointer(regs);
				6296	fs = get_fs();
				6297	set_fs(USER_DS);
				6298	rem = __output_copy_user(handle, (void *) sp, dump_size);
				6299	set_fs(fs);
				6300	dyn_size = dump_size - rem;
				6301
				6302	perf_output_skip(handle, rem);
				6303
				6304	/* Dynamic size. */
				6305	perf_output_put(handle, dyn_size);
				6306	}
				6307	}
				6308
				6309	static void __perf_event_header__init_id(struct perf_event_header *header,
				6310	struct perf_sample_data *data,
				6311	struct perf_event *event)
				6312	{
				6313	u64 sample_type = event->attr.sample_type;
				6314
				6315	data->type = sample_type;
				6316	header->size += event->id_header_size;
				6317
				6318	if (sample_type & PERF_SAMPLE_TID) {
				6319	/* namespace issues */
				6320	data->tid_entry.pid = perf_event_pid(event, current);
				6321	data->tid_entry.tid = perf_event_tid(event, current);
				6322	}
				6323
				6324	if (sample_type & PERF_SAMPLE_TIME)
				6325	data->time = perf_event_clock(event);
				6326
				6327	if (sample_type & (PERF_SAMPLE_ID \| PERF_SAMPLE_IDENTIFIER))
				6328	data->id = primary_event_id(event);
				6329
				6330	if (sample_type & PERF_SAMPLE_STREAM_ID)
				6331	data->stream_id = event->id;
				6332
				6333	if (sample_type & PERF_SAMPLE_CPU) {
				6334	data->cpu_entry.cpu = raw_smp_processor_id();
				6335	data->cpu_entry.reserved = 0;
				6336	}
				6337	}
				6338
				6339	void perf_event_header__init_id(struct perf_event_header *header,
				6340	struct perf_sample_data *data,
				6341	struct perf_event *event)
				6342	{
				6343	if (event->attr.sample_id_all)
				6344	__perf_event_header__init_id(header, data, event);
				6345	}
				6346
				6347	static void __perf_event__output_id_sample(struct perf_output_handle *handle,
				6348	struct perf_sample_data *data)
				6349	{
				6350	u64 sample_type = data->type;
				6351
				6352	if (sample_type & PERF_SAMPLE_TID)
				6353	perf_output_put(handle, data->tid_entry);
				6354
				6355	if (sample_type & PERF_SAMPLE_TIME)
				6356	perf_output_put(handle, data->time);
				6357
				6358	if (sample_type & PERF_SAMPLE_ID)
				6359	perf_output_put(handle, data->id);
				6360
				6361	if (sample_type & PERF_SAMPLE_STREAM_ID)
				6362	perf_output_put(handle, data->stream_id);
				6363
				6364	if (sample_type & PERF_SAMPLE_CPU)
				6365	perf_output_put(handle, data->cpu_entry);
				6366
				6367	if (sample_type & PERF_SAMPLE_IDENTIFIER)
				6368	perf_output_put(handle, data->id);
				6369	}
				6370
				6371	void perf_event__output_id_sample(struct perf_event *event,
				6372	struct perf_output_handle *handle,
				6373	struct perf_sample_data *sample)
				6374	{
				6375	if (event->attr.sample_id_all)
				6376	__perf_event__output_id_sample(handle, sample);
				6377	}
				6378
				6379	static void perf_output_read_one(struct perf_output_handle *handle,
				6380	struct perf_event *event,
				6381	u64 enabled, u64 running)
				6382	{
				6383	u64 read_format = event->attr.read_format;
				6384	u64 values[5];
				6385	int n = 0;
				6386
				6387	values[n++] = perf_event_count(event);
				6388	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
				6389	values[n++] = enabled +
				6390	atomic64_read(&event->child_total_time_enabled);
				6391	}
				6392	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
				6393	values[n++] = running +
				6394	atomic64_read(&event->child_total_time_running);
				6395	}
				6396	if (read_format & PERF_FORMAT_ID)
				6397	values[n++] = primary_event_id(event);
				6398	if (read_format & PERF_FORMAT_LOST)
				6399	values[n++] = atomic64_read(&event->lost_samples);
				6400
				6401	__output_copy(handle, values, n * sizeof(u64));
				6402	}
				6403
				6404	static void perf_output_read_group(struct perf_output_handle *handle,
				6405	struct perf_event *event,
				6406	u64 enabled, u64 running)
				6407	{
				6408	struct perf_event leader = event->group_leader, sub;
				6409	u64 read_format = event->attr.read_format;
				6410	unsigned long flags;
				6411	u64 values[6];
				6412	int n = 0;
				6413
				6414	/*
				6415	* Disabling interrupts avoids all counter scheduling
				6416	* (context switches, timer based rotation and IPIs).
				6417	*/
				6418	local_irq_save(flags);
				6419
				6420	values[n++] = 1 + leader->nr_siblings;
				6421
				6422	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
				6423	values[n++] = enabled;
				6424
				6425	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
				6426	values[n++] = running;
				6427
				6428	if ((leader != event) &&
				6429	(leader->state == PERF_EVENT_STATE_ACTIVE))
				6430	leader->pmu->read(leader);
				6431
				6432	values[n++] = perf_event_count(leader);
				6433	if (read_format & PERF_FORMAT_ID)
				6434	values[n++] = primary_event_id(leader);
				6435	if (read_format & PERF_FORMAT_LOST)
				6436	values[n++] = atomic64_read(&leader->lost_samples);
				6437
				6438	__output_copy(handle, values, n * sizeof(u64));
				6439
				6440	for_each_sibling_event(sub, leader) {
				6441	n = 0;
				6442
				6443	if ((sub != event) &&
				6444	(sub->state == PERF_EVENT_STATE_ACTIVE))
				6445	sub->pmu->read(sub);
				6446
				6447	values[n++] = perf_event_count(sub);
				6448	if (read_format & PERF_FORMAT_ID)
				6449	values[n++] = primary_event_id(sub);
				6450	if (read_format & PERF_FORMAT_LOST)
				6451	values[n++] = atomic64_read(&sub->lost_samples);
				6452
				6453	__output_copy(handle, values, n * sizeof(u64));
				6454	}
				6455
				6456	local_irq_restore(flags);
				6457	}
				6458
				6459	#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED\|\
				6460	PERF_FORMAT_TOTAL_TIME_RUNNING)
				6461
				6462	/*
				6463	* XXX PERF_SAMPLE_READ vs inherited events seems difficult.
				6464	*
				6465	* The problem is that its both hard and excessively expensive to iterate the
				6466	* child list, not to mention that its impossible to IPI the children running
				6467	* on another CPU, from interrupt/NMI context.
				6468	*/
				6469	static void perf_output_read(struct perf_output_handle *handle,
				6470	struct perf_event *event)
				6471	{
				6472	u64 enabled = 0, running = 0, now;
				6473	u64 read_format = event->attr.read_format;
				6474
				6475	/*
				6476	* compute total_time_enabled, total_time_running
				6477	* based on snapshot values taken when the event
				6478	* was last scheduled in.
				6479	*
				6480	* we cannot simply called update_context_time()
				6481	* because of locking issue as we are called in
				6482	* NMI context
				6483	*/
				6484	if (read_format & PERF_FORMAT_TOTAL_TIMES)
				6485	calc_timer_values(event, &now, &enabled, &running);
				6486
				6487	if (event->attr.read_format & PERF_FORMAT_GROUP)
				6488	perf_output_read_group(handle, event, enabled, running);
				6489	else
				6490	perf_output_read_one(handle, event, enabled, running);
				6491	}
				6492
				6493	void perf_output_sample(struct perf_output_handle *handle,
				6494	struct perf_event_header *header,
				6495	struct perf_sample_data *data,
				6496	struct perf_event *event)
				6497	{
				6498	u64 sample_type = data->type;
				6499
				6500	perf_output_put(handle, *header);
				6501
				6502	if (sample_type & PERF_SAMPLE_IDENTIFIER)
				6503	perf_output_put(handle, data->id);
				6504
				6505	if (sample_type & PERF_SAMPLE_IP)
				6506	perf_output_put(handle, data->ip);
				6507
				6508	if (sample_type & PERF_SAMPLE_TID)
				6509	perf_output_put(handle, data->tid_entry);
				6510
				6511	if (sample_type & PERF_SAMPLE_TIME)
				6512	perf_output_put(handle, data->time);
				6513
				6514	if (sample_type & PERF_SAMPLE_ADDR)
				6515	perf_output_put(handle, data->addr);
				6516
				6517	if (sample_type & PERF_SAMPLE_ID)
				6518	perf_output_put(handle, data->id);
				6519
				6520	if (sample_type & PERF_SAMPLE_STREAM_ID)
				6521	perf_output_put(handle, data->stream_id);
				6522
				6523	if (sample_type & PERF_SAMPLE_CPU)
				6524	perf_output_put(handle, data->cpu_entry);
				6525
				6526	if (sample_type & PERF_SAMPLE_PERIOD)
				6527	perf_output_put(handle, data->period);
				6528
				6529	if (sample_type & PERF_SAMPLE_READ)
				6530	perf_output_read(handle, event);
				6531
				6532	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
				6533	int size = 1;
				6534
				6535	size += data->callchain->nr;
				6536	size *= sizeof(u64);
				6537	__output_copy(handle, data->callchain, size);
				6538	}
				6539
				6540	if (sample_type & PERF_SAMPLE_RAW) {
				6541	struct perf_raw_record *raw = data->raw;
				6542
				6543	if (raw) {
				6544	struct perf_raw_frag *frag = &raw->frag;
				6545
				6546	perf_output_put(handle, raw->size);
				6547	do {
				6548	if (frag->copy) {
				6549	__output_custom(handle, frag->copy,
				6550	frag->data, frag->size);
				6551	} else {
				6552	__output_copy(handle, frag->data,
				6553	frag->size);
				6554	}
				6555	if (perf_raw_frag_last(frag))
				6556	break;
				6557	frag = frag->next;
				6558	} while (1);
				6559	if (frag->pad)
				6560	__output_skip(handle, NULL, frag->pad);
				6561	} else {
				6562	struct {
				6563	u32 size;
				6564	u32 data;
				6565	} raw = {
				6566	.size = sizeof(u32),
				6567	.data = 0,
				6568	};
				6569	perf_output_put(handle, raw);
				6570	}
				6571	}
				6572
				6573	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
				6574	if (data->br_stack) {
				6575	size_t size;
				6576
				6577	size = data->br_stack->nr
				6578	* sizeof(struct perf_branch_entry);
				6579
				6580	perf_output_put(handle, data->br_stack->nr);
				6581	perf_output_copy(handle, data->br_stack->entries, size);
				6582	} else {
				6583	/*
				6584	* we always store at least the value of nr
				6585	*/
				6586	u64 nr = 0;
				6587	perf_output_put(handle, nr);
				6588	}
				6589	}
				6590
				6591	if (sample_type & PERF_SAMPLE_REGS_USER) {
				6592	u64 abi = data->regs_user.abi;
				6593
				6594	/*
				6595	* If there are no regs to dump, notice it through
				6596	* first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
				6597	*/
				6598	perf_output_put(handle, abi);
				6599
				6600	if (abi) {
				6601	u64 mask = event->attr.sample_regs_user;
				6602	perf_output_sample_regs(handle,
				6603	data->regs_user.regs,
				6604	mask);
				6605	}
				6606	}
				6607
				6608	if (sample_type & PERF_SAMPLE_STACK_USER) {
				6609	perf_output_sample_ustack(handle,
				6610	data->stack_user_size,
				6611	data->regs_user.regs);
				6612	}
				6613
				6614	if (sample_type & PERF_SAMPLE_WEIGHT)
				6615	perf_output_put(handle, data->weight);
				6616
				6617	if (sample_type & PERF_SAMPLE_DATA_SRC)
				6618	perf_output_put(handle, data->data_src.val);
				6619
				6620	if (sample_type & PERF_SAMPLE_TRANSACTION)
				6621	perf_output_put(handle, data->txn);
				6622
				6623	if (sample_type & PERF_SAMPLE_REGS_INTR) {
				6624	u64 abi = data->regs_intr.abi;
				6625	/*
				6626	* If there are no regs to dump, notice it through
				6627	* first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
				6628	*/
				6629	perf_output_put(handle, abi);
				6630
				6631	if (abi) {
				6632	u64 mask = event->attr.sample_regs_intr;
				6633
				6634	perf_output_sample_regs(handle,
				6635	data->regs_intr.regs,
				6636	mask);
				6637	}
				6638	}
				6639
				6640	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
				6641	perf_output_put(handle, data->phys_addr);
				6642
				6643	if (!event->attr.watermark) {
				6644	int wakeup_events = event->attr.wakeup_events;
				6645
				6646	if (wakeup_events) {
				6647	struct ring_buffer *rb = handle->rb;
				6648	int events = local_inc_return(&rb->events);
				6649
				6650	if (events >= wakeup_events) {
				6651	local_sub(wakeup_events, &rb->events);
				6652	local_inc(&rb->wakeup);
				6653	}
				6654	}
				6655	}
				6656	}
				6657
				6658	static u64 perf_virt_to_phys(u64 virt)
				6659	{
				6660	u64 phys_addr = 0;
				6661
				6662	if (!virt)
				6663	return 0;
				6664
				6665	if (virt >= TASK_SIZE) {
				6666	/* If it's vmalloc()d memory, leave phys_addr as 0 */
				6667	if (virt_addr_valid((void *)(uintptr_t)virt) &&
				6668	!(virt >= VMALLOC_START && virt < VMALLOC_END))
				6669	phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
				6670	} else {
				6671	/*
				6672	* Walking the pages tables for user address.
				6673	* Interrupts are disabled, so it prevents any tear down
				6674	* of the page tables.
				6675	* Try IRQ-safe __get_user_pages_fast first.
				6676	* If failed, leave phys_addr as 0.
				6677	*/
				6678	if (current->mm != NULL) {
				6679	struct page *p;
				6680
				6681	pagefault_disable();
				6682	if (__get_user_pages_fast(virt, 1, 0, &p) == 1) {
				6683	phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
				6684	put_page(p);
				6685	}
				6686	pagefault_enable();
				6687	}
				6688	}
				6689
				6690	return phys_addr;
				6691	}
				6692
				6693	static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
				6694
				6695	struct perf_callchain_entry *
				6696	perf_callchain(struct perf_event event, struct pt_regs regs)
				6697	{
				6698	bool kernel = !event->attr.exclude_callchain_kernel;
				6699	bool user = !event->attr.exclude_callchain_user;
				6700	/* Disallow cross-task user callchains. */
				6701	bool crosstask = event->ctx->task && event->ctx->task != current;
				6702	const u32 max_stack = event->attr.sample_max_stack;
				6703	struct perf_callchain_entry *callchain;
				6704
				6705	if (!kernel && !user)
				6706	return &__empty_callchain;
				6707
				6708	callchain = get_perf_callchain(regs, 0, kernel, user,
				6709	max_stack, crosstask, true);
				6710	return callchain ?: &__empty_callchain;
				6711	}
				6712
				6713	void perf_prepare_sample(struct perf_event_header *header,
				6714	struct perf_sample_data *data,
				6715	struct perf_event *event,
				6716	struct pt_regs *regs)
				6717	{
				6718	u64 sample_type = event->attr.sample_type;
				6719
				6720	header->type = PERF_RECORD_SAMPLE;
				6721	header->size = sizeof(*header) + event->header_size;
				6722
				6723	header->misc = 0;
				6724	header->misc \|= perf_misc_flags(regs);
				6725
				6726	__perf_event_header__init_id(header, data, event);
				6727
				6728	if (sample_type & PERF_SAMPLE_IP)
				6729	data->ip = perf_instruction_pointer(regs);
				6730
				6731	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
				6732	int size = 1;
				6733
				6734	if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
				6735	data->callchain = perf_callchain(event, regs);
				6736
				6737	size += data->callchain->nr;
				6738
				6739	header->size += size * sizeof(u64);
				6740	}
				6741
				6742	if (sample_type & PERF_SAMPLE_RAW) {
				6743	struct perf_raw_record *raw = data->raw;
				6744	int size;
				6745
				6746	if (raw) {
				6747	struct perf_raw_frag *frag = &raw->frag;
				6748	u32 sum = 0;
				6749
				6750	do {
				6751	sum += frag->size;
				6752	if (perf_raw_frag_last(frag))
				6753	break;
				6754	frag = frag->next;
				6755	} while (1);
				6756
				6757	size = round_up(sum + sizeof(u32), sizeof(u64));
				6758	raw->size = size - sizeof(u32);
				6759	frag->pad = raw->size - sum;
				6760	} else {
				6761	size = sizeof(u64);
				6762	}
				6763
				6764	header->size += size;
				6765	}
				6766
				6767	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
				6768	int size = sizeof(u64); /* nr */
				6769	if (data->br_stack) {
				6770	size += data->br_stack->nr
				6771	* sizeof(struct perf_branch_entry);
				6772	}
				6773	header->size += size;
				6774	}
				6775
				6776	if (sample_type & (PERF_SAMPLE_REGS_USER \| PERF_SAMPLE_STACK_USER))
				6777	perf_sample_regs_user(&data->regs_user, regs,
				6778	&data->regs_user_copy);
				6779
				6780	if (sample_type & PERF_SAMPLE_REGS_USER) {
				6781	/* regs dump ABI info */
				6782	int size = sizeof(u64);
				6783
				6784	if (data->regs_user.regs) {
				6785	u64 mask = event->attr.sample_regs_user;
				6786	size += hweight64(mask) * sizeof(u64);
				6787	}
				6788
				6789	header->size += size;
				6790	}
				6791
				6792	if (sample_type & PERF_SAMPLE_STACK_USER) {
				6793	/*
				6794	* Either we need PERF_SAMPLE_STACK_USER bit to be always
				6795	* processed as the last one or have additional check added
				6796	* in case new sample type is added, because we could eat
				6797	* up the rest of the sample size.
				6798	*/
				6799	u16 stack_size = event->attr.sample_stack_user;
				6800	u16 size = sizeof(u64);
				6801
				6802	stack_size = perf_sample_ustack_size(stack_size, header->size,
				6803	data->regs_user.regs);
				6804
				6805	/*
				6806	* If there is something to dump, add space for the dump
				6807	* itself and for the field that tells the dynamic size,
				6808	* which is how many have been actually dumped.
				6809	*/
				6810	if (stack_size)
				6811	size += sizeof(u64) + stack_size;
				6812
				6813	data->stack_user_size = stack_size;
				6814	header->size += size;
				6815	}
				6816
				6817	if (sample_type & PERF_SAMPLE_REGS_INTR) {
				6818	/* regs dump ABI info */
				6819	int size = sizeof(u64);
				6820
				6821	perf_sample_regs_intr(&data->regs_intr, regs);
				6822
				6823	if (data->regs_intr.regs) {
				6824	u64 mask = event->attr.sample_regs_intr;
				6825
				6826	size += hweight64(mask) * sizeof(u64);
				6827	}
				6828
				6829	header->size += size;
				6830	}
				6831
				6832	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
				6833	data->phys_addr = perf_virt_to_phys(data->addr);
				6834	}
				6835
				6836	static __always_inline int
				6837	__perf_event_output(struct perf_event *event,
				6838	struct perf_sample_data *data,
				6839	struct pt_regs *regs,
				6840	int (output_begin)(struct perf_output_handle ,
				6841	struct perf_event *,
				6842	unsigned int))
				6843	{
				6844	struct perf_output_handle handle;
				6845	struct perf_event_header header;
				6846	int err;
				6847
				6848	/* protect the callchain buffers */
				6849	rcu_read_lock();
				6850
				6851	perf_prepare_sample(&header, data, event, regs);
				6852
				6853	err = output_begin(&handle, event, header.size);
				6854	if (err)
				6855	goto exit;
				6856
				6857	perf_output_sample(&handle, &header, data, event);
				6858
				6859	perf_output_end(&handle);
				6860
				6861	exit:
				6862	rcu_read_unlock();
				6863	return err;
				6864	}
				6865
				6866	void
				6867	perf_event_output_forward(struct perf_event *event,
				6868	struct perf_sample_data *data,
				6869	struct pt_regs *regs)
				6870	{
				6871	__perf_event_output(event, data, regs, perf_output_begin_forward);
				6872	}
				6873
				6874	void
				6875	perf_event_output_backward(struct perf_event *event,
				6876	struct perf_sample_data *data,
				6877	struct pt_regs *regs)
				6878	{
				6879	__perf_event_output(event, data, regs, perf_output_begin_backward);
				6880	}
				6881
				6882	int
				6883	perf_event_output(struct perf_event *event,
				6884	struct perf_sample_data *data,
				6885	struct pt_regs *regs)
				6886	{
				6887	return __perf_event_output(event, data, regs, perf_output_begin);
				6888	}
				6889
				6890	/*
				6891	* read event_id
				6892	*/
				6893
				6894	struct perf_read_event {
				6895	struct perf_event_header header;
				6896
				6897	u32 pid;
				6898	u32 tid;
				6899	};
				6900
				6901	static void
				6902	perf_event_read_event(struct perf_event *event,
				6903	struct task_struct *task)
				6904	{
				6905	struct perf_output_handle handle;
				6906	struct perf_sample_data sample;
				6907	struct perf_read_event read_event = {
				6908	.header = {
				6909	.type = PERF_RECORD_READ,
				6910	.misc = 0,
				6911	.size = sizeof(read_event) + event->read_size,
				6912	},
				6913	.pid = perf_event_pid(event, task),
				6914	.tid = perf_event_tid(event, task),
				6915	};
				6916	int ret;
				6917
				6918	perf_event_header__init_id(&read_event.header, &sample, event);
				6919	ret = perf_output_begin(&handle, event, read_event.header.size);
				6920	if (ret)
				6921	return;
				6922
				6923	perf_output_put(&handle, read_event);
				6924	perf_output_read(&handle, event);
				6925	perf_event__output_id_sample(event, &handle, &sample);
				6926
				6927	perf_output_end(&handle);
				6928	}
				6929
				6930	typedef void (perf_iterate_f)(struct perf_event event, void data);
				6931
				6932	static void
				6933	perf_iterate_ctx(struct perf_event_context *ctx,
				6934	perf_iterate_f output,
				6935	void *data, bool all)
				6936	{
				6937	struct perf_event *event;
				6938
				6939	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
				6940	if (!all) {
				6941	if (event->state < PERF_EVENT_STATE_INACTIVE)
				6942	continue;
				6943	if (!event_filter_match(event))
				6944	continue;
				6945	}
				6946
				6947	output(event, data);
				6948	}
				6949	}
				6950
				6951	static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
				6952	{
				6953	struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
				6954	struct perf_event *event;
				6955
				6956	list_for_each_entry_rcu(event, &pel->list, sb_list) {
				6957	/*
				6958	* Skip events that are not fully formed yet; ensure that
				6959	* if we observe event->ctx, both event and ctx will be
				6960	* complete enough. See perf_install_in_context().
				6961	*/
				6962	if (!smp_load_acquire(&event->ctx))
				6963	continue;
				6964
				6965	if (event->state < PERF_EVENT_STATE_INACTIVE)
				6966	continue;
				6967	if (!event_filter_match(event))
				6968	continue;
				6969	output(event, data);
				6970	}
				6971	}
				6972
				6973	/*
				6974	* Iterate all events that need to receive side-band events.
				6975	*
				6976	* For new callers; ensure that account_pmu_sb_event() includes
				6977	* your event, otherwise it might not get delivered.
				6978	*/
				6979	static void
				6980	perf_iterate_sb(perf_iterate_f output, void *data,
				6981	struct perf_event_context *task_ctx)
				6982	{
				6983	struct perf_event_context *ctx;
				6984	int ctxn;
				6985
				6986	rcu_read_lock();
				6987	preempt_disable();
				6988
				6989	/*
				6990	* If we have task_ctx != NULL we only notify the task context itself.
				6991	* The task_ctx is set only for EXIT events before releasing task
				6992	* context.
				6993	*/
				6994	if (task_ctx) {
				6995	perf_iterate_ctx(task_ctx, output, data, false);
				6996	goto done;
				6997	}
				6998
				6999	perf_iterate_sb_cpu(output, data);
				7000
				7001	for_each_task_context_nr(ctxn) {
				7002	ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
				7003	if (ctx)
				7004	perf_iterate_ctx(ctx, output, data, false);
				7005	}
				7006	done:
				7007	preempt_enable();
				7008	rcu_read_unlock();
				7009	}
				7010
				7011	/*
				7012	* Clear all file-based filters at exec, they'll have to be
				7013	* re-instated when/if these objects are mmapped again.
				7014	*/
				7015	static void perf_event_addr_filters_exec(struct perf_event event, void data)
				7016	{
				7017	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
				7018	struct perf_addr_filter *filter;
				7019	unsigned int restart = 0, count = 0;
				7020	unsigned long flags;
				7021
				7022	if (!has_addr_filter(event))
				7023	return;
				7024
				7025	raw_spin_lock_irqsave(&ifh->lock, flags);
				7026	list_for_each_entry(filter, &ifh->list, entry) {
				7027	if (filter->path.dentry) {
				7028	event->addr_filter_ranges[count].start = 0;
				7029	event->addr_filter_ranges[count].size = 0;
				7030	restart++;
				7031	}
				7032
				7033	count++;
				7034	}
				7035
				7036	if (restart)
				7037	event->addr_filters_gen++;
				7038	raw_spin_unlock_irqrestore(&ifh->lock, flags);
				7039
				7040	if (restart)
				7041	perf_event_stop(event, 1);
				7042	}
				7043
				7044	void perf_event_exec(void)
				7045	{
				7046	struct perf_event_context *ctx;
				7047	int ctxn;
				7048
				7049	rcu_read_lock();
				7050	for_each_task_context_nr(ctxn) {
				7051	ctx = current->perf_event_ctxp[ctxn];
				7052	if (!ctx)
				7053	continue;
				7054
				7055	perf_event_enable_on_exec(ctxn);
				7056
				7057	perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
				7058	true);
				7059	}
				7060	rcu_read_unlock();
				7061	}
				7062
				7063	struct remote_output {
				7064	struct ring_buffer *rb;
				7065	int err;
				7066	};
				7067
				7068	static void __perf_event_output_stop(struct perf_event event, void data)
				7069	{
				7070	struct perf_event *parent = event->parent;
				7071	struct remote_output *ro = data;
				7072	struct ring_buffer *rb = ro->rb;
				7073	struct stop_event_data sd = {
				7074	.event = event,
				7075	};
				7076
				7077	if (!has_aux(event))
				7078	return;
				7079
				7080	if (!parent)
				7081	parent = event;
				7082
				7083	/*
				7084	* In case of inheritance, it will be the parent that links to the
				7085	* ring-buffer, but it will be the child that's actually using it.
				7086	*
				7087	* We are using event::rb to determine if the event should be stopped,
				7088	* however this may race with ring_buffer_attach() (through set_output),
				7089	* which will make us skip the event that actually needs to be stopped.
				7090	* So ring_buffer_attach() has to stop an aux event before re-assigning
				7091	* its rb pointer.
				7092	*/
				7093	if (rcu_dereference(parent->rb) == rb)
				7094	ro->err = __perf_event_stop(&sd);
				7095	}
				7096
				7097	static int __perf_pmu_output_stop(void *info)
				7098	{
				7099	struct perf_event *event = info;
				7100	struct pmu *pmu = event->ctx->pmu;
				7101	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
				7102	struct remote_output ro = {
				7103	.rb = event->rb,
				7104	};
				7105
				7106	rcu_read_lock();
				7107	perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
				7108	if (cpuctx->task_ctx)
				7109	perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
				7110	&ro, false);
				7111	rcu_read_unlock();
				7112
				7113	return ro.err;
				7114	}
				7115
				7116	static void perf_pmu_output_stop(struct perf_event *event)
				7117	{
				7118	struct perf_event *iter;
				7119	int err, cpu;
				7120
				7121	restart:
				7122	rcu_read_lock();
				7123	list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
				7124	/*
				7125	* For per-CPU events, we need to make sure that neither they
				7126	* nor their children are running; for cpu==-1 events it's
				7127	* sufficient to stop the event itself if it's active, since
				7128	* it can't have children.
				7129	*/
				7130	cpu = iter->cpu;
				7131	if (cpu == -1)
				7132	cpu = READ_ONCE(iter->oncpu);
				7133
				7134	if (cpu == -1)
				7135	continue;
				7136
				7137	err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
				7138	if (err == -EAGAIN) {
				7139	rcu_read_unlock();
				7140	goto restart;
				7141	}
				7142	}
				7143	rcu_read_unlock();
				7144	}
				7145
				7146	/*
				7147	* task tracking -- fork/exit
				7148	*
				7149	* enabled by: attr.comm \| attr.mmap \| attr.mmap2 \| attr.mmap_data \| attr.task
				7150	*/
				7151
				7152	struct perf_task_event {
				7153	struct task_struct *task;
				7154	struct perf_event_context *task_ctx;
				7155
				7156	struct {
				7157	struct perf_event_header header;
				7158
				7159	u32 pid;
				7160	u32 ppid;
				7161	u32 tid;
				7162	u32 ptid;
				7163	u64 time;
				7164	} event_id;
				7165	};
				7166
				7167	static int perf_event_task_match(struct perf_event *event)
				7168	{
				7169	return event->attr.comm \|\| event->attr.mmap \|\|
				7170	event->attr.mmap2 \|\| event->attr.mmap_data \|\|
				7171	event->attr.task;
				7172	}
				7173
				7174	static void perf_event_task_output(struct perf_event *event,
				7175	void *data)
				7176	{
				7177	struct perf_task_event *task_event = data;
				7178	struct perf_output_handle handle;
				7179	struct perf_sample_data sample;
				7180	struct task_struct *task = task_event->task;
				7181	int ret, size = task_event->event_id.header.size;
				7182
				7183	if (!perf_event_task_match(event))
				7184	return;
				7185
				7186	perf_event_header__init_id(&task_event->event_id.header, &sample, event);
				7187
				7188	ret = perf_output_begin(&handle, event,
				7189	task_event->event_id.header.size);
				7190	if (ret)
				7191	goto out;
				7192
				7193	task_event->event_id.pid = perf_event_pid(event, task);
				7194	task_event->event_id.tid = perf_event_tid(event, task);
				7195
				7196	if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
				7197	task_event->event_id.ppid = perf_event_pid(event,
				7198	task->real_parent);
				7199	task_event->event_id.ptid = perf_event_pid(event,
				7200	task->real_parent);
				7201	} else { /* PERF_RECORD_FORK */
				7202	task_event->event_id.ppid = perf_event_pid(event, current);
				7203	task_event->event_id.ptid = perf_event_tid(event, current);
				7204	}
				7205
				7206	task_event->event_id.time = perf_event_clock(event);
				7207
				7208	perf_output_put(&handle, task_event->event_id);
				7209
				7210	perf_event__output_id_sample(event, &handle, &sample);
				7211
				7212	perf_output_end(&handle);
				7213	out:
				7214	task_event->event_id.header.size = size;
				7215	}
				7216
				7217	static void perf_event_task(struct task_struct *task,
				7218	struct perf_event_context *task_ctx,
				7219	int new)
				7220	{
				7221	struct perf_task_event task_event;
				7222
				7223	if (!atomic_read(&nr_comm_events) &&
				7224	!atomic_read(&nr_mmap_events) &&
				7225	!atomic_read(&nr_task_events))
				7226	return;
				7227
				7228	task_event = (struct perf_task_event){
				7229	.task = task,
				7230	.task_ctx = task_ctx,
				7231	.event_id = {
				7232	.header = {
				7233	.type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
				7234	.misc = 0,
				7235	.size = sizeof(task_event.event_id),
				7236	},
				7237	/* .pid */
				7238	/* .ppid */
				7239	/* .tid */
				7240	/* .ptid */
				7241	/* .time */
				7242	},
				7243	};
				7244
				7245	perf_iterate_sb(perf_event_task_output,
				7246	&task_event,
				7247	task_ctx);
				7248	}
				7249
				7250	void perf_event_fork(struct task_struct *task)
				7251	{
				7252	perf_event_task(task, NULL, 1);
				7253	perf_event_namespaces(task);
				7254	}
				7255
				7256	/*
				7257	* comm tracking
				7258	*/
				7259
				7260	struct perf_comm_event {
				7261	struct task_struct *task;
				7262	char *comm;
				7263	int comm_size;
				7264
				7265	struct {
				7266	struct perf_event_header header;
				7267
				7268	u32 pid;
				7269	u32 tid;
				7270	} event_id;
				7271	};
				7272
				7273	static int perf_event_comm_match(struct perf_event *event)
				7274	{
				7275	return event->attr.comm;
				7276	}
				7277
				7278	static void perf_event_comm_output(struct perf_event *event,
				7279	void *data)
				7280	{
				7281	struct perf_comm_event *comm_event = data;
				7282	struct perf_output_handle handle;
				7283	struct perf_sample_data sample;
				7284	int size = comm_event->event_id.header.size;
				7285	int ret;
				7286
				7287	if (!perf_event_comm_match(event))
				7288	return;
				7289
				7290	perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
				7291	ret = perf_output_begin(&handle, event,
				7292	comm_event->event_id.header.size);
				7293
				7294	if (ret)
				7295	goto out;
				7296
				7297	comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
				7298	comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
				7299
				7300	perf_output_put(&handle, comm_event->event_id);
				7301	__output_copy(&handle, comm_event->comm,
				7302	comm_event->comm_size);
				7303
				7304	perf_event__output_id_sample(event, &handle, &sample);
				7305
				7306	perf_output_end(&handle);
				7307	out:
				7308	comm_event->event_id.header.size = size;
				7309	}
				7310
				7311	static void perf_event_comm_event(struct perf_comm_event *comm_event)
				7312	{
				7313	char comm[TASK_COMM_LEN];
				7314	unsigned int size;
				7315
				7316	memset(comm, 0, sizeof(comm));
				7317	strlcpy(comm, comm_event->task->comm, sizeof(comm));
				7318	size = ALIGN(strlen(comm)+1, sizeof(u64));
				7319
				7320	comm_event->comm = comm;
				7321	comm_event->comm_size = size;
				7322
				7323	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
				7324
				7325	perf_iterate_sb(perf_event_comm_output,
				7326	comm_event,
				7327	NULL);
				7328	}
				7329
				7330	void perf_event_comm(struct task_struct *task, bool exec)
				7331	{
				7332	struct perf_comm_event comm_event;
				7333
				7334	if (!atomic_read(&nr_comm_events))
				7335	return;
				7336
				7337	comm_event = (struct perf_comm_event){
				7338	.task = task,
				7339	/* .comm */
				7340	/* .comm_size */
				7341	.event_id = {
				7342	.header = {
				7343	.type = PERF_RECORD_COMM,
				7344	.misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
				7345	/* .size */
				7346	},
				7347	/* .pid */
				7348	/* .tid */
				7349	},
				7350	};
				7351
				7352	perf_event_comm_event(&comm_event);
				7353	}
				7354
				7355	/*
				7356	* namespaces tracking
				7357	*/
				7358
				7359	struct perf_namespaces_event {
				7360	struct task_struct *task;
				7361
				7362	struct {
				7363	struct perf_event_header header;
				7364
				7365	u32 pid;
				7366	u32 tid;
				7367	u64 nr_namespaces;
				7368	struct perf_ns_link_info link_info[NR_NAMESPACES];
				7369	} event_id;
				7370	};
				7371
				7372	static int perf_event_namespaces_match(struct perf_event *event)
				7373	{
				7374	return event->attr.namespaces;
				7375	}
				7376
				7377	static void perf_event_namespaces_output(struct perf_event *event,
				7378	void *data)
				7379	{
				7380	struct perf_namespaces_event *namespaces_event = data;
				7381	struct perf_output_handle handle;
				7382	struct perf_sample_data sample;
				7383	u16 header_size = namespaces_event->event_id.header.size;
				7384	int ret;
				7385
				7386	if (!perf_event_namespaces_match(event))
				7387	return;
				7388
				7389	perf_event_header__init_id(&namespaces_event->event_id.header,
				7390	&sample, event);
				7391	ret = perf_output_begin(&handle, event,
				7392	namespaces_event->event_id.header.size);
				7393	if (ret)
				7394	goto out;
				7395
				7396	namespaces_event->event_id.pid = perf_event_pid(event,
				7397	namespaces_event->task);
				7398	namespaces_event->event_id.tid = perf_event_tid(event,
				7399	namespaces_event->task);
				7400
				7401	perf_output_put(&handle, namespaces_event->event_id);
				7402
				7403	perf_event__output_id_sample(event, &handle, &sample);
				7404
				7405	perf_output_end(&handle);
				7406	out:
				7407	namespaces_event->event_id.header.size = header_size;
				7408	}
				7409
				7410	static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
				7411	struct task_struct *task,
				7412	const struct proc_ns_operations *ns_ops)
				7413	{
				7414	struct path ns_path;
				7415	struct inode *ns_inode;
				7416	void *error;
				7417
				7418	error = ns_get_path(&ns_path, task, ns_ops);
				7419	if (!error) {
				7420	ns_inode = ns_path.dentry->d_inode;
				7421	ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
				7422	ns_link_info->ino = ns_inode->i_ino;
				7423	path_put(&ns_path);
				7424	}
				7425	}
				7426
				7427	void perf_event_namespaces(struct task_struct *task)
				7428	{
				7429	struct perf_namespaces_event namespaces_event;
				7430	struct perf_ns_link_info *ns_link_info;
				7431
				7432	if (!atomic_read(&nr_namespaces_events))
				7433	return;
				7434
				7435	namespaces_event = (struct perf_namespaces_event){
				7436	.task = task,
				7437	.event_id = {
				7438	.header = {
				7439	.type = PERF_RECORD_NAMESPACES,
				7440	.misc = 0,
				7441	.size = sizeof(namespaces_event.event_id),
				7442	},
				7443	/* .pid */
				7444	/* .tid */
				7445	.nr_namespaces = NR_NAMESPACES,
				7446	/* .link_info[NR_NAMESPACES] */
				7447	},
				7448	};
				7449
				7450	ns_link_info = namespaces_event.event_id.link_info;
				7451
				7452	perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
				7453	task, &mntns_operations);
				7454
				7455	#ifdef CONFIG_USER_NS
				7456	perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
				7457	task, &userns_operations);
				7458	#endif
				7459	#ifdef CONFIG_NET_NS
				7460	perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
				7461	task, &netns_operations);
				7462	#endif
				7463	#ifdef CONFIG_UTS_NS
				7464	perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
				7465	task, &utsns_operations);
				7466	#endif
				7467	#ifdef CONFIG_IPC_NS
				7468	perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
				7469	task, &ipcns_operations);
				7470	#endif
				7471	#ifdef CONFIG_PID_NS
				7472	perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
				7473	task, &pidns_operations);
				7474	#endif
				7475	#ifdef CONFIG_CGROUPS
				7476	perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
				7477	task, &cgroupns_operations);
				7478	#endif
				7479
				7480	perf_iterate_sb(perf_event_namespaces_output,
				7481	&namespaces_event,
				7482	NULL);
				7483	}
				7484
				7485	/*
				7486	* mmap tracking
				7487	*/
				7488
				7489	struct perf_mmap_event {
				7490	struct vm_area_struct *vma;
				7491
				7492	const char *file_name;
				7493	int file_size;
				7494	int maj, min;
				7495	u64 ino;
				7496	u64 ino_generation;
				7497	u32 prot, flags;
				7498
				7499	struct {
				7500	struct perf_event_header header;
				7501
				7502	u32 pid;
				7503	u32 tid;
				7504	u64 start;
				7505	u64 len;
				7506	u64 pgoff;
				7507	} event_id;
				7508	};
				7509
				7510	static int perf_event_mmap_match(struct perf_event *event,
				7511	void *data)
				7512	{
				7513	struct perf_mmap_event *mmap_event = data;
				7514	struct vm_area_struct *vma = mmap_event->vma;
				7515	int executable = vma->vm_flags & VM_EXEC;
				7516
				7517	return (!executable && event->attr.mmap_data) \|\|
				7518	(executable && (event->attr.mmap \|\| event->attr.mmap2));
				7519	}
				7520
				7521	static void perf_event_mmap_output(struct perf_event *event,
				7522	void *data)
				7523	{
				7524	struct perf_mmap_event *mmap_event = data;
				7525	struct perf_output_handle handle;
				7526	struct perf_sample_data sample;
				7527	int size = mmap_event->event_id.header.size;
				7528	u32 type = mmap_event->event_id.header.type;
				7529	int ret;
				7530
				7531	if (!perf_event_mmap_match(event, data))
				7532	return;
				7533
				7534	if (event->attr.mmap2) {
				7535	mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
				7536	mmap_event->event_id.header.size += sizeof(mmap_event->maj);
				7537	mmap_event->event_id.header.size += sizeof(mmap_event->min);
				7538	mmap_event->event_id.header.size += sizeof(mmap_event->ino);
				7539	mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
				7540	mmap_event->event_id.header.size += sizeof(mmap_event->prot);
				7541	mmap_event->event_id.header.size += sizeof(mmap_event->flags);
				7542	}
				7543
				7544	perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
				7545	ret = perf_output_begin(&handle, event,
				7546	mmap_event->event_id.header.size);
				7547	if (ret)
				7548	goto out;
				7549
				7550	mmap_event->event_id.pid = perf_event_pid(event, current);
				7551	mmap_event->event_id.tid = perf_event_tid(event, current);
				7552
				7553	perf_output_put(&handle, mmap_event->event_id);
				7554
				7555	if (event->attr.mmap2) {
				7556	perf_output_put(&handle, mmap_event->maj);
				7557	perf_output_put(&handle, mmap_event->min);
				7558	perf_output_put(&handle, mmap_event->ino);
				7559	perf_output_put(&handle, mmap_event->ino_generation);
				7560	perf_output_put(&handle, mmap_event->prot);
				7561	perf_output_put(&handle, mmap_event->flags);
				7562	}
				7563
				7564	__output_copy(&handle, mmap_event->file_name,
				7565	mmap_event->file_size);
				7566
				7567	perf_event__output_id_sample(event, &handle, &sample);
				7568
				7569	perf_output_end(&handle);
				7570	out:
				7571	mmap_event->event_id.header.size = size;
				7572	mmap_event->event_id.header.type = type;
				7573	}
				7574
				7575	static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
				7576	{
				7577	struct vm_area_struct *vma = mmap_event->vma;
				7578	struct file *file = vma->vm_file;
				7579	int maj = 0, min = 0;
				7580	u64 ino = 0, gen = 0;
				7581	u32 prot = 0, flags = 0;
				7582	unsigned int size;
				7583	char tmp[16];
				7584	char *buf = NULL;
				7585	char *name;
				7586
				7587	if (vma->vm_flags & VM_READ)
				7588	prot \|= PROT_READ;
				7589	if (vma->vm_flags & VM_WRITE)
				7590	prot \|= PROT_WRITE;
				7591	if (vma->vm_flags & VM_EXEC)
				7592	prot \|= PROT_EXEC;
				7593
				7594	if (vma->vm_flags & VM_MAYSHARE)
				7595	flags = MAP_SHARED;
				7596	else
				7597	flags = MAP_PRIVATE;
				7598
				7599	if (vma->vm_flags & VM_DENYWRITE)
				7600	flags \|= MAP_DENYWRITE;
				7601	if (vma->vm_flags & VM_MAYEXEC)
				7602	flags \|= MAP_EXECUTABLE;
				7603	if (vma->vm_flags & VM_LOCKED)
				7604	flags \|= MAP_LOCKED;
				7605	if (vma->vm_flags & VM_HUGETLB)
				7606	flags \|= MAP_HUGETLB;
				7607
				7608	if (file) {
				7609	struct inode *inode;
				7610	dev_t dev;
				7611
				7612	buf = kmalloc(PATH_MAX, GFP_KERNEL);
				7613	if (!buf) {
				7614	name = "//enomem";
				7615	goto cpy_name;
				7616	}
				7617	/*
				7618	* d_path() works from the end of the rb backwards, so we
				7619	* need to add enough zero bytes after the string to handle
				7620	* the 64bit alignment we do later.
				7621	*/
				7622	name = file_path(file, buf, PATH_MAX - sizeof(u64));
				7623	if (IS_ERR(name)) {
				7624	name = "//toolong";
				7625	goto cpy_name;
				7626	}
				7627	inode = file_inode(vma->vm_file);
				7628	dev = inode->i_sb->s_dev;
				7629	ino = inode->i_ino;
				7630	gen = inode->i_generation;
				7631	maj = MAJOR(dev);
				7632	min = MINOR(dev);
				7633
				7634	goto got_name;
				7635	} else {
				7636	if (vma->vm_ops && vma->vm_ops->name) {
				7637	name = (char *) vma->vm_ops->name(vma);
				7638	if (name)
				7639	goto cpy_name;
				7640	}
				7641
				7642	name = (char *)arch_vma_name(vma);
				7643	if (name)
				7644	goto cpy_name;
				7645
				7646	if (vma->vm_start <= vma->vm_mm->start_brk &&
				7647	vma->vm_end >= vma->vm_mm->brk) {
				7648	name = "[heap]";
				7649	goto cpy_name;
				7650	}
				7651	if (vma->vm_start <= vma->vm_mm->start_stack &&
				7652	vma->vm_end >= vma->vm_mm->start_stack) {
				7653	name = "[stack]";
				7654	goto cpy_name;
				7655	}
				7656
				7657	name = "//anon";
				7658	goto cpy_name;
				7659	}
				7660
				7661	cpy_name:
				7662	strlcpy(tmp, name, sizeof(tmp));
				7663	name = tmp;
				7664	got_name:
				7665	/*
				7666	* Since our buffer works in 8 byte units we need to align our string
				7667	* size to a multiple of 8. However, we must guarantee the tail end is
				7668	* zero'd out to avoid leaking random bits to userspace.
				7669	*/
				7670	size = strlen(name)+1;
				7671	while (!IS_ALIGNED(size, sizeof(u64)))
				7672	name[size++] = '\0';
				7673
				7674	mmap_event->file_name = name;
				7675	mmap_event->file_size = size;
				7676	mmap_event->maj = maj;
				7677	mmap_event->min = min;
				7678	mmap_event->ino = ino;
				7679	mmap_event->ino_generation = gen;
				7680	mmap_event->prot = prot;
				7681	mmap_event->flags = flags;
				7682
				7683	if (!(vma->vm_flags & VM_EXEC))
				7684	mmap_event->event_id.header.misc \|= PERF_RECORD_MISC_MMAP_DATA;
				7685
				7686	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
				7687
				7688	perf_iterate_sb(perf_event_mmap_output,
				7689	mmap_event,
				7690	NULL);
				7691
				7692	kfree(buf);
				7693	}
				7694
				7695	/*
				7696	* Check whether inode and address range match filter criteria.
				7697	*/
				7698	static bool perf_addr_filter_match(struct perf_addr_filter *filter,
				7699	struct file *file, unsigned long offset,
				7700	unsigned long size)
				7701	{
				7702	/* d_inode(NULL) won't be equal to any mapped user-space file */
				7703	if (!filter->path.dentry)
				7704	return false;
				7705
				7706	if (d_inode(filter->path.dentry) != file_inode(file))
				7707	return false;
				7708
				7709	if (filter->offset > offset + size)
				7710	return false;
				7711
				7712	if (filter->offset + filter->size < offset)
				7713	return false;
				7714
				7715	return true;
				7716	}
				7717
				7718	static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
				7719	struct vm_area_struct *vma,
				7720	struct perf_addr_filter_range *fr)
				7721	{
				7722	unsigned long vma_size = vma->vm_end - vma->vm_start;
				7723	unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
				7724	struct file *file = vma->vm_file;
				7725
				7726	if (!perf_addr_filter_match(filter, file, off, vma_size))
				7727	return false;
				7728
				7729	if (filter->offset < off) {
				7730	fr->start = vma->vm_start;
				7731	fr->size = min(vma_size, filter->size - (off - filter->offset));
				7732	} else {
				7733	fr->start = vma->vm_start + filter->offset - off;
				7734	fr->size = min(vma->vm_end - fr->start, filter->size);
				7735	}
				7736
				7737	return true;
				7738	}
				7739
				7740	static void __perf_addr_filters_adjust(struct perf_event event, void data)
				7741	{
				7742	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
				7743	struct vm_area_struct *vma = data;
				7744	struct perf_addr_filter *filter;
				7745	unsigned int restart = 0, count = 0;
				7746	unsigned long flags;
				7747
				7748	if (!has_addr_filter(event))
				7749	return;
				7750
				7751	if (!vma->vm_file)
				7752	return;
				7753
				7754	raw_spin_lock_irqsave(&ifh->lock, flags);
				7755	list_for_each_entry(filter, &ifh->list, entry) {
				7756	if (perf_addr_filter_vma_adjust(filter, vma,
				7757	&event->addr_filter_ranges[count]))
				7758	restart++;
				7759
				7760	count++;
				7761	}
				7762
				7763	if (restart)
				7764	event->addr_filters_gen++;
				7765	raw_spin_unlock_irqrestore(&ifh->lock, flags);
				7766
				7767	if (restart)
				7768	perf_event_stop(event, 1);
				7769	}
				7770
				7771	/*
				7772	* Adjust all task's events' filters to the new vma
				7773	*/
				7774	static void perf_addr_filters_adjust(struct vm_area_struct *vma)
				7775	{
				7776	struct perf_event_context *ctx;
				7777	int ctxn;
				7778
				7779	/*
				7780	* Data tracing isn't supported yet and as such there is no need
				7781	* to keep track of anything that isn't related to executable code:
				7782	*/
				7783	if (!(vma->vm_flags & VM_EXEC))
				7784	return;
				7785
				7786	rcu_read_lock();
				7787	for_each_task_context_nr(ctxn) {
				7788	ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
				7789	if (!ctx)
				7790	continue;
				7791
				7792	perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
				7793	}
				7794	rcu_read_unlock();
				7795	}
				7796
				7797	void perf_event_mmap(struct vm_area_struct *vma)
				7798	{
				7799	struct perf_mmap_event mmap_event;
				7800
				7801	if (!atomic_read(&nr_mmap_events))
				7802	return;
				7803
				7804	mmap_event = (struct perf_mmap_event){
				7805	.vma = vma,
				7806	/* .file_name */
				7807	/* .file_size */
				7808	.event_id = {
				7809	.header = {
				7810	.type = PERF_RECORD_MMAP,
				7811	.misc = PERF_RECORD_MISC_USER,
				7812	/* .size */
				7813	},
				7814	/* .pid */
				7815	/* .tid */
				7816	.start = vma->vm_start,
				7817	.len = vma->vm_end - vma->vm_start,
				7818	.pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
				7819	},
				7820	/* .maj (attr_mmap2 only) */
				7821	/* .min (attr_mmap2 only) */
				7822	/* .ino (attr_mmap2 only) */
				7823	/* .ino_generation (attr_mmap2 only) */
				7824	/* .prot (attr_mmap2 only) */
				7825	/* .flags (attr_mmap2 only) */
				7826	};
				7827
				7828	perf_addr_filters_adjust(vma);
				7829	perf_event_mmap_event(&mmap_event);
				7830	}
				7831
				7832	void perf_event_aux_event(struct perf_event *event, unsigned long head,
				7833	unsigned long size, u64 flags)
				7834	{
				7835	struct perf_output_handle handle;
				7836	struct perf_sample_data sample;
				7837	struct perf_aux_event {
				7838	struct perf_event_header header;
				7839	u64 offset;
				7840	u64 size;
				7841	u64 flags;
				7842	} rec = {
				7843	.header = {
				7844	.type = PERF_RECORD_AUX,
				7845	.misc = 0,
				7846	.size = sizeof(rec),
				7847	},
				7848	.offset = head,
				7849	.size = size,
				7850	.flags = flags,
				7851	};
				7852	int ret;
				7853
				7854	perf_event_header__init_id(&rec.header, &sample, event);
				7855	ret = perf_output_begin(&handle, event, rec.header.size);
				7856
				7857	if (ret)
				7858	return;
				7859
				7860	perf_output_put(&handle, rec);
				7861	perf_event__output_id_sample(event, &handle, &sample);
				7862
				7863	perf_output_end(&handle);
				7864	}
				7865
				7866	/*
				7867	* Lost/dropped samples logging
				7868	*/
				7869	void perf_log_lost_samples(struct perf_event *event, u64 lost)
				7870	{
				7871	struct perf_output_handle handle;
				7872	struct perf_sample_data sample;
				7873	int ret;
				7874
				7875	struct {
				7876	struct perf_event_header header;
				7877	u64 lost;
				7878	} lost_samples_event = {
				7879	.header = {
				7880	.type = PERF_RECORD_LOST_SAMPLES,
				7881	.misc = 0,
				7882	.size = sizeof(lost_samples_event),
				7883	},
				7884	.lost = lost,
				7885	};
				7886
				7887	perf_event_header__init_id(&lost_samples_event.header, &sample, event);
				7888
				7889	ret = perf_output_begin(&handle, event,
				7890	lost_samples_event.header.size);
				7891	if (ret)
				7892	return;
				7893
				7894	perf_output_put(&handle, lost_samples_event);
				7895	perf_event__output_id_sample(event, &handle, &sample);
				7896	perf_output_end(&handle);
				7897	}
				7898
				7899	/*
				7900	* context_switch tracking
				7901	*/
				7902
				7903	struct perf_switch_event {
				7904	struct task_struct *task;
				7905	struct task_struct *next_prev;
				7906
				7907	struct {
				7908	struct perf_event_header header;
				7909	u32 next_prev_pid;
				7910	u32 next_prev_tid;
				7911	} event_id;
				7912	};
				7913
				7914	static int perf_event_switch_match(struct perf_event *event)
				7915	{
				7916	return event->attr.context_switch;
				7917	}
				7918
				7919	static void perf_event_switch_output(struct perf_event event, void data)
				7920	{
				7921	struct perf_switch_event *se = data;
				7922	struct perf_output_handle handle;
				7923	struct perf_sample_data sample;
				7924	int ret;
				7925
				7926	if (!perf_event_switch_match(event))
				7927	return;
				7928
				7929	/* Only CPU-wide events are allowed to see next/prev pid/tid */
				7930	if (event->ctx->task) {
				7931	se->event_id.header.type = PERF_RECORD_SWITCH;
				7932	se->event_id.header.size = sizeof(se->event_id.header);
				7933	} else {
				7934	se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
				7935	se->event_id.header.size = sizeof(se->event_id);
				7936	se->event_id.next_prev_pid =
				7937	perf_event_pid(event, se->next_prev);
				7938	se->event_id.next_prev_tid =
				7939	perf_event_tid(event, se->next_prev);
				7940	}
				7941
				7942	perf_event_header__init_id(&se->event_id.header, &sample, event);
				7943
				7944	ret = perf_output_begin(&handle, event, se->event_id.header.size);
				7945	if (ret)
				7946	return;
				7947
				7948	if (event->ctx->task)
				7949	perf_output_put(&handle, se->event_id.header);
				7950	else
				7951	perf_output_put(&handle, se->event_id);
				7952
				7953	perf_event__output_id_sample(event, &handle, &sample);
				7954
				7955	perf_output_end(&handle);
				7956	}
				7957
				7958	static void perf_event_switch(struct task_struct *task,
				7959	struct task_struct *next_prev, bool sched_in)
				7960	{
				7961	struct perf_switch_event switch_event;
				7962
				7963	/* N.B. caller checks nr_switch_events != 0 */
				7964
				7965	switch_event = (struct perf_switch_event){
				7966	.task = task,
				7967	.next_prev = next_prev,
				7968	.event_id = {
				7969	.header = {
				7970	/* .type */
				7971	.misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
				7972	/* .size */
				7973	},
				7974	/* .next_prev_pid */
				7975	/* .next_prev_tid */
				7976	},
				7977	};
				7978
				7979	if (!sched_in && task->state == TASK_RUNNING)
				7980	switch_event.event_id.header.misc \|=
				7981	PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
				7982
				7983	perf_iterate_sb(perf_event_switch_output,
				7984	&switch_event,
				7985	NULL);
				7986	}
				7987
				7988	/*
				7989	* IRQ throttle logging
				7990	*/
				7991
				7992	static void perf_log_throttle(struct perf_event *event, int enable)
				7993	{
				7994	struct perf_output_handle handle;
				7995	struct perf_sample_data sample;
				7996	int ret;
				7997
				7998	struct {
				7999	struct perf_event_header header;
				8000	u64 time;
				8001	u64 id;
				8002	u64 stream_id;
				8003	} throttle_event = {
				8004	.header = {
				8005	.type = PERF_RECORD_THROTTLE,
				8006	.misc = 0,
				8007	.size = sizeof(throttle_event),
				8008	},
				8009	.time = perf_event_clock(event),
				8010	.id = primary_event_id(event),
				8011	.stream_id = event->id,
				8012	};
				8013
				8014	if (enable)
				8015	throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
				8016
				8017	perf_event_header__init_id(&throttle_event.header, &sample, event);
				8018
				8019	ret = perf_output_begin(&handle, event,
				8020	throttle_event.header.size);
				8021	if (ret)
				8022	return;
				8023
				8024	perf_output_put(&handle, throttle_event);
				8025	perf_event__output_id_sample(event, &handle, &sample);
				8026	perf_output_end(&handle);
				8027	}
				8028
				8029	/*
				8030	* ksymbol register/unregister tracking
				8031	*/
				8032
				8033	struct perf_ksymbol_event {
				8034	const char *name;
				8035	int name_len;
				8036	struct {
				8037	struct perf_event_header header;
				8038	u64 addr;
				8039	u32 len;
				8040	u16 ksym_type;
				8041	u16 flags;
				8042	} event_id;
				8043	};
				8044
				8045	static int perf_event_ksymbol_match(struct perf_event *event)
				8046	{
				8047	return event->attr.ksymbol;
				8048	}
				8049
				8050	static void perf_event_ksymbol_output(struct perf_event event, void data)
				8051	{
				8052	struct perf_ksymbol_event *ksymbol_event = data;
				8053	struct perf_output_handle handle;
				8054	struct perf_sample_data sample;
				8055	int ret;
				8056
				8057	if (!perf_event_ksymbol_match(event))
				8058	return;
				8059
				8060	perf_event_header__init_id(&ksymbol_event->event_id.header,
				8061	&sample, event);
				8062	ret = perf_output_begin(&handle, event,
				8063	ksymbol_event->event_id.header.size);
				8064	if (ret)
				8065	return;
				8066
				8067	perf_output_put(&handle, ksymbol_event->event_id);
				8068	__output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
				8069	perf_event__output_id_sample(event, &handle, &sample);
				8070
				8071	perf_output_end(&handle);
				8072	}
				8073
				8074	void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
				8075	const char *sym)
				8076	{
				8077	struct perf_ksymbol_event ksymbol_event;
				8078	char name[KSYM_NAME_LEN];
				8079	u16 flags = 0;
				8080	int name_len;
				8081
				8082	if (!atomic_read(&nr_ksymbol_events))
				8083	return;
				8084
				8085	if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX \|\|
				8086	ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
				8087	goto err;
				8088
				8089	strlcpy(name, sym, KSYM_NAME_LEN);
				8090	name_len = strlen(name) + 1;
				8091	while (!IS_ALIGNED(name_len, sizeof(u64)))
				8092	name[name_len++] = '\0';
				8093	BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
				8094
				8095	if (unregister)
				8096	flags \|= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
				8097
				8098	ksymbol_event = (struct perf_ksymbol_event){
				8099	.name = name,
				8100	.name_len = name_len,
				8101	.event_id = {
				8102	.header = {
				8103	.type = PERF_RECORD_KSYMBOL,
				8104	.size = sizeof(ksymbol_event.event_id) +
				8105	name_len,
				8106	},
				8107	.addr = addr,
				8108	.len = len,
				8109	.ksym_type = ksym_type,
				8110	.flags = flags,
				8111	},
				8112	};
				8113
				8114	perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
				8115	return;
				8116	err:
				8117	WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
				8118	}
				8119
				8120	/*
				8121	* bpf program load/unload tracking
				8122	*/
				8123
				8124	struct perf_bpf_event {
				8125	struct bpf_prog *prog;
				8126	struct {
				8127	struct perf_event_header header;
				8128	u16 type;
				8129	u16 flags;
				8130	u32 id;
				8131	u8 tag[BPF_TAG_SIZE];
				8132	} event_id;
				8133	};
				8134
				8135	static int perf_event_bpf_match(struct perf_event *event)
				8136	{
				8137	return event->attr.bpf_event;
				8138	}
				8139
				8140	static void perf_event_bpf_output(struct perf_event event, void data)
				8141	{
				8142	struct perf_bpf_event *bpf_event = data;
				8143	struct perf_output_handle handle;
				8144	struct perf_sample_data sample;
				8145	int ret;
				8146
				8147	if (!perf_event_bpf_match(event))
				8148	return;
				8149
				8150	perf_event_header__init_id(&bpf_event->event_id.header,
				8151	&sample, event);
				8152	ret = perf_output_begin(&handle, event,
				8153	bpf_event->event_id.header.size);
				8154	if (ret)
				8155	return;
				8156
				8157	perf_output_put(&handle, bpf_event->event_id);
				8158	perf_event__output_id_sample(event, &handle, &sample);
				8159
				8160	perf_output_end(&handle);
				8161	}
				8162
				8163	static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
				8164	enum perf_bpf_event_type type)
				8165	{
				8166	bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
				8167	char sym[KSYM_NAME_LEN];
				8168	int i;
				8169
				8170	if (prog->aux->func_cnt == 0) {
				8171	bpf_get_prog_name(prog, sym);
				8172	perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
				8173	(u64)(unsigned long)prog->bpf_func,
				8174	prog->jited_len, unregister, sym);
				8175	} else {
				8176	for (i = 0; i < prog->aux->func_cnt; i++) {
				8177	struct bpf_prog *subprog = prog->aux->func[i];
				8178
				8179	bpf_get_prog_name(subprog, sym);
				8180	perf_event_ksymbol(
				8181	PERF_RECORD_KSYMBOL_TYPE_BPF,
				8182	(u64)(unsigned long)subprog->bpf_func,
				8183	subprog->jited_len, unregister, sym);
				8184	}
				8185	}
				8186	}
				8187
				8188	void perf_event_bpf_event(struct bpf_prog *prog,
				8189	enum perf_bpf_event_type type,
				8190	u16 flags)
				8191	{
				8192	struct perf_bpf_event bpf_event;
				8193
				8194	if (type <= PERF_BPF_EVENT_UNKNOWN \|\|
				8195	type >= PERF_BPF_EVENT_MAX)
				8196	return;
				8197
				8198	switch (type) {
				8199	case PERF_BPF_EVENT_PROG_LOAD:
				8200	case PERF_BPF_EVENT_PROG_UNLOAD:
				8201	if (atomic_read(&nr_ksymbol_events))
				8202	perf_event_bpf_emit_ksymbols(prog, type);
				8203	break;
				8204	default:
				8205	break;
				8206	}
				8207
				8208	if (!atomic_read(&nr_bpf_events))
				8209	return;
				8210
				8211	bpf_event = (struct perf_bpf_event){
				8212	.prog = prog,
				8213	.event_id = {
				8214	.header = {
				8215	.type = PERF_RECORD_BPF_EVENT,
				8216	.size = sizeof(bpf_event.event_id),
				8217	},
				8218	.type = type,
				8219	.flags = flags,
				8220	.id = prog->aux->id,
				8221	},
				8222	};
				8223
				8224	BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
				8225
				8226	memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
				8227	perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
				8228	}
				8229
				8230	void perf_event_itrace_started(struct perf_event *event)
				8231	{
				8232	event->attach_state \|= PERF_ATTACH_ITRACE;
				8233	}
				8234
				8235	static void perf_log_itrace_start(struct perf_event *event)
				8236	{
				8237	struct perf_output_handle handle;
				8238	struct perf_sample_data sample;
				8239	struct perf_aux_event {
				8240	struct perf_event_header header;
				8241	u32 pid;
				8242	u32 tid;
				8243	} rec;
				8244	int ret;
				8245
				8246	if (event->parent)
				8247	event = event->parent;
				8248
				8249	if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) \|\|
				8250	event->attach_state & PERF_ATTACH_ITRACE)
				8251	return;
				8252
				8253	rec.header.type = PERF_RECORD_ITRACE_START;
				8254	rec.header.misc = 0;
				8255	rec.header.size = sizeof(rec);
				8256	rec.pid = perf_event_pid(event, current);
				8257	rec.tid = perf_event_tid(event, current);
				8258
				8259	perf_event_header__init_id(&rec.header, &sample, event);
				8260	ret = perf_output_begin(&handle, event, rec.header.size);
				8261
				8262	if (ret)
				8263	return;
				8264
				8265	perf_output_put(&handle, rec);
				8266	perf_event__output_id_sample(event, &handle, &sample);
				8267
				8268	perf_output_end(&handle);
				8269	}
				8270
				8271	static int
				8272	__perf_event_account_interrupt(struct perf_event *event, int throttle)
				8273	{
				8274	struct hw_perf_event *hwc = &event->hw;
				8275	int ret = 0;
				8276	u64 seq;
				8277
				8278	seq = __this_cpu_read(perf_throttled_seq);
				8279	if (seq != hwc->interrupts_seq) {
				8280	hwc->interrupts_seq = seq;
				8281	hwc->interrupts = 1;
				8282	} else {
				8283	hwc->interrupts++;
				8284	if (unlikely(throttle &&
				8285	hwc->interrupts > max_samples_per_tick)) {
				8286	__this_cpu_inc(perf_throttled_count);
				8287	tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
				8288	hwc->interrupts = MAX_INTERRUPTS;
				8289	perf_log_throttle(event, 0);
				8290	ret = 1;
				8291	}
				8292	}
				8293
				8294	if (event->attr.freq) {
				8295	u64 now = perf_clock();
				8296	s64 delta = now - hwc->freq_time_stamp;
				8297
				8298	hwc->freq_time_stamp = now;
				8299
				8300	if (delta > 0 && delta < 2*TICK_NSEC)
				8301	perf_adjust_period(event, delta, hwc->last_period, true);
				8302	}
				8303
				8304	return ret;
				8305	}
				8306
				8307	int perf_event_account_interrupt(struct perf_event *event)
				8308	{
				8309	return __perf_event_account_interrupt(event, 1);
				8310	}
				8311
				8312	/*
				8313	* Generic event overflow handling, sampling.
				8314	*/
				8315
				8316	static int __perf_event_overflow(struct perf_event *event,
				8317	int throttle, struct perf_sample_data *data,
				8318	struct pt_regs *regs)
				8319	{
				8320	int events = atomic_read(&event->event_limit);
				8321	int ret = 0;
				8322
				8323	/*
				8324	* Non-sampling counters might still use the PMI to fold short
				8325	* hardware counters, ignore those.
				8326	*/
				8327	if (unlikely(!is_sampling_event(event)))
				8328	return 0;
				8329
				8330	ret = __perf_event_account_interrupt(event, throttle);
				8331
				8332	/*
				8333	* XXX event_limit might not quite work as expected on inherited
				8334	* events
				8335	*/
				8336
				8337	event->pending_kill = POLL_IN;
				8338	if (events && atomic_dec_and_test(&event->event_limit)) {
				8339	ret = 1;
				8340	event->pending_kill = POLL_HUP;
				8341
				8342	perf_event_disable_inatomic(event);
				8343	}
				8344
				8345	READ_ONCE(event->overflow_handler)(event, data, regs);
				8346
				8347	if (*perf_event_fasync(event) && event->pending_kill) {
				8348	event->pending_wakeup = 1;
				8349	irq_work_queue(&event->pending);
				8350	}
				8351
				8352	return ret;
				8353	}
				8354
				8355	int perf_event_overflow(struct perf_event *event,
				8356	struct perf_sample_data *data,
				8357	struct pt_regs *regs)
				8358	{
				8359	return __perf_event_overflow(event, 1, data, regs);
				8360	}
				8361
				8362	/*
				8363	* Generic software event infrastructure
				8364	*/
				8365
				8366	struct swevent_htable {
				8367	struct swevent_hlist *swevent_hlist;
				8368	struct mutex hlist_mutex;
				8369	int hlist_refcount;
				8370
				8371	/* Recursion avoidance in each contexts */
				8372	int recursion[PERF_NR_CONTEXTS];
				8373	};
				8374
				8375	static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
				8376
				8377	/*
				8378	* We directly increment event->count and keep a second value in
				8379	* event->hw.period_left to count intervals. This period event
				8380	* is kept in the range [-sample_period, 0] so that we can use the
				8381	* sign as trigger.
				8382	*/
				8383
				8384	u64 perf_swevent_set_period(struct perf_event *event)
				8385	{
				8386	struct hw_perf_event *hwc = &event->hw;
				8387	u64 period = hwc->last_period;
				8388	u64 nr, offset;
				8389	s64 old, val;
				8390
				8391	hwc->last_period = hwc->sample_period;
				8392
				8393	again:
				8394	old = val = local64_read(&hwc->period_left);
				8395	if (val < 0)
				8396	return 0;
				8397
				8398	nr = div64_u64(period + val, period);
				8399	offset = nr * period;
				8400	val -= offset;
				8401	if (local64_cmpxchg(&hwc->period_left, old, val) != old)
				8402	goto again;
				8403
				8404	return nr;
				8405	}
				8406
				8407	static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
				8408	struct perf_sample_data *data,
				8409	struct pt_regs *regs)
				8410	{
				8411	struct hw_perf_event *hwc = &event->hw;
				8412	int throttle = 0;
				8413
				8414	if (!overflow)
				8415	overflow = perf_swevent_set_period(event);
				8416
				8417	if (hwc->interrupts == MAX_INTERRUPTS)
				8418	return;
				8419
				8420	for (; overflow; overflow--) {
				8421	if (__perf_event_overflow(event, throttle,
				8422	data, regs)) {
				8423	/*
				8424	* We inhibit the overflow from happening when
				8425	* hwc->interrupts == MAX_INTERRUPTS.
				8426	*/
				8427	break;
				8428	}
				8429	throttle = 1;
				8430	}
				8431	}
				8432
				8433	static void perf_swevent_event(struct perf_event *event, u64 nr,
				8434	struct perf_sample_data *data,
				8435	struct pt_regs *regs)
				8436	{
				8437	struct hw_perf_event *hwc = &event->hw;
				8438
				8439	local64_add(nr, &event->count);
				8440
				8441	if (!regs)
				8442	return;
				8443
				8444	if (!is_sampling_event(event))
				8445	return;
				8446
				8447	if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
				8448	data->period = nr;
				8449	return perf_swevent_overflow(event, 1, data, regs);
				8450	} else
				8451	data->period = event->hw.last_period;
				8452
				8453	if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
				8454	return perf_swevent_overflow(event, 1, data, regs);
				8455
				8456	if (local64_add_negative(nr, &hwc->period_left))
				8457	return;
				8458
				8459	perf_swevent_overflow(event, 0, data, regs);
				8460	}
				8461
				8462	static int perf_exclude_event(struct perf_event *event,
				8463	struct pt_regs *regs)
				8464	{
				8465	if (event->hw.state & PERF_HES_STOPPED)
				8466	return 1;
				8467
				8468	if (regs) {
				8469	if (event->attr.exclude_user && user_mode(regs))
				8470	return 1;
				8471
				8472	if (event->attr.exclude_kernel && !user_mode(regs))
				8473	return 1;
				8474	}
				8475
				8476	return 0;
				8477	}
				8478
				8479	static int perf_swevent_match(struct perf_event *event,
				8480	enum perf_type_id type,
				8481	u32 event_id,
				8482	struct perf_sample_data *data,
				8483	struct pt_regs *regs)
				8484	{
				8485	if (event->attr.type != type)
				8486	return 0;
				8487
				8488	if (event->attr.config != event_id)
				8489	return 0;
				8490
				8491	if (perf_exclude_event(event, regs))
				8492	return 0;
				8493
				8494	return 1;
				8495	}
				8496
				8497	static inline u64 swevent_hash(u64 type, u32 event_id)
				8498	{
				8499	u64 val = event_id \| (type << 32);
				8500
				8501	return hash_64(val, SWEVENT_HLIST_BITS);
				8502	}
				8503
				8504	static inline struct hlist_head *
				8505	__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
				8506	{
				8507	u64 hash = swevent_hash(type, event_id);
				8508
				8509	return &hlist->heads[hash];
				8510	}
				8511
				8512	/* For the read side: events when they trigger */
				8513	static inline struct hlist_head *
				8514	find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
				8515	{
				8516	struct swevent_hlist *hlist;
				8517
				8518	hlist = rcu_dereference(swhash->swevent_hlist);
				8519	if (!hlist)
				8520	return NULL;
				8521
				8522	return __find_swevent_head(hlist, type, event_id);
				8523	}
				8524
				8525	/* For the event head insertion and removal in the hlist */
				8526	static inline struct hlist_head *
				8527	find_swevent_head(struct swevent_htable swhash, struct perf_event event)
				8528	{
				8529	struct swevent_hlist *hlist;
				8530	u32 event_id = event->attr.config;
				8531	u64 type = event->attr.type;
				8532
				8533	/*
				8534	* Event scheduling is always serialized against hlist allocation
				8535	* and release. Which makes the protected version suitable here.
				8536	* The context lock guarantees that.
				8537	*/
				8538	hlist = rcu_dereference_protected(swhash->swevent_hlist,
				8539	lockdep_is_held(&event->ctx->lock));
				8540	if (!hlist)
				8541	return NULL;
				8542
				8543	return __find_swevent_head(hlist, type, event_id);
				8544	}
				8545
				8546	static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
				8547	u64 nr,
				8548	struct perf_sample_data *data,
				8549	struct pt_regs *regs)
				8550	{
				8551	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
				8552	struct perf_event *event;
				8553	struct hlist_head *head;
				8554
				8555	rcu_read_lock();
				8556	head = find_swevent_head_rcu(swhash, type, event_id);
				8557	if (!head)
				8558	goto end;
				8559
				8560	hlist_for_each_entry_rcu(event, head, hlist_entry) {
				8561	if (perf_swevent_match(event, type, event_id, data, regs))
				8562	perf_swevent_event(event, nr, data, regs);
				8563	}
				8564	end:
				8565	rcu_read_unlock();
				8566	}
				8567
				8568	DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
				8569
				8570	int perf_swevent_get_recursion_context(void)
				8571	{
				8572	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
				8573
				8574	return get_recursion_context(swhash->recursion);
				8575	}
				8576	EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
				8577
				8578	void perf_swevent_put_recursion_context(int rctx)
				8579	{
				8580	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
				8581
				8582	put_recursion_context(swhash->recursion, rctx);
				8583	}
				8584
				8585	void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
				8586	{
				8587	struct perf_sample_data data;
				8588
				8589	if (WARN_ON_ONCE(!regs))
				8590	return;
				8591
				8592	perf_sample_data_init(&data, addr, 0);
				8593	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
				8594	}
				8595
				8596	void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
				8597	{
				8598	int rctx;
				8599
				8600	preempt_disable_notrace();
				8601	rctx = perf_swevent_get_recursion_context();
				8602	if (unlikely(rctx < 0))
				8603	goto fail;
				8604
				8605	___perf_sw_event(event_id, nr, regs, addr);
				8606
				8607	perf_swevent_put_recursion_context(rctx);
				8608	fail:
				8609	preempt_enable_notrace();
				8610	}
				8611
				8612	static void perf_swevent_read(struct perf_event *event)
				8613	{
				8614	}
				8615
				8616	static int perf_swevent_add(struct perf_event *event, int flags)
				8617	{
				8618	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
				8619	struct hw_perf_event *hwc = &event->hw;
				8620	struct hlist_head *head;
				8621
				8622	if (is_sampling_event(event)) {
				8623	hwc->last_period = hwc->sample_period;
				8624	perf_swevent_set_period(event);
				8625	}
				8626
				8627	hwc->state = !(flags & PERF_EF_START);
				8628
				8629	head = find_swevent_head(swhash, event);
				8630	if (WARN_ON_ONCE(!head))
				8631	return -EINVAL;
				8632
				8633	hlist_add_head_rcu(&event->hlist_entry, head);
				8634	perf_event_update_userpage(event);
				8635
				8636	return 0;
				8637	}
				8638
				8639	static void perf_swevent_del(struct perf_event *event, int flags)
				8640	{
				8641	hlist_del_rcu(&event->hlist_entry);
				8642	}
				8643
				8644	static void perf_swevent_start(struct perf_event *event, int flags)
				8645	{
				8646	event->hw.state = 0;
				8647	}
				8648
				8649	static void perf_swevent_stop(struct perf_event *event, int flags)
				8650	{
				8651	event->hw.state = PERF_HES_STOPPED;
				8652	}
				8653
				8654	/* Deref the hlist from the update side */
				8655	static inline struct swevent_hlist *
				8656	swevent_hlist_deref(struct swevent_htable *swhash)
				8657	{
				8658	return rcu_dereference_protected(swhash->swevent_hlist,
				8659	lockdep_is_held(&swhash->hlist_mutex));
				8660	}
				8661
				8662	static void swevent_hlist_release(struct swevent_htable *swhash)
				8663	{
				8664	struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
				8665
				8666	if (!hlist)
				8667	return;
				8668
				8669	RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
				8670	kfree_rcu(hlist, rcu_head);
				8671	}
				8672
				8673	static void swevent_hlist_put_cpu(int cpu)
				8674	{
				8675	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
				8676
				8677	mutex_lock(&swhash->hlist_mutex);
				8678
				8679	if (!--swhash->hlist_refcount)
				8680	swevent_hlist_release(swhash);
				8681
				8682	mutex_unlock(&swhash->hlist_mutex);
				8683	}
				8684
				8685	static void swevent_hlist_put(void)
				8686	{
				8687	int cpu;
				8688
				8689	for_each_possible_cpu(cpu)
				8690	swevent_hlist_put_cpu(cpu);
				8691	}
				8692
				8693	static int swevent_hlist_get_cpu(int cpu)
				8694	{
				8695	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
				8696	int err = 0;
				8697
				8698	mutex_lock(&swhash->hlist_mutex);
				8699	if (!swevent_hlist_deref(swhash) &&
				8700	cpumask_test_cpu(cpu, perf_online_mask)) {
				8701	struct swevent_hlist *hlist;
				8702
				8703	hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
				8704	if (!hlist) {
				8705	err = -ENOMEM;
				8706	goto exit;
				8707	}
				8708	rcu_assign_pointer(swhash->swevent_hlist, hlist);
				8709	}
				8710	swhash->hlist_refcount++;
				8711	exit:
				8712	mutex_unlock(&swhash->hlist_mutex);
				8713
				8714	return err;
				8715	}
				8716
				8717	static int swevent_hlist_get(void)
				8718	{
				8719	int err, cpu, failed_cpu;
				8720
				8721	mutex_lock(&pmus_lock);
				8722	for_each_possible_cpu(cpu) {
				8723	err = swevent_hlist_get_cpu(cpu);
				8724	if (err) {
				8725	failed_cpu = cpu;
				8726	goto fail;
				8727	}
				8728	}
				8729	mutex_unlock(&pmus_lock);
				8730	return 0;
				8731	fail:
				8732	for_each_possible_cpu(cpu) {
				8733	if (cpu == failed_cpu)
				8734	break;
				8735	swevent_hlist_put_cpu(cpu);
				8736	}
				8737	mutex_unlock(&pmus_lock);
				8738	return err;
				8739	}
				8740
				8741	struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
				8742
				8743	static void sw_perf_event_destroy(struct perf_event *event)
				8744	{
				8745	u64 event_id = event->attr.config;
				8746
				8747	WARN_ON(event->parent);
				8748
				8749	static_key_slow_dec(&perf_swevent_enabled[event_id]);
				8750	swevent_hlist_put();
				8751	}
				8752
				8753	static int perf_swevent_init(struct perf_event *event)
				8754	{
				8755	u64 event_id = event->attr.config;
				8756
				8757	if (event->attr.type != PERF_TYPE_SOFTWARE)
				8758	return -ENOENT;
				8759
				8760	/*
				8761	* no branch sampling for software events
				8762	*/
				8763	if (has_branch_stack(event))
				8764	return -EOPNOTSUPP;
				8765
				8766	switch (event_id) {
				8767	case PERF_COUNT_SW_CPU_CLOCK:
				8768	case PERF_COUNT_SW_TASK_CLOCK:
				8769	return -ENOENT;
				8770
				8771	default:
				8772	break;
				8773	}
				8774
				8775	if (event_id >= PERF_COUNT_SW_MAX)
				8776	return -ENOENT;
				8777
				8778	if (!event->parent) {
				8779	int err;
				8780
				8781	err = swevent_hlist_get();
				8782	if (err)
				8783	return err;
				8784
				8785	static_key_slow_inc(&perf_swevent_enabled[event_id]);
				8786	event->destroy = sw_perf_event_destroy;
				8787	}
				8788
				8789	return 0;
				8790	}
				8791
				8792	static struct pmu perf_swevent = {
				8793	.task_ctx_nr = perf_sw_context,
				8794
				8795	.capabilities = PERF_PMU_CAP_NO_NMI,
				8796
				8797	.event_init = perf_swevent_init,
				8798	.add = perf_swevent_add,
				8799	.del = perf_swevent_del,
				8800	.start = perf_swevent_start,
				8801	.stop = perf_swevent_stop,
				8802	.read = perf_swevent_read,
				8803	};
				8804
				8805	#ifdef CONFIG_EVENT_TRACING
				8806
				8807	static int perf_tp_filter_match(struct perf_event *event,
				8808	struct perf_sample_data *data)
				8809	{
				8810	void *record = data->raw->frag.data;
				8811
				8812	/* only top level events have filters set */
				8813	if (event->parent)
				8814	event = event->parent;
				8815
				8816	if (likely(!event->filter) \|\| filter_match_preds(event->filter, record))
				8817	return 1;
				8818	return 0;
				8819	}
				8820
				8821	static int perf_tp_event_match(struct perf_event *event,
				8822	struct perf_sample_data *data,
				8823	struct pt_regs *regs)
				8824	{
				8825	if (event->hw.state & PERF_HES_STOPPED)
				8826	return 0;
				8827	/*
				8828	* If exclude_kernel, only trace user-space tracepoints (uprobes)
				8829	*/
				8830	if (event->attr.exclude_kernel && !user_mode(regs))
				8831	return 0;
				8832
				8833	if (!perf_tp_filter_match(event, data))
				8834	return 0;
				8835
				8836	return 1;
				8837	}
				8838
				8839	void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
				8840	struct trace_event_call *call, u64 count,
				8841	struct pt_regs regs, struct hlist_head head,
				8842	struct task_struct *task)
				8843	{
				8844	if (bpf_prog_array_valid(call)) {
				8845	(struct pt_regs *)raw_data = regs;
				8846	if (!trace_call_bpf(call, raw_data) \|\| hlist_empty(head)) {
				8847	perf_swevent_put_recursion_context(rctx);
				8848	return;
				8849	}
				8850	}
				8851	perf_tp_event(call->event.type, count, raw_data, size, regs, head,
				8852	rctx, task);
				8853	}
				8854	EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
				8855
				8856	void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
				8857	struct pt_regs regs, struct hlist_head head, int rctx,
				8858	struct task_struct *task)
				8859	{
				8860	struct perf_sample_data data;
				8861	struct perf_event *event;
				8862
				8863	struct perf_raw_record raw = {
				8864	.frag = {
				8865	.size = entry_size,
				8866	.data = record,
				8867	},
				8868	};
				8869
				8870	perf_sample_data_init(&data, 0, 0);
				8871	data.raw = &raw;
				8872
				8873	perf_trace_buf_update(record, event_type);
				8874
				8875	hlist_for_each_entry_rcu(event, head, hlist_entry) {
				8876	if (perf_tp_event_match(event, &data, regs))
				8877	perf_swevent_event(event, count, &data, regs);
				8878	}
				8879
				8880	/*
				8881	* If we got specified a target task, also iterate its context and
				8882	* deliver this event there too.
				8883	*/
				8884	if (task && task != current) {
				8885	struct perf_event_context *ctx;
				8886	struct trace_entry *entry = record;
				8887
				8888	rcu_read_lock();
				8889	ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
				8890	if (!ctx)
				8891	goto unlock;
				8892
				8893	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
				8894	if (event->cpu != smp_processor_id())
				8895	continue;
				8896	if (event->attr.type != PERF_TYPE_TRACEPOINT)
				8897	continue;
				8898	if (event->attr.config != entry->type)
				8899	continue;
				8900	if (perf_tp_event_match(event, &data, regs))
				8901	perf_swevent_event(event, count, &data, regs);
				8902	}
				8903	unlock:
				8904	rcu_read_unlock();
				8905	}
				8906
				8907	perf_swevent_put_recursion_context(rctx);
				8908	}
				8909	EXPORT_SYMBOL_GPL(perf_tp_event);
				8910
				8911	static void tp_perf_event_destroy(struct perf_event *event)
				8912	{
				8913	perf_trace_destroy(event);
				8914	}
				8915
				8916	static int perf_tp_event_init(struct perf_event *event)
				8917	{
				8918	int err;
				8919
				8920	if (event->attr.type != PERF_TYPE_TRACEPOINT)
				8921	return -ENOENT;
				8922
				8923	/*
				8924	* no branch sampling for tracepoint events
				8925	*/
				8926	if (has_branch_stack(event))
				8927	return -EOPNOTSUPP;
				8928
				8929	err = perf_trace_init(event);
				8930	if (err)
				8931	return err;
				8932
				8933	event->destroy = tp_perf_event_destroy;
				8934
				8935	return 0;
				8936	}
				8937
				8938	static struct pmu perf_tracepoint = {
				8939	.task_ctx_nr = perf_sw_context,
				8940
				8941	.event_init = perf_tp_event_init,
				8942	.add = perf_trace_add,
				8943	.del = perf_trace_del,
				8944	.start = perf_swevent_start,
				8945	.stop = perf_swevent_stop,
				8946	.read = perf_swevent_read,
				8947	};
				8948
				8949	#if defined(CONFIG_KPROBE_EVENTS) \|\| defined(CONFIG_UPROBE_EVENTS)
				8950	/*
				8951	* Flags in config, used by dynamic PMU kprobe and uprobe
				8952	* The flags should match following PMU_FORMAT_ATTR().
				8953	*
				8954	* PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
				8955	* if not set, create kprobe/uprobe
				8956	*
				8957	* The following values specify a reference counter (or semaphore in the
				8958	* terminology of tools like dtrace, systemtap, etc.) Userspace Statically
				8959	* Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
				8960	*
				8961	* PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset
				8962	* PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left
				8963	*/
				8964	enum perf_probe_config {
				8965	PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */
				8966	PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
				8967	PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
				8968	};
				8969
				8970	PMU_FORMAT_ATTR(retprobe, "config:0");
				8971	#endif
				8972
				8973	#ifdef CONFIG_KPROBE_EVENTS
				8974	static struct attribute *kprobe_attrs[] = {
				8975	&format_attr_retprobe.attr,
				8976	NULL,
				8977	};
				8978
				8979	static struct attribute_group kprobe_format_group = {
				8980	.name = "format",
				8981	.attrs = kprobe_attrs,
				8982	};
				8983
				8984	static const struct attribute_group *kprobe_attr_groups[] = {
				8985	&kprobe_format_group,
				8986	NULL,
				8987	};
				8988
				8989	static int perf_kprobe_event_init(struct perf_event *event);
				8990	static struct pmu perf_kprobe = {
				8991	.task_ctx_nr = perf_sw_context,
				8992	.event_init = perf_kprobe_event_init,
				8993	.add = perf_trace_add,
				8994	.del = perf_trace_del,
				8995	.start = perf_swevent_start,
				8996	.stop = perf_swevent_stop,
				8997	.read = perf_swevent_read,
				8998	.attr_groups = kprobe_attr_groups,
				8999	};
				9000
				9001	static int perf_kprobe_event_init(struct perf_event *event)
				9002	{
				9003	int err;
				9004	bool is_retprobe;
				9005
				9006	if (event->attr.type != perf_kprobe.type)
				9007	return -ENOENT;
				9008
				9009	if (!capable(CAP_SYS_ADMIN))
				9010	return -EACCES;
				9011
				9012	/*
				9013	* no branch sampling for probe events
				9014	*/
				9015	if (has_branch_stack(event))
				9016	return -EOPNOTSUPP;
				9017
				9018	is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
				9019	err = perf_kprobe_init(event, is_retprobe);
				9020	if (err)
				9021	return err;
				9022
				9023	event->destroy = perf_kprobe_destroy;
				9024
				9025	return 0;
				9026	}
				9027	#endif /* CONFIG_KPROBE_EVENTS */
				9028
				9029	#ifdef CONFIG_UPROBE_EVENTS
				9030	PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
				9031
				9032	static struct attribute *uprobe_attrs[] = {
				9033	&format_attr_retprobe.attr,
				9034	&format_attr_ref_ctr_offset.attr,
				9035	NULL,
				9036	};
				9037
				9038	static struct attribute_group uprobe_format_group = {
				9039	.name = "format",
				9040	.attrs = uprobe_attrs,
				9041	};
				9042
				9043	static const struct attribute_group *uprobe_attr_groups[] = {
				9044	&uprobe_format_group,
				9045	NULL,
				9046	};
				9047
				9048	static int perf_uprobe_event_init(struct perf_event *event);
				9049	static struct pmu perf_uprobe = {
				9050	.task_ctx_nr = perf_sw_context,
				9051	.event_init = perf_uprobe_event_init,
				9052	.add = perf_trace_add,
				9053	.del = perf_trace_del,
				9054	.start = perf_swevent_start,
				9055	.stop = perf_swevent_stop,
				9056	.read = perf_swevent_read,
				9057	.attr_groups = uprobe_attr_groups,
				9058	};
				9059
				9060	static int perf_uprobe_event_init(struct perf_event *event)
				9061	{
				9062	int err;
				9063	unsigned long ref_ctr_offset;
				9064	bool is_retprobe;
				9065
				9066	if (event->attr.type != perf_uprobe.type)
				9067	return -ENOENT;
				9068
				9069	if (!capable(CAP_SYS_ADMIN))
				9070	return -EACCES;
				9071
				9072	/*
				9073	* no branch sampling for probe events
				9074	*/
				9075	if (has_branch_stack(event))
				9076	return -EOPNOTSUPP;
				9077
				9078	is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
				9079	ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
				9080	err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
				9081	if (err)
				9082	return err;
				9083
				9084	event->destroy = perf_uprobe_destroy;
				9085
				9086	return 0;
				9087	}
				9088	#endif /* CONFIG_UPROBE_EVENTS */
				9089
				9090	static inline void perf_tp_register(void)
				9091	{
				9092	perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
				9093	#ifdef CONFIG_KPROBE_EVENTS
				9094	perf_pmu_register(&perf_kprobe, "kprobe", -1);
				9095	#endif
				9096	#ifdef CONFIG_UPROBE_EVENTS
				9097	perf_pmu_register(&perf_uprobe, "uprobe", -1);
				9098	#endif
				9099	}
				9100
				9101	static void perf_event_free_filter(struct perf_event *event)
				9102	{
				9103	ftrace_profile_free_filter(event);
				9104	}
				9105
				9106	#ifdef CONFIG_BPF_SYSCALL
				9107	static void bpf_overflow_handler(struct perf_event *event,
				9108	struct perf_sample_data *data,
				9109	struct pt_regs *regs)
				9110	{
				9111	struct bpf_perf_event_data_kern ctx = {
				9112	.data = data,
				9113	.event = event,
				9114	};
				9115	int ret = 0;
				9116
				9117	ctx.regs = perf_arch_bpf_user_pt_regs(regs);
				9118	preempt_disable();
				9119	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
				9120	goto out;
				9121	rcu_read_lock();
				9122	ret = BPF_PROG_RUN(event->prog, &ctx);
				9123	rcu_read_unlock();
				9124	out:
				9125	__this_cpu_dec(bpf_prog_active);
				9126	preempt_enable();
				9127	if (!ret)
				9128	return;
				9129
				9130	event->orig_overflow_handler(event, data, regs);
				9131	}
				9132
				9133	static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
				9134	{
				9135	struct bpf_prog *prog;
				9136
				9137	if (event->overflow_handler_context)
				9138	/* hw breakpoint or kernel counter */
				9139	return -EINVAL;
				9140
				9141	if (event->prog)
				9142	return -EEXIST;
				9143
				9144	prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
				9145	if (IS_ERR(prog))
				9146	return PTR_ERR(prog);
				9147
				9148	event->prog = prog;
				9149	event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
				9150	WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
				9151	return 0;
				9152	}
				9153
				9154	static void perf_event_free_bpf_handler(struct perf_event *event)
				9155	{
				9156	struct bpf_prog *prog = event->prog;
				9157
				9158	if (!prog)
				9159	return;
				9160
				9161	WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
				9162	event->prog = NULL;
				9163	bpf_prog_put(prog);
				9164	}
				9165	#else
				9166	static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
				9167	{
				9168	return -EOPNOTSUPP;
				9169	}
				9170	static void perf_event_free_bpf_handler(struct perf_event *event)
				9171	{
				9172	}
				9173	#endif
				9174
				9175	/*
				9176	* returns true if the event is a tracepoint, or a kprobe/upprobe created
				9177	* with perf_event_open()
				9178	*/
				9179	static inline bool perf_event_is_tracing(struct perf_event *event)
				9180	{
				9181	if (event->pmu == &perf_tracepoint)
				9182	return true;
				9183	#ifdef CONFIG_KPROBE_EVENTS
				9184	if (event->pmu == &perf_kprobe)
				9185	return true;
				9186	#endif
				9187	#ifdef CONFIG_UPROBE_EVENTS
				9188	if (event->pmu == &perf_uprobe)
				9189	return true;
				9190	#endif
				9191	return false;
				9192	}
				9193
				9194	static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
				9195	{
				9196	bool is_kprobe, is_tracepoint, is_syscall_tp;
				9197	struct bpf_prog *prog;
				9198	int ret;
				9199
				9200	if (!perf_event_is_tracing(event))
				9201	return perf_event_set_bpf_handler(event, prog_fd);
				9202
				9203	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
				9204	is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
				9205	is_syscall_tp = is_syscall_trace_event(event->tp_event);
				9206	if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
				9207	/* bpf programs can only be attached to u/kprobe or tracepoint */
				9208	return -EINVAL;
				9209
				9210	prog = bpf_prog_get(prog_fd);
				9211	if (IS_ERR(prog))
				9212	return PTR_ERR(prog);
				9213
				9214	if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) \|\|
				9215	(is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) \|\|
				9216	(is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
				9217	/* valid fd, but invalid bpf program type */
				9218	bpf_prog_put(prog);
				9219	return -EINVAL;
				9220	}
				9221
				9222	/* Kprobe override only works for kprobes, not uprobes. */
				9223	if (prog->kprobe_override &&
				9224	!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
				9225	bpf_prog_put(prog);
				9226	return -EINVAL;
				9227	}
				9228
				9229	if (is_tracepoint \|\| is_syscall_tp) {
				9230	int off = trace_event_get_offsets(event->tp_event);
				9231
				9232	if (prog->aux->max_ctx_offset > off) {
				9233	bpf_prog_put(prog);
				9234	return -EACCES;
				9235	}
				9236	}
				9237
				9238	ret = perf_event_attach_bpf_prog(event, prog);
				9239	if (ret)
				9240	bpf_prog_put(prog);
				9241	return ret;
				9242	}
				9243
				9244	static void perf_event_free_bpf_prog(struct perf_event *event)
				9245	{
				9246	if (!perf_event_is_tracing(event)) {
				9247	perf_event_free_bpf_handler(event);
				9248	return;
				9249	}
				9250	perf_event_detach_bpf_prog(event);
				9251	}
				9252
				9253	#else
				9254
				9255	static inline void perf_tp_register(void)
				9256	{
				9257	}
				9258
				9259	static void perf_event_free_filter(struct perf_event *event)
				9260	{
				9261	}
				9262
				9263	static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
				9264	{
				9265	return -ENOENT;
				9266	}
				9267
				9268	static void perf_event_free_bpf_prog(struct perf_event *event)
				9269	{
				9270	}
				9271	#endif /* CONFIG_EVENT_TRACING */
				9272
				9273	#ifdef CONFIG_HAVE_HW_BREAKPOINT
				9274	void perf_bp_event(struct perf_event bp, void data)
				9275	{
				9276	struct perf_sample_data sample;
				9277	struct pt_regs *regs = data;
				9278
				9279	perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
				9280
				9281	if (!bp->hw.state && !perf_exclude_event(bp, regs))
				9282	perf_swevent_event(bp, 1, &sample, regs);
				9283	}
				9284	#endif
				9285
				9286	/*
				9287	* Allocate a new address filter
				9288	*/
				9289	static struct perf_addr_filter *
				9290	perf_addr_filter_new(struct perf_event event, struct list_head filters)
				9291	{
				9292	int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
				9293	struct perf_addr_filter *filter;
				9294
				9295	filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
				9296	if (!filter)
				9297	return NULL;
				9298
				9299	INIT_LIST_HEAD(&filter->entry);
				9300	list_add_tail(&filter->entry, filters);
				9301
				9302	return filter;
				9303	}
				9304
				9305	static void free_filters_list(struct list_head *filters)
				9306	{
				9307	struct perf_addr_filter filter, iter;
				9308
				9309	list_for_each_entry_safe(filter, iter, filters, entry) {
				9310	path_put(&filter->path);
				9311	list_del(&filter->entry);
				9312	kfree(filter);
				9313	}
				9314	}
				9315
				9316	/*
				9317	* Free existing address filters and optionally install new ones
				9318	*/
				9319	static void perf_addr_filters_splice(struct perf_event *event,
				9320	struct list_head *head)
				9321	{
				9322	unsigned long flags;
				9323	LIST_HEAD(list);
				9324
				9325	if (!has_addr_filter(event))
				9326	return;
				9327
				9328	/* don't bother with children, they don't have their own filters */
				9329	if (event->parent)
				9330	return;
				9331
				9332	raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
				9333
				9334	list_splice_init(&event->addr_filters.list, &list);
				9335	if (head)
				9336	list_splice(head, &event->addr_filters.list);
				9337
				9338	raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
				9339
				9340	free_filters_list(&list);
				9341	}
				9342
				9343	/*
				9344	* Scan through mm's vmas and see if one of them matches the
				9345	* @filter; if so, adjust filter's address range.
				9346	* Called with mm::mmap_sem down for reading.
				9347	*/
				9348	static void perf_addr_filter_apply(struct perf_addr_filter *filter,
				9349	struct mm_struct *mm,
				9350	struct perf_addr_filter_range *fr)
				9351	{
				9352	struct vm_area_struct *vma;
				9353
				9354	for (vma = mm->mmap; vma; vma = vma->vm_next) {
				9355	if (!vma->vm_file)
				9356	continue;
				9357
				9358	if (perf_addr_filter_vma_adjust(filter, vma, fr))
				9359	return;
				9360	}
				9361	}
				9362
				9363	/*
				9364	* Update event's address range filters based on the
				9365	* task's existing mappings, if any.
				9366	*/
				9367	static void perf_event_addr_filters_apply(struct perf_event *event)
				9368	{
				9369	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
				9370	struct task_struct *task = READ_ONCE(event->ctx->task);
				9371	struct perf_addr_filter *filter;
				9372	struct mm_struct *mm = NULL;
				9373	unsigned int count = 0;
				9374	unsigned long flags;
				9375
				9376	/*
				9377	* We may observe TASK_TOMBSTONE, which means that the event tear-down
				9378	* will stop on the parent's child_mutex that our caller is also holding
				9379	*/
				9380	if (task == TASK_TOMBSTONE)
				9381	return;
				9382
				9383	if (ifh->nr_file_filters) {
				9384	mm = get_task_mm(task);
				9385	if (!mm)
				9386	goto restart;
				9387
				9388	down_read(&mm->mmap_sem);
				9389	}
				9390
				9391	raw_spin_lock_irqsave(&ifh->lock, flags);
				9392	list_for_each_entry(filter, &ifh->list, entry) {
				9393	if (filter->path.dentry) {
				9394	/*
				9395	* Adjust base offset if the filter is associated to a
				9396	* binary that needs to be mapped:
				9397	*/
				9398	event->addr_filter_ranges[count].start = 0;
				9399	event->addr_filter_ranges[count].size = 0;
				9400
				9401	perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
				9402	} else {
				9403	event->addr_filter_ranges[count].start = filter->offset;
				9404	event->addr_filter_ranges[count].size = filter->size;
				9405	}
				9406
				9407	count++;
				9408	}
				9409
				9410	event->addr_filters_gen++;
				9411	raw_spin_unlock_irqrestore(&ifh->lock, flags);
				9412
				9413	if (ifh->nr_file_filters) {
				9414	up_read(&mm->mmap_sem);
				9415
				9416	mmput(mm);
				9417	}
				9418
				9419	restart:
				9420	perf_event_stop(event, 1);
				9421	}
				9422
				9423	/*
				9424	* Address range filtering: limiting the data to certain
				9425	* instruction address ranges. Filters are ioctl()ed to us from
				9426	* userspace as ascii strings.
				9427	*
				9428	* Filter string format:
				9429	*
				9430	* ACTION RANGE_SPEC
				9431	* where ACTION is one of the
				9432	* * "filter": limit the trace to this region
				9433	* * "start": start tracing from this address
				9434	* * "stop": stop tracing at this address/region;
				9435	* RANGE_SPEC is
				9436	* * for kernel addresses: <start address>[/<size>]
				9437	* * for object files: <start address>[/<size>]@</path/to/object/file>
				9438	*
				9439	* if <size> is not specified or is zero, the range is treated as a single
				9440	* address; not valid for ACTION=="filter".
				9441	*/
				9442	enum {
				9443	IF_ACT_NONE = -1,
				9444	IF_ACT_FILTER,
				9445	IF_ACT_START,
				9446	IF_ACT_STOP,
				9447	IF_SRC_FILE,
				9448	IF_SRC_KERNEL,
				9449	IF_SRC_FILEADDR,
				9450	IF_SRC_KERNELADDR,
				9451	};
				9452
				9453	enum {
				9454	IF_STATE_ACTION = 0,
				9455	IF_STATE_SOURCE,
				9456	IF_STATE_END,
				9457	};
				9458
				9459	static const match_table_t if_tokens = {
				9460	{ IF_ACT_FILTER, "filter" },
				9461	{ IF_ACT_START, "start" },
				9462	{ IF_ACT_STOP, "stop" },
				9463	{ IF_SRC_FILE, "%u/%u@%s" },
				9464	{ IF_SRC_KERNEL, "%u/%u" },
				9465	{ IF_SRC_FILEADDR, "%u@%s" },
				9466	{ IF_SRC_KERNELADDR, "%u" },
				9467	{ IF_ACT_NONE, NULL },
				9468	};
				9469
				9470	/*
				9471	* Address filter string parser
				9472	*/
				9473	static int
				9474	perf_event_parse_addr_filter(struct perf_event event, char fstr,
				9475	struct list_head *filters)
				9476	{
				9477	struct perf_addr_filter *filter = NULL;
				9478	char start, orig, *filename = NULL;
				9479	substring_t args[MAX_OPT_ARGS];
				9480	int state = IF_STATE_ACTION, token;
				9481	unsigned int kernel = 0;
				9482	int ret = -EINVAL;
				9483
				9484	orig = fstr = kstrdup(fstr, GFP_KERNEL);
				9485	if (!fstr)
				9486	return -ENOMEM;
				9487
				9488	while ((start = strsep(&fstr, " ,\n")) != NULL) {
				9489	static const enum perf_addr_filter_action_t actions[] = {
				9490	[IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER,
				9491	[IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START,
				9492	[IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP,
				9493	};
				9494	ret = -EINVAL;
				9495
				9496	if (!*start)
				9497	continue;
				9498
				9499	/* filter definition begins */
				9500	if (state == IF_STATE_ACTION) {
				9501	filter = perf_addr_filter_new(event, filters);
				9502	if (!filter)
				9503	goto fail;
				9504	}
				9505
				9506	token = match_token(start, if_tokens, args);
				9507	switch (token) {
				9508	case IF_ACT_FILTER:
				9509	case IF_ACT_START:
				9510	case IF_ACT_STOP:
				9511	if (state != IF_STATE_ACTION)
				9512	goto fail;
				9513
				9514	filter->action = actions[token];
				9515	state = IF_STATE_SOURCE;
				9516	break;
				9517
				9518	case IF_SRC_KERNELADDR:
				9519	case IF_SRC_KERNEL:
				9520	kernel = 1;
				9521	/* fall through */
				9522
				9523	case IF_SRC_FILEADDR:
				9524	case IF_SRC_FILE:
				9525	if (state != IF_STATE_SOURCE)
				9526	goto fail;
				9527
				9528	*args[0].to = 0;
				9529	ret = kstrtoul(args[0].from, 0, &filter->offset);
				9530	if (ret)
				9531	goto fail;
				9532
				9533	if (token == IF_SRC_KERNEL \|\| token == IF_SRC_FILE) {
				9534	*args[1].to = 0;
				9535	ret = kstrtoul(args[1].from, 0, &filter->size);
				9536	if (ret)
				9537	goto fail;
				9538	}
				9539
				9540	if (token == IF_SRC_FILE \|\| token == IF_SRC_FILEADDR) {
				9541	int fpos = token == IF_SRC_FILE ? 2 : 1;
				9542
				9543	kfree(filename);
				9544	filename = match_strdup(&args[fpos]);
				9545	if (!filename) {
				9546	ret = -ENOMEM;
				9547	goto fail;
				9548	}
				9549	}
				9550
				9551	state = IF_STATE_END;
				9552	break;
				9553
				9554	default:
				9555	goto fail;
				9556	}
				9557
				9558	/*
				9559	* Filter definition is fully parsed, validate and install it.
				9560	* Make sure that it doesn't contradict itself or the event's
				9561	* attribute.
				9562	*/
				9563	if (state == IF_STATE_END) {
				9564	ret = -EINVAL;
				9565	if (kernel && event->attr.exclude_kernel)
				9566	goto fail;
				9567
				9568	/*
				9569	* ACTION "filter" must have a non-zero length region
				9570	* specified.
				9571	*/
				9572	if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
				9573	!filter->size)
				9574	goto fail;
				9575
				9576	if (!kernel) {
				9577	if (!filename)
				9578	goto fail;
				9579
				9580	/*
				9581	* For now, we only support file-based filters
				9582	* in per-task events; doing so for CPU-wide
				9583	* events requires additional context switching
				9584	* trickery, since same object code will be
				9585	* mapped at different virtual addresses in
				9586	* different processes.
				9587	*/
				9588	ret = -EOPNOTSUPP;
				9589	if (!event->ctx->task)
				9590	goto fail;
				9591
				9592	/* look up the path and grab its inode */
				9593	ret = kern_path(filename, LOOKUP_FOLLOW,
				9594	&filter->path);
				9595	if (ret)
				9596	goto fail;
				9597
				9598	ret = -EINVAL;
				9599	if (!filter->path.dentry \|\|
				9600	!S_ISREG(d_inode(filter->path.dentry)
				9601	->i_mode))
				9602	goto fail;
				9603
				9604	event->addr_filters.nr_file_filters++;
				9605	}
				9606
				9607	/* ready to consume more filters */
				9608	kfree(filename);
				9609	filename = NULL;
				9610	state = IF_STATE_ACTION;
				9611	filter = NULL;
				9612	kernel = 0;
				9613	}
				9614	}
				9615
				9616	if (state != IF_STATE_ACTION)
				9617	goto fail;
				9618
				9619	kfree(filename);
				9620	kfree(orig);
				9621
				9622	return 0;
				9623
				9624	fail:
				9625	kfree(filename);
				9626	free_filters_list(filters);
				9627	kfree(orig);
				9628
				9629	return ret;
				9630	}
				9631
				9632	static int
				9633	perf_event_set_addr_filter(struct perf_event event, char filter_str)
				9634	{
				9635	LIST_HEAD(filters);
				9636	int ret;
				9637
				9638	/*
				9639	* Since this is called in perf_ioctl() path, we're already holding
				9640	* ctx::mutex.
				9641	*/
				9642	lockdep_assert_held(&event->ctx->mutex);
				9643
				9644	if (WARN_ON_ONCE(event->parent))
				9645	return -EINVAL;
				9646
				9647	ret = perf_event_parse_addr_filter(event, filter_str, &filters);
				9648	if (ret)
				9649	goto fail_clear_files;
				9650
				9651	ret = event->pmu->addr_filters_validate(&filters);
				9652	if (ret)
				9653	goto fail_free_filters;
				9654
				9655	/* remove existing filters, if any */
				9656	perf_addr_filters_splice(event, &filters);
				9657
				9658	/* install new filters */
				9659	perf_event_for_each_child(event, perf_event_addr_filters_apply);
				9660
				9661	return ret;
				9662
				9663	fail_free_filters:
				9664	free_filters_list(&filters);
				9665
				9666	fail_clear_files:
				9667	event->addr_filters.nr_file_filters = 0;
				9668
				9669	return ret;
				9670	}
				9671
				9672	static int perf_event_set_filter(struct perf_event event, void __user arg)
				9673	{
				9674	int ret = -EINVAL;
				9675	char *filter_str;
				9676
				9677	filter_str = strndup_user(arg, PAGE_SIZE);
				9678	if (IS_ERR(filter_str))
				9679	return PTR_ERR(filter_str);
				9680
				9681	#ifdef CONFIG_EVENT_TRACING
				9682	if (perf_event_is_tracing(event)) {
				9683	struct perf_event_context *ctx = event->ctx;
				9684
				9685	/*
				9686	* Beware, here be dragons!!
				9687	*
				9688	* the tracepoint muck will deadlock against ctx->mutex, but
				9689	* the tracepoint stuff does not actually need it. So
				9690	* temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
				9691	* already have a reference on ctx.
				9692	*
				9693	* This can result in event getting moved to a different ctx,
				9694	* but that does not affect the tracepoint state.
				9695	*/
				9696	mutex_unlock(&ctx->mutex);
				9697	ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
				9698	mutex_lock(&ctx->mutex);
				9699	} else
				9700	#endif
				9701	if (has_addr_filter(event))
				9702	ret = perf_event_set_addr_filter(event, filter_str);
				9703
				9704	kfree(filter_str);
				9705	return ret;
				9706	}
				9707
				9708	/*
				9709	* hrtimer based swevent callback
				9710	*/
				9711
				9712	static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
				9713	{
				9714	enum hrtimer_restart ret = HRTIMER_RESTART;
				9715	struct perf_sample_data data;
				9716	struct pt_regs *regs;
				9717	struct perf_event *event;
				9718	u64 period;
				9719
				9720	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
				9721
				9722	if (event->state != PERF_EVENT_STATE_ACTIVE)
				9723	return HRTIMER_NORESTART;
				9724
				9725	event->pmu->read(event);
				9726
				9727	perf_sample_data_init(&data, 0, event->hw.last_period);
				9728	regs = get_irq_regs();
				9729
				9730	if (regs && !perf_exclude_event(event, regs)) {
				9731	if (!(event->attr.exclude_idle && is_idle_task(current)))
				9732	if (__perf_event_overflow(event, 1, &data, regs))
				9733	ret = HRTIMER_NORESTART;
				9734	}
				9735
				9736	period = max_t(u64, 10000, event->hw.sample_period);
				9737	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
				9738
				9739	return ret;
				9740	}
				9741
				9742	static void perf_swevent_start_hrtimer(struct perf_event *event)
				9743	{
				9744	struct hw_perf_event *hwc = &event->hw;
				9745	s64 period;
				9746
				9747	if (!is_sampling_event(event))
				9748	return;
				9749
				9750	period = local64_read(&hwc->period_left);
				9751	if (period) {
				9752	if (period < 0)
				9753	period = 10000;
				9754
				9755	local64_set(&hwc->period_left, 0);
				9756	} else {
				9757	period = max_t(u64, 10000, hwc->sample_period);
				9758	}
				9759	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
				9760	HRTIMER_MODE_REL_PINNED_HARD);
				9761	}
				9762
				9763	static void perf_swevent_cancel_hrtimer(struct perf_event *event)
				9764	{
				9765	struct hw_perf_event *hwc = &event->hw;
				9766
				9767	if (is_sampling_event(event)) {
				9768	ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
				9769	local64_set(&hwc->period_left, ktime_to_ns(remaining));
				9770
				9771	hrtimer_cancel(&hwc->hrtimer);
				9772	}
				9773	}
				9774
				9775	static void perf_swevent_init_hrtimer(struct perf_event *event)
				9776	{
				9777	struct hw_perf_event *hwc = &event->hw;
				9778
				9779	if (!is_sampling_event(event))
				9780	return;
				9781
				9782	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
				9783	hwc->hrtimer.function = perf_swevent_hrtimer;
				9784
				9785	/*
				9786	* Since hrtimers have a fixed rate, we can do a static freq->period
				9787	* mapping and avoid the whole period adjust feedback stuff.
				9788	*/
				9789	if (event->attr.freq) {
				9790	long freq = event->attr.sample_freq;
				9791
				9792	event->attr.sample_period = NSEC_PER_SEC / freq;
				9793	hwc->sample_period = event->attr.sample_period;
				9794	local64_set(&hwc->period_left, hwc->sample_period);
				9795	hwc->last_period = hwc->sample_period;
				9796	event->attr.freq = 0;
				9797	}
				9798	}
				9799
				9800	/*
				9801	* Software event: cpu wall time clock
				9802	*/
				9803
				9804	static void cpu_clock_event_update(struct perf_event *event)
				9805	{
				9806	s64 prev;
				9807	u64 now;
				9808
				9809	now = local_clock();
				9810	prev = local64_xchg(&event->hw.prev_count, now);
				9811	local64_add(now - prev, &event->count);
				9812	}
				9813
				9814	static void cpu_clock_event_start(struct perf_event *event, int flags)
				9815	{
				9816	local64_set(&event->hw.prev_count, local_clock());
				9817	perf_swevent_start_hrtimer(event);
				9818	}
				9819
				9820	static void cpu_clock_event_stop(struct perf_event *event, int flags)
				9821	{
				9822	perf_swevent_cancel_hrtimer(event);
				9823	cpu_clock_event_update(event);
				9824	}
				9825
				9826	static int cpu_clock_event_add(struct perf_event *event, int flags)
				9827	{
				9828	if (flags & PERF_EF_START)
				9829	cpu_clock_event_start(event, flags);
				9830	perf_event_update_userpage(event);
				9831
				9832	return 0;
				9833	}
				9834
				9835	static void cpu_clock_event_del(struct perf_event *event, int flags)
				9836	{
				9837	cpu_clock_event_stop(event, flags);
				9838	}
				9839
				9840	static void cpu_clock_event_read(struct perf_event *event)
				9841	{
				9842	cpu_clock_event_update(event);
				9843	}
				9844
				9845	static int cpu_clock_event_init(struct perf_event *event)
				9846	{
				9847	if (event->attr.type != PERF_TYPE_SOFTWARE)
				9848	return -ENOENT;
				9849
				9850	if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
				9851	return -ENOENT;
				9852
				9853	/*
				9854	* no branch sampling for software events
				9855	*/
				9856	if (has_branch_stack(event))
				9857	return -EOPNOTSUPP;
				9858
				9859	perf_swevent_init_hrtimer(event);
				9860
				9861	return 0;
				9862	}
				9863
				9864	static struct pmu perf_cpu_clock = {
				9865	.task_ctx_nr = perf_sw_context,
				9866
				9867	.capabilities = PERF_PMU_CAP_NO_NMI,
				9868
				9869	.event_init = cpu_clock_event_init,
				9870	.add = cpu_clock_event_add,
				9871	.del = cpu_clock_event_del,
				9872	.start = cpu_clock_event_start,
				9873	.stop = cpu_clock_event_stop,
				9874	.read = cpu_clock_event_read,
				9875	};
				9876
				9877	/*
				9878	* Software event: task time clock
				9879	*/
				9880
				9881	static void task_clock_event_update(struct perf_event *event, u64 now)
				9882	{
				9883	u64 prev;
				9884	s64 delta;
				9885
				9886	prev = local64_xchg(&event->hw.prev_count, now);
				9887	delta = now - prev;
				9888	local64_add(delta, &event->count);
				9889	}
				9890
				9891	static void task_clock_event_start(struct perf_event *event, int flags)
				9892	{
				9893	local64_set(&event->hw.prev_count, event->ctx->time);
				9894	perf_swevent_start_hrtimer(event);
				9895	}
				9896
				9897	static void task_clock_event_stop(struct perf_event *event, int flags)
				9898	{
				9899	perf_swevent_cancel_hrtimer(event);
				9900	task_clock_event_update(event, event->ctx->time);
				9901	}
				9902
				9903	static int task_clock_event_add(struct perf_event *event, int flags)
				9904	{
				9905	if (flags & PERF_EF_START)
				9906	task_clock_event_start(event, flags);
				9907	perf_event_update_userpage(event);
				9908
				9909	return 0;
				9910	}
				9911
				9912	static void task_clock_event_del(struct perf_event *event, int flags)
				9913	{
				9914	task_clock_event_stop(event, PERF_EF_UPDATE);
				9915	}
				9916
				9917	static void task_clock_event_read(struct perf_event *event)
				9918	{
				9919	u64 now = perf_clock();
				9920	u64 delta = now - event->ctx->timestamp;
				9921	u64 time = event->ctx->time + delta;
				9922
				9923	task_clock_event_update(event, time);
				9924	}
				9925
				9926	static int task_clock_event_init(struct perf_event *event)
				9927	{
				9928	if (event->attr.type != PERF_TYPE_SOFTWARE)
				9929	return -ENOENT;
				9930
				9931	if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
				9932	return -ENOENT;
				9933
				9934	/*
				9935	* no branch sampling for software events
				9936	*/
				9937	if (has_branch_stack(event))
				9938	return -EOPNOTSUPP;
				9939
				9940	perf_swevent_init_hrtimer(event);
				9941
				9942	return 0;
				9943	}
				9944
				9945	static struct pmu perf_task_clock = {
				9946	.task_ctx_nr = perf_sw_context,
				9947
				9948	.capabilities = PERF_PMU_CAP_NO_NMI,
				9949
				9950	.event_init = task_clock_event_init,
				9951	.add = task_clock_event_add,
				9952	.del = task_clock_event_del,
				9953	.start = task_clock_event_start,
				9954	.stop = task_clock_event_stop,
				9955	.read = task_clock_event_read,
				9956	};
				9957
				9958	static void perf_pmu_nop_void(struct pmu *pmu)
				9959	{
				9960	}
				9961
				9962	static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
				9963	{
				9964	}
				9965
				9966	static int perf_pmu_nop_int(struct pmu *pmu)
				9967	{
				9968	return 0;
				9969	}
				9970
				9971	static int perf_event_nop_int(struct perf_event *event, u64 value)
				9972	{
				9973	return 0;
				9974	}
				9975
				9976	static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
				9977
				9978	static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
				9979	{
				9980	__this_cpu_write(nop_txn_flags, flags);
				9981
				9982	if (flags & ~PERF_PMU_TXN_ADD)
				9983	return;
				9984
				9985	perf_pmu_disable(pmu);
				9986	}
				9987
				9988	static int perf_pmu_commit_txn(struct pmu *pmu)
				9989	{
				9990	unsigned int flags = __this_cpu_read(nop_txn_flags);
				9991
				9992	__this_cpu_write(nop_txn_flags, 0);
				9993
				9994	if (flags & ~PERF_PMU_TXN_ADD)
				9995	return 0;
				9996
				9997	perf_pmu_enable(pmu);
				9998	return 0;
				9999	}
				10000
				10001	static void perf_pmu_cancel_txn(struct pmu *pmu)
				10002	{
				10003	unsigned int flags = __this_cpu_read(nop_txn_flags);
				10004
				10005	__this_cpu_write(nop_txn_flags, 0);
				10006
				10007	if (flags & ~PERF_PMU_TXN_ADD)
				10008	return;
				10009
				10010	perf_pmu_enable(pmu);
				10011	}
				10012
				10013	static int perf_event_idx_default(struct perf_event *event)
				10014	{
				10015	return 0;
				10016	}
				10017
				10018	/*
				10019	* Ensures all contexts with the same task_ctx_nr have the same
				10020	* pmu_cpu_context too.
				10021	*/
				10022	static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
				10023	{
				10024	struct pmu *pmu;
				10025
				10026	if (ctxn < 0)
				10027	return NULL;
				10028
				10029	list_for_each_entry(pmu, &pmus, entry) {
				10030	if (pmu->task_ctx_nr == ctxn)
				10031	return pmu->pmu_cpu_context;
				10032	}
				10033
				10034	return NULL;
				10035	}
				10036
				10037	static void free_pmu_context(struct pmu *pmu)
				10038	{
				10039	/*
				10040	* Static contexts such as perf_sw_context have a global lifetime
				10041	* and may be shared between different PMUs. Avoid freeing them
				10042	* when a single PMU is going away.
				10043	*/
				10044	if (pmu->task_ctx_nr > perf_invalid_context)
				10045	return;
				10046
				10047	free_percpu(pmu->pmu_cpu_context);
				10048	}
				10049
				10050	/*
				10051	* Let userspace know that this PMU supports address range filtering:
				10052	*/
				10053	static ssize_t nr_addr_filters_show(struct device *dev,
				10054	struct device_attribute *attr,
				10055	char *page)
				10056	{
				10057	struct pmu *pmu = dev_get_drvdata(dev);
				10058
				10059	return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
				10060	}
				10061	DEVICE_ATTR_RO(nr_addr_filters);
				10062
				10063	static struct idr pmu_idr;
				10064
				10065	static ssize_t
				10066	type_show(struct device dev, struct device_attribute attr, char *page)
				10067	{
				10068	struct pmu *pmu = dev_get_drvdata(dev);
				10069
				10070	return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
				10071	}
				10072	static DEVICE_ATTR_RO(type);
				10073
				10074	static ssize_t
				10075	perf_event_mux_interval_ms_show(struct device *dev,
				10076	struct device_attribute *attr,
				10077	char *page)
				10078	{
				10079	struct pmu *pmu = dev_get_drvdata(dev);
				10080
				10081	return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
				10082	}
				10083
				10084	static DEFINE_MUTEX(mux_interval_mutex);
				10085
				10086	static ssize_t
				10087	perf_event_mux_interval_ms_store(struct device *dev,
				10088	struct device_attribute *attr,
				10089	const char *buf, size_t count)
				10090	{
				10091	struct pmu *pmu = dev_get_drvdata(dev);
				10092	int timer, cpu, ret;
				10093
				10094	ret = kstrtoint(buf, 0, &timer);
				10095	if (ret)
				10096	return ret;
				10097
				10098	if (timer < 1)
				10099	return -EINVAL;
				10100
				10101	/* same value, noting to do */
				10102	if (timer == pmu->hrtimer_interval_ms)
				10103	return count;
				10104
				10105	mutex_lock(&mux_interval_mutex);
				10106	pmu->hrtimer_interval_ms = timer;
				10107
				10108	/* update all cpuctx for this PMU */
				10109	cpus_read_lock();
				10110	for_each_online_cpu(cpu) {
				10111	struct perf_cpu_context *cpuctx;
				10112	cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
				10113	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
				10114
				10115	cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpuctx);
				10116	}
				10117	cpus_read_unlock();
				10118	mutex_unlock(&mux_interval_mutex);
				10119
				10120	return count;
				10121	}
				10122	static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
				10123
				10124	static struct attribute *pmu_dev_attrs[] = {
				10125	&dev_attr_type.attr,
				10126	&dev_attr_perf_event_mux_interval_ms.attr,
				10127	&dev_attr_nr_addr_filters.attr,
				10128	NULL,
				10129	};
				10130
				10131	static umode_t pmu_dev_is_visible(struct kobject kobj, struct attribute a, int n)
				10132	{
				10133	struct device *dev = kobj_to_dev(kobj);
				10134	struct pmu *pmu = dev_get_drvdata(dev);
				10135
				10136	if (n == 2 && !pmu->nr_addr_filters)
				10137	return 0;
				10138
				10139	return a->mode;
				10140	}
				10141
				10142	static struct attribute_group pmu_dev_attr_group = {
				10143	.is_visible = pmu_dev_is_visible,
				10144	.attrs = pmu_dev_attrs,
				10145	};
				10146
				10147	static const struct attribute_group *pmu_dev_groups[] = {
				10148	&pmu_dev_attr_group,
				10149	NULL,
				10150	};
				10151
				10152	static int pmu_bus_running;
				10153	static struct bus_type pmu_bus = {
				10154	.name = "event_source",
				10155	.dev_groups = pmu_dev_groups,
				10156	};
				10157
				10158	static void pmu_dev_release(struct device *dev)
				10159	{
				10160	kfree(dev);
				10161	}
				10162
				10163	static int pmu_dev_alloc(struct pmu *pmu)
				10164	{
				10165	int ret = -ENOMEM;
				10166
				10167	pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
				10168	if (!pmu->dev)
				10169	goto out;
				10170
				10171	pmu->dev->groups = pmu->attr_groups;
				10172	device_initialize(pmu->dev);
				10173
				10174	dev_set_drvdata(pmu->dev, pmu);
				10175	pmu->dev->bus = &pmu_bus;
				10176	pmu->dev->release = pmu_dev_release;
				10177
				10178	ret = dev_set_name(pmu->dev, "%s", pmu->name);
				10179	if (ret)
				10180	goto free_dev;
				10181
				10182	ret = device_add(pmu->dev);
				10183	if (ret)
				10184	goto free_dev;
				10185
				10186	if (pmu->attr_update) {
				10187	ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
				10188	if (ret)
				10189	goto del_dev;
				10190	}
				10191
				10192	out:
				10193	return ret;
				10194
				10195	del_dev:
				10196	device_del(pmu->dev);
				10197
				10198	free_dev:
				10199	put_device(pmu->dev);
				10200	goto out;
				10201	}
				10202
				10203	static struct lock_class_key cpuctx_mutex;
				10204	static struct lock_class_key cpuctx_lock;
				10205
				10206	int perf_pmu_register(struct pmu pmu, const char name, int type)
				10207	{
				10208	int cpu, ret;
				10209
				10210	mutex_lock(&pmus_lock);
				10211	ret = -ENOMEM;
				10212	pmu->pmu_disable_count = alloc_percpu(int);
				10213	if (!pmu->pmu_disable_count)
				10214	goto unlock;
				10215
				10216	pmu->type = -1;
				10217	if (!name)
				10218	goto skip_type;
				10219	pmu->name = name;
				10220
				10221	if (type < 0) {
				10222	type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
				10223	if (type < 0) {
				10224	ret = type;
				10225	goto free_pdc;
				10226	}
				10227	}
				10228	pmu->type = type;
				10229
				10230	if (pmu_bus_running) {
				10231	ret = pmu_dev_alloc(pmu);
				10232	if (ret)
				10233	goto free_idr;
				10234	}
				10235
				10236	skip_type:
				10237	if (pmu->task_ctx_nr == perf_hw_context) {
				10238	static int hw_context_taken = 0;
				10239
				10240	/*
				10241	* Other than systems with heterogeneous CPUs, it never makes
				10242	* sense for two PMUs to share perf_hw_context. PMUs which are
				10243	* uncore must use perf_invalid_context.
				10244	*/
				10245	if (WARN_ON_ONCE(hw_context_taken &&
				10246	!(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
				10247	pmu->task_ctx_nr = perf_invalid_context;
				10248
				10249	hw_context_taken = 1;
				10250	}
				10251
				10252	pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
				10253	if (pmu->pmu_cpu_context)
				10254	goto got_cpu_context;
				10255
				10256	ret = -ENOMEM;
				10257	pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
				10258	if (!pmu->pmu_cpu_context)
				10259	goto free_dev;
				10260
				10261	for_each_possible_cpu(cpu) {
				10262	struct perf_cpu_context *cpuctx;
				10263
				10264	cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
				10265	__perf_event_init_context(&cpuctx->ctx);
				10266	lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
				10267	lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
				10268	cpuctx->ctx.pmu = pmu;
				10269	cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
				10270
				10271	__perf_mux_hrtimer_init(cpuctx, cpu);
				10272	}
				10273
				10274	got_cpu_context:
				10275	if (!pmu->start_txn) {
				10276	if (pmu->pmu_enable) {
				10277	/*
				10278	* If we have pmu_enable/pmu_disable calls, install
				10279	* transaction stubs that use that to try and batch
				10280	* hardware accesses.
				10281	*/
				10282	pmu->start_txn = perf_pmu_start_txn;
				10283	pmu->commit_txn = perf_pmu_commit_txn;
				10284	pmu->cancel_txn = perf_pmu_cancel_txn;
				10285	} else {
				10286	pmu->start_txn = perf_pmu_nop_txn;
				10287	pmu->commit_txn = perf_pmu_nop_int;
				10288	pmu->cancel_txn = perf_pmu_nop_void;
				10289	}
				10290	}
				10291
				10292	if (!pmu->pmu_enable) {
				10293	pmu->pmu_enable = perf_pmu_nop_void;
				10294	pmu->pmu_disable = perf_pmu_nop_void;
				10295	}
				10296
				10297	if (!pmu->check_period)
				10298	pmu->check_period = perf_event_nop_int;
				10299
				10300	if (!pmu->event_idx)
				10301	pmu->event_idx = perf_event_idx_default;
				10302
				10303	list_add_rcu(&pmu->entry, &pmus);
				10304	atomic_set(&pmu->exclusive_cnt, 0);
				10305	ret = 0;
				10306	unlock:
				10307	mutex_unlock(&pmus_lock);
				10308
				10309	return ret;
				10310
				10311	free_dev:
				10312	device_del(pmu->dev);
				10313	put_device(pmu->dev);
				10314
				10315	free_idr:
				10316	if (pmu->type >= PERF_TYPE_MAX)
				10317	idr_remove(&pmu_idr, pmu->type);
				10318
				10319	free_pdc:
				10320	free_percpu(pmu->pmu_disable_count);
				10321	goto unlock;
				10322	}
				10323	EXPORT_SYMBOL_GPL(perf_pmu_register);
				10324
				10325	void perf_pmu_unregister(struct pmu *pmu)
				10326	{
				10327	mutex_lock(&pmus_lock);
				10328	list_del_rcu(&pmu->entry);
				10329
				10330	/*
				10331	* We dereference the pmu list under both SRCU and regular RCU, so
				10332	* synchronize against both of those.
				10333	*/
				10334	synchronize_srcu(&pmus_srcu);
				10335	synchronize_rcu();
				10336
				10337	free_percpu(pmu->pmu_disable_count);
				10338	if (pmu->type >= PERF_TYPE_MAX)
				10339	idr_remove(&pmu_idr, pmu->type);
				10340	if (pmu_bus_running) {
				10341	if (pmu->nr_addr_filters)
				10342	device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
				10343	device_del(pmu->dev);
				10344	put_device(pmu->dev);
				10345	}
				10346	free_pmu_context(pmu);
				10347	mutex_unlock(&pmus_lock);
				10348	}
				10349	EXPORT_SYMBOL_GPL(perf_pmu_unregister);
				10350
				10351	static inline bool has_extended_regs(struct perf_event *event)
				10352	{
				10353	return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) \|\|
				10354	(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
				10355	}
				10356
				10357	static int perf_try_init_event(struct pmu pmu, struct perf_event event)
				10358	{
				10359	struct perf_event_context *ctx = NULL;
				10360	int ret;
				10361
				10362	if (!try_module_get(pmu->module))
				10363	return -ENODEV;
				10364
				10365	/*
				10366	* A number of pmu->event_init() methods iterate the sibling_list to,
				10367	* for example, validate if the group fits on the PMU. Therefore,
				10368	* if this is a sibling event, acquire the ctx->mutex to protect
				10369	* the sibling_list.
				10370	*/
				10371	if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
				10372	/*
				10373	* This ctx->mutex can nest when we're called through
				10374	* inheritance. See the perf_event_ctx_lock_nested() comment.
				10375	*/
				10376	ctx = perf_event_ctx_lock_nested(event->group_leader,
				10377	SINGLE_DEPTH_NESTING);
				10378	BUG_ON(!ctx);
				10379	}
				10380
				10381	event->pmu = pmu;
				10382	ret = pmu->event_init(event);
				10383
				10384	if (ctx)
				10385	perf_event_ctx_unlock(event->group_leader, ctx);
				10386
				10387	if (!ret) {
				10388	if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
				10389	has_extended_regs(event))
				10390	ret = -EOPNOTSUPP;
				10391
				10392	if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
				10393	event_has_any_exclude_flag(event))
				10394	ret = -EINVAL;
				10395
				10396	if (ret && event->destroy)
				10397	event->destroy(event);
				10398	}
				10399
				10400	if (ret)
				10401	module_put(pmu->module);
				10402
				10403	return ret;
				10404	}
				10405
				10406	static struct pmu perf_init_event(struct perf_event event)
				10407	{
				10408	struct pmu *pmu;
				10409	int idx;
				10410	int ret;
				10411
				10412	idx = srcu_read_lock(&pmus_srcu);
				10413
				10414	/* Try parent's PMU first: */
				10415	if (event->parent && event->parent->pmu) {
				10416	pmu = event->parent->pmu;
				10417	ret = perf_try_init_event(pmu, event);
				10418	if (!ret)
				10419	goto unlock;
				10420	}
				10421
				10422	rcu_read_lock();
				10423	pmu = idr_find(&pmu_idr, event->attr.type);
				10424	rcu_read_unlock();
				10425	if (pmu) {
				10426	ret = perf_try_init_event(pmu, event);
				10427	if (ret)
				10428	pmu = ERR_PTR(ret);
				10429	goto unlock;
				10430	}
				10431
				10432	list_for_each_entry_rcu(pmu, &pmus, entry) {
				10433	ret = perf_try_init_event(pmu, event);
				10434	if (!ret)
				10435	goto unlock;
				10436
				10437	if (ret != -ENOENT) {
				10438	pmu = ERR_PTR(ret);
				10439	goto unlock;
				10440	}
				10441	}
				10442	pmu = ERR_PTR(-ENOENT);
				10443	unlock:
				10444	srcu_read_unlock(&pmus_srcu, idx);
				10445
				10446	return pmu;
				10447	}
				10448
				10449	static void attach_sb_event(struct perf_event *event)
				10450	{
				10451	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
				10452
				10453	raw_spin_lock(&pel->lock);
				10454	list_add_rcu(&event->sb_list, &pel->list);
				10455	raw_spin_unlock(&pel->lock);
				10456	}
				10457
				10458	/*
				10459	* We keep a list of all !task (and therefore per-cpu) events
				10460	* that need to receive side-band records.
				10461	*
				10462	* This avoids having to scan all the various PMU per-cpu contexts
				10463	* looking for them.
				10464	*/
				10465	static void account_pmu_sb_event(struct perf_event *event)
				10466	{
				10467	if (is_sb_event(event))
				10468	attach_sb_event(event);
				10469	}
				10470
				10471	static void account_event_cpu(struct perf_event *event, int cpu)
				10472	{
				10473	if (event->parent)
				10474	return;
				10475
				10476	if (is_cgroup_event(event))
				10477	atomic_inc(&per_cpu(perf_cgroup_events, cpu));
				10478	}
				10479
				10480	/* Freq events need the tick to stay alive (see perf_event_task_tick). */
				10481	static void account_freq_event_nohz(void)
				10482	{
				10483	#ifdef CONFIG_NO_HZ_FULL
				10484	/* Lock so we don't race with concurrent unaccount */
				10485	spin_lock(&nr_freq_lock);
				10486	if (atomic_inc_return(&nr_freq_events) == 1)
				10487	tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
				10488	spin_unlock(&nr_freq_lock);
				10489	#endif
				10490	}
				10491
				10492	static void account_freq_event(void)
				10493	{
				10494	if (tick_nohz_full_enabled())
				10495	account_freq_event_nohz();
				10496	else
				10497	atomic_inc(&nr_freq_events);
				10498	}
				10499
				10500
				10501	static void account_event(struct perf_event *event)
				10502	{
				10503	bool inc = false;
				10504
				10505	if (event->parent)
				10506	return;
				10507
				10508	if (event->attach_state & PERF_ATTACH_TASK)
				10509	inc = true;
				10510	if (event->attr.mmap \|\| event->attr.mmap_data)
				10511	atomic_inc(&nr_mmap_events);
				10512	if (event->attr.comm)
				10513	atomic_inc(&nr_comm_events);
				10514	if (event->attr.namespaces)
				10515	atomic_inc(&nr_namespaces_events);
				10516	if (event->attr.task)
				10517	atomic_inc(&nr_task_events);
				10518	if (event->attr.freq)
				10519	account_freq_event();
				10520	if (event->attr.context_switch) {
				10521	atomic_inc(&nr_switch_events);
				10522	inc = true;
				10523	}
				10524	if (has_branch_stack(event))
				10525	inc = true;
				10526	if (is_cgroup_event(event))
				10527	inc = true;
				10528	if (event->attr.ksymbol)
				10529	atomic_inc(&nr_ksymbol_events);
				10530	if (event->attr.bpf_event)
				10531	atomic_inc(&nr_bpf_events);
				10532
				10533	if (inc) {
				10534	/*
				10535	* We need the mutex here because static_branch_enable()
				10536	* must complete before the perf_sched_count increment
				10537	* becomes visible.
				10538	*/
				10539	if (atomic_inc_not_zero(&perf_sched_count))
				10540	goto enabled;
				10541
				10542	mutex_lock(&perf_sched_mutex);
				10543	if (!atomic_read(&perf_sched_count)) {
				10544	static_branch_enable(&perf_sched_events);
				10545	/*
				10546	* Guarantee that all CPUs observe they key change and
				10547	* call the perf scheduling hooks before proceeding to
				10548	* install events that need them.
				10549	*/
				10550	synchronize_rcu();
				10551	}
				10552	/*
				10553	* Now that we have waited for the sync_sched(), allow further
				10554	* increments to by-pass the mutex.
				10555	*/
				10556	atomic_inc(&perf_sched_count);
				10557	mutex_unlock(&perf_sched_mutex);
				10558	}
				10559	enabled:
				10560
				10561	account_event_cpu(event, event->cpu);
				10562
				10563	account_pmu_sb_event(event);
				10564	}
				10565
				10566	/*
				10567	* Allocate and initialize an event structure
				10568	*/
				10569	static struct perf_event *
				10570	perf_event_alloc(struct perf_event_attr *attr, int cpu,
				10571	struct task_struct *task,
				10572	struct perf_event *group_leader,
				10573	struct perf_event *parent_event,
				10574	perf_overflow_handler_t overflow_handler,
				10575	void *context, int cgroup_fd)
				10576	{
				10577	struct pmu *pmu;
				10578	struct perf_event *event;
				10579	struct hw_perf_event *hwc;
				10580	long err = -EINVAL;
				10581
				10582	if ((unsigned)cpu >= nr_cpu_ids) {
				10583	if (!task \|\| cpu != -1)
				10584	return ERR_PTR(-EINVAL);
				10585	}
				10586
				10587	event = kzalloc(sizeof(*event), GFP_KERNEL);
				10588	if (!event)
				10589	return ERR_PTR(-ENOMEM);
				10590
				10591	/*
				10592	* Single events are their own group leaders, with an
				10593	* empty sibling list:
				10594	*/
				10595	if (!group_leader)
				10596	group_leader = event;
				10597
				10598	mutex_init(&event->child_mutex);
				10599	INIT_LIST_HEAD(&event->child_list);
				10600
				10601	INIT_LIST_HEAD(&event->event_entry);
				10602	INIT_LIST_HEAD(&event->sibling_list);
				10603	INIT_LIST_HEAD(&event->active_list);
				10604	init_event_group(event);
				10605	INIT_LIST_HEAD(&event->rb_entry);
				10606	INIT_LIST_HEAD(&event->active_entry);
				10607	INIT_LIST_HEAD(&event->addr_filters.list);
				10608	INIT_HLIST_NODE(&event->hlist_entry);
				10609
				10610
				10611	init_waitqueue_head(&event->waitq);
				10612	event->pending_disable = -1;
				10613	init_irq_work(&event->pending, perf_pending_event);
				10614
				10615	mutex_init(&event->mmap_mutex);
				10616	raw_spin_lock_init(&event->addr_filters.lock);
				10617
				10618	atomic_long_set(&event->refcount, 1);
				10619	event->cpu = cpu;
				10620	event->attr = *attr;
				10621	event->group_leader = group_leader;
				10622	event->pmu = NULL;
				10623	event->oncpu = -1;
				10624
				10625	event->parent = parent_event;
				10626
				10627	event->ns = get_pid_ns(task_active_pid_ns(current));
				10628	event->id = atomic64_inc_return(&perf_event_id);
				10629
				10630	event->state = PERF_EVENT_STATE_INACTIVE;
				10631
				10632	if (task) {
				10633	event->attach_state = PERF_ATTACH_TASK;
				10634	/*
				10635	* XXX pmu::event_init needs to know what task to account to
				10636	* and we cannot use the ctx information because we need the
				10637	* pmu before we get a ctx.
				10638	*/
				10639	event->hw.target = get_task_struct(task);
				10640	}
				10641
				10642	event->clock = &local_clock;
				10643	if (parent_event)
				10644	event->clock = parent_event->clock;
				10645
				10646	if (!overflow_handler && parent_event) {
				10647	overflow_handler = parent_event->overflow_handler;
				10648	context = parent_event->overflow_handler_context;
				10649	#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
				10650	if (overflow_handler == bpf_overflow_handler) {
				10651	struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
				10652
				10653	if (IS_ERR(prog)) {
				10654	err = PTR_ERR(prog);
				10655	goto err_ns;
				10656	}
				10657	event->prog = prog;
				10658	event->orig_overflow_handler =
				10659	parent_event->orig_overflow_handler;
				10660	}
				10661	#endif
				10662	}
				10663
				10664	if (overflow_handler) {
				10665	event->overflow_handler = overflow_handler;
				10666	event->overflow_handler_context = context;
				10667	} else if (is_write_backward(event)){
				10668	event->overflow_handler = perf_event_output_backward;
				10669	event->overflow_handler_context = NULL;
				10670	} else {
				10671	event->overflow_handler = perf_event_output_forward;
				10672	event->overflow_handler_context = NULL;
				10673	}
				10674
				10675	perf_event__state_init(event);
				10676
				10677	pmu = NULL;
				10678
				10679	hwc = &event->hw;
				10680	hwc->sample_period = attr->sample_period;
				10681	if (attr->freq && attr->sample_freq)
				10682	hwc->sample_period = 1;
				10683	hwc->last_period = hwc->sample_period;
				10684
				10685	local64_set(&hwc->period_left, hwc->sample_period);
				10686
				10687	/*
				10688	* We currently do not support PERF_SAMPLE_READ on inherited events.
				10689	* See perf_output_read().
				10690	*/
				10691	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
				10692	goto err_ns;
				10693
				10694	if (!has_branch_stack(event))
				10695	event->attr.branch_sample_type = 0;
				10696
				10697	if (cgroup_fd != -1) {
				10698	err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
				10699	if (err)
				10700	goto err_ns;
				10701	}
				10702
				10703	pmu = perf_init_event(event);
				10704	if (IS_ERR(pmu)) {
				10705	err = PTR_ERR(pmu);
				10706	goto err_ns;
				10707	}
				10708
				10709	/*
				10710	* Disallow uncore-cgroup events, they don't make sense as the cgroup will
				10711	* be different on other CPUs in the uncore mask.
				10712	*/
				10713	if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
				10714	err = -EINVAL;
				10715	goto err_pmu;
				10716	}
				10717
				10718	if (event->attr.aux_output &&
				10719	!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
				10720	err = -EOPNOTSUPP;
				10721	goto err_pmu;
				10722	}
				10723
				10724	err = exclusive_event_init(event);
				10725	if (err)
				10726	goto err_pmu;
				10727
				10728	if (has_addr_filter(event)) {
				10729	event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
				10730	sizeof(struct perf_addr_filter_range),
				10731	GFP_KERNEL);
				10732	if (!event->addr_filter_ranges) {
				10733	err = -ENOMEM;
				10734	goto err_per_task;
				10735	}
				10736
				10737	/*
				10738	* Clone the parent's vma offsets: they are valid until exec()
				10739	* even if the mm is not shared with the parent.
				10740	*/
				10741	if (event->parent) {
				10742	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
				10743
				10744	raw_spin_lock_irq(&ifh->lock);
				10745	memcpy(event->addr_filter_ranges,
				10746	event->parent->addr_filter_ranges,
				10747	pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
				10748	raw_spin_unlock_irq(&ifh->lock);
				10749	}
				10750
				10751	/* force hw sync on the address filters */
				10752	event->addr_filters_gen = 1;
				10753	}
				10754
				10755	if (!event->parent) {
				10756	if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
				10757	err = get_callchain_buffers(attr->sample_max_stack);
				10758	if (err)
				10759	goto err_addr_filters;
				10760	}
				10761	}
				10762
				10763	err = security_perf_event_alloc(event);
				10764	if (err)
				10765	goto err_callchain_buffer;
				10766
				10767	/* symmetric to unaccount_event() in _free_event() */
				10768	account_event(event);
				10769
				10770	return event;
				10771
				10772	err_callchain_buffer:
				10773	if (!event->parent) {
				10774	if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
				10775	put_callchain_buffers();
				10776	}
				10777	err_addr_filters:
				10778	kfree(event->addr_filter_ranges);
				10779
				10780	err_per_task:
				10781	exclusive_event_destroy(event);
				10782
				10783	err_pmu:
				10784	if (event->destroy)
				10785	event->destroy(event);
				10786	module_put(pmu->module);
				10787	err_ns:
				10788	if (is_cgroup_event(event))
				10789	perf_detach_cgroup(event);
				10790	if (event->ns)
				10791	put_pid_ns(event->ns);
				10792	if (event->hw.target)
				10793	put_task_struct(event->hw.target);
				10794	kfree(event);
				10795
				10796	return ERR_PTR(err);
				10797	}
				10798
				10799	static int perf_copy_attr(struct perf_event_attr __user *uattr,
				10800	struct perf_event_attr *attr)
				10801	{
				10802	u32 size;
				10803	int ret;
				10804
				10805	/* Zero the full structure, so that a short copy will be nice. */
				10806	memset(attr, 0, sizeof(*attr));
				10807
				10808	ret = get_user(size, &uattr->size);
				10809	if (ret)
				10810	return ret;
				10811
				10812	/* ABI compatibility quirk: */
				10813	if (!size)
				10814	size = PERF_ATTR_SIZE_VER0;
				10815	if (size < PERF_ATTR_SIZE_VER0 \|\| size > PAGE_SIZE)
				10816	goto err_size;
				10817
				10818	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
				10819	if (ret) {
				10820	if (ret == -E2BIG)
				10821	goto err_size;
				10822	return ret;
				10823	}
				10824
				10825	attr->size = size;
				10826
				10827	if (attr->__reserved_1 \|\| attr->__reserved_2)
				10828	return -EINVAL;
				10829
				10830	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
				10831	return -EINVAL;
				10832
				10833	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
				10834	return -EINVAL;
				10835
				10836	if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
				10837	u64 mask = attr->branch_sample_type;
				10838
				10839	/* only using defined bits */
				10840	if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
				10841	return -EINVAL;
				10842
				10843	/* at least one branch bit must be set */
				10844	if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
				10845	return -EINVAL;
				10846
				10847	/* propagate priv level, when not set for branch */
				10848	if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
				10849
				10850	/* exclude_kernel checked on syscall entry */
				10851	if (!attr->exclude_kernel)
				10852	mask \|= PERF_SAMPLE_BRANCH_KERNEL;
				10853
				10854	if (!attr->exclude_user)
				10855	mask \|= PERF_SAMPLE_BRANCH_USER;
				10856
				10857	if (!attr->exclude_hv)
				10858	mask \|= PERF_SAMPLE_BRANCH_HV;
				10859	/*
				10860	* adjust user setting (for HW filter setup)
				10861	*/
				10862	attr->branch_sample_type = mask;
				10863	}
				10864	/* privileged levels capture (kernel, hv): check permissions */
				10865	if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
				10866	ret = perf_allow_kernel(attr);
				10867	if (ret)
				10868	return ret;
				10869	}
				10870	}
				10871
				10872	if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
				10873	ret = perf_reg_validate(attr->sample_regs_user);
				10874	if (ret)
				10875	return ret;
				10876	}
				10877
				10878	if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
				10879	if (!arch_perf_have_user_stack_dump())
				10880	return -ENOSYS;
				10881
				10882	/*
				10883	* We have __u32 type for the size, but so far
				10884	* we can only use __u16 as maximum due to the
				10885	* __u16 sample size limit.
				10886	*/
				10887	if (attr->sample_stack_user >= USHRT_MAX)
				10888	return -EINVAL;
				10889	else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
				10890	return -EINVAL;
				10891	}
				10892
				10893	if (!attr->sample_max_stack)
				10894	attr->sample_max_stack = sysctl_perf_event_max_stack;
				10895
				10896	if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
				10897	ret = perf_reg_validate(attr->sample_regs_intr);
				10898	out:
				10899	return ret;
				10900
				10901	err_size:
				10902	put_user(sizeof(*attr), &uattr->size);
				10903	ret = -E2BIG;
				10904	goto out;
				10905	}
				10906
				10907	static void mutex_lock_double(struct mutex a, struct mutex b)
				10908	{
				10909	if (b < a)
				10910	swap(a, b);
				10911
				10912	mutex_lock(a);
				10913	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
				10914	}
				10915
				10916	static int
				10917	perf_event_set_output(struct perf_event event, struct perf_event output_event)
				10918	{
				10919	struct ring_buffer *rb = NULL;
				10920	int ret = -EINVAL;
				10921
				10922	if (!output_event) {
				10923	mutex_lock(&event->mmap_mutex);
				10924	goto set;
				10925	}
				10926
				10927	/* don't allow circular references */
				10928	if (event == output_event)
				10929	goto out;
				10930
				10931	/*
				10932	* Don't allow cross-cpu buffers
				10933	*/
				10934	if (output_event->cpu != event->cpu)
				10935	goto out;
				10936
				10937	/*
				10938	* If its not a per-cpu rb, it must be the same task.
				10939	*/
				10940	if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
				10941	goto out;
				10942
				10943	/*
				10944	* Mixing clocks in the same buffer is trouble you don't need.
				10945	*/
				10946	if (output_event->clock != event->clock)
				10947	goto out;
				10948
				10949	/*
				10950	* Either writing ring buffer from beginning or from end.
				10951	* Mixing is not allowed.
				10952	*/
				10953	if (is_write_backward(output_event) != is_write_backward(event))
				10954	goto out;
				10955
				10956	/*
				10957	* If both events generate aux data, they must be on the same PMU
				10958	*/
				10959	if (has_aux(event) && has_aux(output_event) &&
				10960	event->pmu != output_event->pmu)
				10961	goto out;
				10962
				10963	/*
				10964	* Hold both mmap_mutex to serialize against perf_mmap_close(). Since
				10965	* output_event is already on rb->event_list, and the list iteration
				10966	* restarts after every removal, it is guaranteed this new event is
				10967	* observed OR if output_event is already removed, it's guaranteed we
				10968	* observe !rb->mmap_count.
				10969	*/
				10970	mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
				10971	set:
				10972	/* Can't redirect output if we've got an active mmap() */
				10973	if (atomic_read(&event->mmap_count))
				10974	goto unlock;
				10975
				10976	if (output_event) {
				10977	/* get the rb we want to redirect to */
				10978	rb = ring_buffer_get(output_event);
				10979	if (!rb)
				10980	goto unlock;
				10981
				10982	/* did we race against perf_mmap_close() */
				10983	if (!atomic_read(&rb->mmap_count)) {
				10984	ring_buffer_put(rb);
				10985	goto unlock;
				10986	}
				10987	}
				10988
				10989	ring_buffer_attach(event, rb);
				10990
				10991	ret = 0;
				10992	unlock:
				10993	mutex_unlock(&event->mmap_mutex);
				10994	if (output_event)
				10995	mutex_unlock(&output_event->mmap_mutex);
				10996
				10997	out:
				10998	return ret;
				10999	}
				11000
				11001	static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
				11002	{
				11003	bool nmi_safe = false;
				11004
				11005	switch (clk_id) {
				11006	case CLOCK_MONOTONIC:
				11007	event->clock = &ktime_get_mono_fast_ns;
				11008	nmi_safe = true;
				11009	break;
				11010
				11011	case CLOCK_MONOTONIC_RAW:
				11012	event->clock = &ktime_get_raw_fast_ns;
				11013	nmi_safe = true;
				11014	break;
				11015
				11016	case CLOCK_REALTIME:
				11017	event->clock = &ktime_get_real_ns;
				11018	break;
				11019
				11020	case CLOCK_BOOTTIME:
				11021	event->clock = &ktime_get_boottime_ns;
				11022	break;
				11023
				11024	case CLOCK_TAI:
				11025	event->clock = &ktime_get_clocktai_ns;
				11026	break;
				11027
				11028	default:
				11029	return -EINVAL;
				11030	}
				11031
				11032	if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
				11033	return -EINVAL;
				11034
				11035	return 0;
				11036	}
				11037
				11038	/*
				11039	* Variation on perf_event_ctx_lock_nested(), except we take two context
				11040	* mutexes.
				11041	*/
				11042	static struct perf_event_context *
				11043	__perf_event_ctx_lock_double(struct perf_event *group_leader,
				11044	struct perf_event_context *ctx)
				11045	{
				11046	struct perf_event_context *gctx;
				11047
				11048	again:
				11049	rcu_read_lock();
				11050	gctx = READ_ONCE(group_leader->ctx);
				11051	if (!refcount_inc_not_zero(&gctx->refcount)) {
				11052	rcu_read_unlock();
				11053	goto again;
				11054	}
				11055	rcu_read_unlock();
				11056
				11057	mutex_lock_double(&gctx->mutex, &ctx->mutex);
				11058
				11059	if (group_leader->ctx != gctx) {
				11060	mutex_unlock(&ctx->mutex);
				11061	mutex_unlock(&gctx->mutex);
				11062	put_ctx(gctx);
				11063	goto again;
				11064	}
				11065
				11066	return gctx;
				11067	}
				11068
				11069	/**
				11070	* sys_perf_event_open - open a performance event, associate it to a task/cpu
				11071	*
				11072	* @attr_uptr: event_id type attributes for monitoring/sampling
				11073	* @pid: target pid
				11074	* @cpu: target cpu
				11075	* @group_fd: group leader event fd
				11076	*/
				11077	SYSCALL_DEFINE5(perf_event_open,
				11078	struct perf_event_attr __user *, attr_uptr,
				11079	pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
				11080	{
				11081	struct perf_event group_leader = NULL, output_event = NULL;
				11082	struct perf_event event, sibling;
				11083	struct perf_event_attr attr;
				11084	struct perf_event_context ctx, gctx;
				11085	struct file *event_file = NULL;
				11086	struct fd group = {NULL, 0};
				11087	struct task_struct *task = NULL;
				11088	struct pmu *pmu;
				11089	int event_fd;
				11090	int move_group = 0;
				11091	int err;
				11092	int f_flags = O_RDWR;
				11093	int cgroup_fd = -1;
				11094
				11095	/* for future expandability... */
				11096	if (flags & ~PERF_FLAG_ALL)
				11097	return -EINVAL;
				11098
				11099	/* Do we allow access to perf_event_open(2) ? */
				11100	err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
				11101	if (err)
				11102	return err;
				11103
				11104	err = perf_copy_attr(attr_uptr, &attr);
				11105	if (err)
				11106	return err;
				11107
				11108	if (!attr.exclude_kernel) {
				11109	err = perf_allow_kernel(&attr);
				11110	if (err)
				11111	return err;
				11112	}
				11113
				11114	if (attr.namespaces) {
				11115	if (!capable(CAP_SYS_ADMIN))
				11116	return -EACCES;
				11117	}
				11118
				11119	if (attr.freq) {
				11120	if (attr.sample_freq > sysctl_perf_event_sample_rate)
				11121	return -EINVAL;
				11122	} else {
				11123	if (attr.sample_period & (1ULL << 63))
				11124	return -EINVAL;
				11125	}
				11126
				11127	/* Only privileged users can get physical addresses */
				11128	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
				11129	err = perf_allow_kernel(&attr);
				11130	if (err)
				11131	return err;
				11132	}
				11133
				11134	/* REGS_INTR can leak data, lockdown must prevent this */
				11135	if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
				11136	err = security_locked_down(LOCKDOWN_PERF);
				11137	if (err)
				11138	return err;
				11139	}
				11140
				11141	/*
				11142	* In cgroup mode, the pid argument is used to pass the fd
				11143	* opened to the cgroup directory in cgroupfs. The cpu argument
				11144	* designates the cpu on which to monitor threads from that
				11145	* cgroup.
				11146	*/
				11147	if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 \|\| cpu == -1))
				11148	return -EINVAL;
				11149
				11150	if (flags & PERF_FLAG_FD_CLOEXEC)
				11151	f_flags \|= O_CLOEXEC;
				11152
				11153	event_fd = get_unused_fd_flags(f_flags);
				11154	if (event_fd < 0)
				11155	return event_fd;
				11156
				11157	if (group_fd != -1) {
				11158	err = perf_fget_light(group_fd, &group);
				11159	if (err)
				11160	goto err_fd;
				11161	group_leader = group.file->private_data;
				11162	if (flags & PERF_FLAG_FD_OUTPUT)
				11163	output_event = group_leader;
				11164	if (flags & PERF_FLAG_FD_NO_GROUP)
				11165	group_leader = NULL;
				11166	}
				11167
				11168	if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
				11169	task = find_lively_task_by_vpid(pid);
				11170	if (IS_ERR(task)) {
				11171	err = PTR_ERR(task);
				11172	goto err_group_fd;
				11173	}
				11174	}
				11175
				11176	if (task && group_leader &&
				11177	group_leader->attr.inherit != attr.inherit) {
				11178	err = -EINVAL;
				11179	goto err_task;
				11180	}
				11181
				11182	if (flags & PERF_FLAG_PID_CGROUP)
				11183	cgroup_fd = pid;
				11184
				11185	event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
				11186	NULL, NULL, cgroup_fd);
				11187	if (IS_ERR(event)) {
				11188	err = PTR_ERR(event);
				11189	goto err_task;
				11190	}
				11191
				11192	if (is_sampling_event(event)) {
				11193	if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
				11194	err = -EOPNOTSUPP;
				11195	goto err_alloc;
				11196	}
				11197	}
				11198
				11199	/*
				11200	* Special case software events and allow them to be part of
				11201	* any hardware group.
				11202	*/
				11203	pmu = event->pmu;
				11204
				11205	if (attr.use_clockid) {
				11206	err = perf_event_set_clock(event, attr.clockid);
				11207	if (err)
				11208	goto err_alloc;
				11209	}
				11210
				11211	if (pmu->task_ctx_nr == perf_sw_context)
				11212	event->event_caps \|= PERF_EV_CAP_SOFTWARE;
				11213
				11214	if (group_leader) {
				11215	if (is_software_event(event) &&
				11216	!in_software_context(group_leader)) {
				11217	/*
				11218	* If the event is a sw event, but the group_leader
				11219	* is on hw context.
				11220	*
				11221	* Allow the addition of software events to hw
				11222	* groups, this is safe because software events
				11223	* never fail to schedule.
				11224	*/
				11225	pmu = group_leader->ctx->pmu;
				11226	} else if (!is_software_event(event) &&
				11227	is_software_event(group_leader) &&
				11228	(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
				11229	/*
				11230	* In case the group is a pure software group, and we
				11231	* try to add a hardware event, move the whole group to
				11232	* the hardware context.
				11233	*/
				11234	move_group = 1;
				11235	}
				11236	}
				11237
				11238	/*
				11239	* Get the target context (task or percpu):
				11240	*/
				11241	ctx = find_get_context(pmu, task, event);
				11242	if (IS_ERR(ctx)) {
				11243	err = PTR_ERR(ctx);
				11244	goto err_alloc;
				11245	}
				11246
				11247	/*
				11248	* Look up the group leader (we will attach this event to it):
				11249	*/
				11250	if (group_leader) {
				11251	err = -EINVAL;
				11252
				11253	/*
				11254	* Do not allow a recursive hierarchy (this new sibling
				11255	* becoming part of another group-sibling):
				11256	*/
				11257	if (group_leader->group_leader != group_leader)
				11258	goto err_context;
				11259
				11260	/* All events in a group should have the same clock */
				11261	if (group_leader->clock != event->clock)
				11262	goto err_context;
				11263
				11264	/*
				11265	* Make sure we're both events for the same CPU;
				11266	* grouping events for different CPUs is broken; since
				11267	* you can never concurrently schedule them anyhow.
				11268	*/
				11269	if (group_leader->cpu != event->cpu)
				11270	goto err_context;
				11271
				11272	/*
				11273	* Make sure we're both on the same task, or both
				11274	* per-CPU events.
				11275	*/
				11276	if (group_leader->ctx->task != ctx->task)
				11277	goto err_context;
				11278
				11279	/*
				11280	* Do not allow to attach to a group in a different task
				11281	* or CPU context. If we're moving SW events, we'll fix
				11282	* this up later, so allow that.
				11283	*
				11284	* Racy, not holding group_leader->ctx->mutex, see comment with
				11285	* perf_event_ctx_lock().
				11286	*/
				11287	if (!move_group && group_leader->ctx != ctx)
				11288	goto err_context;
				11289
				11290	/*
				11291	* Only a group leader can be exclusive or pinned
				11292	*/
				11293	if (attr.exclusive \|\| attr.pinned)
				11294	goto err_context;
				11295	}
				11296
				11297	if (output_event) {
				11298	err = perf_event_set_output(event, output_event);
				11299	if (err)
				11300	goto err_context;
				11301	}
				11302
				11303	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
				11304	f_flags);
				11305	if (IS_ERR(event_file)) {
				11306	err = PTR_ERR(event_file);
				11307	event_file = NULL;
				11308	goto err_context;
				11309	}
				11310
				11311	if (task) {
				11312	err = down_read_interruptible(&task->signal->exec_update_lock);
				11313	if (err)
				11314	goto err_file;
				11315
				11316	/*
				11317	* Preserve ptrace permission check for backwards compatibility.
				11318	*
				11319	* We must hold exec_update_lock across this and any potential
				11320	* perf_install_in_context() call for this new event to
				11321	* serialize against exec() altering our credentials (and the
				11322	* perf_event_exit_task() that could imply).
				11323	*/
				11324	err = -EACCES;
				11325	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
				11326	goto err_cred;
				11327	}
				11328
				11329	if (move_group) {
				11330	gctx = __perf_event_ctx_lock_double(group_leader, ctx);
				11331
				11332	if (gctx->task == TASK_TOMBSTONE) {
				11333	err = -ESRCH;
				11334	goto err_locked;
				11335	}
				11336
				11337	/*
				11338	* Check if we raced against another sys_perf_event_open() call
				11339	* moving the software group underneath us.
				11340	*/
				11341	if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
				11342	/*
				11343	* If someone moved the group out from under us, check
				11344	* if this new event wound up on the same ctx, if so
				11345	* its the regular !move_group case, otherwise fail.
				11346	*/
				11347	if (gctx != ctx) {
				11348	err = -EINVAL;
				11349	goto err_locked;
				11350	} else {
				11351	perf_event_ctx_unlock(group_leader, gctx);
				11352	move_group = 0;
				11353	goto not_move_group;
				11354	}
				11355	}
				11356
				11357	/*
				11358	* Failure to create exclusive events returns -EBUSY.
				11359	*/
				11360	err = -EBUSY;
				11361	if (!exclusive_event_installable(group_leader, ctx))
				11362	goto err_locked;
				11363
				11364	for_each_sibling_event(sibling, group_leader) {
				11365	if (!exclusive_event_installable(sibling, ctx))
				11366	goto err_locked;
				11367	}
				11368	} else {
				11369	mutex_lock(&ctx->mutex);
				11370
				11371	/*
				11372	* Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
				11373	* see the group_leader && !move_group test earlier.
				11374	*/
				11375	if (group_leader && group_leader->ctx != ctx) {
				11376	err = -EINVAL;
				11377	goto err_locked;
				11378	}
				11379	}
				11380	not_move_group:
				11381
				11382	if (ctx->task == TASK_TOMBSTONE) {
				11383	err = -ESRCH;
				11384	goto err_locked;
				11385	}
				11386
				11387	if (!perf_event_validate_size(event)) {
				11388	err = -E2BIG;
				11389	goto err_locked;
				11390	}
				11391
				11392	if (!task) {
				11393	/*
				11394	* Check if the @cpu we're creating an event for is online.
				11395	*
				11396	* We use the perf_cpu_context::ctx::mutex to serialize against
				11397	* the hotplug notifiers. See perf_event_{init,exit}_cpu().
				11398	*/
				11399	struct perf_cpu_context *cpuctx =
				11400	container_of(ctx, struct perf_cpu_context, ctx);
				11401
				11402	if (!cpuctx->online) {
				11403	err = -ENODEV;
				11404	goto err_locked;
				11405	}
				11406	}
				11407
				11408	if (event->attr.aux_output && !perf_get_aux_event(event, group_leader)) {
				11409	err = -EINVAL;
				11410	goto err_locked;
				11411	}
				11412
				11413	/*
				11414	* Must be under the same ctx::mutex as perf_install_in_context(),
				11415	* because we need to serialize with concurrent event creation.
				11416	*/
				11417	if (!exclusive_event_installable(event, ctx)) {
				11418	err = -EBUSY;
				11419	goto err_locked;
				11420	}
				11421
				11422	WARN_ON_ONCE(ctx->parent_ctx);
				11423
				11424	/*
				11425	* This is the point on no return; we cannot fail hereafter. This is
				11426	* where we start modifying current state.
				11427	*/
				11428
				11429	if (move_group) {
				11430	/*
				11431	* See perf_event_ctx_lock() for comments on the details
				11432	* of swizzling perf_event::ctx.
				11433	*/
				11434	perf_remove_from_context(group_leader, 0);
				11435	put_ctx(gctx);
				11436
				11437	for_each_sibling_event(sibling, group_leader) {
				11438	perf_remove_from_context(sibling, 0);
				11439	put_ctx(gctx);
				11440	}
				11441
				11442	/*
				11443	* Wait for everybody to stop referencing the events through
				11444	* the old lists, before installing it on new lists.
				11445	*/
				11446	synchronize_rcu();
				11447
				11448	/*
				11449	* Install the group siblings before the group leader.
				11450	*
				11451	* Because a group leader will try and install the entire group
				11452	* (through the sibling list, which is still in-tact), we can
				11453	* end up with siblings installed in the wrong context.
				11454	*
				11455	* By installing siblings first we NO-OP because they're not
				11456	* reachable through the group lists.
				11457	*/
				11458	for_each_sibling_event(sibling, group_leader) {
				11459	perf_event__state_init(sibling);
				11460	perf_install_in_context(ctx, sibling, sibling->cpu);
				11461	get_ctx(ctx);
				11462	}
				11463
				11464	/*
				11465	* Removing from the context ends up with disabled
				11466	* event. What we want here is event in the initial
				11467	* startup state, ready to be add into new context.
				11468	*/
				11469	perf_event__state_init(group_leader);
				11470	perf_install_in_context(ctx, group_leader, group_leader->cpu);
				11471	get_ctx(ctx);
				11472	}
				11473
				11474	/*
				11475	* Precalculate sample_data sizes; do while holding ctx::mutex such
				11476	* that we're serialized against further additions and before
				11477	* perf_install_in_context() which is the point the event is active and
				11478	* can use these values.
				11479	*/
				11480	perf_event__header_size(event);
				11481	perf_event__id_header_size(event);
				11482
				11483	event->owner = current;
				11484
				11485	perf_install_in_context(ctx, event, event->cpu);
				11486	perf_unpin_context(ctx);
				11487
				11488	if (move_group)
				11489	perf_event_ctx_unlock(group_leader, gctx);
				11490	mutex_unlock(&ctx->mutex);
				11491
				11492	if (task) {
				11493	up_read(&task->signal->exec_update_lock);
				11494	put_task_struct(task);
				11495	}
				11496
				11497	mutex_lock(&current->perf_event_mutex);
				11498	list_add_tail(&event->owner_entry, &current->perf_event_list);
				11499	mutex_unlock(&current->perf_event_mutex);
				11500
				11501	/*
				11502	* Drop the reference on the group_event after placing the
				11503	* new event on the sibling_list. This ensures destruction
				11504	* of the group leader will find the pointer to itself in
				11505	* perf_group_detach().
				11506	*/
				11507	fdput(group);
				11508	fd_install(event_fd, event_file);
				11509	return event_fd;
				11510
				11511	err_locked:
				11512	if (move_group)
				11513	perf_event_ctx_unlock(group_leader, gctx);
				11514	mutex_unlock(&ctx->mutex);
				11515	err_cred:
				11516	if (task)
				11517	up_read(&task->signal->exec_update_lock);
				11518	err_file:
				11519	fput(event_file);
				11520	err_context:
				11521	perf_unpin_context(ctx);
				11522	put_ctx(ctx);
				11523	err_alloc:
				11524	/*
				11525	* If event_file is set, the fput() above will have called ->release()
				11526	* and that will take care of freeing the event.
				11527	*/
				11528	if (!event_file)
				11529	free_event(event);
				11530	err_task:
				11531	if (task)
				11532	put_task_struct(task);
				11533	err_group_fd:
				11534	fdput(group);
				11535	err_fd:
				11536	put_unused_fd(event_fd);
				11537	return err;
				11538	}
				11539
				11540	/**
				11541	* perf_event_create_kernel_counter
				11542	*
				11543	* @attr: attributes of the counter to create
				11544	* @cpu: cpu in which the counter is bound
				11545	* @task: task to profile (NULL for percpu)
				11546	*/
				11547	struct perf_event *
				11548	perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
				11549	struct task_struct *task,
				11550	perf_overflow_handler_t overflow_handler,
				11551	void *context)
				11552	{
				11553	struct perf_event_context *ctx;
				11554	struct perf_event *event;
				11555	int err;
				11556
				11557	/*
				11558	* Grouping is not supported for kernel events, neither is 'AUX',
				11559	* make sure the caller's intentions are adjusted.
				11560	*/
				11561	if (attr->aux_output)
				11562	return ERR_PTR(-EINVAL);
				11563
				11564	event = perf_event_alloc(attr, cpu, task, NULL, NULL,
				11565	overflow_handler, context, -1);
				11566	if (IS_ERR(event)) {
				11567	err = PTR_ERR(event);
				11568	goto err;
				11569	}
				11570
				11571	/* Mark owner so we could distinguish it from user events. */
				11572	event->owner = TASK_TOMBSTONE;
				11573
				11574	/*
				11575	* Get the target context (task or percpu):
				11576	*/
				11577	ctx = find_get_context(event->pmu, task, event);
				11578	if (IS_ERR(ctx)) {
				11579	err = PTR_ERR(ctx);
				11580	goto err_free;
				11581	}
				11582
				11583	WARN_ON_ONCE(ctx->parent_ctx);
				11584	mutex_lock(&ctx->mutex);
				11585	if (ctx->task == TASK_TOMBSTONE) {
				11586	err = -ESRCH;
				11587	goto err_unlock;
				11588	}
				11589
				11590	if (!task) {
				11591	/*
				11592	* Check if the @cpu we're creating an event for is online.
				11593	*
				11594	* We use the perf_cpu_context::ctx::mutex to serialize against
				11595	* the hotplug notifiers. See perf_event_{init,exit}_cpu().
				11596	*/
				11597	struct perf_cpu_context *cpuctx =
				11598	container_of(ctx, struct perf_cpu_context, ctx);
				11599	if (!cpuctx->online) {
				11600	err = -ENODEV;
				11601	goto err_unlock;
				11602	}
				11603	}
				11604
				11605	if (!exclusive_event_installable(event, ctx)) {
				11606	err = -EBUSY;
				11607	goto err_unlock;
				11608	}
				11609
				11610	perf_install_in_context(ctx, event, event->cpu);
				11611	perf_unpin_context(ctx);
				11612	mutex_unlock(&ctx->mutex);
				11613
				11614	return event;
				11615
				11616	err_unlock:
				11617	mutex_unlock(&ctx->mutex);
				11618	perf_unpin_context(ctx);
				11619	put_ctx(ctx);
				11620	err_free:
				11621	free_event(event);
				11622	err:
				11623	return ERR_PTR(err);
				11624	}
				11625	EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
				11626
				11627	void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
				11628	{
				11629	struct perf_event_context *src_ctx;
				11630	struct perf_event_context *dst_ctx;
				11631	struct perf_event event, tmp;
				11632	LIST_HEAD(events);
				11633
				11634	src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
				11635	dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
				11636
				11637	/*
				11638	* See perf_event_ctx_lock() for comments on the details
				11639	* of swizzling perf_event::ctx.
				11640	*/
				11641	mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
				11642	list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
				11643	event_entry) {
				11644	perf_remove_from_context(event, 0);
				11645	unaccount_event_cpu(event, src_cpu);
				11646	put_ctx(src_ctx);
				11647	list_add(&event->migrate_entry, &events);
				11648	}
				11649
				11650	/*
				11651	* Wait for the events to quiesce before re-instating them.
				11652	*/
				11653	synchronize_rcu();
				11654
				11655	/*
				11656	* Re-instate events in 2 passes.
				11657	*
				11658	* Skip over group leaders and only install siblings on this first
				11659	* pass, siblings will not get enabled without a leader, however a
				11660	* leader will enable its siblings, even if those are still on the old
				11661	* context.
				11662	*/
				11663	list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
				11664	if (event->group_leader == event)
				11665	continue;
				11666
				11667	list_del(&event->migrate_entry);
				11668	if (event->state >= PERF_EVENT_STATE_OFF)
				11669	event->state = PERF_EVENT_STATE_INACTIVE;
				11670	account_event_cpu(event, dst_cpu);
				11671	perf_install_in_context(dst_ctx, event, dst_cpu);
				11672	get_ctx(dst_ctx);
				11673	}
				11674
				11675	/*
				11676	* Once all the siblings are setup properly, install the group leaders
				11677	* to make it go.
				11678	*/
				11679	list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
				11680	list_del(&event->migrate_entry);
				11681	if (event->state >= PERF_EVENT_STATE_OFF)
				11682	event->state = PERF_EVENT_STATE_INACTIVE;
				11683	account_event_cpu(event, dst_cpu);
				11684	perf_install_in_context(dst_ctx, event, dst_cpu);
				11685	get_ctx(dst_ctx);
				11686	}
				11687	mutex_unlock(&dst_ctx->mutex);
				11688	mutex_unlock(&src_ctx->mutex);
				11689	}
				11690	EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
				11691
				11692	static void sync_child_event(struct perf_event *child_event,
				11693	struct task_struct *child)
				11694	{
				11695	struct perf_event *parent_event = child_event->parent;
				11696	u64 child_val;
				11697
				11698	if (child_event->attr.inherit_stat)
				11699	perf_event_read_event(child_event, child);
				11700
				11701	child_val = perf_event_count(child_event);
				11702
				11703	/*
				11704	* Add back the child's count to the parent's count:
				11705	*/
				11706	atomic64_add(child_val, &parent_event->child_count);
				11707	atomic64_add(child_event->total_time_enabled,
				11708	&parent_event->child_total_time_enabled);
				11709	atomic64_add(child_event->total_time_running,
				11710	&parent_event->child_total_time_running);
				11711	}
				11712
				11713	static void
				11714	perf_event_exit_event(struct perf_event *child_event,
				11715	struct perf_event_context *child_ctx,
				11716	struct task_struct *child)
				11717	{
				11718	struct perf_event *parent_event = child_event->parent;
				11719
				11720	/*
				11721	* Do not destroy the 'original' grouping; because of the context
				11722	* switch optimization the original events could've ended up in a
				11723	* random child task.
				11724	*
				11725	* If we were to destroy the original group, all group related
				11726	* operations would cease to function properly after this random
				11727	* child dies.
				11728	*
				11729	* Do destroy all inherited groups, we don't care about those
				11730	* and being thorough is better.
				11731	*/
				11732	raw_spin_lock_irq(&child_ctx->lock);
				11733	WARN_ON_ONCE(child_ctx->is_active);
				11734
				11735	if (parent_event)
				11736	perf_group_detach(child_event);
				11737	list_del_event(child_event, child_ctx);
				11738	perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
				11739	raw_spin_unlock_irq(&child_ctx->lock);
				11740
				11741	/*
				11742	* Parent events are governed by their filedesc, retain them.
				11743	*/
				11744	if (!parent_event) {
				11745	perf_event_wakeup(child_event);
				11746	return;
				11747	}
				11748	/*
				11749	* Child events can be cleaned up.
				11750	*/
				11751
				11752	sync_child_event(child_event, child);
				11753
				11754	/*
				11755	* Remove this event from the parent's list
				11756	*/
				11757	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
				11758	mutex_lock(&parent_event->child_mutex);
				11759	list_del_init(&child_event->child_list);
				11760	mutex_unlock(&parent_event->child_mutex);
				11761
				11762	/*
				11763	* Kick perf_poll() for is_event_hup().
				11764	*/
				11765	perf_event_wakeup(parent_event);
				11766	free_event(child_event);
				11767	put_event(parent_event);
				11768	}
				11769
				11770	static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
				11771	{
				11772	struct perf_event_context child_ctx, clone_ctx = NULL;
				11773	struct perf_event child_event, next;
				11774
				11775	WARN_ON_ONCE(child != current);
				11776
				11777	child_ctx = perf_pin_task_context(child, ctxn);
				11778	if (!child_ctx)
				11779	return;
				11780
				11781	/*
				11782	* In order to reduce the amount of tricky in ctx tear-down, we hold
				11783	* ctx::mutex over the entire thing. This serializes against almost
				11784	* everything that wants to access the ctx.
				11785	*
				11786	* The exception is sys_perf_event_open() /
				11787	* perf_event_create_kernel_count() which does find_get_context()
				11788	* without ctx::mutex (it cannot because of the move_group double mutex
				11789	* lock thing). See the comments in perf_install_in_context().
				11790	*/
				11791	mutex_lock(&child_ctx->mutex);
				11792
				11793	/*
				11794	* In a single ctx::lock section, de-schedule the events and detach the
				11795	* context from the task such that we cannot ever get it scheduled back
				11796	* in.
				11797	*/
				11798	raw_spin_lock_irq(&child_ctx->lock);
				11799	task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
				11800
				11801	/*
				11802	* Now that the context is inactive, destroy the task <-> ctx relation
				11803	* and mark the context dead.
				11804	*/
				11805	RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
				11806	put_ctx(child_ctx); /* cannot be last */
				11807	WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
				11808	put_task_struct(current); /* cannot be last */
				11809
				11810	clone_ctx = unclone_ctx(child_ctx);
				11811	raw_spin_unlock_irq(&child_ctx->lock);
				11812
				11813	if (clone_ctx)
				11814	put_ctx(clone_ctx);
				11815
				11816	/*
				11817	* Report the task dead after unscheduling the events so that we
				11818	* won't get any samples after PERF_RECORD_EXIT. We can however still
				11819	* get a few PERF_RECORD_READ events.
				11820	*/
				11821	perf_event_task(child, child_ctx, 0);
				11822
				11823	list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
				11824	perf_event_exit_event(child_event, child_ctx, child);
				11825
				11826	mutex_unlock(&child_ctx->mutex);
				11827
				11828	put_ctx(child_ctx);
				11829	}
				11830
				11831	/*
				11832	* When a child task exits, feed back event values to parent events.
				11833	*
				11834	* Can be called with exec_update_lock held when called from
				11835	* install_exec_creds().
				11836	*/
				11837	void perf_event_exit_task(struct task_struct *child)
				11838	{
				11839	struct perf_event event, tmp;
				11840	int ctxn;
				11841
				11842	mutex_lock(&child->perf_event_mutex);
				11843	list_for_each_entry_safe(event, tmp, &child->perf_event_list,
				11844	owner_entry) {
				11845	list_del_init(&event->owner_entry);
				11846
				11847	/*
				11848	* Ensure the list deletion is visible before we clear
				11849	* the owner, closes a race against perf_release() where
				11850	* we need to serialize on the owner->perf_event_mutex.
				11851	*/
				11852	smp_store_release(&event->owner, NULL);
				11853	}
				11854	mutex_unlock(&child->perf_event_mutex);
				11855
				11856	for_each_task_context_nr(ctxn)
				11857	perf_event_exit_task_context(child, ctxn);
				11858
				11859	/*
				11860	* The perf_event_exit_task_context calls perf_event_task
				11861	* with child's task_ctx, which generates EXIT events for
				11862	* child contexts and sets child->perf_event_ctxp[] to NULL.
				11863	* At this point we need to send EXIT events to cpu contexts.
				11864	*/
				11865	perf_event_task(child, NULL, 0);
				11866	}
				11867
				11868	static void perf_free_event(struct perf_event *event,
				11869	struct perf_event_context *ctx)
				11870	{
				11871	struct perf_event *parent = event->parent;
				11872
				11873	if (WARN_ON_ONCE(!parent))
				11874	return;
				11875
				11876	mutex_lock(&parent->child_mutex);
				11877	list_del_init(&event->child_list);
				11878	mutex_unlock(&parent->child_mutex);
				11879
				11880	put_event(parent);
				11881
				11882	raw_spin_lock_irq(&ctx->lock);
				11883	perf_group_detach(event);
				11884	list_del_event(event, ctx);
				11885	raw_spin_unlock_irq(&ctx->lock);
				11886	free_event(event);
				11887	}
				11888
				11889	/*
				11890	* Free a context as created by inheritance by perf_event_init_task() below,
				11891	* used by fork() in case of fail.
				11892	*
				11893	* Even though the task has never lived, the context and events have been
				11894	* exposed through the child_list, so we must take care tearing it all down.
				11895	*/
				11896	void perf_event_free_task(struct task_struct *task)
				11897	{
				11898	struct perf_event_context *ctx;
				11899	struct perf_event event, tmp;
				11900	int ctxn;
				11901
				11902	for_each_task_context_nr(ctxn) {
				11903	ctx = task->perf_event_ctxp[ctxn];
				11904	if (!ctx)
				11905	continue;
				11906
				11907	mutex_lock(&ctx->mutex);
				11908	raw_spin_lock_irq(&ctx->lock);
				11909	/*
				11910	* Destroy the task <-> ctx relation and mark the context dead.
				11911	*
				11912	* This is important because even though the task hasn't been
				11913	* exposed yet the context has been (through child_list).
				11914	*/
				11915	RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
				11916	WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
				11917	put_task_struct(task); /* cannot be last */
				11918	raw_spin_unlock_irq(&ctx->lock);
				11919
				11920	list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
				11921	perf_free_event(event, ctx);
				11922
				11923	mutex_unlock(&ctx->mutex);
				11924
				11925	/*
				11926	* perf_event_release_kernel() could've stolen some of our
				11927	* child events and still have them on its free_list. In that
				11928	* case we must wait for these events to have been freed (in
				11929	* particular all their references to this task must've been
				11930	* dropped).
				11931	*
				11932	* Without this copy_process() will unconditionally free this
				11933	* task (irrespective of its reference count) and
				11934	* _free_event()'s put_task_struct(event->hw.target) will be a
				11935	* use-after-free.
				11936	*
				11937	* Wait for all events to drop their context reference.
				11938	*/
				11939	wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
				11940	put_ctx(ctx); /* must be last */
				11941	}
				11942	}
				11943
				11944	void perf_event_delayed_put(struct task_struct *task)
				11945	{
				11946	int ctxn;
				11947
				11948	for_each_task_context_nr(ctxn)
				11949	WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
				11950	}
				11951
				11952	struct file *perf_event_get(unsigned int fd)
				11953	{
				11954	struct file *file = fget(fd);
				11955	if (!file)
				11956	return ERR_PTR(-EBADF);
				11957
				11958	if (file->f_op != &perf_fops) {
				11959	fput(file);
				11960	return ERR_PTR(-EBADF);
				11961	}
				11962
				11963	return file;
				11964	}
				11965
				11966	const struct perf_event perf_get_event(struct file file)
				11967	{
				11968	if (file->f_op != &perf_fops)
				11969	return ERR_PTR(-EINVAL);
				11970
				11971	return file->private_data;
				11972	}
				11973
				11974	const struct perf_event_attr perf_event_attrs(struct perf_event event)
				11975	{
				11976	if (!event)
				11977	return ERR_PTR(-EINVAL);
				11978
				11979	return &event->attr;
				11980	}
				11981
				11982	/*
				11983	* Inherit an event from parent task to child task.
				11984	*
				11985	* Returns:
				11986	* - valid pointer on success
				11987	* - NULL for orphaned events
				11988	* - IS_ERR() on error
				11989	*/
				11990	static struct perf_event *
				11991	inherit_event(struct perf_event *parent_event,
				11992	struct task_struct *parent,
				11993	struct perf_event_context *parent_ctx,
				11994	struct task_struct *child,
				11995	struct perf_event *group_leader,
				11996	struct perf_event_context *child_ctx)
				11997	{
				11998	enum perf_event_state parent_state = parent_event->state;
				11999	struct perf_event *child_event;
				12000	unsigned long flags;
				12001
				12002	/*
				12003	* Instead of creating recursive hierarchies of events,
				12004	* we link inherited events back to the original parent,
				12005	* which has a filp for sure, which we use as the reference
				12006	* count:
				12007	*/
				12008	if (parent_event->parent)
				12009	parent_event = parent_event->parent;
				12010
				12011	child_event = perf_event_alloc(&parent_event->attr,
				12012	parent_event->cpu,
				12013	child,
				12014	group_leader, parent_event,
				12015	NULL, NULL, -1);
				12016	if (IS_ERR(child_event))
				12017	return child_event;
				12018
				12019
				12020	if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
				12021	!child_ctx->task_ctx_data) {
				12022	struct pmu *pmu = child_event->pmu;
				12023
				12024	child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
				12025	GFP_KERNEL);
				12026	if (!child_ctx->task_ctx_data) {
				12027	free_event(child_event);
				12028	return ERR_PTR(-ENOMEM);
				12029	}
				12030	}
				12031
				12032	/*
				12033	* is_orphaned_event() and list_add_tail(&parent_event->child_list)
				12034	* must be under the same lock in order to serialize against
				12035	* perf_event_release_kernel(), such that either we must observe
				12036	* is_orphaned_event() or they will observe us on the child_list.
				12037	*/
				12038	mutex_lock(&parent_event->child_mutex);
				12039	if (is_orphaned_event(parent_event) \|\|
				12040	!atomic_long_inc_not_zero(&parent_event->refcount)) {
				12041	mutex_unlock(&parent_event->child_mutex);
				12042	/* task_ctx_data is freed with child_ctx */
				12043	free_event(child_event);
				12044	return NULL;
				12045	}
				12046
				12047	get_ctx(child_ctx);
				12048
				12049	/*
				12050	* Make the child state follow the state of the parent event,
				12051	* not its attr.disabled bit. We hold the parent's mutex,
				12052	* so we won't race with perf_event_{en, dis}able_family.
				12053	*/
				12054	if (parent_state >= PERF_EVENT_STATE_INACTIVE)
				12055	child_event->state = PERF_EVENT_STATE_INACTIVE;
				12056	else
				12057	child_event->state = PERF_EVENT_STATE_OFF;
				12058
				12059	if (parent_event->attr.freq) {
				12060	u64 sample_period = parent_event->hw.sample_period;
				12061	struct hw_perf_event *hwc = &child_event->hw;
				12062
				12063	hwc->sample_period = sample_period;
				12064	hwc->last_period = sample_period;
				12065
				12066	local64_set(&hwc->period_left, sample_period);
				12067	}
				12068
				12069	child_event->ctx = child_ctx;
				12070	child_event->overflow_handler = parent_event->overflow_handler;
				12071	child_event->overflow_handler_context
				12072	= parent_event->overflow_handler_context;
				12073
				12074	/*
				12075	* Precalculate sample_data sizes
				12076	*/
				12077	perf_event__header_size(child_event);
				12078	perf_event__id_header_size(child_event);
				12079
				12080	/*
				12081	* Link it up in the child's context:
				12082	*/
				12083	raw_spin_lock_irqsave(&child_ctx->lock, flags);
				12084	add_event_to_ctx(child_event, child_ctx);
				12085	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
				12086
				12087	/*
				12088	* Link this into the parent event's child list
				12089	*/
				12090	list_add_tail(&child_event->child_list, &parent_event->child_list);
				12091	mutex_unlock(&parent_event->child_mutex);
				12092
				12093	return child_event;
				12094	}
				12095
				12096	/*
				12097	* Inherits an event group.
				12098	*
				12099	* This will quietly suppress orphaned events; !inherit_event() is not an error.
				12100	* This matches with perf_event_release_kernel() removing all child events.
				12101	*
				12102	* Returns:
				12103	* - 0 on success
				12104	* - <0 on error
				12105	*/
				12106	static int inherit_group(struct perf_event *parent_event,
				12107	struct task_struct *parent,
				12108	struct perf_event_context *parent_ctx,
				12109	struct task_struct *child,
				12110	struct perf_event_context *child_ctx)
				12111	{
				12112	struct perf_event *leader;
				12113	struct perf_event *sub;
				12114	struct perf_event *child_ctr;
				12115
				12116	leader = inherit_event(parent_event, parent, parent_ctx,
				12117	child, NULL, child_ctx);
				12118	if (IS_ERR(leader))
				12119	return PTR_ERR(leader);
				12120	/*
				12121	* @leader can be NULL here because of is_orphaned_event(). In this
				12122	* case inherit_event() will create individual events, similar to what
				12123	* perf_group_detach() would do anyway.
				12124	*/
				12125	for_each_sibling_event(sub, parent_event) {
				12126	child_ctr = inherit_event(sub, parent, parent_ctx,
				12127	child, leader, child_ctx);
				12128	if (IS_ERR(child_ctr))
				12129	return PTR_ERR(child_ctr);
				12130
				12131	if (sub->aux_event == parent_event && child_ctr &&
				12132	!perf_get_aux_event(child_ctr, leader))
				12133	return -EINVAL;
				12134	}
				12135	if (leader)
				12136	leader->group_generation = parent_event->group_generation;
				12137	return 0;
				12138	}
				12139
				12140	/*
				12141	* Creates the child task context and tries to inherit the event-group.
				12142	*
				12143	* Clears @inherited_all on !attr.inherited or error. Note that we'll leave
				12144	* inherited_all set when we 'fail' to inherit an orphaned event; this is
				12145	* consistent with perf_event_release_kernel() removing all child events.
				12146	*
				12147	* Returns:
				12148	* - 0 on success
				12149	* - <0 on error
				12150	*/
				12151	static int
				12152	inherit_task_group(struct perf_event event, struct task_struct parent,
				12153	struct perf_event_context *parent_ctx,
				12154	struct task_struct *child, int ctxn,
				12155	int *inherited_all)
				12156	{
				12157	int ret;
				12158	struct perf_event_context *child_ctx;
				12159
				12160	if (!event->attr.inherit) {
				12161	*inherited_all = 0;
				12162	return 0;
				12163	}
				12164
				12165	child_ctx = child->perf_event_ctxp[ctxn];
				12166	if (!child_ctx) {
				12167	/*
				12168	* This is executed from the parent task context, so
				12169	* inherit events that have been marked for cloning.
				12170	* First allocate and initialize a context for the
				12171	* child.
				12172	*/
				12173	child_ctx = alloc_perf_context(parent_ctx->pmu, child);
				12174	if (!child_ctx)
				12175	return -ENOMEM;
				12176
				12177	child->perf_event_ctxp[ctxn] = child_ctx;
				12178	}
				12179
				12180	ret = inherit_group(event, parent, parent_ctx,
				12181	child, child_ctx);
				12182
				12183	if (ret)
				12184	*inherited_all = 0;
				12185
				12186	return ret;
				12187	}
				12188
				12189	/*
				12190	* Initialize the perf_event context in task_struct
				12191	*/
				12192	static int perf_event_init_context(struct task_struct *child, int ctxn)
				12193	{
				12194	struct perf_event_context child_ctx, parent_ctx;
				12195	struct perf_event_context *cloned_ctx;
				12196	struct perf_event *event;
				12197	struct task_struct *parent = current;
				12198	int inherited_all = 1;
				12199	unsigned long flags;
				12200	int ret = 0;
				12201
				12202	if (likely(!parent->perf_event_ctxp[ctxn]))
				12203	return 0;
				12204
				12205	/*
				12206	* If the parent's context is a clone, pin it so it won't get
				12207	* swapped under us.
				12208	*/
				12209	parent_ctx = perf_pin_task_context(parent, ctxn);
				12210	if (!parent_ctx)
				12211	return 0;
				12212
				12213	/*
				12214	* No need to check if parent_ctx != NULL here; since we saw
				12215	* it non-NULL earlier, the only reason for it to become NULL
				12216	* is if we exit, and since we're currently in the middle of
				12217	* a fork we can't be exiting at the same time.
				12218	*/
				12219
				12220	/*
				12221	* Lock the parent list. No need to lock the child - not PID
				12222	* hashed yet and not running, so nobody can access it.
				12223	*/
				12224	mutex_lock(&parent_ctx->mutex);
				12225
				12226	/*
				12227	* We dont have to disable NMIs - we are only looking at
				12228	* the list, not manipulating it:
				12229	*/
				12230	perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
				12231	ret = inherit_task_group(event, parent, parent_ctx,
				12232	child, ctxn, &inherited_all);
				12233	if (ret)
				12234	goto out_unlock;
				12235	}
				12236
				12237	/*
				12238	* We can't hold ctx->lock when iterating the ->flexible_group list due
				12239	* to allocations, but we need to prevent rotation because
				12240	* rotate_ctx() will change the list from interrupt context.
				12241	*/
				12242	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
				12243	parent_ctx->rotate_disable = 1;
				12244	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
				12245
				12246	perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
				12247	ret = inherit_task_group(event, parent, parent_ctx,
				12248	child, ctxn, &inherited_all);
				12249	if (ret)
				12250	goto out_unlock;
				12251	}
				12252
				12253	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
				12254	parent_ctx->rotate_disable = 0;
				12255
				12256	child_ctx = child->perf_event_ctxp[ctxn];
				12257
				12258	if (child_ctx && inherited_all) {
				12259	/*
				12260	* Mark the child context as a clone of the parent
				12261	* context, or of whatever the parent is a clone of.
				12262	*
				12263	* Note that if the parent is a clone, the holding of
				12264	* parent_ctx->lock avoids it from being uncloned.
				12265	*/
				12266	cloned_ctx = parent_ctx->parent_ctx;
				12267	if (cloned_ctx) {
				12268	child_ctx->parent_ctx = cloned_ctx;
				12269	child_ctx->parent_gen = parent_ctx->parent_gen;
				12270	} else {
				12271	child_ctx->parent_ctx = parent_ctx;
				12272	child_ctx->parent_gen = parent_ctx->generation;
				12273	}
				12274	get_ctx(child_ctx->parent_ctx);
				12275	}
				12276
				12277	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
				12278	out_unlock:
				12279	mutex_unlock(&parent_ctx->mutex);
				12280
				12281	perf_unpin_context(parent_ctx);
				12282	put_ctx(parent_ctx);
				12283
				12284	return ret;
				12285	}
				12286
				12287	/*
				12288	* Initialize the perf_event context in task_struct
				12289	*/
				12290	int perf_event_init_task(struct task_struct *child)
				12291	{
				12292	int ctxn, ret;
				12293
				12294	memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
				12295	mutex_init(&child->perf_event_mutex);
				12296	INIT_LIST_HEAD(&child->perf_event_list);
				12297
				12298	for_each_task_context_nr(ctxn) {
				12299	ret = perf_event_init_context(child, ctxn);
				12300	if (ret) {
				12301	perf_event_free_task(child);
				12302	return ret;
				12303	}
				12304	}
				12305
				12306	return 0;
				12307	}
				12308
				12309	static void __init perf_event_init_all_cpus(void)
				12310	{
				12311	struct swevent_htable *swhash;
				12312	int cpu;
				12313
				12314	zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
				12315
				12316	for_each_possible_cpu(cpu) {
				12317	swhash = &per_cpu(swevent_htable, cpu);
				12318	mutex_init(&swhash->hlist_mutex);
				12319	INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
				12320
				12321	INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
				12322	raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
				12323
				12324	#ifdef CONFIG_CGROUP_PERF
				12325	INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
				12326	#endif
				12327	INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
				12328	}
				12329	}
				12330
				12331	static void perf_swevent_init_cpu(unsigned int cpu)
				12332	{
				12333	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
				12334
				12335	mutex_lock(&swhash->hlist_mutex);
				12336	if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
				12337	struct swevent_hlist *hlist;
				12338
				12339	hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
				12340	WARN_ON(!hlist);
				12341	rcu_assign_pointer(swhash->swevent_hlist, hlist);
				12342	}
				12343	mutex_unlock(&swhash->hlist_mutex);
				12344	}
				12345
				12346	#if defined CONFIG_HOTPLUG_CPU \|\| defined CONFIG_KEXEC_CORE
				12347	static void __perf_event_exit_context(void *__info)
				12348	{
				12349	struct perf_event_context *ctx = __info;
				12350	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
				12351	struct perf_event *event;
				12352
				12353	raw_spin_lock(&ctx->lock);
				12354	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
				12355	list_for_each_entry(event, &ctx->event_list, event_entry)
				12356	__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
				12357	raw_spin_unlock(&ctx->lock);
				12358	}
				12359
				12360	static void perf_event_exit_cpu_context(int cpu)
				12361	{
				12362	struct perf_cpu_context *cpuctx;
				12363	struct perf_event_context *ctx;
				12364	struct pmu *pmu;
				12365
				12366	mutex_lock(&pmus_lock);
				12367	list_for_each_entry(pmu, &pmus, entry) {
				12368	cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
				12369	ctx = &cpuctx->ctx;
				12370
				12371	mutex_lock(&ctx->mutex);
				12372	smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
				12373	cpuctx->online = 0;
				12374	mutex_unlock(&ctx->mutex);
				12375	}
				12376	cpumask_clear_cpu(cpu, perf_online_mask);
				12377	mutex_unlock(&pmus_lock);
				12378	}
				12379	#else
				12380
				12381	static void perf_event_exit_cpu_context(int cpu) { }
				12382
				12383	#endif
				12384
				12385	int perf_event_init_cpu(unsigned int cpu)
				12386	{
				12387	struct perf_cpu_context *cpuctx;
				12388	struct perf_event_context *ctx;
				12389	struct pmu *pmu;
				12390
				12391	perf_swevent_init_cpu(cpu);
				12392
				12393	mutex_lock(&pmus_lock);
				12394	cpumask_set_cpu(cpu, perf_online_mask);
				12395	list_for_each_entry(pmu, &pmus, entry) {
				12396	cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
				12397	ctx = &cpuctx->ctx;
				12398
				12399	mutex_lock(&ctx->mutex);
				12400	cpuctx->online = 1;
				12401	mutex_unlock(&ctx->mutex);
				12402	}
				12403	mutex_unlock(&pmus_lock);
				12404
				12405	return 0;
				12406	}
				12407
				12408	int perf_event_exit_cpu(unsigned int cpu)
				12409	{
				12410	perf_event_exit_cpu_context(cpu);
				12411	return 0;
				12412	}
				12413
				12414	static int
				12415	perf_reboot(struct notifier_block notifier, unsigned long val, void v)
				12416	{
				12417	int cpu;
				12418
				12419	for_each_online_cpu(cpu)
				12420	perf_event_exit_cpu(cpu);
				12421
				12422	return NOTIFY_OK;
				12423	}
				12424
				12425	/*
				12426	* Run the perf reboot notifier at the very last possible moment so that
				12427	* the generic watchdog code runs as long as possible.
				12428	*/
				12429	static struct notifier_block perf_reboot_notifier = {
				12430	.notifier_call = perf_reboot,
				12431	.priority = INT_MIN,
				12432	};
				12433
				12434	void __init perf_event_init(void)
				12435	{
				12436	int ret;
				12437
				12438	idr_init(&pmu_idr);
				12439
				12440	perf_event_init_all_cpus();
				12441	init_srcu_struct(&pmus_srcu);
				12442	perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
				12443	perf_pmu_register(&perf_cpu_clock, NULL, -1);
				12444	perf_pmu_register(&perf_task_clock, NULL, -1);
				12445	perf_tp_register();
				12446	perf_event_init_cpu(smp_processor_id());
				12447	register_reboot_notifier(&perf_reboot_notifier);
				12448
				12449	ret = init_hw_breakpoint();
				12450	WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
				12451
				12452	/*
				12453	* Build time assertion that we keep the data_head at the intended
				12454	* location. IOW, validation we got the __reserved[] size right.
				12455	*/
				12456	BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
				12457	!= 1024);
				12458	}
				12459
				12460	ssize_t perf_event_sysfs_show(struct device dev, struct device_attribute attr,
				12461	char *page)
				12462	{
				12463	struct perf_pmu_events_attr *pmu_attr =
				12464	container_of(attr, struct perf_pmu_events_attr, attr);
				12465
				12466	if (pmu_attr->event_str)
				12467	return sprintf(page, "%s\n", pmu_attr->event_str);
				12468
				12469	return 0;
				12470	}
				12471	EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
				12472
				12473	static int __init perf_event_sysfs_init(void)
				12474	{
				12475	struct pmu *pmu;
				12476	int ret;
				12477
				12478	mutex_lock(&pmus_lock);
				12479
				12480	ret = bus_register(&pmu_bus);
				12481	if (ret)
				12482	goto unlock;
				12483
				12484	list_for_each_entry(pmu, &pmus, entry) {
				12485	if (!pmu->name \|\| pmu->type < 0)
				12486	continue;
				12487
				12488	ret = pmu_dev_alloc(pmu);
				12489	WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
				12490	}
				12491	pmu_bus_running = 1;
				12492	ret = 0;
				12493
				12494	unlock:
				12495	mutex_unlock(&pmus_lock);
				12496
				12497	return ret;
				12498	}
				12499	device_initcall(perf_event_sysfs_init);
				12500
				12501	#ifdef CONFIG_CGROUP_PERF
				12502	static struct cgroup_subsys_state *
				12503	perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
				12504	{
				12505	struct perf_cgroup *jc;
				12506
				12507	jc = kzalloc(sizeof(*jc), GFP_KERNEL);
				12508	if (!jc)
				12509	return ERR_PTR(-ENOMEM);
				12510
				12511	jc->info = alloc_percpu(struct perf_cgroup_info);
				12512	if (!jc->info) {
				12513	kfree(jc);
				12514	return ERR_PTR(-ENOMEM);
				12515	}
				12516
				12517	return &jc->css;
				12518	}
				12519
				12520	static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
				12521	{
				12522	struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
				12523
				12524	free_percpu(jc->info);
				12525	kfree(jc);
				12526	}
				12527
				12528	static int __perf_cgroup_move(void *info)
				12529	{
				12530	struct task_struct *task = info;
				12531	rcu_read_lock();
				12532	perf_cgroup_switch(task, PERF_CGROUP_SWOUT \| PERF_CGROUP_SWIN);
				12533	rcu_read_unlock();
				12534	return 0;
				12535	}
				12536
				12537	static void perf_cgroup_attach(struct cgroup_taskset *tset)
				12538	{
				12539	struct task_struct *task;
				12540	struct cgroup_subsys_state *css;
				12541
				12542	cgroup_taskset_for_each(task, css, tset)
				12543	task_function_call(task, __perf_cgroup_move, task);
				12544	}
				12545
				12546	struct cgroup_subsys perf_event_cgrp_subsys = {
				12547	.css_alloc = perf_cgroup_css_alloc,
				12548	.css_free = perf_cgroup_css_free,
				12549	.attach = perf_cgroup_attach,
				12550	/*
				12551	* Implicitly enable on dfl hierarchy so that perf events can
				12552	* always be filtered by cgroup2 path as long as perf_event
				12553	* controller is not mounted on a legacy hierarchy.
				12554	*/
				12555	.implicit_on_dfl = true,
				12556	.threaded = true,
				12557	};
				12558	#endif /* CONFIG_CGROUP_PERF */