Blame - marvell/linux/kernel/sched/core.c - T108

blob: b2dcc2405a8a24677ce8cfd9cb984eb552bc60c1 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0-only
				2	/*
				3	* kernel/sched/core.c
				4	*
				5	* Core kernel scheduler code and related syscalls
				6	*
				7	* Copyright (C) 1991-2002 Linus Torvalds
				8	*/
				9	#include "sched.h"
				10
				11	#include <linux/nospec.h>
				12
				13	#include <linux/kcov.h>
				14	#include <linux/scs.h>
				15
				16	#include <asm/switch_to.h>
				17	#include <asm/tlb.h>
				18
				19	#include "../workqueue_internal.h"
				20	#include "../smpboot.h"
				21
				22	#include "pelt.h"
				23
				24	#define CREATE_TRACE_POINTS
				25	#include <trace/events/sched.h>
				26
				27	#undef CREATE_TRACE_POINTS
				28	#include <trace/hooks/dtask.h>
				29
				30	#undef CREATE_TRACE_POINTS
				31	#include <trace/hooks/sched.h>
				32
				33	/*
				34	* Export tracepoints that act as a bare tracehook (ie: have no trace event
				35	* associated with them) to allow external modules to probe them.
				36	*/
				37	EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
				38	EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
				39	EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
				40	EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
				41	EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
				42	EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
				43
				44	DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
				45	EXPORT_SYMBOL_GPL(runqueues);
				46
				47	#ifdef CONFIG_SCHED_DEBUG
				48	/*
				49	* Debugging: various feature bits
				50	*
				51	* If SCHED_DEBUG is disabled, each compilation unit has its own copy of
				52	* sysctl_sched_features, defined in sched.h, to allow constants propagation
				53	* at compile time and compiler optimization based on features default.
				54	*/
				55	#define SCHED_FEAT(name, enabled) \
				56	(1UL << __SCHED_FEAT_##name) * enabled \|
				57	const_debug unsigned int sysctl_sched_features =
				58	#include "features.h"
				59	0;
				60	#undef SCHED_FEAT
				61	#endif
				62
				63	/*
				64	* Number of tasks to iterate in a single balance run.
				65	* Limited because this is done with IRQs disabled.
				66	*/
				67	const_debug unsigned int sysctl_sched_nr_migrate = 32;
				68
				69	/*
				70	* period over which we measure -rt task CPU usage in us.
				71	* default: 1s
				72	*/
				73	unsigned int sysctl_sched_rt_period = 1000000;
				74
				75	__read_mostly int scheduler_running;
				76
				77	/*
				78	* part of the period that we allow rt tasks to run in us.
				79	* default: 0.95s
				80	*/
				81	int sysctl_sched_rt_runtime = 950000;
				82
				83	/*
				84	* __task_rq_lock - lock the rq @p resides on.
				85	*/
				86	struct rq __task_rq_lock(struct task_struct p, struct rq_flags *rf)
				87	__acquires(rq->lock)
				88	{
				89	struct rq *rq;
				90
				91	lockdep_assert_held(&p->pi_lock);
				92
				93	for (;;) {
				94	rq = task_rq(p);
				95	raw_spin_lock(&rq->lock);
				96	if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
				97	rq_pin_lock(rq, rf);
				98	return rq;
				99	}
				100	raw_spin_unlock(&rq->lock);
				101
				102	while (unlikely(task_on_rq_migrating(p)))
				103	cpu_relax();
				104	}
				105	}
				106
				107	/*
				108	* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
				109	*/
				110	struct rq task_rq_lock(struct task_struct p, struct rq_flags *rf)
				111	__acquires(p->pi_lock)
				112	__acquires(rq->lock)
				113	{
				114	struct rq *rq;
				115
				116	for (;;) {
				117	raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
				118	rq = task_rq(p);
				119	raw_spin_lock(&rq->lock);
				120	/*
				121	* move_queued_task() task_rq_lock()
				122	*
				123	* ACQUIRE (rq->lock)
				124	* [S] ->on_rq = MIGRATING [L] rq = task_rq()
				125	* WMB (__set_task_cpu()) ACQUIRE (rq->lock);
				126	* [S] ->cpu = new_cpu [L] task_rq()
				127	* [L] ->on_rq
				128	* RELEASE (rq->lock)
				129	*
				130	* If we observe the old CPU in task_rq_lock(), the acquire of
				131	* the old rq->lock will fully serialize against the stores.
				132	*
				133	* If we observe the new CPU in task_rq_lock(), the address
				134	* dependency headed by '[L] rq = task_rq()' and the acquire
				135	* will pair with the WMB to ensure we then also see migrating.
				136	*/
				137	if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
				138	rq_pin_lock(rq, rf);
				139	return rq;
				140	}
				141	raw_spin_unlock(&rq->lock);
				142	raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
				143
				144	while (unlikely(task_on_rq_migrating(p)))
				145	cpu_relax();
				146	}
				147	}
				148
				149	/*
				150	* RQ-clock updating methods:
				151	*/
				152
				153	static void update_rq_clock_task(struct rq *rq, s64 delta)
				154	{
				155	/*
				156	* In theory, the compile should just see 0 here, and optimize out the call
				157	* to sched_rt_avg_update. But I don't trust it...
				158	*/
				159	s64 __maybe_unused steal = 0, irq_delta = 0;
				160
				161	#ifdef CONFIG_IRQ_TIME_ACCOUNTING
				162	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
				163
				164	/*
				165	* Since irq_time is only updated on {soft,}irq_exit, we might run into
				166	* this case when a previous update_rq_clock() happened inside a
				167	* {soft,}irq region.
				168	*
				169	* When this happens, we stop ->clock_task and only update the
				170	* prev_irq_time stamp to account for the part that fit, so that a next
				171	* update will consume the rest. This ensures ->clock_task is
				172	* monotonic.
				173	*
				174	* It does however cause some slight miss-attribution of {soft,}irq
				175	* time, a more accurate solution would be to update the irq_time using
				176	* the current rq->clock timestamp, except that would require using
				177	* atomic ops.
				178	*/
				179	if (irq_delta > delta)
				180	irq_delta = delta;
				181
				182	rq->prev_irq_time += irq_delta;
				183	delta -= irq_delta;
				184	#endif
				185	#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
				186	if (static_key_false((&paravirt_steal_rq_enabled))) {
				187	steal = paravirt_steal_clock(cpu_of(rq));
				188	steal -= rq->prev_steal_time_rq;
				189
				190	if (unlikely(steal > delta))
				191	steal = delta;
				192
				193	rq->prev_steal_time_rq += steal;
				194	delta -= steal;
				195	}
				196	#endif
				197
				198	rq->clock_task += delta;
				199
				200	#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
				201	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
				202	update_irq_load_avg(rq, irq_delta + steal);
				203	#endif
				204	update_rq_clock_pelt(rq, delta);
				205	}
				206
				207	void update_rq_clock(struct rq *rq)
				208	{
				209	s64 delta;
				210
				211	lockdep_assert_held(&rq->lock);
				212
				213	if (rq->clock_update_flags & RQCF_ACT_SKIP)
				214	return;
				215
				216	#ifdef CONFIG_SCHED_DEBUG
				217	if (sched_feat(WARN_DOUBLE_CLOCK))
				218	SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
				219	rq->clock_update_flags \|= RQCF_UPDATED;
				220	#endif
				221
				222	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
				223	if (delta < 0)
				224	return;
				225	rq->clock += delta;
				226	update_rq_clock_task(rq, delta);
				227	}
				228
				229
				230	#ifdef CONFIG_SCHED_HRTICK
				231	/*
				232	* Use HR-timers to deliver accurate preemption points.
				233	*/
				234
				235	static void hrtick_clear(struct rq *rq)
				236	{
				237	if (hrtimer_active(&rq->hrtick_timer))
				238	hrtimer_cancel(&rq->hrtick_timer);
				239	}
				240
				241	/*
				242	* High-resolution timer tick.
				243	* Runs from hardirq context with interrupts disabled.
				244	*/
				245	static enum hrtimer_restart hrtick(struct hrtimer *timer)
				246	{
				247	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
				248	struct rq_flags rf;
				249
				250	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
				251
				252	rq_lock(rq, &rf);
				253	update_rq_clock(rq);
				254	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
				255	rq_unlock(rq, &rf);
				256
				257	return HRTIMER_NORESTART;
				258	}
				259
				260	#ifdef CONFIG_SMP
				261
				262	static void __hrtick_restart(struct rq *rq)
				263	{
				264	struct hrtimer *timer = &rq->hrtick_timer;
				265	ktime_t time = rq->hrtick_time;
				266
				267	hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
				268	}
				269
				270	/*
				271	* called from hardirq (IPI) context
				272	*/
				273	static void __hrtick_start(void *arg)
				274	{
				275	struct rq *rq = arg;
				276	struct rq_flags rf;
				277
				278	rq_lock(rq, &rf);
				279	__hrtick_restart(rq);
				280	rq->hrtick_csd_pending = 0;
				281	rq_unlock(rq, &rf);
				282	}
				283
				284	/*
				285	* Called to set the hrtick timer state.
				286	*
				287	* called with rq->lock held and irqs disabled
				288	*/
				289	void hrtick_start(struct rq *rq, u64 delay)
				290	{
				291	struct hrtimer *timer = &rq->hrtick_timer;
				292	s64 delta;
				293
				294	/*
				295	* Don't schedule slices shorter than 10000ns, that just
				296	* doesn't make sense and can cause timer DoS.
				297	*/
				298	delta = max_t(s64, delay, 10000LL);
				299	rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
				300
				301	if (rq == this_rq()) {
				302	__hrtick_restart(rq);
				303	} else if (!rq->hrtick_csd_pending) {
				304	smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
				305	rq->hrtick_csd_pending = 1;
				306	}
				307	}
				308
				309	#else
				310	/*
				311	* Called to set the hrtick timer state.
				312	*
				313	* called with rq->lock held and irqs disabled
				314	*/
				315	void hrtick_start(struct rq *rq, u64 delay)
				316	{
				317	/*
				318	* Don't schedule slices shorter than 10000ns, that just
				319	* doesn't make sense. Rely on vruntime for fairness.
				320	*/
				321	delay = max_t(u64, delay, 10000LL);
				322	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
				323	HRTIMER_MODE_REL_PINNED_HARD);
				324	}
				325	#endif /* CONFIG_SMP */
				326
				327	static void hrtick_rq_init(struct rq *rq)
				328	{
				329	#ifdef CONFIG_SMP
				330	rq->hrtick_csd_pending = 0;
				331
				332	rq->hrtick_csd.flags = 0;
				333	rq->hrtick_csd.func = __hrtick_start;
				334	rq->hrtick_csd.info = rq;
				335	#endif
				336
				337	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
				338	rq->hrtick_timer.function = hrtick;
				339	}
				340	#else /* CONFIG_SCHED_HRTICK */
				341	static inline void hrtick_clear(struct rq *rq)
				342	{
				343	}
				344
				345	static inline void hrtick_rq_init(struct rq *rq)
				346	{
				347	}
				348	#endif /* CONFIG_SCHED_HRTICK */
				349
				350	/*
				351	* cmpxchg based fetch_or, macro so it works for different integer types
				352	*/
				353	#define fetch_or(ptr, mask) \
				354	({ \
				355	typeof(ptr) _ptr = (ptr); \
				356	typeof(mask) _mask = (mask); \
				357	typeof(_ptr) _old, _val = _ptr; \
				358	\
				359	for (;;) { \
				360	_old = cmpxchg(_ptr, _val, _val \| _mask); \
				361	if (_old == _val) \
				362	break; \
				363	_val = _old; \
				364	} \
				365	_old; \
				366	})
				367
				368	#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
				369	/*
				370	* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
				371	* this avoids any races wrt polling state changes and thereby avoids
				372	* spurious IPIs.
				373	*/
				374	static bool set_nr_and_not_polling(struct task_struct *p)
				375	{
				376	struct thread_info *ti = task_thread_info(p);
				377	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
				378	}
				379
				380	/*
				381	* Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
				382	*
				383	* If this returns true, then the idle task promises to call
				384	* sched_ttwu_pending() and reschedule soon.
				385	*/
				386	static bool set_nr_if_polling(struct task_struct *p)
				387	{
				388	struct thread_info *ti = task_thread_info(p);
				389	typeof(ti->flags) old, val = READ_ONCE(ti->flags);
				390
				391	for (;;) {
				392	if (!(val & _TIF_POLLING_NRFLAG))
				393	return false;
				394	if (val & _TIF_NEED_RESCHED)
				395	return true;
				396	old = cmpxchg(&ti->flags, val, val \| _TIF_NEED_RESCHED);
				397	if (old == val)
				398	break;
				399	val = old;
				400	}
				401	return true;
				402	}
				403
				404	#else
				405	static bool set_nr_and_not_polling(struct task_struct *p)
				406	{
				407	set_tsk_need_resched(p);
				408	return true;
				409	}
				410
				411	#ifdef CONFIG_SMP
				412	static bool set_nr_if_polling(struct task_struct *p)
				413	{
				414	return false;
				415	}
				416	#endif
				417	#endif
				418
				419	static bool __wake_q_add(struct wake_q_head head, struct task_struct task)
				420	{
				421	struct wake_q_node *node = &task->wake_q;
				422
				423	/*
				424	* Atomically grab the task, if ->wake_q is !nil already it means
				425	* its already queued (either by us or someone else) and will get the
				426	* wakeup due to that.
				427	*
				428	* In order to ensure that a pending wakeup will observe our pending
				429	* state, even in the failed case, an explicit smp_mb() must be used.
				430	*/
				431	smp_mb__before_atomic();
				432	if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
				433	return false;
				434
				435	/*
				436	* The head is context local, there can be no concurrency.
				437	*/
				438	*head->lastp = node;
				439	head->lastp = &node->next;
				440	return true;
				441	}
				442
				443	/**
				444	* wake_q_add() - queue a wakeup for 'later' waking.
				445	* @head: the wake_q_head to add @task to
				446	* @task: the task to queue for 'later' wakeup
				447	*
				448	* Queue a task for later wakeup, most likely by the wake_up_q() call in the
				449	* same context, _HOWEVER_ this is not guaranteed, the wakeup can come
				450	* instantly.
				451	*
				452	* This function must be used as-if it were wake_up_process(); IOW the task
				453	* must be ready to be woken at this location.
				454	*/
				455	void wake_q_add(struct wake_q_head head, struct task_struct task)
				456	{
				457	if (__wake_q_add(head, task))
				458	get_task_struct(task);
				459	}
				460
				461	/**
				462	* wake_q_add_safe() - safely queue a wakeup for 'later' waking.
				463	* @head: the wake_q_head to add @task to
				464	* @task: the task to queue for 'later' wakeup
				465	*
				466	* Queue a task for later wakeup, most likely by the wake_up_q() call in the
				467	* same context, _HOWEVER_ this is not guaranteed, the wakeup can come
				468	* instantly.
				469	*
				470	* This function must be used as-if it were wake_up_process(); IOW the task
				471	* must be ready to be woken at this location.
				472	*
				473	* This function is essentially a task-safe equivalent to wake_q_add(). Callers
				474	* that already hold reference to @task can call the 'safe' version and trust
				475	* wake_q to do the right thing depending whether or not the @task is already
				476	* queued for wakeup.
				477	*/
				478	void wake_q_add_safe(struct wake_q_head head, struct task_struct task)
				479	{
				480	if (!__wake_q_add(head, task))
				481	put_task_struct(task);
				482	}
				483
				484	void wake_up_q(struct wake_q_head *head)
				485	{
				486	struct wake_q_node *node = head->first;
				487
				488	while (node != WAKE_Q_TAIL) {
				489	struct task_struct *task;
				490
				491	task = container_of(node, struct task_struct, wake_q);
				492	BUG_ON(!task);
				493	/* Task can safely be re-inserted now: */
				494	node = node->next;
				495	task->wake_q.next = NULL;
				496
				497	/*
				498	* wake_up_process() executes a full barrier, which pairs with
				499	* the queueing in wake_q_add() so as not to miss wakeups.
				500	*/
				501	wake_up_process(task);
				502	put_task_struct(task);
				503	}
				504	}
				505
				506	/*
				507	* resched_curr - mark rq's current task 'to be rescheduled now'.
				508	*
				509	* On UP this means the setting of the need_resched flag, on SMP it
				510	* might also involve a cross-CPU call to trigger the scheduler on
				511	* the target CPU.
				512	*/
				513	void resched_curr(struct rq *rq)
				514	{
				515	struct task_struct *curr = rq->curr;
				516	int cpu;
				517
				518	lockdep_assert_held(&rq->lock);
				519
				520	if (test_tsk_need_resched(curr))
				521	return;
				522
				523	cpu = cpu_of(rq);
				524
				525	if (cpu == smp_processor_id()) {
				526	set_tsk_need_resched(curr);
				527	set_preempt_need_resched();
				528	return;
				529	}
				530
				531	if (set_nr_and_not_polling(curr))
				532	smp_send_reschedule(cpu);
				533	else
				534	trace_sched_wake_idle_without_ipi(cpu);
				535	}
				536
				537	void resched_cpu(int cpu)
				538	{
				539	struct rq *rq = cpu_rq(cpu);
				540	unsigned long flags;
				541
				542	raw_spin_lock_irqsave(&rq->lock, flags);
				543	if (cpu_online(cpu) \|\| cpu == smp_processor_id())
				544	resched_curr(rq);
				545	raw_spin_unlock_irqrestore(&rq->lock, flags);
				546	}
				547
				548	#ifdef CONFIG_SMP
				549	#ifdef CONFIG_NO_HZ_COMMON
				550	/*
				551	* In the semi idle case, use the nearest busy CPU for migrating timers
				552	* from an idle CPU. This is good for power-savings.
				553	*
				554	* We don't do similar optimization for completely idle system, as
				555	* selecting an idle CPU will add more delays to the timers than intended
				556	* (as that CPU's timer base may not be uptodate wrt jiffies etc).
				557	*/
				558	int get_nohz_timer_target(void)
				559	{
				560	int i, cpu = smp_processor_id();
				561	struct sched_domain *sd;
				562
				563	if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
				564	return cpu;
				565
				566	rcu_read_lock();
				567	for_each_domain(cpu, sd) {
				568	for_each_cpu(i, sched_domain_span(sd)) {
				569	if (cpu == i)
				570	continue;
				571
				572	if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
				573	cpu = i;
				574	goto unlock;
				575	}
				576	}
				577	}
				578
				579	if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
				580	cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
				581	unlock:
				582	rcu_read_unlock();
				583	return cpu;
				584	}
				585
				586	/*
				587	* When add_timer_on() enqueues a timer into the timer wheel of an
				588	* idle CPU then this timer might expire before the next timer event
				589	* which is scheduled to wake up that CPU. In case of a completely
				590	* idle system the next event might even be infinite time into the
				591	* future. wake_up_idle_cpu() ensures that the CPU is woken up and
				592	* leaves the inner idle loop so the newly added timer is taken into
				593	* account when the CPU goes back to idle and evaluates the timer
				594	* wheel for the next timer event.
				595	*/
				596	static void wake_up_idle_cpu(int cpu)
				597	{
				598	struct rq *rq = cpu_rq(cpu);
				599
				600	if (cpu == smp_processor_id())
				601	return;
				602
				603	if (set_nr_and_not_polling(rq->idle))
				604	smp_send_reschedule(cpu);
				605	else
				606	trace_sched_wake_idle_without_ipi(cpu);
				607	}
				608
				609	static bool wake_up_full_nohz_cpu(int cpu)
				610	{
				611	/*
				612	* We just need the target to call irq_exit() and re-evaluate
				613	* the next tick. The nohz full kick at least implies that.
				614	* If needed we can still optimize that later with an
				615	* empty IRQ.
				616	*/
				617	if (cpu_is_offline(cpu))
				618	return true; /* Don't try to wake offline CPUs. */
				619	if (tick_nohz_full_cpu(cpu)) {
				620	if (cpu != smp_processor_id() \|\|
				621	tick_nohz_tick_stopped())
				622	tick_nohz_full_kick_cpu(cpu);
				623	return true;
				624	}
				625
				626	return false;
				627	}
				628
				629	/*
				630	* Wake up the specified CPU. If the CPU is going offline, it is the
				631	* caller's responsibility to deal with the lost wakeup, for example,
				632	* by hooking into the CPU_DEAD notifier like timers and hrtimers do.
				633	*/
				634	void wake_up_nohz_cpu(int cpu)
				635	{
				636	if (!wake_up_full_nohz_cpu(cpu))
				637	wake_up_idle_cpu(cpu);
				638	}
				639
				640	static inline bool got_nohz_idle_kick(void)
				641	{
				642	int cpu = smp_processor_id();
				643
				644	if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
				645	return false;
				646
				647	if (idle_cpu(cpu) && !need_resched())
				648	return true;
				649
				650	/*
				651	* We can't run Idle Load Balance on this CPU for this time so we
				652	* cancel it and clear NOHZ_BALANCE_KICK
				653	*/
				654	atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
				655	return false;
				656	}
				657
				658	#else /* CONFIG_NO_HZ_COMMON */
				659
				660	static inline bool got_nohz_idle_kick(void)
				661	{
				662	return false;
				663	}
				664
				665	#endif /* CONFIG_NO_HZ_COMMON */
				666
				667	#ifdef CONFIG_NO_HZ_FULL
				668	bool sched_can_stop_tick(struct rq *rq)
				669	{
				670	int fifo_nr_running;
				671
				672	/* Deadline tasks, even if single, need the tick */
				673	if (rq->dl.dl_nr_running)
				674	return false;
				675
				676	/*
				677	* If there are more than one RR tasks, we need the tick to effect the
				678	* actual RR behaviour.
				679	*/
				680	if (rq->rt.rr_nr_running) {
				681	if (rq->rt.rr_nr_running == 1)
				682	return true;
				683	else
				684	return false;
				685	}
				686
				687	/*
				688	* If there's no RR tasks, but FIFO tasks, we can skip the tick, no
				689	* forced preemption between FIFO tasks.
				690	*/
				691	fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
				692	if (fifo_nr_running)
				693	return true;
				694
				695	/*
				696	* If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
				697	* if there's more than one we need the tick for involuntary
				698	* preemption.
				699	*/
				700	if (rq->nr_running > 1)
				701	return false;
				702
				703	return true;
				704	}
				705	#endif /* CONFIG_NO_HZ_FULL */
				706	#endif /* CONFIG_SMP */
				707
				708	#if defined(CONFIG_RT_GROUP_SCHED) \|\| (defined(CONFIG_FAIR_GROUP_SCHED) && \
				709	(defined(CONFIG_SMP) \|\| defined(CONFIG_CFS_BANDWIDTH)))
				710	/*
				711	* Iterate task_group tree rooted at *from, calling @down when first entering a
				712	* node and @up when leaving it for the final time.
				713	*
				714	* Caller must hold rcu_lock or sufficient equivalent.
				715	*/
				716	int walk_tg_tree_from(struct task_group *from,
				717	tg_visitor down, tg_visitor up, void *data)
				718	{
				719	struct task_group parent, child;
				720	int ret;
				721
				722	parent = from;
				723
				724	down:
				725	ret = (*down)(parent, data);
				726	if (ret)
				727	goto out;
				728	list_for_each_entry_rcu(child, &parent->children, siblings) {
				729	parent = child;
				730	goto down;
				731
				732	up:
				733	continue;
				734	}
				735	ret = (*up)(parent, data);
				736	if (ret \|\| parent == from)
				737	goto out;
				738
				739	child = parent;
				740	parent = parent->parent;
				741	if (parent)
				742	goto up;
				743	out:
				744	return ret;
				745	}
				746
				747	int tg_nop(struct task_group tg, void data)
				748	{
				749	return 0;
				750	}
				751	#endif
				752
				753	static void set_load_weight(struct task_struct *p, bool update_load)
				754	{
				755	int prio = p->static_prio - MAX_RT_PRIO;
				756	struct load_weight *load = &p->se.load;
				757
				758	/*
				759	* SCHED_IDLE tasks get minimal weight:
				760	*/
				761	if (task_has_idle_policy(p)) {
				762	load->weight = scale_load(WEIGHT_IDLEPRIO);
				763	load->inv_weight = WMULT_IDLEPRIO;
				764	p->se.runnable_weight = load->weight;
				765	return;
				766	}
				767
				768	/*
				769	* SCHED_OTHER tasks have to update their load when changing their
				770	* weight
				771	*/
				772	if (update_load && p->sched_class == &fair_sched_class) {
				773	reweight_task(p, prio);
				774	} else {
				775	load->weight = scale_load(sched_prio_to_weight[prio]);
				776	load->inv_weight = sched_prio_to_wmult[prio];
				777	p->se.runnable_weight = load->weight;
				778	}
				779	}
				780
				781	#ifdef CONFIG_UCLAMP_TASK
				782	/*
				783	* Serializes updates of utilization clamp values
				784	*
				785	* The (slow-path) user-space triggers utilization clamp value updates which
				786	* can require updates on (fast-path) scheduler's data structures used to
				787	* support enqueue/dequeue operations.
				788	* While the per-CPU rq lock protects fast-path update operations, user-space
				789	* requests are serialized using a mutex to reduce the risk of conflicting
				790	* updates or API abuses.
				791	*/
				792	static DEFINE_MUTEX(uclamp_mutex);
				793
				794	/* Max allowed minimum utilization */
				795	unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
				796
				797	/* Max allowed maximum utilization */
				798	unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
				799
				800	/*
				801	* By default RT tasks run at the maximum performance point/capacity of the
				802	* system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
				803	* SCHED_CAPACITY_SCALE.
				804	*
				805	* This knob allows admins to change the default behavior when uclamp is being
				806	* used. In battery powered devices, particularly, running at the maximum
				807	* capacity and frequency will increase energy consumption and shorten the
				808	* battery life.
				809	*
				810	* This knob only affects RT tasks that their uclamp_se->user_defined == false.
				811	*
				812	* This knob will not override the system default sched_util_clamp_min defined
				813	* above.
				814	*/
				815	unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
				816
				817	/* All clamps are required to be less or equal than these values */
				818	static struct uclamp_se uclamp_default[UCLAMP_CNT];
				819
				820	/*
				821	* This static key is used to reduce the uclamp overhead in the fast path. It
				822	* primarily disables the call to uclamp_rq_{inc, dec}() in
				823	* enqueue/dequeue_task().
				824	*
				825	* This allows users to continue to enable uclamp in their kernel config with
				826	* minimum uclamp overhead in the fast path.
				827	*
				828	* As soon as userspace modifies any of the uclamp knobs, the static key is
				829	* enabled, since we have an actual users that make use of uclamp
				830	* functionality.
				831	*
				832	* The knobs that would enable this static key are:
				833	*
				834	* * A task modifying its uclamp value with sched_setattr().
				835	* * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
				836	* * An admin modifying the cgroup cpu.uclamp.{min, max}
				837	*/
				838	DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
				839
				840	/* Integer rounded range for each bucket */
				841	#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
				842
				843	#define for_each_clamp_id(clamp_id) \
				844	for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
				845
				846	static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
				847	{
				848	return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
				849	}
				850
				851	static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
				852	{
				853	return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
				854	}
				855
				856	static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
				857	{
				858	if (clamp_id == UCLAMP_MIN)
				859	return 0;
				860	return SCHED_CAPACITY_SCALE;
				861	}
				862
				863	static inline void uclamp_se_set(struct uclamp_se *uc_se,
				864	unsigned int value, bool user_defined)
				865	{
				866	uc_se->value = value;
				867	uc_se->bucket_id = uclamp_bucket_id(value);
				868	uc_se->user_defined = user_defined;
				869	}
				870
				871	static inline unsigned int
				872	uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
				873	unsigned int clamp_value)
				874	{
				875	/*
				876	* Avoid blocked utilization pushing up the frequency when we go
				877	* idle (which drops the max-clamp) by retaining the last known
				878	* max-clamp.
				879	*/
				880	if (clamp_id == UCLAMP_MAX) {
				881	rq->uclamp_flags \|= UCLAMP_FLAG_IDLE;
				882	return clamp_value;
				883	}
				884
				885	return uclamp_none(UCLAMP_MIN);
				886	}
				887
				888	static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
				889	unsigned int clamp_value)
				890	{
				891	/* Reset max-clamp retention only on idle exit */
				892	if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
				893	return;
				894
				895	WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
				896	}
				897
				898	static inline
				899	unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
				900	unsigned int clamp_value)
				901	{
				902	struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
				903	int bucket_id = UCLAMP_BUCKETS - 1;
				904
				905	/*
				906	* Since both min and max clamps are max aggregated, find the
				907	* top most bucket with tasks in.
				908	*/
				909	for ( ; bucket_id >= 0; bucket_id--) {
				910	if (!bucket[bucket_id].tasks)
				911	continue;
				912	return bucket[bucket_id].value;
				913	}
				914
				915	/* No tasks -- default clamp values */
				916	return uclamp_idle_value(rq, clamp_id, clamp_value);
				917	}
				918
				919	static void __uclamp_update_util_min_rt_default(struct task_struct *p)
				920	{
				921	unsigned int default_util_min;
				922	struct uclamp_se *uc_se;
				923
				924	lockdep_assert_held(&p->pi_lock);
				925
				926	uc_se = &p->uclamp_req[UCLAMP_MIN];
				927
				928	/* Only sync if user didn't override the default */
				929	if (uc_se->user_defined)
				930	return;
				931
				932	default_util_min = sysctl_sched_uclamp_util_min_rt_default;
				933	uclamp_se_set(uc_se, default_util_min, false);
				934	}
				935
				936	static void uclamp_update_util_min_rt_default(struct task_struct *p)
				937	{
				938	struct rq_flags rf;
				939	struct rq *rq;
				940
				941	if (!rt_task(p))
				942	return;
				943
				944	/* Protect updates to p->uclamp_* */
				945	rq = task_rq_lock(p, &rf);
				946	__uclamp_update_util_min_rt_default(p);
				947	task_rq_unlock(rq, p, &rf);
				948	}
				949
				950	static void uclamp_sync_util_min_rt_default(void)
				951	{
				952	struct task_struct g, p;
				953
				954	/*
				955	* copy_process() sysctl_uclamp
				956	* uclamp_min_rt = X;
				957	* write_lock(&tasklist_lock) read_lock(&tasklist_lock)
				958	* // link thread smp_mb__after_spinlock()
				959	* write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
				960	* sched_post_fork() for_each_process_thread()
				961	* __uclamp_sync_rt() __uclamp_sync_rt()
				962	*
				963	* Ensures that either sched_post_fork() will observe the new
				964	* uclamp_min_rt or for_each_process_thread() will observe the new
				965	* task.
				966	*/
				967	read_lock(&tasklist_lock);
				968	smp_mb__after_spinlock();
				969	read_unlock(&tasklist_lock);
				970
				971	rcu_read_lock();
				972	for_each_process_thread(g, p)
				973	uclamp_update_util_min_rt_default(p);
				974	rcu_read_unlock();
				975	}
				976
				977	static inline struct uclamp_se
				978	uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
				979	{
				980	/* Copy by value as we could modify it */
				981	struct uclamp_se uc_req = p->uclamp_req[clamp_id];
				982	#ifdef CONFIG_UCLAMP_TASK_GROUP
				983	unsigned int tg_min, tg_max, value;
				984
				985	/*
				986	* Tasks in autogroups or root task group will be
				987	* restricted by system defaults.
				988	*/
				989	if (task_group_is_autogroup(task_group(p)))
				990	return uc_req;
				991	if (task_group(p) == &root_task_group)
				992	return uc_req;
				993
				994	tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
				995	tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
				996	value = uc_req.value;
				997	value = clamp(value, tg_min, tg_max);
				998	uclamp_se_set(&uc_req, value, false);
				999	#endif
				1000
				1001	return uc_req;
				1002	}
				1003
				1004	/*
				1005	* The effective clamp bucket index of a task depends on, by increasing
				1006	* priority:
				1007	* - the task specific clamp value, when explicitly requested from userspace
				1008	* - the task group effective clamp value, for tasks not either in the root
				1009	* group or in an autogroup
				1010	* - the system default clamp value, defined by the sysadmin
				1011	*/
				1012	static inline struct uclamp_se
				1013	uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
				1014	{
				1015	struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
				1016	struct uclamp_se uc_max = uclamp_default[clamp_id];
				1017
				1018	/* System default restrictions always apply */
				1019	if (unlikely(uc_req.value > uc_max.value))
				1020	return uc_max;
				1021
				1022	return uc_req;
				1023	}
				1024
				1025	unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
				1026	{
				1027	struct uclamp_se uc_eff;
				1028
				1029	/* Task currently refcounted: use back-annotated (effective) value */
				1030	if (p->uclamp[clamp_id].active)
				1031	return (unsigned long)p->uclamp[clamp_id].value;
				1032
				1033	uc_eff = uclamp_eff_get(p, clamp_id);
				1034
				1035	return (unsigned long)uc_eff.value;
				1036	}
				1037
				1038	/*
				1039	* When a task is enqueued on a rq, the clamp bucket currently defined by the
				1040	* task's uclamp::bucket_id is refcounted on that rq. This also immediately
				1041	* updates the rq's clamp value if required.
				1042	*
				1043	* Tasks can have a task-specific value requested from user-space, track
				1044	* within each bucket the maximum value for tasks refcounted in it.
				1045	* This "local max aggregation" allows to track the exact "requested" value
				1046	* for each bucket when all its RUNNABLE tasks require the same clamp.
				1047	*/
				1048	static inline void uclamp_rq_inc_id(struct rq rq, struct task_struct p,
				1049	enum uclamp_id clamp_id)
				1050	{
				1051	struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
				1052	struct uclamp_se *uc_se = &p->uclamp[clamp_id];
				1053	struct uclamp_bucket *bucket;
				1054
				1055	lockdep_assert_held(&rq->lock);
				1056
				1057	/* Update task effective clamp */
				1058	p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
				1059
				1060	bucket = &uc_rq->bucket[uc_se->bucket_id];
				1061	bucket->tasks++;
				1062	uc_se->active = true;
				1063
				1064	uclamp_idle_reset(rq, clamp_id, uc_se->value);
				1065
				1066	/*
				1067	* Local max aggregation: rq buckets always track the max
				1068	* "requested" clamp value of its RUNNABLE tasks.
				1069	*/
				1070	if (bucket->tasks == 1 \|\| uc_se->value > bucket->value)
				1071	bucket->value = uc_se->value;
				1072
				1073	if (uc_se->value > READ_ONCE(uc_rq->value))
				1074	WRITE_ONCE(uc_rq->value, uc_se->value);
				1075	}
				1076
				1077	/*
				1078	* When a task is dequeued from a rq, the clamp bucket refcounted by the task
				1079	* is released. If this is the last task reference counting the rq's max
				1080	* active clamp value, then the rq's clamp value is updated.
				1081	*
				1082	* Both refcounted tasks and rq's cached clamp values are expected to be
				1083	* always valid. If it's detected they are not, as defensive programming,
				1084	* enforce the expected state and warn.
				1085	*/
				1086	static inline void uclamp_rq_dec_id(struct rq rq, struct task_struct p,
				1087	enum uclamp_id clamp_id)
				1088	{
				1089	struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
				1090	struct uclamp_se *uc_se = &p->uclamp[clamp_id];
				1091	struct uclamp_bucket *bucket;
				1092	unsigned int bkt_clamp;
				1093	unsigned int rq_clamp;
				1094
				1095	lockdep_assert_held(&rq->lock);
				1096
				1097	/*
				1098	* If sched_uclamp_used was enabled after task @p was enqueued,
				1099	* we could end up with unbalanced call to uclamp_rq_dec_id().
				1100	*
				1101	* In this case the uc_se->active flag should be false since no uclamp
				1102	* accounting was performed at enqueue time and we can just return
				1103	* here.
				1104	*
				1105	* Need to be careful of the following enqeueue/dequeue ordering
				1106	* problem too
				1107	*
				1108	* enqueue(taskA)
				1109	* // sched_uclamp_used gets enabled
				1110	* enqueue(taskB)
				1111	* dequeue(taskA)
				1112	* // Must not decrement bukcet->tasks here
				1113	* dequeue(taskB)
				1114	*
				1115	* where we could end up with stale data in uc_se and
				1116	* bucket[uc_se->bucket_id].
				1117	*
				1118	* The following check here eliminates the possibility of such race.
				1119	*/
				1120	if (unlikely(!uc_se->active))
				1121	return;
				1122
				1123	bucket = &uc_rq->bucket[uc_se->bucket_id];
				1124
				1125	SCHED_WARN_ON(!bucket->tasks);
				1126	if (likely(bucket->tasks))
				1127	bucket->tasks--;
				1128
				1129	uc_se->active = false;
				1130
				1131	/*
				1132	* Keep "local max aggregation" simple and accept to (possibly)
				1133	* overboost some RUNNABLE tasks in the same bucket.
				1134	* The rq clamp bucket value is reset to its base value whenever
				1135	* there are no more RUNNABLE tasks refcounting it.
				1136	*/
				1137	if (likely(bucket->tasks))
				1138	return;
				1139
				1140	rq_clamp = READ_ONCE(uc_rq->value);
				1141	/*
				1142	* Defensive programming: this should never happen. If it happens,
				1143	* e.g. due to future modification, warn and fixup the expected value.
				1144	*/
				1145	SCHED_WARN_ON(bucket->value > rq_clamp);
				1146	if (bucket->value >= rq_clamp) {
				1147	bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
				1148	WRITE_ONCE(uc_rq->value, bkt_clamp);
				1149	}
				1150	}
				1151
				1152	static inline void uclamp_rq_inc(struct rq rq, struct task_struct p)
				1153	{
				1154	enum uclamp_id clamp_id;
				1155
				1156	/*
				1157	* Avoid any overhead until uclamp is actually used by the userspace.
				1158	*
				1159	* The condition is constructed such that a NOP is generated when
				1160	* sched_uclamp_used is disabled.
				1161	*/
				1162	if (!static_branch_unlikely(&sched_uclamp_used))
				1163	return;
				1164
				1165	if (unlikely(!p->sched_class->uclamp_enabled))
				1166	return;
				1167
				1168	for_each_clamp_id(clamp_id)
				1169	uclamp_rq_inc_id(rq, p, clamp_id);
				1170
				1171	/* Reset clamp idle holding when there is one RUNNABLE task */
				1172	if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
				1173	rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
				1174	}
				1175
				1176	static inline void uclamp_rq_dec(struct rq rq, struct task_struct p)
				1177	{
				1178	enum uclamp_id clamp_id;
				1179
				1180	/*
				1181	* Avoid any overhead until uclamp is actually used by the userspace.
				1182	*
				1183	* The condition is constructed such that a NOP is generated when
				1184	* sched_uclamp_used is disabled.
				1185	*/
				1186	if (!static_branch_unlikely(&sched_uclamp_used))
				1187	return;
				1188
				1189	if (unlikely(!p->sched_class->uclamp_enabled))
				1190	return;
				1191
				1192	for_each_clamp_id(clamp_id)
				1193	uclamp_rq_dec_id(rq, p, clamp_id);
				1194	}
				1195
				1196	static inline void uclamp_rq_reinc_id(struct rq rq, struct task_struct p,
				1197	enum uclamp_id clamp_id)
				1198	{
				1199	if (!p->uclamp[clamp_id].active)
				1200	return;
				1201
				1202	uclamp_rq_dec_id(rq, p, clamp_id);
				1203	uclamp_rq_inc_id(rq, p, clamp_id);
				1204
				1205	/*
				1206	* Make sure to clear the idle flag if we've transiently reached 0
				1207	* active tasks on rq.
				1208	*/
				1209	if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
				1210	rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
				1211	}
				1212
				1213	static inline void
				1214	uclamp_update_active(struct task_struct *p)
				1215	{
				1216	enum uclamp_id clamp_id;
				1217	struct rq_flags rf;
				1218	struct rq *rq;
				1219
				1220	/*
				1221	* Lock the task and the rq where the task is (or was) queued.
				1222	*
				1223	* We might lock the (previous) rq of a !RUNNABLE task, but that's the
				1224	* price to pay to safely serialize util_{min,max} updates with
				1225	* enqueues, dequeues and migration operations.
				1226	* This is the same locking schema used by __set_cpus_allowed_ptr().
				1227	*/
				1228	rq = task_rq_lock(p, &rf);
				1229
				1230	/*
				1231	* Setting the clamp bucket is serialized by task_rq_lock().
				1232	* If the task is not yet RUNNABLE and its task_struct is not
				1233	* affecting a valid clamp bucket, the next time it's enqueued,
				1234	* it will already see the updated clamp bucket value.
				1235	*/
				1236	for_each_clamp_id(clamp_id)
				1237	uclamp_rq_reinc_id(rq, p, clamp_id);
				1238
				1239	task_rq_unlock(rq, p, &rf);
				1240	}
				1241
				1242	#ifdef CONFIG_UCLAMP_TASK_GROUP
				1243	static inline void
				1244	uclamp_update_active_tasks(struct cgroup_subsys_state *css)
				1245	{
				1246	struct css_task_iter it;
				1247	struct task_struct *p;
				1248
				1249	css_task_iter_start(css, 0, &it);
				1250	while ((p = css_task_iter_next(&it)))
				1251	uclamp_update_active(p);
				1252	css_task_iter_end(&it);
				1253	}
				1254
				1255	static void cpu_util_update_eff(struct cgroup_subsys_state *css);
				1256	static void uclamp_update_root_tg(void)
				1257	{
				1258	struct task_group *tg = &root_task_group;
				1259
				1260	uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
				1261	sysctl_sched_uclamp_util_min, false);
				1262	uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
				1263	sysctl_sched_uclamp_util_max, false);
				1264
				1265	rcu_read_lock();
				1266	cpu_util_update_eff(&root_task_group.css);
				1267	rcu_read_unlock();
				1268	}
				1269	#else
				1270	static void uclamp_update_root_tg(void) { }
				1271	#endif
				1272
				1273	int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
				1274	void __user buffer, size_t lenp,
				1275	loff_t *ppos)
				1276	{
				1277	bool update_root_tg = false;
				1278	int old_min, old_max, old_min_rt;
				1279	int result;
				1280
				1281	mutex_lock(&uclamp_mutex);
				1282	old_min = sysctl_sched_uclamp_util_min;
				1283	old_max = sysctl_sched_uclamp_util_max;
				1284	old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
				1285
				1286	result = proc_dointvec(table, write, buffer, lenp, ppos);
				1287	if (result)
				1288	goto undo;
				1289	if (!write)
				1290	goto done;
				1291
				1292	if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max \|\|
				1293	sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE \|\|
				1294	sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
				1295
				1296	result = -EINVAL;
				1297	goto undo;
				1298	}
				1299
				1300	if (old_min != sysctl_sched_uclamp_util_min) {
				1301	uclamp_se_set(&uclamp_default[UCLAMP_MIN],
				1302	sysctl_sched_uclamp_util_min, false);
				1303	update_root_tg = true;
				1304	}
				1305	if (old_max != sysctl_sched_uclamp_util_max) {
				1306	uclamp_se_set(&uclamp_default[UCLAMP_MAX],
				1307	sysctl_sched_uclamp_util_max, false);
				1308	update_root_tg = true;
				1309	}
				1310
				1311	if (update_root_tg) {
				1312	static_branch_enable(&sched_uclamp_used);
				1313	uclamp_update_root_tg();
				1314	}
				1315
				1316	if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
				1317	static_branch_enable(&sched_uclamp_used);
				1318	uclamp_sync_util_min_rt_default();
				1319	}
				1320
				1321	/*
				1322	* We update all RUNNABLE tasks only when task groups are in use.
				1323	* Otherwise, keep it simple and do just a lazy update at each next
				1324	* task enqueue time.
				1325	*/
				1326
				1327	goto done;
				1328
				1329	undo:
				1330	sysctl_sched_uclamp_util_min = old_min;
				1331	sysctl_sched_uclamp_util_max = old_max;
				1332	sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
				1333	done:
				1334	mutex_unlock(&uclamp_mutex);
				1335
				1336	return result;
				1337	}
				1338
				1339	static int uclamp_validate(struct task_struct *p,
				1340	const struct sched_attr *attr)
				1341	{
				1342	unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
				1343	unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
				1344
				1345	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
				1346	lower_bound = attr->sched_util_min;
				1347	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
				1348	upper_bound = attr->sched_util_max;
				1349
				1350	if (lower_bound > upper_bound)
				1351	return -EINVAL;
				1352	if (upper_bound > SCHED_CAPACITY_SCALE)
				1353	return -EINVAL;
				1354
				1355	/*
				1356	* We have valid uclamp attributes; make sure uclamp is enabled.
				1357	*
				1358	* We need to do that here, because enabling static branches is a
				1359	* blocking operation which obviously cannot be done while holding
				1360	* scheduler locks.
				1361	*/
				1362	static_branch_enable(&sched_uclamp_used);
				1363
				1364	return 0;
				1365	}
				1366
				1367	static void __setscheduler_uclamp(struct task_struct *p,
				1368	const struct sched_attr *attr)
				1369	{
				1370	enum uclamp_id clamp_id;
				1371
				1372	/*
				1373	* On scheduling class change, reset to default clamps for tasks
				1374	* without a task-specific value.
				1375	*/
				1376	for_each_clamp_id(clamp_id) {
				1377	struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
				1378
				1379	/* Keep using defined clamps across class changes */
				1380	if (uc_se->user_defined)
				1381	continue;
				1382
				1383	/*
				1384	* RT by default have a 100% boost value that could be modified
				1385	* at runtime.
				1386	*/
				1387	if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
				1388	__uclamp_update_util_min_rt_default(p);
				1389	else
				1390	uclamp_se_set(uc_se, uclamp_none(clamp_id), false);
				1391
				1392	}
				1393
				1394	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
				1395	return;
				1396
				1397	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
				1398	uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
				1399	attr->sched_util_min, true);
				1400	}
				1401
				1402	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
				1403	uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
				1404	attr->sched_util_max, true);
				1405	}
				1406	}
				1407
				1408	static void uclamp_fork(struct task_struct *p)
				1409	{
				1410	enum uclamp_id clamp_id;
				1411
				1412	/*
				1413	* We don't need to hold task_rq_lock() when updating p->uclamp_* here
				1414	* as the task is still at its early fork stages.
				1415	*/
				1416	for_each_clamp_id(clamp_id)
				1417	p->uclamp[clamp_id].active = false;
				1418
				1419	if (likely(!p->sched_reset_on_fork))
				1420	return;
				1421
				1422	for_each_clamp_id(clamp_id) {
				1423	uclamp_se_set(&p->uclamp_req[clamp_id],
				1424	uclamp_none(clamp_id), false);
				1425	}
				1426	}
				1427
				1428	static void uclamp_post_fork(struct task_struct *p)
				1429	{
				1430	uclamp_update_util_min_rt_default(p);
				1431	}
				1432
				1433	static void __init init_uclamp_rq(struct rq *rq)
				1434	{
				1435	enum uclamp_id clamp_id;
				1436	struct uclamp_rq *uc_rq = rq->uclamp;
				1437
				1438	for_each_clamp_id(clamp_id) {
				1439	uc_rq[clamp_id] = (struct uclamp_rq) {
				1440	.value = uclamp_none(clamp_id)
				1441	};
				1442	}
				1443
				1444	rq->uclamp_flags = UCLAMP_FLAG_IDLE;
				1445	}
				1446
				1447	static void __init init_uclamp(void)
				1448	{
				1449	struct uclamp_se uc_max = {};
				1450	enum uclamp_id clamp_id;
				1451	int cpu;
				1452
				1453	mutex_init(&uclamp_mutex);
				1454
				1455	for_each_possible_cpu(cpu)
				1456	init_uclamp_rq(cpu_rq(cpu));
				1457
				1458	for_each_clamp_id(clamp_id) {
				1459	uclamp_se_set(&init_task.uclamp_req[clamp_id],
				1460	uclamp_none(clamp_id), false);
				1461	}
				1462
				1463	/* System defaults allow max clamp values for both indexes */
				1464	uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
				1465	for_each_clamp_id(clamp_id) {
				1466	uclamp_default[clamp_id] = uc_max;
				1467	#ifdef CONFIG_UCLAMP_TASK_GROUP
				1468	root_task_group.uclamp_req[clamp_id] = uc_max;
				1469	root_task_group.uclamp[clamp_id] = uc_max;
				1470	#endif
				1471	}
				1472	}
				1473
				1474	#else /* CONFIG_UCLAMP_TASK */
				1475	static inline void uclamp_rq_inc(struct rq rq, struct task_struct p) { }
				1476	static inline void uclamp_rq_dec(struct rq rq, struct task_struct p) { }
				1477	static inline int uclamp_validate(struct task_struct *p,
				1478	const struct sched_attr *attr)
				1479	{
				1480	return -EOPNOTSUPP;
				1481	}
				1482	static void __setscheduler_uclamp(struct task_struct *p,
				1483	const struct sched_attr *attr) { }
				1484	static inline void uclamp_fork(struct task_struct *p) { }
				1485	static inline void uclamp_post_fork(struct task_struct *p) { }
				1486	static inline void init_uclamp(void) { }
				1487	#endif /* CONFIG_UCLAMP_TASK */
				1488
				1489	static inline void enqueue_task(struct rq rq, struct task_struct p, int flags)
				1490	{
				1491	if (!(flags & ENQUEUE_NOCLOCK))
				1492	update_rq_clock(rq);
				1493
				1494	if (!(flags & ENQUEUE_RESTORE)) {
				1495	sched_info_queued(rq, p);
				1496	psi_enqueue(p, flags & ENQUEUE_WAKEUP);
				1497	}
				1498
				1499	uclamp_rq_inc(rq, p);
				1500	p->sched_class->enqueue_task(rq, p, flags);
				1501
				1502	trace_android_rvh_enqueue_task(rq, p);
				1503	}
				1504
				1505	static inline void dequeue_task(struct rq rq, struct task_struct p, int flags)
				1506	{
				1507	if (!(flags & DEQUEUE_NOCLOCK))
				1508	update_rq_clock(rq);
				1509
				1510	if (!(flags & DEQUEUE_SAVE)) {
				1511	sched_info_dequeued(rq, p);
				1512	psi_dequeue(p, flags & DEQUEUE_SLEEP);
				1513	}
				1514
				1515	uclamp_rq_dec(rq, p);
				1516	p->sched_class->dequeue_task(rq, p, flags);
				1517
				1518	trace_android_rvh_dequeue_task(rq, p);
				1519	}
				1520
				1521	void activate_task(struct rq rq, struct task_struct p, int flags)
				1522	{
				1523	if (task_on_rq_migrating(p))
				1524	flags \|= ENQUEUE_MIGRATED;
				1525
				1526	if (task_contributes_to_load(p))
				1527	rq->nr_uninterruptible--;
				1528
				1529	enqueue_task(rq, p, flags);
				1530
				1531	p->on_rq = TASK_ON_RQ_QUEUED;
				1532	}
				1533
				1534	void deactivate_task(struct rq rq, struct task_struct p, int flags)
				1535	{
				1536	p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
				1537
				1538	if (task_contributes_to_load(p))
				1539	rq->nr_uninterruptible++;
				1540
				1541	dequeue_task(rq, p, flags);
				1542	}
				1543
				1544	/*
				1545	* __normal_prio - return the priority that is based on the static prio
				1546	*/
				1547	static inline int __normal_prio(struct task_struct *p)
				1548	{
				1549	return p->static_prio;
				1550	}
				1551
				1552	/*
				1553	* Calculate the expected normal priority: i.e. priority
				1554	* without taking RT-inheritance into account. Might be
				1555	* boosted by interactivity modifiers. Changes upon fork,
				1556	* setprio syscalls, and whenever the interactivity
				1557	* estimator recalculates.
				1558	*/
				1559	static inline int normal_prio(struct task_struct *p)
				1560	{
				1561	int prio;
				1562
				1563	if (task_has_dl_policy(p))
				1564	prio = MAX_DL_PRIO-1;
				1565	else if (task_has_rt_policy(p))
				1566	prio = MAX_RT_PRIO-1 - p->rt_priority;
				1567	else
				1568	prio = __normal_prio(p);
				1569	return prio;
				1570	}
				1571
				1572	/*
				1573	* Calculate the current priority, i.e. the priority
				1574	* taken into account by the scheduler. This value might
				1575	* be boosted by RT tasks, or might be boosted by
				1576	* interactivity modifiers. Will be RT if the task got
				1577	* RT-boosted. If not then it returns p->normal_prio.
				1578	*/
				1579	static int effective_prio(struct task_struct *p)
				1580	{
				1581	p->normal_prio = normal_prio(p);
				1582	/*
				1583	* If we are RT tasks or we were boosted to RT priority,
				1584	* keep the priority unchanged. Otherwise, update priority
				1585	* to the normal priority:
				1586	*/
				1587	if (!rt_prio(p->prio))
				1588	return p->normal_prio;
				1589	return p->prio;
				1590	}
				1591
				1592	/**
				1593	* task_curr - is this task currently executing on a CPU?
				1594	* @p: the task in question.
				1595	*
				1596	* Return: 1 if the task is currently executing. 0 otherwise.
				1597	*/
				1598	inline int task_curr(const struct task_struct *p)
				1599	{
				1600	return cpu_curr(task_cpu(p)) == p;
				1601	}
				1602
				1603	/*
				1604	* switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
				1605	* use the balance_callback list if you want balancing.
				1606	*
				1607	* this means any call to check_class_changed() must be followed by a call to
				1608	* balance_callback().
				1609	*/
				1610	static inline void check_class_changed(struct rq rq, struct task_struct p,
				1611	const struct sched_class *prev_class,
				1612	int oldprio)
				1613	{
				1614	if (prev_class != p->sched_class) {
				1615	if (prev_class->switched_from)
				1616	prev_class->switched_from(rq, p);
				1617
				1618	p->sched_class->switched_to(rq, p);
				1619	} else if (oldprio != p->prio \|\| dl_task(p))
				1620	p->sched_class->prio_changed(rq, p, oldprio);
				1621	}
				1622
				1623	void check_preempt_curr(struct rq rq, struct task_struct p, int flags)
				1624	{
				1625	const struct sched_class *class;
				1626
				1627	if (p->sched_class == rq->curr->sched_class) {
				1628	rq->curr->sched_class->check_preempt_curr(rq, p, flags);
				1629	} else {
				1630	for_each_class(class) {
				1631	if (class == rq->curr->sched_class)
				1632	break;
				1633	if (class == p->sched_class) {
				1634	resched_curr(rq);
				1635	break;
				1636	}
				1637	}
				1638	}
				1639
				1640	/*
				1641	* A queue event has occurred, and we're going to schedule. In
				1642	* this case, we can save a useless back to back clock update.
				1643	*/
				1644	if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
				1645	rq_clock_skip_update(rq);
				1646	}
				1647
				1648	#ifdef CONFIG_SMP
				1649
				1650	static inline bool is_per_cpu_kthread(struct task_struct *p)
				1651	{
				1652	if (!(p->flags & PF_KTHREAD))
				1653	return false;
				1654
				1655	if (p->nr_cpus_allowed != 1)
				1656	return false;
				1657
				1658	return true;
				1659	}
				1660
				1661	/*
				1662	* Per-CPU kthreads are allowed to run on !active && online CPUs, see
				1663	* __set_cpus_allowed_ptr() and select_fallback_rq().
				1664	*/
				1665	static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
				1666	{
				1667	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
				1668	return false;
				1669
				1670	if (is_per_cpu_kthread(p))
				1671	return cpu_online(cpu);
				1672
				1673	return cpu_active(cpu);
				1674	}
				1675
				1676	/*
				1677	* This is how migration works:
				1678	*
				1679	* 1) we invoke migration_cpu_stop() on the target CPU using
				1680	* stop_one_cpu().
				1681	* 2) stopper starts to run (implicitly forcing the migrated thread
				1682	* off the CPU)
				1683	* 3) it checks whether the migrated task is still in the wrong runqueue.
				1684	* 4) if it's in the wrong runqueue then the migration thread removes
				1685	* it and puts it into the right queue.
				1686	* 5) stopper completes and stop_one_cpu() returns and the migration
				1687	* is done.
				1688	*/
				1689
				1690	/*
				1691	* move_queued_task - move a queued task to new rq.
				1692	*
				1693	* Returns (locked) new rq. Old rq's lock is released.
				1694	*/
				1695	static struct rq move_queued_task(struct rq rq, struct rq_flags *rf,
				1696	struct task_struct *p, int new_cpu)
				1697	{
				1698	lockdep_assert_held(&rq->lock);
				1699
				1700	WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
				1701	dequeue_task(rq, p, DEQUEUE_NOCLOCK);
				1702	set_task_cpu(p, new_cpu);
				1703	rq_unlock(rq, rf);
				1704
				1705	rq = cpu_rq(new_cpu);
				1706
				1707	rq_lock(rq, rf);
				1708	BUG_ON(task_cpu(p) != new_cpu);
				1709	enqueue_task(rq, p, 0);
				1710	p->on_rq = TASK_ON_RQ_QUEUED;
				1711	check_preempt_curr(rq, p, 0);
				1712
				1713	return rq;
				1714	}
				1715
				1716	struct migration_arg {
				1717	struct task_struct *task;
				1718	int dest_cpu;
				1719	};
				1720
				1721	/*
				1722	* Move (not current) task off this CPU, onto the destination CPU. We're doing
				1723	* this because either it can't run here any more (set_cpus_allowed()
				1724	* away from this CPU, or CPU going down), or because we're
				1725	* attempting to rebalance this task on exec (sched_exec).
				1726	*
				1727	* So we race with normal scheduler movements, but that's OK, as long
				1728	* as the task is no longer on this CPU.
				1729	*/
				1730	static struct rq __migrate_task(struct rq rq, struct rq_flags *rf,
				1731	struct task_struct *p, int dest_cpu)
				1732	{
				1733	/* Affinity changed (again). */
				1734	if (!is_cpu_allowed(p, dest_cpu))
				1735	return rq;
				1736
				1737	update_rq_clock(rq);
				1738	rq = move_queued_task(rq, rf, p, dest_cpu);
				1739
				1740	return rq;
				1741	}
				1742
				1743	/*
				1744	* migration_cpu_stop - this will be executed by a highprio stopper thread
				1745	* and performs thread migration by bumping thread off CPU then
				1746	* 'pushing' onto another runqueue.
				1747	*/
				1748	static int migration_cpu_stop(void *data)
				1749	{
				1750	struct migration_arg *arg = data;
				1751	struct task_struct *p = arg->task;
				1752	struct rq *rq = this_rq();
				1753	struct rq_flags rf;
				1754
				1755	/*
				1756	* The original target CPU might have gone down and we might
				1757	* be on another CPU but it doesn't matter.
				1758	*/
				1759	local_irq_disable();
				1760	/*
				1761	* We need to explicitly wake pending tasks before running
				1762	* __migrate_task() such that we will not miss enforcing cpus_ptr
				1763	* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
				1764	*/
				1765	sched_ttwu_pending();
				1766
				1767	raw_spin_lock(&p->pi_lock);
				1768	rq_lock(rq, &rf);
				1769	/*
				1770	* If task_rq(p) != rq, it cannot be migrated here, because we're
				1771	* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
				1772	* we're holding p->pi_lock.
				1773	*/
				1774	if (task_rq(p) == rq) {
				1775	if (task_on_rq_queued(p))
				1776	rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
				1777	else
				1778	p->wake_cpu = arg->dest_cpu;
				1779	}
				1780	rq_unlock(rq, &rf);
				1781	raw_spin_unlock(&p->pi_lock);
				1782
				1783	local_irq_enable();
				1784	return 0;
				1785	}
				1786
				1787	/*
				1788	* sched_class::set_cpus_allowed must do the below, but is not required to
				1789	* actually call this function.
				1790	*/
				1791	void set_cpus_allowed_common(struct task_struct p, const struct cpumask new_mask)
				1792	{
				1793	cpumask_copy(&p->cpus_mask, new_mask);
				1794	p->nr_cpus_allowed = cpumask_weight(new_mask);
				1795	}
				1796
				1797	void do_set_cpus_allowed(struct task_struct p, const struct cpumask new_mask)
				1798	{
				1799	struct rq *rq = task_rq(p);
				1800	bool queued, running;
				1801
				1802	lockdep_assert_held(&p->pi_lock);
				1803
				1804	queued = task_on_rq_queued(p);
				1805	running = task_current(rq, p);
				1806
				1807	if (queued) {
				1808	/*
				1809	* Because __kthread_bind() calls this on blocked tasks without
				1810	* holding rq->lock.
				1811	*/
				1812	lockdep_assert_held(&rq->lock);
				1813	dequeue_task(rq, p, DEQUEUE_SAVE \| DEQUEUE_NOCLOCK);
				1814	}
				1815	if (running)
				1816	put_prev_task(rq, p);
				1817
				1818	p->sched_class->set_cpus_allowed(p, new_mask);
				1819
				1820	if (queued)
				1821	enqueue_task(rq, p, ENQUEUE_RESTORE \| ENQUEUE_NOCLOCK);
				1822	if (running)
				1823	set_next_task(rq, p);
				1824	}
				1825
				1826	/*
				1827	* Change a given task's CPU affinity. Migrate the thread to a
				1828	* proper CPU and schedule it away if the CPU it's executing on
				1829	* is removed from the allowed bitmask.
				1830	*
				1831	* NOTE: the caller must have a valid reference to the task, the
				1832	* task must not exit() & deallocate itself prematurely. The
				1833	* call is not atomic; no spinlocks may be held.
				1834	*/
				1835	static int __set_cpus_allowed_ptr(struct task_struct *p,
				1836	const struct cpumask *new_mask, bool check)
				1837	{
				1838	const struct cpumask *cpu_valid_mask = cpu_active_mask;
				1839	unsigned int dest_cpu;
				1840	struct rq_flags rf;
				1841	struct rq *rq;
				1842	int ret = 0;
				1843
				1844	rq = task_rq_lock(p, &rf);
				1845	update_rq_clock(rq);
				1846
				1847	if (p->flags & PF_KTHREAD) {
				1848	/*
				1849	* Kernel threads are allowed on online && !active CPUs
				1850	*/
				1851	cpu_valid_mask = cpu_online_mask;
				1852	}
				1853
				1854	/*
				1855	* Must re-check here, to close a race against __kthread_bind(),
				1856	* sched_setaffinity() is not guaranteed to observe the flag.
				1857	*/
				1858	if (check && (p->flags & PF_NO_SETAFFINITY)) {
				1859	ret = -EINVAL;
				1860	goto out;
				1861	}
				1862
				1863	if (cpumask_equal(&p->cpus_mask, new_mask))
				1864	goto out;
				1865
				1866	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
				1867	if (dest_cpu >= nr_cpu_ids) {
				1868	ret = -EINVAL;
				1869	goto out;
				1870	}
				1871
				1872	do_set_cpus_allowed(p, new_mask);
				1873
				1874	if (p->flags & PF_KTHREAD) {
				1875	/*
				1876	* For kernel threads that do indeed end up on online &&
				1877	* !active we want to ensure they are strict per-CPU threads.
				1878	*/
				1879	WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
				1880	!cpumask_intersects(new_mask, cpu_active_mask) &&
				1881	p->nr_cpus_allowed != 1);
				1882	}
				1883
				1884	/* Can the task run on the task's current CPU? If so, we're done */
				1885	if (cpumask_test_cpu(task_cpu(p), new_mask))
				1886	goto out;
				1887
				1888	if (task_running(rq, p) \|\| p->state == TASK_WAKING) {
				1889	struct migration_arg arg = { p, dest_cpu };
				1890	/* Need help from migration thread: drop lock and wait. */
				1891	task_rq_unlock(rq, p, &rf);
				1892	stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
				1893	return 0;
				1894	} else if (task_on_rq_queued(p)) {
				1895	/*
				1896	* OK, since we're going to drop the lock immediately
				1897	* afterwards anyway.
				1898	*/
				1899	rq = move_queued_task(rq, &rf, p, dest_cpu);
				1900	}
				1901	out:
				1902	task_rq_unlock(rq, p, &rf);
				1903
				1904	return ret;
				1905	}
				1906
				1907	int set_cpus_allowed_ptr(struct task_struct p, const struct cpumask new_mask)
				1908	{
				1909	return __set_cpus_allowed_ptr(p, new_mask, false);
				1910	}
				1911	EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
				1912
				1913	void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
				1914	{
				1915	#ifdef CONFIG_SCHED_DEBUG
				1916	/*
				1917	* We should never call set_task_cpu() on a blocked task,
				1918	* ttwu() will sort out the placement.
				1919	*/
				1920	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
				1921	!p->on_rq);
				1922
				1923	/*
				1924	* Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
				1925	* because schedstat_wait_{start,end} rebase migrating task's wait_start
				1926	* time relying on p->on_rq.
				1927	*/
				1928	WARN_ON_ONCE(p->state == TASK_RUNNING &&
				1929	p->sched_class == &fair_sched_class &&
				1930	(p->on_rq && !task_on_rq_migrating(p)));
				1931
				1932	#ifdef CONFIG_LOCKDEP
				1933	/*
				1934	* The caller should hold either p->pi_lock or rq->lock, when changing
				1935	* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
				1936	*
				1937	* sched_move_task() holds both and thus holding either pins the cgroup,
				1938	* see task_group().
				1939	*
				1940	* Furthermore, all task_rq users should acquire both locks, see
				1941	* task_rq_lock().
				1942	*/
				1943	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) \|\|
				1944	lockdep_is_held(&task_rq(p)->lock)));
				1945	#endif
				1946	/*
				1947	* Clearly, migrating tasks to offline CPUs is a fairly daft thing.
				1948	*/
				1949	WARN_ON_ONCE(!cpu_online(new_cpu));
				1950	#endif
				1951
				1952	trace_sched_migrate_task(p, new_cpu);
				1953
				1954	if (task_cpu(p) != new_cpu) {
				1955	if (p->sched_class->migrate_task_rq)
				1956	p->sched_class->migrate_task_rq(p, new_cpu);
				1957	p->se.nr_migrations++;
				1958	rseq_migrate(p);
				1959	perf_event_task_migrate(p);
				1960	}
				1961
				1962	__set_task_cpu(p, new_cpu);
				1963	}
				1964
				1965	#ifdef CONFIG_NUMA_BALANCING
				1966	static void __migrate_swap_task(struct task_struct *p, int cpu)
				1967	{
				1968	if (task_on_rq_queued(p)) {
				1969	struct rq src_rq, dst_rq;
				1970	struct rq_flags srf, drf;
				1971
				1972	src_rq = task_rq(p);
				1973	dst_rq = cpu_rq(cpu);
				1974
				1975	rq_pin_lock(src_rq, &srf);
				1976	rq_pin_lock(dst_rq, &drf);
				1977
				1978	deactivate_task(src_rq, p, 0);
				1979	set_task_cpu(p, cpu);
				1980	activate_task(dst_rq, p, 0);
				1981	check_preempt_curr(dst_rq, p, 0);
				1982
				1983	rq_unpin_lock(dst_rq, &drf);
				1984	rq_unpin_lock(src_rq, &srf);
				1985
				1986	} else {
				1987	/*
				1988	* Task isn't running anymore; make it appear like we migrated
				1989	* it before it went to sleep. This means on wakeup we make the
				1990	* previous CPU our target instead of where it really is.
				1991	*/
				1992	p->wake_cpu = cpu;
				1993	}
				1994	}
				1995
				1996	struct migration_swap_arg {
				1997	struct task_struct src_task, dst_task;
				1998	int src_cpu, dst_cpu;
				1999	};
				2000
				2001	static int migrate_swap_stop(void *data)
				2002	{
				2003	struct migration_swap_arg *arg = data;
				2004	struct rq src_rq, dst_rq;
				2005	int ret = -EAGAIN;
				2006
				2007	if (!cpu_active(arg->src_cpu) \|\| !cpu_active(arg->dst_cpu))
				2008	return -EAGAIN;
				2009
				2010	src_rq = cpu_rq(arg->src_cpu);
				2011	dst_rq = cpu_rq(arg->dst_cpu);
				2012
				2013	double_raw_lock(&arg->src_task->pi_lock,
				2014	&arg->dst_task->pi_lock);
				2015	double_rq_lock(src_rq, dst_rq);
				2016
				2017	if (task_cpu(arg->dst_task) != arg->dst_cpu)
				2018	goto unlock;
				2019
				2020	if (task_cpu(arg->src_task) != arg->src_cpu)
				2021	goto unlock;
				2022
				2023	if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
				2024	goto unlock;
				2025
				2026	if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
				2027	goto unlock;
				2028
				2029	__migrate_swap_task(arg->src_task, arg->dst_cpu);
				2030	__migrate_swap_task(arg->dst_task, arg->src_cpu);
				2031
				2032	ret = 0;
				2033
				2034	unlock:
				2035	double_rq_unlock(src_rq, dst_rq);
				2036	raw_spin_unlock(&arg->dst_task->pi_lock);
				2037	raw_spin_unlock(&arg->src_task->pi_lock);
				2038
				2039	return ret;
				2040	}
				2041
				2042	/*
				2043	* Cross migrate two tasks
				2044	*/
				2045	int migrate_swap(struct task_struct cur, struct task_struct p,
				2046	int target_cpu, int curr_cpu)
				2047	{
				2048	struct migration_swap_arg arg;
				2049	int ret = -EINVAL;
				2050
				2051	arg = (struct migration_swap_arg){
				2052	.src_task = cur,
				2053	.src_cpu = curr_cpu,
				2054	.dst_task = p,
				2055	.dst_cpu = target_cpu,
				2056	};
				2057
				2058	if (arg.src_cpu == arg.dst_cpu)
				2059	goto out;
				2060
				2061	/*
				2062	* These three tests are all lockless; this is OK since all of them
				2063	* will be re-checked with proper locks held further down the line.
				2064	*/
				2065	if (!cpu_active(arg.src_cpu) \|\| !cpu_active(arg.dst_cpu))
				2066	goto out;
				2067
				2068	if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
				2069	goto out;
				2070
				2071	if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
				2072	goto out;
				2073
				2074	trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
				2075	ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
				2076
				2077	out:
				2078	return ret;
				2079	}
				2080	#endif /* CONFIG_NUMA_BALANCING */
				2081
				2082	/*
				2083	* wait_task_inactive - wait for a thread to unschedule.
				2084	*
				2085	* If @match_state is nonzero, it's the @p->state value just checked and
				2086	* not expected to change. If it changes, i.e. @p might have woken up,
				2087	* then return zero. When we succeed in waiting for @p to be off its CPU,
				2088	* we return a positive number (its total switch count). If a second call
				2089	* a short while later returns the same number, the caller can be sure that
				2090	* @p has remained unscheduled the whole time.
				2091	*
				2092	* The caller must ensure that the task will unschedule sometime soon,
				2093	* else this function might spin for a long time. This function can't
				2094	* be called with interrupts off, or it may introduce deadlock with
				2095	* smp_call_function() if an IPI is sent by the same process we are
				2096	* waiting to become inactive.
				2097	*/
				2098	unsigned long wait_task_inactive(struct task_struct *p, long match_state)
				2099	{
				2100	int running, queued;
				2101	struct rq_flags rf;
				2102	unsigned long ncsw;
				2103	struct rq *rq;
				2104
				2105	for (;;) {
				2106	/*
				2107	* We do the initial early heuristics without holding
				2108	* any task-queue locks at all. We'll only try to get
				2109	* the runqueue lock when things look like they will
				2110	* work out!
				2111	*/
				2112	rq = task_rq(p);
				2113
				2114	/*
				2115	* If the task is actively running on another CPU
				2116	* still, just relax and busy-wait without holding
				2117	* any locks.
				2118	*
				2119	* NOTE! Since we don't hold any locks, it's not
				2120	* even sure that "rq" stays as the right runqueue!
				2121	* But we don't care, since "task_running()" will
				2122	* return false if the runqueue has changed and p
				2123	* is actually now running somewhere else!
				2124	*/
				2125	while (task_running(rq, p)) {
				2126	if (match_state && unlikely(p->state != match_state))
				2127	return 0;
				2128	cpu_relax();
				2129	}
				2130
				2131	/*
				2132	* Ok, time to look more closely! We need the rq
				2133	* lock now, to be sure. If we're wrong, we'll
				2134	* just go back and repeat.
				2135	*/
				2136	rq = task_rq_lock(p, &rf);
				2137	trace_sched_wait_task(p);
				2138	running = task_running(rq, p);
				2139	queued = task_on_rq_queued(p);
				2140	ncsw = 0;
				2141	if (!match_state \|\| p->state == match_state)
				2142	ncsw = p->nvcsw \| LONG_MIN; /* sets MSB */
				2143	task_rq_unlock(rq, p, &rf);
				2144
				2145	/*
				2146	* If it changed from the expected state, bail out now.
				2147	*/
				2148	if (unlikely(!ncsw))
				2149	break;
				2150
				2151	/*
				2152	* Was it really running after all now that we
				2153	* checked with the proper locks actually held?
				2154	*
				2155	* Oops. Go back and try again..
				2156	*/
				2157	if (unlikely(running)) {
				2158	cpu_relax();
				2159	continue;
				2160	}
				2161
				2162	/*
				2163	* It's not enough that it's not actively running,
				2164	* it must be off the runqueue _entirely_, and not
				2165	* preempted!
				2166	*
				2167	* So if it was still runnable (but just not actively
				2168	* running right now), it's preempted, and we should
				2169	* yield - it could be a while.
				2170	*/
				2171	if (unlikely(queued)) {
				2172	ktime_t to = NSEC_PER_SEC / HZ;
				2173
				2174	set_current_state(TASK_UNINTERRUPTIBLE);
				2175	schedule_hrtimeout(&to, HRTIMER_MODE_REL);
				2176	continue;
				2177	}
				2178
				2179	/*
				2180	* Ahh, all good. It wasn't running, and it wasn't
				2181	* runnable, which means that it will never become
				2182	* running in the future either. We're all done!
				2183	*/
				2184	break;
				2185	}
				2186
				2187	return ncsw;
				2188	}
				2189
				2190	/***
				2191	* kick_process - kick a running thread to enter/exit the kernel
				2192	* @p: the to-be-kicked thread
				2193	*
				2194	* Cause a process which is running on another CPU to enter
				2195	* kernel-mode, without any delay. (to get signals handled.)
				2196	*
				2197	* NOTE: this function doesn't have to take the runqueue lock,
				2198	* because all it wants to ensure is that the remote task enters
				2199	* the kernel. If the IPI races and the task has been migrated
				2200	* to another CPU then no harm is done and the purpose has been
				2201	* achieved as well.
				2202	*/
				2203	void kick_process(struct task_struct *p)
				2204	{
				2205	int cpu;
				2206
				2207	preempt_disable();
				2208	cpu = task_cpu(p);
				2209	if ((cpu != smp_processor_id()) && task_curr(p))
				2210	smp_send_reschedule(cpu);
				2211	preempt_enable();
				2212	}
				2213	EXPORT_SYMBOL_GPL(kick_process);
				2214
				2215	/*
				2216	* ->cpus_ptr is protected by both rq->lock and p->pi_lock
				2217	*
				2218	* A few notes on cpu_active vs cpu_online:
				2219	*
				2220	* - cpu_active must be a subset of cpu_online
				2221	*
				2222	* - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
				2223	* see __set_cpus_allowed_ptr(). At this point the newly online
				2224	* CPU isn't yet part of the sched domains, and balancing will not
				2225	* see it.
				2226	*
				2227	* - on CPU-down we clear cpu_active() to mask the sched domains and
				2228	* avoid the load balancer to place new tasks on the to be removed
				2229	* CPU. Existing tasks will remain running there and will be taken
				2230	* off.
				2231	*
				2232	* This means that fallback selection must not select !active CPUs.
				2233	* And can assume that any active CPU must be online. Conversely
				2234	* select_task_rq() below may allow selection of !active CPUs in order
				2235	* to satisfy the above rules.
				2236	*/
				2237	static int select_fallback_rq(int cpu, struct task_struct *p)
				2238	{
				2239	int nid = cpu_to_node(cpu);
				2240	const struct cpumask *nodemask = NULL;
				2241	enum { cpuset, possible, fail } state = cpuset;
				2242	int dest_cpu = -1;
				2243
				2244	trace_android_rvh_select_fallback_rq(cpu, p, &dest_cpu);
				2245	if (dest_cpu >= 0)
				2246	return dest_cpu;
				2247
				2248	/*
				2249	* If the node that the CPU is on has been offlined, cpu_to_node()
				2250	* will return -1. There is no CPU on the node, and we should
				2251	* select the CPU on the other node.
				2252	*/
				2253	if (nid != -1) {
				2254	nodemask = cpumask_of_node(nid);
				2255
				2256	/* Look for allowed, online CPU in same node. */
				2257	for_each_cpu(dest_cpu, nodemask) {
				2258	if (!cpu_active(dest_cpu))
				2259	continue;
				2260	if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
				2261	return dest_cpu;
				2262	}
				2263	}
				2264
				2265	for (;;) {
				2266	/* Any allowed, online CPU? */
				2267	for_each_cpu(dest_cpu, p->cpus_ptr) {
				2268	if (!is_cpu_allowed(p, dest_cpu))
				2269	continue;
				2270
				2271	goto out;
				2272	}
				2273
				2274	/* No more Mr. Nice Guy. */
				2275	switch (state) {
				2276	case cpuset:
				2277	if (IS_ENABLED(CONFIG_CPUSETS)) {
				2278	cpuset_cpus_allowed_fallback(p);
				2279	state = possible;
				2280	break;
				2281	}
				2282	/* Fall-through */
				2283	case possible:
				2284	do_set_cpus_allowed(p, cpu_possible_mask);
				2285	state = fail;
				2286	break;
				2287
				2288	case fail:
				2289	BUG();
				2290	break;
				2291	}
				2292	}
				2293
				2294	out:
				2295	if (state != cpuset) {
				2296	/*
				2297	* Don't tell them about moving exiting tasks or
				2298	* kernel threads (both mm NULL), since they never
				2299	* leave kernel.
				2300	*/
				2301	if (p->mm && printk_ratelimit()) {
				2302	printk_deferred("process %d (%s) no longer affine to cpu%d\n",
				2303	task_pid_nr(p), p->comm, cpu);
				2304	}
				2305	}
				2306
				2307	return dest_cpu;
				2308	}
				2309
				2310	/*
				2311	* The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
				2312	*/
				2313	static inline
				2314	int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
				2315	{
				2316	lockdep_assert_held(&p->pi_lock);
				2317
				2318	if (p->nr_cpus_allowed > 1)
				2319	cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
				2320	else
				2321	cpu = cpumask_any(p->cpus_ptr);
				2322
				2323	/*
				2324	* In order not to call set_task_cpu() on a blocking task we need
				2325	* to rely on ttwu() to place the task on a valid ->cpus_ptr
				2326	* CPU.
				2327	*
				2328	* Since this is common to all placement strategies, this lives here.
				2329	*
				2330	* [ this allows ->select_task() to simply return task_cpu(p) and
				2331	* not worry about this generic constraint ]
				2332	*/
				2333	if (unlikely(!is_cpu_allowed(p, cpu)))
				2334	cpu = select_fallback_rq(task_cpu(p), p);
				2335
				2336	return cpu;
				2337	}
				2338
				2339	static void update_avg(u64 *avg, u64 sample)
				2340	{
				2341	s64 diff = sample - *avg;
				2342	*avg += diff >> 3;
				2343	}
				2344
				2345	void sched_set_stop_task(int cpu, struct task_struct *stop)
				2346	{
				2347	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
				2348	struct task_struct *old_stop = cpu_rq(cpu)->stop;
				2349
				2350	if (stop) {
				2351	/*
				2352	* Make it appear like a SCHED_FIFO task, its something
				2353	* userspace knows about and won't get confused about.
				2354	*
				2355	* Also, it will make PI more or less work without too
				2356	* much confusion -- but then, stop work should not
				2357	* rely on PI working anyway.
				2358	*/
				2359	sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
				2360
				2361	stop->sched_class = &stop_sched_class;
				2362	}
				2363
				2364	cpu_rq(cpu)->stop = stop;
				2365
				2366	if (old_stop) {
				2367	/*
				2368	* Reset it back to a normal scheduling class so that
				2369	* it can die in pieces.
				2370	*/
				2371	old_stop->sched_class = &rt_sched_class;
				2372	}
				2373	}
				2374
				2375	#else
				2376
				2377	static inline int __set_cpus_allowed_ptr(struct task_struct *p,
				2378	const struct cpumask *new_mask, bool check)
				2379	{
				2380	return set_cpus_allowed_ptr(p, new_mask);
				2381	}
				2382
				2383	#endif /* CONFIG_SMP */
				2384
				2385	static void
				2386	ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
				2387	{
				2388	struct rq *rq;
				2389
				2390	if (!schedstat_enabled())
				2391	return;
				2392
				2393	rq = this_rq();
				2394
				2395	#ifdef CONFIG_SMP
				2396	if (cpu == rq->cpu) {
				2397	__schedstat_inc(rq->ttwu_local);
				2398	__schedstat_inc(p->se.statistics.nr_wakeups_local);
				2399	} else {
				2400	struct sched_domain *sd;
				2401
				2402	__schedstat_inc(p->se.statistics.nr_wakeups_remote);
				2403	rcu_read_lock();
				2404	for_each_domain(rq->cpu, sd) {
				2405	if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
				2406	__schedstat_inc(sd->ttwu_wake_remote);
				2407	break;
				2408	}
				2409	}
				2410	rcu_read_unlock();
				2411	}
				2412
				2413	if (wake_flags & WF_MIGRATED)
				2414	__schedstat_inc(p->se.statistics.nr_wakeups_migrate);
				2415	#endif /* CONFIG_SMP */
				2416
				2417	__schedstat_inc(rq->ttwu_count);
				2418	__schedstat_inc(p->se.statistics.nr_wakeups);
				2419
				2420	if (wake_flags & WF_SYNC)
				2421	__schedstat_inc(p->se.statistics.nr_wakeups_sync);
				2422	}
				2423
				2424	/*
				2425	* Mark the task runnable and perform wakeup-preemption.
				2426	*/
				2427	static void ttwu_do_wakeup(struct rq rq, struct task_struct p, int wake_flags,
				2428	struct rq_flags *rf)
				2429	{
				2430	check_preempt_curr(rq, p, wake_flags);
				2431	p->state = TASK_RUNNING;
				2432	trace_sched_wakeup(p);
				2433
				2434	#ifdef CONFIG_SMP
				2435	if (p->sched_class->task_woken) {
				2436	/*
				2437	* Our task @p is fully woken up and running; so its safe to
				2438	* drop the rq->lock, hereafter rq is only used for statistics.
				2439	*/
				2440	rq_unpin_lock(rq, rf);
				2441	p->sched_class->task_woken(rq, p);
				2442	rq_repin_lock(rq, rf);
				2443	}
				2444
				2445	if (rq->idle_stamp) {
				2446	u64 delta = rq_clock(rq) - rq->idle_stamp;
				2447	u64 max = 2*rq->max_idle_balance_cost;
				2448
				2449	update_avg(&rq->avg_idle, delta);
				2450
				2451	if (rq->avg_idle > max)
				2452	rq->avg_idle = max;
				2453
				2454	rq->idle_stamp = 0;
				2455	}
				2456	#endif
				2457	}
				2458
				2459	static void
				2460	ttwu_do_activate(struct rq rq, struct task_struct p, int wake_flags,
				2461	struct rq_flags *rf)
				2462	{
				2463	int en_flags = ENQUEUE_WAKEUP \| ENQUEUE_NOCLOCK;
				2464
				2465	lockdep_assert_held(&rq->lock);
				2466
				2467	#ifdef CONFIG_SMP
				2468	if (p->sched_contributes_to_load)
				2469	rq->nr_uninterruptible--;
				2470
				2471	if (wake_flags & WF_MIGRATED)
				2472	en_flags \|= ENQUEUE_MIGRATED;
				2473	#endif
				2474
				2475	activate_task(rq, p, en_flags);
				2476	ttwu_do_wakeup(rq, p, wake_flags, rf);
				2477	}
				2478
				2479	/*
				2480	* Called in case the task @p isn't fully descheduled from its runqueue,
				2481	* in this case we must do a remote wakeup. Its a 'light' wakeup though,
				2482	* since all we need to do is flip p->state to TASK_RUNNING, since
				2483	* the task is still ->on_rq.
				2484	*/
				2485	static int ttwu_remote(struct task_struct *p, int wake_flags)
				2486	{
				2487	struct rq_flags rf;
				2488	struct rq *rq;
				2489	int ret = 0;
				2490
				2491	rq = __task_rq_lock(p, &rf);
				2492	if (task_on_rq_queued(p)) {
				2493	/* check_preempt_curr() may use rq clock */
				2494	update_rq_clock(rq);
				2495	ttwu_do_wakeup(rq, p, wake_flags, &rf);
				2496	ret = 1;
				2497	}
				2498	__task_rq_unlock(rq, &rf);
				2499
				2500	return ret;
				2501	}
				2502
				2503	#ifdef CONFIG_SMP
				2504	void sched_ttwu_pending(void)
				2505	{
				2506	struct rq *rq = this_rq();
				2507	struct llist_node *llist = llist_del_all(&rq->wake_list);
				2508	struct task_struct p, t;
				2509	struct rq_flags rf;
				2510
				2511	if (!llist)
				2512	return;
				2513
				2514	rq_lock_irqsave(rq, &rf);
				2515	update_rq_clock(rq);
				2516
				2517	llist_for_each_entry_safe(p, t, llist, wake_entry)
				2518	ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
				2519
				2520	rq_unlock_irqrestore(rq, &rf);
				2521	}
				2522
				2523	void scheduler_ipi(void)
				2524	{
				2525	/*
				2526	* Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
				2527	* TIF_NEED_RESCHED remotely (for the first time) will also send
				2528	* this IPI.
				2529	*/
				2530	preempt_fold_need_resched();
				2531
				2532	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
				2533	return;
				2534
				2535	/*
				2536	* Not all reschedule IPI handlers call irq_enter/irq_exit, since
				2537	* traditionally all their work was done from the interrupt return
				2538	* path. Now that we actually do some work, we need to make sure
				2539	* we do call them.
				2540	*
				2541	* Some archs already do call them, luckily irq_enter/exit nest
				2542	* properly.
				2543	*
				2544	* Arguably we should visit all archs and update all handlers,
				2545	* however a fair share of IPIs are still resched only so this would
				2546	* somewhat pessimize the simple resched case.
				2547	*/
				2548	irq_enter();
				2549	sched_ttwu_pending();
				2550
				2551	/*
				2552	* Check if someone kicked us for doing the nohz idle load balance.
				2553	*/
				2554	if (unlikely(got_nohz_idle_kick())) {
				2555	this_rq()->idle_balance = 1;
				2556	raise_softirq_irqoff(SCHED_SOFTIRQ);
				2557	}
				2558	irq_exit();
				2559	}
				2560
				2561	static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
				2562	{
				2563	struct rq *rq = cpu_rq(cpu);
				2564
				2565	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
				2566
				2567	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
				2568	if (!set_nr_if_polling(rq->idle))
				2569	smp_send_reschedule(cpu);
				2570	else
				2571	trace_sched_wake_idle_without_ipi(cpu);
				2572	}
				2573	}
				2574
				2575	void wake_up_if_idle(int cpu)
				2576	{
				2577	struct rq *rq = cpu_rq(cpu);
				2578	struct rq_flags rf;
				2579
				2580	rcu_read_lock();
				2581
				2582	if (!is_idle_task(rcu_dereference(rq->curr)))
				2583	goto out;
				2584
				2585	if (set_nr_if_polling(rq->idle)) {
				2586	trace_sched_wake_idle_without_ipi(cpu);
				2587	} else {
				2588	rq_lock_irqsave(rq, &rf);
				2589	if (is_idle_task(rq->curr))
				2590	smp_send_reschedule(cpu);
				2591	/* Else CPU is not idle, do nothing here: */
				2592	rq_unlock_irqrestore(rq, &rf);
				2593	}
				2594
				2595	out:
				2596	rcu_read_unlock();
				2597	}
				2598
				2599	bool cpus_share_cache(int this_cpu, int that_cpu)
				2600	{
				2601	if (this_cpu == that_cpu)
				2602	return true;
				2603
				2604	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
				2605	}
				2606	#endif /* CONFIG_SMP */
				2607
				2608	static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
				2609	{
				2610	struct rq *rq = cpu_rq(cpu);
				2611	struct rq_flags rf;
				2612
				2613	#if defined(CONFIG_SMP)
				2614	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
				2615	sched_clock_cpu(cpu); /* Sync clocks across CPUs */
				2616	ttwu_queue_remote(p, cpu, wake_flags);
				2617	return;
				2618	}
				2619	#endif
				2620
				2621	rq_lock(rq, &rf);
				2622	update_rq_clock(rq);
				2623	ttwu_do_activate(rq, p, wake_flags, &rf);
				2624	rq_unlock(rq, &rf);
				2625	}
				2626
				2627	/*
				2628	* Notes on Program-Order guarantees on SMP systems.
				2629	*
				2630	* MIGRATION
				2631	*
				2632	* The basic program-order guarantee on SMP systems is that when a task [t]
				2633	* migrates, all its activity on its old CPU [c0] happens-before any subsequent
				2634	* execution on its new CPU [c1].
				2635	*
				2636	* For migration (of runnable tasks) this is provided by the following means:
				2637	*
				2638	* A) UNLOCK of the rq(c0)->lock scheduling out task t
				2639	* B) migration for t is required to synchronize both rq(c0)->lock and
				2640	* rq(c1)->lock (if not at the same time, then in that order).
				2641	* C) LOCK of the rq(c1)->lock scheduling in task
				2642	*
				2643	* Release/acquire chaining guarantees that B happens after A and C after B.
				2644	* Note: the CPU doing B need not be c0 or c1
				2645	*
				2646	* Example:
				2647	*
				2648	* CPU0 CPU1 CPU2
				2649	*
				2650	* LOCK rq(0)->lock
				2651	* sched-out X
				2652	* sched-in Y
				2653	* UNLOCK rq(0)->lock
				2654	*
				2655	* LOCK rq(0)->lock // orders against CPU0
				2656	* dequeue X
				2657	* UNLOCK rq(0)->lock
				2658	*
				2659	* LOCK rq(1)->lock
				2660	* enqueue X
				2661	* UNLOCK rq(1)->lock
				2662	*
				2663	* LOCK rq(1)->lock // orders against CPU2
				2664	* sched-out Z
				2665	* sched-in X
				2666	* UNLOCK rq(1)->lock
				2667	*
				2668	*
				2669	* BLOCKING -- aka. SLEEP + WAKEUP
				2670	*
				2671	* For blocking we (obviously) need to provide the same guarantee as for
				2672	* migration. However the means are completely different as there is no lock
				2673	* chain to provide order. Instead we do:
				2674	*
				2675	* 1) smp_store_release(X->on_cpu, 0)
				2676	* 2) smp_cond_load_acquire(!X->on_cpu)
				2677	*
				2678	* Example:
				2679	*
				2680	* CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule)
				2681	*
				2682	* LOCK rq(0)->lock LOCK X->pi_lock
				2683	* dequeue X
				2684	* sched-out X
				2685	* smp_store_release(X->on_cpu, 0);
				2686	*
				2687	* smp_cond_load_acquire(&X->on_cpu, !VAL);
				2688	* X->state = WAKING
				2689	* set_task_cpu(X,2)
				2690	*
				2691	* LOCK rq(2)->lock
				2692	* enqueue X
				2693	* X->state = RUNNING
				2694	* UNLOCK rq(2)->lock
				2695	*
				2696	* LOCK rq(2)->lock // orders against CPU1
				2697	* sched-out Z
				2698	* sched-in X
				2699	* UNLOCK rq(2)->lock
				2700	*
				2701	* UNLOCK X->pi_lock
				2702	* UNLOCK rq(0)->lock
				2703	*
				2704	*
				2705	* However, for wakeups there is a second guarantee we must provide, namely we
				2706	* must ensure that CONDITION=1 done by the caller can not be reordered with
				2707	* accesses to the task state; see try_to_wake_up() and set_current_state().
				2708	*/
				2709
				2710	/**
				2711	* try_to_wake_up - wake up a thread
				2712	* @p: the thread to be awakened
				2713	* @state: the mask of task states that can be woken
				2714	* @wake_flags: wake modifier flags (WF_*)
				2715	*
				2716	* If (@state & @p->state) @p->state = TASK_RUNNING.
				2717	*
				2718	* If the task was not queued/runnable, also place it back on a runqueue.
				2719	*
				2720	* Atomic against schedule() which would dequeue a task, also see
				2721	* set_current_state().
				2722	*
				2723	* This function executes a full memory barrier before accessing the task
				2724	* state; see set_current_state().
				2725	*
				2726	* Return: %true if @p->state changes (an actual wakeup was done),
				2727	* %false otherwise.
				2728	*/
				2729	static int
				2730	try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
				2731	{
				2732	unsigned long flags;
				2733	int cpu, success = 0;
				2734
				2735	preempt_disable();
				2736	if (p == current) {
				2737	/*
				2738	* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
				2739	* == smp_processor_id()'. Together this means we can special
				2740	* case the whole 'p->on_rq && ttwu_remote()' case below
				2741	* without taking any locks.
				2742	*
				2743	* In particular:
				2744	* - we rely on Program-Order guarantees for all the ordering,
				2745	* - we're serialized against set_special_state() by virtue of
				2746	* it disabling IRQs (this allows not taking ->pi_lock).
				2747	*/
				2748	if (!(p->state & state))
				2749	goto out;
				2750
				2751	success = 1;
				2752	cpu = task_cpu(p);
				2753	trace_sched_waking(p);
				2754	p->state = TASK_RUNNING;
				2755	trace_sched_wakeup(p);
				2756	goto out;
				2757	}
				2758
				2759	/*
				2760	* If we are going to wake up a thread waiting for CONDITION we
				2761	* need to ensure that CONDITION=1 done by the caller can not be
				2762	* reordered with p->state check below. This pairs with mb() in
				2763	* set_current_state() the waiting thread does.
				2764	*/
				2765	raw_spin_lock_irqsave(&p->pi_lock, flags);
				2766	smp_mb__after_spinlock();
				2767	if (!(p->state & state))
				2768	goto unlock;
				2769
				2770	trace_sched_waking(p);
				2771
				2772	/* We're going to change ->state: */
				2773	success = 1;
				2774	cpu = task_cpu(p);
				2775
				2776	/*
				2777	* Ensure we load p->on_rq _after_ p->state, otherwise it would
				2778	* be possible to, falsely, observe p->on_rq == 0 and get stuck
				2779	* in smp_cond_load_acquire() below.
				2780	*
				2781	* sched_ttwu_pending() try_to_wake_up()
				2782	* STORE p->on_rq = 1 LOAD p->state
				2783	* UNLOCK rq->lock
				2784	*
				2785	* __schedule() (switch to task 'p')
				2786	* LOCK rq->lock smp_rmb();
				2787	* smp_mb__after_spinlock();
				2788	* UNLOCK rq->lock
				2789	*
				2790	* [task p]
				2791	* STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
				2792	*
				2793	* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
				2794	* __schedule(). See the comment for smp_mb__after_spinlock().
				2795	*/
				2796	smp_rmb();
				2797	if (p->on_rq && ttwu_remote(p, wake_flags))
				2798	goto unlock;
				2799
				2800	#ifdef CONFIG_SMP
				2801	/*
				2802	* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
				2803	* possible to, falsely, observe p->on_cpu == 0.
				2804	*
				2805	* One must be running (->on_cpu == 1) in order to remove oneself
				2806	* from the runqueue.
				2807	*
				2808	* __schedule() (switch to task 'p') try_to_wake_up()
				2809	* STORE p->on_cpu = 1 LOAD p->on_rq
				2810	* UNLOCK rq->lock
				2811	*
				2812	* __schedule() (put 'p' to sleep)
				2813	* LOCK rq->lock smp_rmb();
				2814	* smp_mb__after_spinlock();
				2815	* STORE p->on_rq = 0 LOAD p->on_cpu
				2816	*
				2817	* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
				2818	* __schedule(). See the comment for smp_mb__after_spinlock().
				2819	*/
				2820	smp_rmb();
				2821
				2822	/*
				2823	* If the owning (remote) CPU is still in the middle of schedule() with
				2824	* this task as prev, wait until its done referencing the task.
				2825	*
				2826	* Pairs with the smp_store_release() in finish_task().
				2827	*
				2828	* This ensures that tasks getting woken will be fully ordered against
				2829	* their previous state and preserve Program Order.
				2830	*/
				2831	smp_cond_load_acquire(&p->on_cpu, !VAL);
				2832
				2833	p->sched_contributes_to_load = !!task_contributes_to_load(p);
				2834	p->state = TASK_WAKING;
				2835
				2836	if (p->in_iowait) {
				2837	delayacct_blkio_end(p);
				2838	atomic_dec(&task_rq(p)->nr_iowait);
				2839	}
				2840
				2841	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
				2842	if (task_cpu(p) != cpu) {
				2843	wake_flags \|= WF_MIGRATED;
				2844	psi_ttwu_dequeue(p);
				2845	set_task_cpu(p, cpu);
				2846	}
				2847
				2848	#else /* CONFIG_SMP */
				2849
				2850	if (p->in_iowait) {
				2851	delayacct_blkio_end(p);
				2852	atomic_dec(&task_rq(p)->nr_iowait);
				2853	}
				2854
				2855	#endif /* CONFIG_SMP */
				2856
				2857	ttwu_queue(p, cpu, wake_flags);
				2858	unlock:
				2859	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
				2860	out:
				2861	if (success)
				2862	ttwu_stat(p, cpu, wake_flags);
				2863	preempt_enable();
				2864
				2865	return success;
				2866	}
				2867
				2868	/**
				2869	* wake_up_process - Wake up a specific process
				2870	* @p: The process to be woken up.
				2871	*
				2872	* Attempt to wake up the nominated process and move it to the set of runnable
				2873	* processes.
				2874	*
				2875	* Return: 1 if the process was woken up, 0 if it was already running.
				2876	*
				2877	* This function executes a full memory barrier before accessing the task state.
				2878	*/
				2879	int wake_up_process(struct task_struct *p)
				2880	{
				2881	return try_to_wake_up(p, TASK_NORMAL, 0);
				2882	}
				2883	EXPORT_SYMBOL(wake_up_process);
				2884
				2885	int wake_up_state(struct task_struct *p, unsigned int state)
				2886	{
				2887	return try_to_wake_up(p, state, 0);
				2888	}
				2889	EXPORT_SYMBOL_GPL(wake_up_state);
				2890
				2891	/*
				2892	* Perform scheduler related setup for a newly forked process p.
				2893	* p is forked by current.
				2894	*
				2895	* __sched_fork() is basic setup used by init_idle() too:
				2896	*/
				2897	static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
				2898	{
				2899	p->on_rq = 0;
				2900
				2901	p->se.on_rq = 0;
				2902	p->se.exec_start = 0;
				2903	p->se.sum_exec_runtime = 0;
				2904	p->se.prev_sum_exec_runtime = 0;
				2905	p->se.nr_migrations = 0;
				2906	p->se.vruntime = 0;
				2907	INIT_LIST_HEAD(&p->se.group_node);
				2908
				2909	#ifdef CONFIG_FAIR_GROUP_SCHED
				2910	p->se.cfs_rq = NULL;
				2911	#endif
				2912
				2913	#ifdef CONFIG_SCHEDSTATS
				2914	/* Even if schedstat is disabled, there should not be garbage */
				2915	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
				2916	#endif
				2917
				2918	RB_CLEAR_NODE(&p->dl.rb_node);
				2919	init_dl_task_timer(&p->dl);
				2920	init_dl_inactive_task_timer(&p->dl);
				2921	__dl_clear_params(p);
				2922
				2923	INIT_LIST_HEAD(&p->rt.run_list);
				2924	p->rt.timeout = 0;
				2925	p->rt.time_slice = sched_rr_timeslice;
				2926	p->rt.on_rq = 0;
				2927	p->rt.on_list = 0;
				2928
				2929	#ifdef CONFIG_PREEMPT_NOTIFIERS
				2930	INIT_HLIST_HEAD(&p->preempt_notifiers);
				2931	#endif
				2932
				2933	#ifdef CONFIG_COMPACTION
				2934	p->capture_control = NULL;
				2935	#endif
				2936	init_numa_balancing(clone_flags, p);
				2937	}
				2938
				2939	DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
				2940
				2941	#ifdef CONFIG_NUMA_BALANCING
				2942
				2943	void set_numabalancing_state(bool enabled)
				2944	{
				2945	if (enabled)
				2946	static_branch_enable(&sched_numa_balancing);
				2947	else
				2948	static_branch_disable(&sched_numa_balancing);
				2949	}
				2950
				2951	#ifdef CONFIG_PROC_SYSCTL
				2952	int sysctl_numa_balancing(struct ctl_table *table, int write,
				2953	void __user buffer, size_t lenp, loff_t *ppos)
				2954	{
				2955	struct ctl_table t;
				2956	int err;
				2957	int state = static_branch_likely(&sched_numa_balancing);
				2958
				2959	if (write && !capable(CAP_SYS_ADMIN))
				2960	return -EPERM;
				2961
				2962	t = *table;
				2963	t.data = &state;
				2964	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
				2965	if (err < 0)
				2966	return err;
				2967	if (write)
				2968	set_numabalancing_state(state);
				2969	return err;
				2970	}
				2971	#endif
				2972	#endif
				2973
				2974	#ifdef CONFIG_SCHEDSTATS
				2975
				2976	DEFINE_STATIC_KEY_FALSE(sched_schedstats);
				2977	static bool __initdata __sched_schedstats = false;
				2978
				2979	static void set_schedstats(bool enabled)
				2980	{
				2981	if (enabled)
				2982	static_branch_enable(&sched_schedstats);
				2983	else
				2984	static_branch_disable(&sched_schedstats);
				2985	}
				2986
				2987	void force_schedstat_enabled(void)
				2988	{
				2989	if (!schedstat_enabled()) {
				2990	pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
				2991	static_branch_enable(&sched_schedstats);
				2992	}
				2993	}
				2994
				2995	static int __init setup_schedstats(char *str)
				2996	{
				2997	int ret = 0;
				2998	if (!str)
				2999	goto out;
				3000
				3001	/*
				3002	* This code is called before jump labels have been set up, so we can't
				3003	* change the static branch directly just yet. Instead set a temporary
				3004	* variable so init_schedstats() can do it later.
				3005	*/
				3006	if (!strcmp(str, "enable")) {
				3007	__sched_schedstats = true;
				3008	ret = 1;
				3009	} else if (!strcmp(str, "disable")) {
				3010	__sched_schedstats = false;
				3011	ret = 1;
				3012	}
				3013	out:
				3014	if (!ret)
				3015	pr_warn("Unable to parse schedstats=\n");
				3016
				3017	return ret;
				3018	}
				3019	__setup("schedstats=", setup_schedstats);
				3020
				3021	static void __init init_schedstats(void)
				3022	{
				3023	set_schedstats(__sched_schedstats);
				3024	}
				3025
				3026	#ifdef CONFIG_PROC_SYSCTL
				3027	int sysctl_schedstats(struct ctl_table *table, int write,
				3028	void __user buffer, size_t lenp, loff_t *ppos)
				3029	{
				3030	struct ctl_table t;
				3031	int err;
				3032	int state = static_branch_likely(&sched_schedstats);
				3033
				3034	if (write && !capable(CAP_SYS_ADMIN))
				3035	return -EPERM;
				3036
				3037	t = *table;
				3038	t.data = &state;
				3039	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
				3040	if (err < 0)
				3041	return err;
				3042	if (write)
				3043	set_schedstats(state);
				3044	return err;
				3045	}
				3046	#endif /* CONFIG_PROC_SYSCTL */
				3047	#else /* !CONFIG_SCHEDSTATS */
				3048	static inline void init_schedstats(void) {}
				3049	#endif /* CONFIG_SCHEDSTATS */
				3050
				3051	/*
				3052	* fork()/clone()-time setup:
				3053	*/
				3054	int sched_fork(unsigned long clone_flags, struct task_struct *p)
				3055	{
				3056	unsigned long flags;
				3057
				3058	__sched_fork(clone_flags, p);
				3059	/*
				3060	* We mark the process as NEW here. This guarantees that
				3061	* nobody will actually run it, and a signal or other external
				3062	* event cannot wake it up and insert it on the runqueue either.
				3063	*/
				3064	p->state = TASK_NEW;
				3065
				3066	/*
				3067	* Make sure we do not leak PI boosting priority to the child.
				3068	*/
				3069	p->prio = current->normal_prio;
				3070	trace_android_rvh_prepare_prio_fork(p);
				3071
				3072	uclamp_fork(p);
				3073
				3074	/*
				3075	* Revert to default priority/policy on fork if requested.
				3076	*/
				3077	if (unlikely(p->sched_reset_on_fork)) {
				3078	if (task_has_dl_policy(p) \|\| task_has_rt_policy(p)) {
				3079	p->policy = SCHED_NORMAL;
				3080	p->static_prio = NICE_TO_PRIO(0);
				3081	p->rt_priority = 0;
				3082	} else if (PRIO_TO_NICE(p->static_prio) < 0)
				3083	p->static_prio = NICE_TO_PRIO(0);
				3084
				3085	p->prio = p->normal_prio = __normal_prio(p);
				3086	set_load_weight(p, false);
				3087
				3088	/*
				3089	* We don't need the reset flag anymore after the fork. It has
				3090	* fulfilled its duty:
				3091	*/
				3092	p->sched_reset_on_fork = 0;
				3093	}
				3094
				3095	if (dl_prio(p->prio))
				3096	return -EAGAIN;
				3097	else if (rt_prio(p->prio))
				3098	p->sched_class = &rt_sched_class;
				3099	else
				3100	p->sched_class = &fair_sched_class;
				3101
				3102	init_entity_runnable_average(&p->se);
				3103	trace_android_rvh_finish_prio_fork(p);
				3104
				3105	/*
				3106	* The child is not yet in the pid-hash so no cgroup attach races,
				3107	* and the cgroup is pinned to this child due to cgroup_fork()
				3108	* is ran before sched_fork().
				3109	*
				3110	* Silence PROVE_RCU.
				3111	*/
				3112	raw_spin_lock_irqsave(&p->pi_lock, flags);
				3113	rseq_migrate(p);
				3114	/*
				3115	* We're setting the CPU for the first time, we don't migrate,
				3116	* so use __set_task_cpu().
				3117	*/
				3118	__set_task_cpu(p, smp_processor_id());
				3119	if (p->sched_class->task_fork)
				3120	p->sched_class->task_fork(p);
				3121	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
				3122
				3123	#ifdef CONFIG_SCHED_INFO
				3124	if (likely(sched_info_on()))
				3125	memset(&p->sched_info, 0, sizeof(p->sched_info));
				3126	#endif
				3127	#if defined(CONFIG_SMP)
				3128	p->on_cpu = 0;
				3129	#endif
				3130	init_task_preempt_count(p);
				3131	#ifdef CONFIG_SMP
				3132	plist_node_init(&p->pushable_tasks, MAX_PRIO);
				3133	RB_CLEAR_NODE(&p->pushable_dl_tasks);
				3134	#endif
				3135	return 0;
				3136	}
				3137
				3138	void sched_post_fork(struct task_struct *p)
				3139	{
				3140	uclamp_post_fork(p);
				3141	}
				3142
				3143	unsigned long to_ratio(u64 period, u64 runtime)
				3144	{
				3145	if (runtime == RUNTIME_INF)
				3146	return BW_UNIT;
				3147
				3148	/*
				3149	* Doing this here saves a lot of checks in all
				3150	* the calling paths, and returning zero seems
				3151	* safe for them anyway.
				3152	*/
				3153	if (period == 0)
				3154	return 0;
				3155
				3156	return div64_u64(runtime << BW_SHIFT, period);
				3157	}
				3158
				3159	/*
				3160	* wake_up_new_task - wake up a newly created task for the first time.
				3161	*
				3162	* This function will do some initial scheduler statistics housekeeping
				3163	* that must be done for every newly created context, then puts the task
				3164	* on the runqueue and wakes it.
				3165	*/
				3166	void wake_up_new_task(struct task_struct *p)
				3167	{
				3168	struct rq_flags rf;
				3169	struct rq *rq;
				3170
				3171	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
				3172	p->state = TASK_RUNNING;
				3173	#ifdef CONFIG_SMP
				3174	/*
				3175	* Fork balancing, do it here and not earlier because:
				3176	* - cpus_ptr can change in the fork path
				3177	* - any previously selected CPU might disappear through hotplug
				3178	*
				3179	* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
				3180	* as we're not fully set-up yet.
				3181	*/
				3182	p->recent_used_cpu = task_cpu(p);
				3183	rseq_migrate(p);
				3184	__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
				3185	#endif
				3186	rq = __task_rq_lock(p, &rf);
				3187	update_rq_clock(rq);
				3188	post_init_entity_util_avg(p);
				3189
				3190	activate_task(rq, p, ENQUEUE_NOCLOCK);
				3191	trace_sched_wakeup_new(p);
				3192	check_preempt_curr(rq, p, WF_FORK);
				3193	#ifdef CONFIG_SMP
				3194	if (p->sched_class->task_woken) {
				3195	/*
				3196	* Nothing relies on rq->lock after this, so its fine to
				3197	* drop it.
				3198	*/
				3199	rq_unpin_lock(rq, &rf);
				3200	p->sched_class->task_woken(rq, p);
				3201	rq_repin_lock(rq, &rf);
				3202	}
				3203	#endif
				3204	task_rq_unlock(rq, p, &rf);
				3205	}
				3206
				3207	#ifdef CONFIG_PREEMPT_NOTIFIERS
				3208
				3209	static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
				3210
				3211	void preempt_notifier_inc(void)
				3212	{
				3213	static_branch_inc(&preempt_notifier_key);
				3214	}
				3215	EXPORT_SYMBOL_GPL(preempt_notifier_inc);
				3216
				3217	void preempt_notifier_dec(void)
				3218	{
				3219	static_branch_dec(&preempt_notifier_key);
				3220	}
				3221	EXPORT_SYMBOL_GPL(preempt_notifier_dec);
				3222
				3223	/**
				3224	* preempt_notifier_register - tell me when current is being preempted & rescheduled
				3225	* @notifier: notifier struct to register
				3226	*/
				3227	void preempt_notifier_register(struct preempt_notifier *notifier)
				3228	{
				3229	if (!static_branch_unlikely(&preempt_notifier_key))
				3230	WARN(1, "registering preempt_notifier while notifiers disabled\n");
				3231
				3232	hlist_add_head(&notifier->link, &current->preempt_notifiers);
				3233	}
				3234	EXPORT_SYMBOL_GPL(preempt_notifier_register);
				3235
				3236	/**
				3237	* preempt_notifier_unregister - no longer interested in preemption notifications
				3238	* @notifier: notifier struct to unregister
				3239	*
				3240	* This is not safe to call from within a preemption notifier.
				3241	*/
				3242	void preempt_notifier_unregister(struct preempt_notifier *notifier)
				3243	{
				3244	hlist_del(&notifier->link);
				3245	}
				3246	EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
				3247
				3248	static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
				3249	{
				3250	struct preempt_notifier *notifier;
				3251
				3252	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
				3253	notifier->ops->sched_in(notifier, raw_smp_processor_id());
				3254	}
				3255
				3256	static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
				3257	{
				3258	if (static_branch_unlikely(&preempt_notifier_key))
				3259	__fire_sched_in_preempt_notifiers(curr);
				3260	}
				3261
				3262	static void
				3263	__fire_sched_out_preempt_notifiers(struct task_struct *curr,
				3264	struct task_struct *next)
				3265	{
				3266	struct preempt_notifier *notifier;
				3267
				3268	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
				3269	notifier->ops->sched_out(notifier, next);
				3270	}
				3271
				3272	static __always_inline void
				3273	fire_sched_out_preempt_notifiers(struct task_struct *curr,
				3274	struct task_struct *next)
				3275	{
				3276	if (static_branch_unlikely(&preempt_notifier_key))
				3277	__fire_sched_out_preempt_notifiers(curr, next);
				3278	}
				3279
				3280	#else /* !CONFIG_PREEMPT_NOTIFIERS */
				3281
				3282	static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
				3283	{
				3284	}
				3285
				3286	static inline void
				3287	fire_sched_out_preempt_notifiers(struct task_struct *curr,
				3288	struct task_struct *next)
				3289	{
				3290	}
				3291
				3292	#endif /* CONFIG_PREEMPT_NOTIFIERS */
				3293
				3294	static inline void prepare_task(struct task_struct *next)
				3295	{
				3296	#ifdef CONFIG_SMP
				3297	/*
				3298	* Claim the task as running, we do this before switching to it
				3299	* such that any running task will have this set.
				3300	*/
				3301	next->on_cpu = 1;
				3302	#endif
				3303	}
				3304
				3305	static inline void finish_task(struct task_struct *prev)
				3306	{
				3307	#ifdef CONFIG_SMP
				3308	/*
				3309	* After ->on_cpu is cleared, the task can be moved to a different CPU.
				3310	* We must ensure this doesn't happen until the switch is completely
				3311	* finished.
				3312	*
				3313	* In particular, the load of prev->state in finish_task_switch() must
				3314	* happen before this.
				3315	*
				3316	* Pairs with the smp_cond_load_acquire() in try_to_wake_up().
				3317	*/
				3318	smp_store_release(&prev->on_cpu, 0);
				3319	#endif
				3320	}
				3321
				3322	static inline void
				3323	prepare_lock_switch(struct rq rq, struct task_struct next, struct rq_flags *rf)
				3324	{
				3325	/*
				3326	* Since the runqueue lock will be released by the next
				3327	* task (which is an invalid locking op but in the case
				3328	* of the scheduler it's an obvious special-case), so we
				3329	* do an early lockdep release here:
				3330	*/
				3331	rq_unpin_lock(rq, rf);
				3332	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
				3333	#ifdef CONFIG_DEBUG_SPINLOCK
				3334	/* this is a valid case when another task releases the spinlock */
				3335	rq->lock.owner = next;
				3336	#endif
				3337	}
				3338
				3339	static inline void finish_lock_switch(struct rq *rq)
				3340	{
				3341	/*
				3342	* If we are tracking spinlock dependencies then we have to
				3343	* fix up the runqueue lock - which gets 'carried over' from
				3344	* prev into current:
				3345	*/
				3346	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
				3347	raw_spin_unlock_irq(&rq->lock);
				3348	}
				3349
				3350	/*
				3351	* NOP if the arch has not defined these:
				3352	*/
				3353
				3354	#ifndef prepare_arch_switch
				3355	# define prepare_arch_switch(next) do { } while (0)
				3356	#endif
				3357
				3358	#ifndef finish_arch_post_lock_switch
				3359	# define finish_arch_post_lock_switch() do { } while (0)
				3360	#endif
				3361
				3362	/**
				3363	* prepare_task_switch - prepare to switch tasks
				3364	* @rq: the runqueue preparing to switch
				3365	* @prev: the current task that is being switched out
				3366	* @next: the task we are going to switch to.
				3367	*
				3368	* This is called with the rq lock held and interrupts off. It must
				3369	* be paired with a subsequent finish_task_switch after the context
				3370	* switch.
				3371	*
				3372	* prepare_task_switch sets up locking and calls architecture specific
				3373	* hooks.
				3374	*/
				3375	static inline void
				3376	prepare_task_switch(struct rq rq, struct task_struct prev,
				3377	struct task_struct *next)
				3378	{
				3379	kcov_prepare_switch(prev);
				3380	sched_info_switch(rq, prev, next);
				3381	perf_event_task_sched_out(prev, next);
				3382	rseq_preempt(prev);
				3383	fire_sched_out_preempt_notifiers(prev, next);
				3384	prepare_task(next);
				3385	prepare_arch_switch(next);
				3386	}
				3387
				3388	/**
				3389	* finish_task_switch - clean up after a task-switch
				3390	* @prev: the thread we just switched away from.
				3391	*
				3392	* finish_task_switch must be called after the context switch, paired
				3393	* with a prepare_task_switch call before the context switch.
				3394	* finish_task_switch will reconcile locking set up by prepare_task_switch,
				3395	* and do any other architecture-specific cleanup actions.
				3396	*
				3397	* Note that we may have delayed dropping an mm in context_switch(). If
				3398	* so, we finish that here outside of the runqueue lock. (Doing it
				3399	* with the lock held can cause deadlocks; see schedule() for
				3400	* details.)
				3401	*
				3402	* The context switch have flipped the stack from under us and restored the
				3403	* local variables which were saved when this task called schedule() in the
				3404	* past. prev == current is still correct but we need to recalculate this_rq
				3405	* because prev may have moved to another CPU.
				3406	*/
				3407	static struct rq finish_task_switch(struct task_struct prev)
				3408	__releases(rq->lock)
				3409	{
				3410	struct rq *rq = this_rq();
				3411	struct mm_struct *mm = rq->prev_mm;
				3412	long prev_state;
				3413
				3414	/*
				3415	* The previous task will have left us with a preempt_count of 2
				3416	* because it left us after:
				3417	*
				3418	* schedule()
				3419	* preempt_disable(); // 1
				3420	* __schedule()
				3421	* raw_spin_lock_irq(&rq->lock) // 2
				3422	*
				3423	* Also, see FORK_PREEMPT_COUNT.
				3424	*/
				3425	if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
				3426	"corrupted preempt_count: %s/%d/0x%x\n",
				3427	current->comm, current->pid, preempt_count()))
				3428	preempt_count_set(FORK_PREEMPT_COUNT);
				3429
				3430	rq->prev_mm = NULL;
				3431
				3432	/*
				3433	* A task struct has one reference for the use as "current".
				3434	* If a task dies, then it sets TASK_DEAD in tsk->state and calls
				3435	* schedule one last time. The schedule call will never return, and
				3436	* the scheduled task must drop that reference.
				3437	*
				3438	* We must observe prev->state before clearing prev->on_cpu (in
				3439	* finish_task), otherwise a concurrent wakeup can get prev
				3440	* running on another CPU and we could rave with its RUNNING -> DEAD
				3441	* transition, resulting in a double drop.
				3442	*/
				3443	prev_state = prev->state;
				3444	vtime_task_switch(prev);
				3445	perf_event_task_sched_in(prev, current);
				3446	finish_task(prev);
				3447	finish_lock_switch(rq);
				3448	finish_arch_post_lock_switch();
				3449	kcov_finish_switch(current);
				3450
				3451	fire_sched_in_preempt_notifiers(current);
				3452	/*
				3453	* When switching through a kernel thread, the loop in
				3454	* membarrier_{private,global}_expedited() may have observed that
				3455	* kernel thread and not issued an IPI. It is therefore possible to
				3456	* schedule between user->kernel->user threads without passing though
				3457	* switch_mm(). Membarrier requires a barrier after storing to
				3458	* rq->curr, before returning to userspace, so provide them here:
				3459	*
				3460	* - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
				3461	* provided by mmdrop(),
				3462	* - a sync_core for SYNC_CORE.
				3463	*/
				3464	if (mm) {
				3465	membarrier_mm_sync_core_before_usermode(mm);
				3466	mmdrop(mm);
				3467	}
				3468	if (unlikely(prev_state == TASK_DEAD)) {
				3469	if (prev->sched_class->task_dead)
				3470	prev->sched_class->task_dead(prev);
				3471
				3472	/*
				3473	* Remove function-return probe instances associated with this
				3474	* task and put them back on the free list.
				3475	*/
				3476	kprobe_flush_task(prev);
				3477
				3478	/* Task is done with its stack. */
				3479	put_task_stack(prev);
				3480
				3481	put_task_struct_rcu_user(prev);
				3482	}
				3483
				3484	tick_nohz_task_switch();
				3485	return rq;
				3486	}
				3487
				3488	#ifdef CONFIG_SMP
				3489
				3490	/* rq->lock is NOT held, but preemption is disabled */
				3491	static void __balance_callback(struct rq *rq)
				3492	{
				3493	struct callback_head head, next;
				3494	void (func)(struct rq rq);
				3495	unsigned long flags;
				3496
				3497	raw_spin_lock_irqsave(&rq->lock, flags);
				3498	head = rq->balance_callback;
				3499	rq->balance_callback = NULL;
				3500	while (head) {
				3501	func = (void ()(struct rq ))head->func;
				3502	next = head->next;
				3503	head->next = NULL;
				3504	head = next;
				3505
				3506	func(rq);
				3507	}
				3508	raw_spin_unlock_irqrestore(&rq->lock, flags);
				3509	}
				3510
				3511	static inline void balance_callback(struct rq *rq)
				3512	{
				3513	if (unlikely(rq->balance_callback))
				3514	__balance_callback(rq);
				3515	}
				3516
				3517	#else
				3518
				3519	static inline void balance_callback(struct rq *rq)
				3520	{
				3521	}
				3522
				3523	#endif
				3524
				3525	/**
				3526	* schedule_tail - first thing a freshly forked thread must call.
				3527	* @prev: the thread we just switched away from.
				3528	*/
				3529	asmlinkage __visible void schedule_tail(struct task_struct *prev)
				3530	__releases(rq->lock)
				3531	{
				3532	struct rq *rq;
				3533
				3534	/*
				3535	* New tasks start with FORK_PREEMPT_COUNT, see there and
				3536	* finish_task_switch() for details.
				3537	*
				3538	* finish_task_switch() will drop rq->lock() and lower preempt_count
				3539	* and the preempt_enable() will end up enabling preemption (on
				3540	* PREEMPT_COUNT kernels).
				3541	*/
				3542
				3543	rq = finish_task_switch(prev);
				3544	balance_callback(rq);
				3545	preempt_enable();
				3546
				3547	if (current->set_child_tid)
				3548	put_user(task_pid_vnr(current), current->set_child_tid);
				3549
				3550	calculate_sigpending();
				3551	}
				3552
				3553	/*
				3554	* context_switch - switch to the new MM and the new thread's register state.
				3555	*/
				3556	static __always_inline struct rq *
				3557	context_switch(struct rq rq, struct task_struct prev,
				3558	struct task_struct next, struct rq_flags rf)
				3559	{
				3560	prepare_task_switch(rq, prev, next);
				3561
				3562	/*
				3563	* For paravirt, this is coupled with an exit in switch_to to
				3564	* combine the page table reload and the switch backend into
				3565	* one hypercall.
				3566	*/
				3567	arch_start_context_switch(prev);
				3568
				3569	/*
				3570	* kernel -> kernel lazy + transfer active
				3571	* user -> kernel lazy + mmgrab() active
				3572	*
				3573	* kernel -> user switch + mmdrop() active
				3574	* user -> user switch
				3575	*/
				3576	if (!next->mm) { // to kernel
				3577	enter_lazy_tlb(prev->active_mm, next);
				3578
				3579	next->active_mm = prev->active_mm;
				3580	if (prev->mm) // from user
				3581	mmgrab(prev->active_mm);
				3582	else
				3583	prev->active_mm = NULL;
				3584	} else { // to user
				3585	membarrier_switch_mm(rq, prev->active_mm, next->mm);
				3586	/*
				3587	* sys_membarrier() requires an smp_mb() between setting
				3588	* rq->curr / membarrier_switch_mm() and returning to userspace.
				3589	*
				3590	* The below provides this either through switch_mm(), or in
				3591	* case 'prev->active_mm == next->mm' through
				3592	* finish_task_switch()'s mmdrop().
				3593	*/
				3594	switch_mm_irqs_off(prev->active_mm, next->mm, next);
				3595
				3596	if (!prev->mm) { // from kernel
				3597	/* will mmdrop() in finish_task_switch(). */
				3598	rq->prev_mm = prev->active_mm;
				3599	prev->active_mm = NULL;
				3600	}
				3601	}
				3602
				3603	rq->clock_update_flags &= ~(RQCF_ACT_SKIP\|RQCF_REQ_SKIP);
				3604
				3605	prepare_lock_switch(rq, next, rf);
				3606
				3607	/* Here we just switch the register state and the stack. */
				3608	switch_to(prev, next, prev);
				3609	barrier();
				3610
				3611	#ifdef CONFIG_CPU_ASR1901
				3612	prev->last_irq_trace = prev->cur_irq_trace;
				3613	prev->cur_irq_trace = 0;
				3614	#endif
				3615	return finish_task_switch(prev);
				3616	}
				3617
				3618	#ifdef CONFIG_CPU_ASR1901
				3619	void write_one_irq_trace(u32 fun_ptr)
				3620	{
				3621	struct task_struct *task = current;
				3622	task->irq_trace[task->cur_irq_trace % NR_IRQ_TRACE] = fun_ptr;
				3623	task->cur_irq_trace++;
				3624	}
				3625	#endif
				3626
				3627	/*
				3628	* nr_running and nr_context_switches:
				3629	*
				3630	* externally visible scheduler statistics: current number of runnable
				3631	* threads, total number of context switches performed since bootup.
				3632	*/
				3633	unsigned long nr_running(void)
				3634	{
				3635	unsigned long i, sum = 0;
				3636
				3637	for_each_online_cpu(i)
				3638	sum += cpu_rq(i)->nr_running;
				3639
				3640	return sum;
				3641	}
				3642
				3643	/*
				3644	* Check if only the current task is running on the CPU.
				3645	*
				3646	* Caution: this function does not check that the caller has disabled
				3647	* preemption, thus the result might have a time-of-check-to-time-of-use
				3648	* race. The caller is responsible to use it correctly, for example:
				3649	*
				3650	* - from a non-preemptible section (of course)
				3651	*
				3652	* - from a thread that is bound to a single CPU
				3653	*
				3654	* - in a loop with very short iterations (e.g. a polling loop)
				3655	*/
				3656	bool single_task_running(void)
				3657	{
				3658	return raw_rq()->nr_running == 1;
				3659	}
				3660	EXPORT_SYMBOL(single_task_running);
				3661
				3662	unsigned long long nr_context_switches(void)
				3663	{
				3664	int i;
				3665	unsigned long long sum = 0;
				3666
				3667	for_each_possible_cpu(i)
				3668	sum += cpu_rq(i)->nr_switches;
				3669
				3670	return sum;
				3671	}
				3672
				3673	/*
				3674	* Consumers of these two interfaces, like for example the cpuidle menu
				3675	* governor, are using nonsensical data. Preferring shallow idle state selection
				3676	* for a CPU that has IO-wait which might not even end up running the task when
				3677	* it does become runnable.
				3678	*/
				3679
				3680	unsigned long nr_iowait_cpu(int cpu)
				3681	{
				3682	return atomic_read(&cpu_rq(cpu)->nr_iowait);
				3683	}
				3684
				3685	/*
				3686	* IO-wait accounting, and how its mostly bollocks (on SMP).
				3687	*
				3688	* The idea behind IO-wait account is to account the idle time that we could
				3689	* have spend running if it were not for IO. That is, if we were to improve the
				3690	* storage performance, we'd have a proportional reduction in IO-wait time.
				3691	*
				3692	* This all works nicely on UP, where, when a task blocks on IO, we account
				3693	* idle time as IO-wait, because if the storage were faster, it could've been
				3694	* running and we'd not be idle.
				3695	*
				3696	* This has been extended to SMP, by doing the same for each CPU. This however
				3697	* is broken.
				3698	*
				3699	* Imagine for instance the case where two tasks block on one CPU, only the one
				3700	* CPU will have IO-wait accounted, while the other has regular idle. Even
				3701	* though, if the storage were faster, both could've ran at the same time,
				3702	* utilising both CPUs.
				3703	*
				3704	* This means, that when looking globally, the current IO-wait accounting on
				3705	* SMP is a lower bound, by reason of under accounting.
				3706	*
				3707	* Worse, since the numbers are provided per CPU, they are sometimes
				3708	* interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
				3709	* associated with any one particular CPU, it can wake to another CPU than it
				3710	* blocked on. This means the per CPU IO-wait number is meaningless.
				3711	*
				3712	* Task CPU affinities can make all that even more 'interesting'.
				3713	*/
				3714
				3715	unsigned long nr_iowait(void)
				3716	{
				3717	unsigned long i, sum = 0;
				3718
				3719	for_each_possible_cpu(i)
				3720	sum += nr_iowait_cpu(i);
				3721
				3722	return sum;
				3723	}
				3724
				3725	#ifdef CONFIG_SMP
				3726
				3727	/*
				3728	* sched_exec - execve() is a valuable balancing opportunity, because at
				3729	* this point the task has the smallest effective memory and cache footprint.
				3730	*/
				3731	void sched_exec(void)
				3732	{
				3733	struct task_struct *p = current;
				3734	unsigned long flags;
				3735	int dest_cpu;
				3736
				3737	raw_spin_lock_irqsave(&p->pi_lock, flags);
				3738	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
				3739	if (dest_cpu == smp_processor_id())
				3740	goto unlock;
				3741
				3742	if (likely(cpu_active(dest_cpu))) {
				3743	struct migration_arg arg = { p, dest_cpu };
				3744
				3745	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
				3746	stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
				3747	return;
				3748	}
				3749	unlock:
				3750	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
				3751	}
				3752
				3753	#endif
				3754
				3755	DEFINE_PER_CPU(struct kernel_stat, kstat);
				3756	DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
				3757
				3758	EXPORT_PER_CPU_SYMBOL(kstat);
				3759	EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
				3760
				3761	/*
				3762	* The function fair_sched_class.update_curr accesses the struct curr
				3763	* and its field curr->exec_start; when called from task_sched_runtime(),
				3764	* we observe a high rate of cache misses in practice.
				3765	* Prefetching this data results in improved performance.
				3766	*/
				3767	static inline void prefetch_curr_exec_start(struct task_struct *p)
				3768	{
				3769	#ifdef CONFIG_FAIR_GROUP_SCHED
				3770	struct sched_entity *curr = (&p->se)->cfs_rq->curr;
				3771	#else
				3772	struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
				3773	#endif
				3774	prefetch(curr);
				3775	prefetch(&curr->exec_start);
				3776	}
				3777
				3778	/*
				3779	* Return accounted runtime for the task.
				3780	* In case the task is currently running, return the runtime plus current's
				3781	* pending runtime that have not been accounted yet.
				3782	*/
				3783	unsigned long long task_sched_runtime(struct task_struct *p)
				3784	{
				3785	struct rq_flags rf;
				3786	struct rq *rq;
				3787	u64 ns;
				3788
				3789	#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
				3790	/*
				3791	* 64-bit doesn't need locks to atomically read a 64-bit value.
				3792	* So we have a optimization chance when the task's delta_exec is 0.
				3793	* Reading ->on_cpu is racy, but this is ok.
				3794	*
				3795	* If we race with it leaving CPU, we'll take a lock. So we're correct.
				3796	* If we race with it entering CPU, unaccounted time is 0. This is
				3797	* indistinguishable from the read occurring a few cycles earlier.
				3798	* If we see ->on_cpu without ->on_rq, the task is leaving, and has
				3799	* been accounted, so we're correct here as well.
				3800	*/
				3801	if (!p->on_cpu \|\| !task_on_rq_queued(p))
				3802	return p->se.sum_exec_runtime;
				3803	#endif
				3804
				3805	rq = task_rq_lock(p, &rf);
				3806	/*
				3807	* Must be ->curr _and_ ->on_rq. If dequeued, we would
				3808	* project cycles that may never be accounted to this
				3809	* thread, breaking clock_gettime().
				3810	*/
				3811	if (task_current(rq, p) && task_on_rq_queued(p)) {
				3812	prefetch_curr_exec_start(p);
				3813	update_rq_clock(rq);
				3814	p->sched_class->update_curr(rq);
				3815	}
				3816	ns = p->se.sum_exec_runtime;
				3817	task_rq_unlock(rq, p, &rf);
				3818
				3819	return ns;
				3820	}
				3821	EXPORT_SYMBOL_GPL(task_sched_runtime);
				3822
				3823	/*
				3824	* This function gets called by the timer code, with HZ frequency.
				3825	* We call it with interrupts disabled.
				3826	*/
				3827	void scheduler_tick(void)
				3828	{
				3829	int cpu = smp_processor_id();
				3830	struct rq *rq = cpu_rq(cpu);
				3831	struct task_struct *curr = rq->curr;
				3832	struct rq_flags rf;
				3833
				3834	sched_clock_tick();
				3835
				3836	rq_lock(rq, &rf);
				3837
				3838	update_rq_clock(rq);
				3839	curr->sched_class->task_tick(rq, curr, 0);
				3840	calc_global_load_tick(rq);
				3841	psi_task_tick(rq);
				3842
				3843	rq_unlock(rq, &rf);
				3844
				3845	perf_event_task_tick();
				3846
				3847	#ifdef CONFIG_SMP
				3848	rq->idle_balance = idle_cpu(cpu);
				3849	trigger_load_balance(rq);
				3850	#endif
				3851
				3852	trace_android_vh_scheduler_tick(rq);
				3853	}
				3854
				3855	#ifdef CONFIG_NO_HZ_FULL
				3856
				3857	struct tick_work {
				3858	int cpu;
				3859	atomic_t state;
				3860	struct delayed_work work;
				3861	};
				3862	/* Values for ->state, see diagram below. */
				3863	#define TICK_SCHED_REMOTE_OFFLINE 0
				3864	#define TICK_SCHED_REMOTE_OFFLINING 1
				3865	#define TICK_SCHED_REMOTE_RUNNING 2
				3866
				3867	/*
				3868	* State diagram for ->state:
				3869	*
				3870	*
				3871	* TICK_SCHED_REMOTE_OFFLINE
				3872	* \| ^
				3873	* \| \|
				3874	* \| \| sched_tick_remote()
				3875	* \| \|
				3876	* \| \|
				3877	* +--TICK_SCHED_REMOTE_OFFLINING
				3878	* \| ^
				3879	* \| \|
				3880	* sched_tick_start() \| \| sched_tick_stop()
				3881	* \| \|
				3882	* V \|
				3883	* TICK_SCHED_REMOTE_RUNNING
				3884	*
				3885	*
				3886	* Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
				3887	* and sched_tick_start() are happy to leave the state in RUNNING.
				3888	*/
				3889
				3890	static struct tick_work __percpu *tick_work_cpu;
				3891
				3892	static void sched_tick_remote(struct work_struct *work)
				3893	{
				3894	struct delayed_work *dwork = to_delayed_work(work);
				3895	struct tick_work *twork = container_of(dwork, struct tick_work, work);
				3896	int cpu = twork->cpu;
				3897	struct rq *rq = cpu_rq(cpu);
				3898	struct task_struct *curr;
				3899	struct rq_flags rf;
				3900	u64 delta;
				3901	int os;
				3902
				3903	/*
				3904	* Handle the tick only if it appears the remote CPU is running in full
				3905	* dynticks mode. The check is racy by nature, but missing a tick or
				3906	* having one too much is no big deal because the scheduler tick updates
				3907	* statistics and checks timeslices in a time-independent way, regardless
				3908	* of when exactly it is running.
				3909	*/
				3910	if (!tick_nohz_tick_stopped_cpu(cpu))
				3911	goto out_requeue;
				3912
				3913	rq_lock_irq(rq, &rf);
				3914	curr = rq->curr;
				3915	if (cpu_is_offline(cpu))
				3916	goto out_unlock;
				3917
				3918	update_rq_clock(rq);
				3919
				3920	if (!is_idle_task(curr)) {
				3921	/*
				3922	* Make sure the next tick runs within a reasonable
				3923	* amount of time.
				3924	*/
				3925	delta = rq_clock_task(rq) - curr->se.exec_start;
				3926	WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
				3927	}
				3928	curr->sched_class->task_tick(rq, curr, 0);
				3929
				3930	calc_load_nohz_remote(rq);
				3931	out_unlock:
				3932	rq_unlock_irq(rq, &rf);
				3933	out_requeue:
				3934
				3935	/*
				3936	* Run the remote tick once per second (1Hz). This arbitrary
				3937	* frequency is large enough to avoid overload but short enough
				3938	* to keep scheduler internal stats reasonably up to date. But
				3939	* first update state to reflect hotplug activity if required.
				3940	*/
				3941	os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
				3942	WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
				3943	if (os == TICK_SCHED_REMOTE_RUNNING)
				3944	queue_delayed_work(system_unbound_wq, dwork, HZ);
				3945	}
				3946
				3947	static void sched_tick_start(int cpu)
				3948	{
				3949	int os;
				3950	struct tick_work *twork;
				3951
				3952	if (housekeeping_cpu(cpu, HK_FLAG_TICK))
				3953	return;
				3954
				3955	WARN_ON_ONCE(!tick_work_cpu);
				3956
				3957	twork = per_cpu_ptr(tick_work_cpu, cpu);
				3958	os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
				3959	WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
				3960	if (os == TICK_SCHED_REMOTE_OFFLINE) {
				3961	twork->cpu = cpu;
				3962	INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
				3963	queue_delayed_work(system_unbound_wq, &twork->work, HZ);
				3964	}
				3965	}
				3966
				3967	#ifdef CONFIG_HOTPLUG_CPU
				3968	static void sched_tick_stop(int cpu)
				3969	{
				3970	struct tick_work *twork;
				3971	int os;
				3972
				3973	if (housekeeping_cpu(cpu, HK_FLAG_TICK))
				3974	return;
				3975
				3976	WARN_ON_ONCE(!tick_work_cpu);
				3977
				3978	twork = per_cpu_ptr(tick_work_cpu, cpu);
				3979	/* There cannot be competing actions, but don't rely on stop-machine. */
				3980	os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
				3981	WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
				3982	/* Don't cancel, as this would mess up the state machine. */
				3983	}
				3984	#endif /* CONFIG_HOTPLUG_CPU */
				3985
				3986	int __init sched_tick_offload_init(void)
				3987	{
				3988	tick_work_cpu = alloc_percpu(struct tick_work);
				3989	BUG_ON(!tick_work_cpu);
				3990	return 0;
				3991	}
				3992
				3993	#else /* !CONFIG_NO_HZ_FULL */
				3994	static inline void sched_tick_start(int cpu) { }
				3995	static inline void sched_tick_stop(int cpu) { }
				3996	#endif
				3997
				3998	#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) \|\| \
				3999	defined(CONFIG_TRACE_PREEMPT_TOGGLE))
				4000	/*
				4001	* If the value passed in is equal to the current preempt count
				4002	* then we just disabled preemption. Start timing the latency.
				4003	*/
				4004	static inline void preempt_latency_start(int val)
				4005	{
				4006	if (preempt_count() == val) {
				4007	unsigned long ip = get_lock_parent_ip();
				4008	#ifdef CONFIG_DEBUG_PREEMPT
				4009	current->preempt_disable_ip = ip;
				4010	#endif
				4011	trace_preempt_off(CALLER_ADDR0, ip);
				4012	}
				4013	}
				4014
				4015	void preempt_count_add(int val)
				4016	{
				4017	#ifdef CONFIG_DEBUG_PREEMPT
				4018	/*
				4019	* Underflow?
				4020	*/
				4021	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
				4022	return;
				4023	#endif
				4024	__preempt_count_add(val);
				4025	#ifdef CONFIG_DEBUG_PREEMPT
				4026	/*
				4027	* Spinlock count overflowing soon?
				4028	*/
				4029	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
				4030	PREEMPT_MASK - 10);
				4031	#endif
				4032	preempt_latency_start(val);
				4033	}
				4034	EXPORT_SYMBOL(preempt_count_add);
				4035	NOKPROBE_SYMBOL(preempt_count_add);
				4036
				4037	/*
				4038	* If the value passed in equals to the current preempt count
				4039	* then we just enabled preemption. Stop timing the latency.
				4040	*/
				4041	static inline void preempt_latency_stop(int val)
				4042	{
				4043	if (preempt_count() == val)
				4044	trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
				4045	}
				4046
				4047	void preempt_count_sub(int val)
				4048	{
				4049	#ifdef CONFIG_DEBUG_PREEMPT
				4050	/*
				4051	* Underflow?
				4052	*/
				4053	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
				4054	return;
				4055	/*
				4056	* Is the spinlock portion underflowing?
				4057	*/
				4058	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
				4059	!(preempt_count() & PREEMPT_MASK)))
				4060	return;
				4061	#endif
				4062
				4063	preempt_latency_stop(val);
				4064	__preempt_count_sub(val);
				4065	}
				4066	EXPORT_SYMBOL(preempt_count_sub);
				4067	NOKPROBE_SYMBOL(preempt_count_sub);
				4068
				4069	#else
				4070	static inline void preempt_latency_start(int val) { }
				4071	static inline void preempt_latency_stop(int val) { }
				4072	#endif
				4073
				4074	static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
				4075	{
				4076	#ifdef CONFIG_DEBUG_PREEMPT
				4077	return p->preempt_disable_ip;
				4078	#else
				4079	return 0;
				4080	#endif
				4081	}
				4082
				4083	/*
				4084	* Print scheduling while atomic bug:
				4085	*/
				4086	static noinline void __schedule_bug(struct task_struct *prev)
				4087	{
				4088	/* Save this before calling printk(), since that will clobber it */
				4089	unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
				4090
				4091	if (oops_in_progress)
				4092	return;
				4093
				4094	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
				4095	prev->comm, prev->pid, preempt_count());
				4096
				4097	debug_show_held_locks(prev);
				4098	print_modules();
				4099	if (irqs_disabled())
				4100	print_irqtrace_events(prev);
				4101	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
				4102	&& in_atomic_preempt_off()) {
				4103	pr_err("Preemption disabled at:");
				4104	print_ip_sym(preempt_disable_ip);
				4105	pr_cont("\n");
				4106	}
				4107	check_panic_on_warn("scheduling while atomic");
				4108
				4109	dump_stack();
				4110	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
				4111	}
				4112
				4113	/*
				4114	* Various schedule()-time debugging checks and statistics:
				4115	*/
				4116	static inline void schedule_debug(struct task_struct *prev, bool preempt)
				4117	{
				4118	#ifdef CONFIG_SCHED_STACK_END_CHECK
				4119	if (task_stack_end_corrupted(prev))
				4120	panic("corrupted stack end detected inside scheduler\n");
				4121	#endif
				4122
				4123	#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
				4124	if (!preempt && prev->state && prev->non_block_count) {
				4125	printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
				4126	prev->comm, prev->pid, prev->non_block_count);
				4127	dump_stack();
				4128	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
				4129	}
				4130	#endif
				4131
				4132	if (unlikely(in_atomic_preempt_off())) {
				4133	__schedule_bug(prev);
				4134	preempt_count_set(PREEMPT_DISABLED);
				4135	}
				4136	rcu_sleep_check();
				4137
				4138	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
				4139
				4140	schedstat_inc(this_rq()->sched_count);
				4141	}
				4142
				4143	/*
				4144	* Pick up the highest-prio task:
				4145	*/
				4146	static inline struct task_struct *
				4147	pick_next_task(struct rq rq, struct task_struct prev, struct rq_flags *rf)
				4148	{
				4149	const struct sched_class *class;
				4150	struct task_struct *p;
				4151
				4152	/*
				4153	* Optimization: we know that if all tasks are in the fair class we can
				4154	* call that function directly, but only if the @prev task wasn't of a
				4155	* higher scheduling class, because otherwise those loose the
				4156	* opportunity to pull in more work from other CPUs.
				4157	*/
				4158	if (likely((prev->sched_class == &idle_sched_class \|\|
				4159	prev->sched_class == &fair_sched_class) &&
				4160	rq->nr_running == rq->cfs.h_nr_running)) {
				4161
				4162	p = fair_sched_class.pick_next_task(rq, prev, rf);
				4163	if (unlikely(p == RETRY_TASK))
				4164	goto restart;
				4165
				4166	/* Assumes fair_sched_class->next == idle_sched_class */
				4167	if (unlikely(!p))
				4168	p = idle_sched_class.pick_next_task(rq, prev, rf);
				4169
				4170	return p;
				4171	}
				4172
				4173	restart:
				4174	#ifdef CONFIG_SMP
				4175	/*
				4176	* We must do the balancing pass before put_next_task(), such
				4177	* that when we release the rq->lock the task is in the same
				4178	* state as before we took rq->lock.
				4179	*
				4180	* We can terminate the balance pass as soon as we know there is
				4181	* a runnable task of @class priority or higher.
				4182	*/
				4183	for_class_range(class, prev->sched_class, &idle_sched_class) {
				4184	if (class->balance(rq, prev, rf))
				4185	break;
				4186	}
				4187	#endif
				4188
				4189	put_prev_task(rq, prev);
				4190
				4191	for_each_class(class) {
				4192	p = class->pick_next_task(rq, NULL, NULL);
				4193	if (p)
				4194	return p;
				4195	}
				4196
				4197	/* The idle class should always have a runnable task: */
				4198	BUG();
				4199	}
				4200
				4201	/*
				4202	* __schedule() is the main scheduler function.
				4203	*
				4204	* The main means of driving the scheduler and thus entering this function are:
				4205	*
				4206	* 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
				4207	*
				4208	* 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
				4209	* paths. For example, see arch/x86/entry_64.S.
				4210	*
				4211	* To drive preemption between tasks, the scheduler sets the flag in timer
				4212	* interrupt handler scheduler_tick().
				4213	*
				4214	* 3. Wakeups don't really cause entry into schedule(). They add a
				4215	* task to the run-queue and that's it.
				4216	*
				4217	* Now, if the new task added to the run-queue preempts the current
				4218	* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
				4219	* called on the nearest possible occasion:
				4220	*
				4221	* - If the kernel is preemptible (CONFIG_PREEMPTION=y):
				4222	*
				4223	* - in syscall or exception context, at the next outmost
				4224	* preempt_enable(). (this might be as soon as the wake_up()'s
				4225	* spin_unlock()!)
				4226	*
				4227	* - in IRQ context, return from interrupt-handler to
				4228	* preemptible context
				4229	*
				4230	* - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
				4231	* then at the next:
				4232	*
				4233	* - cond_resched() call
				4234	* - explicit schedule() call
				4235	* - return from syscall or exception to user-space
				4236	* - return from interrupt-handler to user-space
				4237	*
				4238	* WARNING: must be called with preemption disabled!
				4239	*/
				4240	static void __sched notrace __schedule(bool preempt)
				4241	{
				4242	struct task_struct prev, next;
				4243	unsigned long *switch_count;
				4244	struct rq_flags rf;
				4245	struct rq *rq;
				4246	int cpu;
				4247
				4248	cpu = smp_processor_id();
				4249	rq = cpu_rq(cpu);
				4250	prev = rq->curr;
				4251
				4252	schedule_debug(prev, preempt);
				4253
				4254	if (sched_feat(HRTICK))
				4255	hrtick_clear(rq);
				4256
				4257	local_irq_disable();
				4258	rcu_note_context_switch(preempt);
				4259
				4260	/*
				4261	* Make sure that signal_pending_state()->signal_pending() below
				4262	* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
				4263	* done by the caller to avoid the race with signal_wake_up().
				4264	*
				4265	* The membarrier system call requires a full memory barrier
				4266	* after coming from user-space, before storing to rq->curr.
				4267	*/
				4268	rq_lock(rq, &rf);
				4269	smp_mb__after_spinlock();
				4270
				4271	/* Promote REQ to ACT */
				4272	rq->clock_update_flags <<= 1;
				4273	update_rq_clock(rq);
				4274
				4275	switch_count = &prev->nivcsw;
				4276	if (!preempt && prev->state) {
				4277	if (signal_pending_state(prev->state, prev)) {
				4278	prev->state = TASK_RUNNING;
				4279	} else {
				4280	deactivate_task(rq, prev, DEQUEUE_SLEEP \| DEQUEUE_NOCLOCK);
				4281
				4282	if (prev->in_iowait) {
				4283	atomic_inc(&rq->nr_iowait);
				4284	delayacct_blkio_start();
				4285	}
				4286	}
				4287	switch_count = &prev->nvcsw;
				4288	}
				4289
				4290	next = pick_next_task(rq, prev, &rf);
				4291	clear_tsk_need_resched(prev);
				4292	clear_preempt_need_resched();
				4293
				4294	if (likely(prev != next)) {
				4295	rq->nr_switches++;
				4296	/*
				4297	* RCU users of rcu_dereference(rq->curr) may not see
				4298	* changes to task_struct made by pick_next_task().
				4299	*/
				4300	RCU_INIT_POINTER(rq->curr, next);
				4301	/*
				4302	* The membarrier system call requires each architecture
				4303	* to have a full memory barrier after updating
				4304	* rq->curr, before returning to user-space.
				4305	*
				4306	* Here are the schemes providing that barrier on the
				4307	* various architectures:
				4308	* - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
				4309	* switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
				4310	* - finish_lock_switch() for weakly-ordered
				4311	* architectures where spin_unlock is a full barrier,
				4312	* - switch_to() for arm64 (weakly-ordered, spin_unlock
				4313	* is a RELEASE barrier),
				4314	*/
				4315	++*switch_count;
				4316
				4317	trace_sched_switch(preempt, prev, next);
				4318
				4319	/* Also unlocks the rq: */
				4320	rq = context_switch(rq, prev, next, &rf);
				4321	} else {
				4322	rq->clock_update_flags &= ~(RQCF_ACT_SKIP\|RQCF_REQ_SKIP);
				4323	rq_unlock_irq(rq, &rf);
				4324	}
				4325
				4326	balance_callback(rq);
				4327	}
				4328
				4329	void __noreturn do_task_dead(void)
				4330	{
				4331	/* Causes final put_task_struct in finish_task_switch(): */
				4332	set_special_state(TASK_DEAD);
				4333
				4334	/* Tell freezer to ignore us: */
				4335	current->flags \|= PF_NOFREEZE;
				4336
				4337	__schedule(false);
				4338	BUG();
				4339
				4340	/* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
				4341	for (;;)
				4342	cpu_relax();
				4343	}
				4344
				4345	static inline void sched_submit_work(struct task_struct *tsk)
				4346	{
				4347	if (!tsk->state)
				4348	return;
				4349
				4350	/*
				4351	* If a worker went to sleep, notify and ask workqueue whether
				4352	* it wants to wake up a task to maintain concurrency.
				4353	* As this function is called inside the schedule() context,
				4354	* we disable preemption to avoid it calling schedule() again
				4355	* in the possible wakeup of a kworker and because wq_worker_sleeping()
				4356	* requires it.
				4357	*/
				4358	if (tsk->flags & PF_WQ_WORKER) {
				4359	preempt_disable();
				4360	wq_worker_sleeping(tsk);
				4361	preempt_enable_no_resched();
				4362	}
				4363
				4364	if (tsk_is_pi_blocked(tsk))
				4365	return;
				4366
				4367	/*
				4368	* If we are going to sleep and we have plugged IO queued,
				4369	* make sure to submit it to avoid deadlocks.
				4370	*/
				4371	if (blk_needs_flush_plug(tsk))
				4372	blk_schedule_flush_plug(tsk);
				4373	}
				4374
				4375	static void sched_update_worker(struct task_struct *tsk)
				4376	{
				4377	if (tsk->flags & PF_WQ_WORKER)
				4378	wq_worker_running(tsk);
				4379	}
				4380
				4381	asmlinkage __visible void __sched schedule(void)
				4382	{
				4383	struct task_struct *tsk = current;
				4384
				4385	sched_submit_work(tsk);
				4386	do {
				4387	preempt_disable();
				4388	__schedule(false);
				4389	sched_preempt_enable_no_resched();
				4390	} while (need_resched());
				4391	sched_update_worker(tsk);
				4392	}
				4393	EXPORT_SYMBOL(schedule);
				4394
				4395	/*
				4396	* synchronize_rcu_tasks() makes sure that no task is stuck in preempted
				4397	* state (have scheduled out non-voluntarily) by making sure that all
				4398	* tasks have either left the run queue or have gone into user space.
				4399	* As idle tasks do not do either, they must not ever be preempted
				4400	* (schedule out non-voluntarily).
				4401	*
				4402	* schedule_idle() is similar to schedule_preempt_disable() except that it
				4403	* never enables preemption because it does not call sched_submit_work().
				4404	*/
				4405	void __sched schedule_idle(void)
				4406	{
				4407	/*
				4408	* As this skips calling sched_submit_work(), which the idle task does
				4409	* regardless because that function is a nop when the task is in a
				4410	* TASK_RUNNING state, make sure this isn't used someplace that the
				4411	* current task can be in any other state. Note, idle is always in the
				4412	* TASK_RUNNING state.
				4413	*/
				4414	WARN_ON_ONCE(current->state);
				4415	do {
				4416	__schedule(false);
				4417	} while (need_resched());
				4418	}
				4419
				4420	#ifdef CONFIG_CONTEXT_TRACKING
				4421	asmlinkage __visible void __sched schedule_user(void)
				4422	{
				4423	/*
				4424	* If we come here after a random call to set_need_resched(),
				4425	* or we have been woken up remotely but the IPI has not yet arrived,
				4426	* we haven't yet exited the RCU idle mode. Do it here manually until
				4427	* we find a better solution.
				4428	*
				4429	* NB: There are buggy callers of this function. Ideally we
				4430	* should warn if prev_state != CONTEXT_USER, but that will trigger
				4431	* too frequently to make sense yet.
				4432	*/
				4433	enum ctx_state prev_state = exception_enter();
				4434	schedule();
				4435	exception_exit(prev_state);
				4436	}
				4437	#endif
				4438
				4439	/**
				4440	* schedule_preempt_disabled - called with preemption disabled
				4441	*
				4442	* Returns with preemption disabled. Note: preempt_count must be 1
				4443	*/
				4444	void __sched schedule_preempt_disabled(void)
				4445	{
				4446	sched_preempt_enable_no_resched();
				4447	schedule();
				4448	preempt_disable();
				4449	}
				4450
				4451	static void __sched notrace preempt_schedule_common(void)
				4452	{
				4453	do {
				4454	/*
				4455	* Because the function tracer can trace preempt_count_sub()
				4456	* and it also uses preempt_enable/disable_notrace(), if
				4457	* NEED_RESCHED is set, the preempt_enable_notrace() called
				4458	* by the function tracer will call this function again and
				4459	* cause infinite recursion.
				4460	*
				4461	* Preemption must be disabled here before the function
				4462	* tracer can trace. Break up preempt_disable() into two
				4463	* calls. One to disable preemption without fear of being
				4464	* traced. The other to still record the preemption latency,
				4465	* which can also be traced by the function tracer.
				4466	*/
				4467	preempt_disable_notrace();
				4468	preempt_latency_start(1);
				4469	__schedule(true);
				4470	preempt_latency_stop(1);
				4471	preempt_enable_no_resched_notrace();
				4472
				4473	/*
				4474	* Check again in case we missed a preemption opportunity
				4475	* between schedule and now.
				4476	*/
				4477	} while (need_resched());
				4478	}
				4479
				4480	#ifdef CONFIG_PREEMPTION
				4481	/*
				4482	* This is the entry point to schedule() from in-kernel preemption
				4483	* off of preempt_enable.
				4484	*/
				4485	asmlinkage __visible void __sched notrace preempt_schedule(void)
				4486	{
				4487	/*
				4488	* If there is a non-zero preempt_count or interrupts are disabled,
				4489	* we do not want to preempt the current task. Just return..
				4490	*/
				4491	if (likely(!preemptible()))
				4492	return;
				4493
				4494	preempt_schedule_common();
				4495	}
				4496	NOKPROBE_SYMBOL(preempt_schedule);
				4497	EXPORT_SYMBOL(preempt_schedule);
				4498
				4499	/**
				4500	* preempt_schedule_notrace - preempt_schedule called by tracing
				4501	*
				4502	* The tracing infrastructure uses preempt_enable_notrace to prevent
				4503	* recursion and tracing preempt enabling caused by the tracing
				4504	* infrastructure itself. But as tracing can happen in areas coming
				4505	* from userspace or just about to enter userspace, a preempt enable
				4506	* can occur before user_exit() is called. This will cause the scheduler
				4507	* to be called when the system is still in usermode.
				4508	*
				4509	* To prevent this, the preempt_enable_notrace will use this function
				4510	* instead of preempt_schedule() to exit user context if needed before
				4511	* calling the scheduler.
				4512	*/
				4513	asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
				4514	{
				4515	enum ctx_state prev_ctx;
				4516
				4517	if (likely(!preemptible()))
				4518	return;
				4519
				4520	do {
				4521	/*
				4522	* Because the function tracer can trace preempt_count_sub()
				4523	* and it also uses preempt_enable/disable_notrace(), if
				4524	* NEED_RESCHED is set, the preempt_enable_notrace() called
				4525	* by the function tracer will call this function again and
				4526	* cause infinite recursion.
				4527	*
				4528	* Preemption must be disabled here before the function
				4529	* tracer can trace. Break up preempt_disable() into two
				4530	* calls. One to disable preemption without fear of being
				4531	* traced. The other to still record the preemption latency,
				4532	* which can also be traced by the function tracer.
				4533	*/
				4534	preempt_disable_notrace();
				4535	preempt_latency_start(1);
				4536	/*
				4537	* Needs preempt disabled in case user_exit() is traced
				4538	* and the tracer calls preempt_enable_notrace() causing
				4539	* an infinite recursion.
				4540	*/
				4541	prev_ctx = exception_enter();
				4542	__schedule(true);
				4543	exception_exit(prev_ctx);
				4544
				4545	preempt_latency_stop(1);
				4546	preempt_enable_no_resched_notrace();
				4547	} while (need_resched());
				4548	}
				4549	EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
				4550
				4551	#endif /* CONFIG_PREEMPTION */
				4552
				4553	/*
				4554	* This is the entry point to schedule() from kernel preemption
				4555	* off of irq context.
				4556	* Note, that this is called and return with irqs disabled. This will
				4557	* protect us against recursive calling from irq.
				4558	*/
				4559	asmlinkage __visible void __sched preempt_schedule_irq(void)
				4560	{
				4561	enum ctx_state prev_state;
				4562
				4563	/* Catch callers which need to be fixed */
				4564	BUG_ON(preempt_count() \|\| !irqs_disabled());
				4565
				4566	prev_state = exception_enter();
				4567
				4568	do {
				4569	preempt_disable();
				4570	local_irq_enable();
				4571	__schedule(true);
				4572	local_irq_disable();
				4573	sched_preempt_enable_no_resched();
				4574	} while (need_resched());
				4575
				4576	exception_exit(prev_state);
				4577	}
				4578
				4579	int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
				4580	void *key)
				4581	{
				4582	return try_to_wake_up(curr->private, mode, wake_flags);
				4583	}
				4584	EXPORT_SYMBOL(default_wake_function);
				4585
				4586	#ifdef CONFIG_RT_MUTEXES
				4587
				4588	static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
				4589	{
				4590	if (pi_task)
				4591	prio = min(prio, pi_task->prio);
				4592
				4593	return prio;
				4594	}
				4595
				4596	static inline int rt_effective_prio(struct task_struct *p, int prio)
				4597	{
				4598	struct task_struct *pi_task = rt_mutex_get_top_task(p);
				4599
				4600	return __rt_effective_prio(pi_task, prio);
				4601	}
				4602
				4603	/*
				4604	* rt_mutex_setprio - set the current priority of a task
				4605	* @p: task to boost
				4606	* @pi_task: donor task
				4607	*
				4608	* This function changes the 'effective' priority of a task. It does
				4609	* not touch ->normal_prio like __setscheduler().
				4610	*
				4611	* Used by the rt_mutex code to implement priority inheritance
				4612	* logic. Call site only calls if the priority of the task changed.
				4613	*/
				4614	void rt_mutex_setprio(struct task_struct p, struct task_struct pi_task)
				4615	{
				4616	int prio, oldprio, queued, running, queue_flag =
				4617	DEQUEUE_SAVE \| DEQUEUE_MOVE \| DEQUEUE_NOCLOCK;
				4618	const struct sched_class *prev_class;
				4619	struct rq_flags rf;
				4620	struct rq *rq;
				4621
				4622	trace_android_rvh_rtmutex_prepare_setprio(p, pi_task);
				4623	/* XXX used to be waiter->prio, not waiter->task->prio */
				4624	prio = __rt_effective_prio(pi_task, p->normal_prio);
				4625
				4626	/*
				4627	* If nothing changed; bail early.
				4628	*/
				4629	if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
				4630	return;
				4631
				4632	rq = __task_rq_lock(p, &rf);
				4633	update_rq_clock(rq);
				4634	/*
				4635	* Set under pi_lock && rq->lock, such that the value can be used under
				4636	* either lock.
				4637	*
				4638	* Note that there is loads of tricky to make this pointer cache work
				4639	* right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
				4640	* ensure a task is de-boosted (pi_task is set to NULL) before the
				4641	* task is allowed to run again (and can exit). This ensures the pointer
				4642	* points to a blocked task -- which guaratees the task is present.
				4643	*/
				4644	p->pi_top_task = pi_task;
				4645
				4646	/*
				4647	* For FIFO/RR we only need to set prio, if that matches we're done.
				4648	*/
				4649	if (prio == p->prio && !dl_prio(prio))
				4650	goto out_unlock;
				4651
				4652	/*
				4653	* Idle task boosting is a nono in general. There is one
				4654	* exception, when PREEMPT_RT and NOHZ is active:
				4655	*
				4656	* The idle task calls get_next_timer_interrupt() and holds
				4657	* the timer wheel base->lock on the CPU and another CPU wants
				4658	* to access the timer (probably to cancel it). We can safely
				4659	* ignore the boosting request, as the idle CPU runs this code
				4660	* with interrupts disabled and will complete the lock
				4661	* protected section without being interrupted. So there is no
				4662	* real need to boost.
				4663	*/
				4664	if (unlikely(p == rq->idle)) {
				4665	WARN_ON(p != rq->curr);
				4666	WARN_ON(p->pi_blocked_on);
				4667	goto out_unlock;
				4668	}
				4669
				4670	trace_sched_pi_setprio(p, pi_task);
				4671	oldprio = p->prio;
				4672
				4673	if (oldprio == prio)
				4674	queue_flag &= ~DEQUEUE_MOVE;
				4675
				4676	prev_class = p->sched_class;
				4677	queued = task_on_rq_queued(p);
				4678	running = task_current(rq, p);
				4679	if (queued)
				4680	dequeue_task(rq, p, queue_flag);
				4681	if (running)
				4682	put_prev_task(rq, p);
				4683
				4684	/*
				4685	* Boosting condition are:
				4686	* 1. -rt task is running and holds mutex A
				4687	* --> -dl task blocks on mutex A
				4688	*
				4689	* 2. -dl task is running and holds mutex A
				4690	* --> -dl task blocks on mutex A and could preempt the
				4691	* running task
				4692	*/
				4693	if (dl_prio(prio)) {
				4694	if (!dl_prio(p->normal_prio) \|\|
				4695	(pi_task && dl_prio(pi_task->prio) &&
				4696	dl_entity_preempt(&pi_task->dl, &p->dl))) {
				4697	p->dl.pi_se = pi_task->dl.pi_se;
				4698	queue_flag \|= ENQUEUE_REPLENISH;
				4699	} else {
				4700	p->dl.pi_se = &p->dl;
				4701	}
				4702	p->sched_class = &dl_sched_class;
				4703	} else if (rt_prio(prio)) {
				4704	if (dl_prio(oldprio))
				4705	p->dl.pi_se = &p->dl;
				4706	if (oldprio < prio)
				4707	queue_flag \|= ENQUEUE_HEAD;
				4708	p->sched_class = &rt_sched_class;
				4709	} else {
				4710	if (dl_prio(oldprio))
				4711	p->dl.pi_se = &p->dl;
				4712	if (rt_prio(oldprio))
				4713	p->rt.timeout = 0;
				4714	p->sched_class = &fair_sched_class;
				4715	}
				4716
				4717	p->prio = prio;
				4718
				4719	if (queued)
				4720	enqueue_task(rq, p, queue_flag);
				4721	if (running)
				4722	set_next_task(rq, p);
				4723
				4724	check_class_changed(rq, p, prev_class, oldprio);
				4725	out_unlock:
				4726	/* Avoid rq from going away on us: */
				4727	preempt_disable();
				4728	__task_rq_unlock(rq, &rf);
				4729
				4730	balance_callback(rq);
				4731	preempt_enable();
				4732	}
				4733	#else
				4734	static inline int rt_effective_prio(struct task_struct *p, int prio)
				4735	{
				4736	return prio;
				4737	}
				4738	#endif
				4739
				4740	void set_user_nice(struct task_struct *p, long nice)
				4741	{
				4742	bool queued, running, allowed = false;
				4743	int old_prio, delta;
				4744	struct rq_flags rf;
				4745	struct rq *rq;
				4746
				4747	trace_android_rvh_set_user_nice(p, &nice, &allowed);
				4748	if ((task_nice(p) == nice \|\| nice < MIN_NICE \|\| nice > MAX_NICE) && !allowed)
				4749	return;
				4750	/*
				4751	* We have to be careful, if called from sys_setpriority(),
				4752	* the task might be in the middle of scheduling on another CPU.
				4753	*/
				4754	rq = task_rq_lock(p, &rf);
				4755	update_rq_clock(rq);
				4756
				4757	/*
				4758	* The RT priorities are set via sched_setscheduler(), but we still
				4759	* allow the 'normal' nice value to be set - but as expected
				4760	* it wont have any effect on scheduling until the task is
				4761	* SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
				4762	*/
				4763	if (task_has_dl_policy(p) \|\| task_has_rt_policy(p)) {
				4764	p->static_prio = NICE_TO_PRIO(nice);
				4765	goto out_unlock;
				4766	}
				4767	queued = task_on_rq_queued(p);
				4768	running = task_current(rq, p);
				4769	if (queued)
				4770	dequeue_task(rq, p, DEQUEUE_SAVE \| DEQUEUE_NOCLOCK);
				4771	if (running)
				4772	put_prev_task(rq, p);
				4773
				4774	p->static_prio = NICE_TO_PRIO(nice);
				4775	set_load_weight(p, true);
				4776	old_prio = p->prio;
				4777	p->prio = effective_prio(p);
				4778	delta = p->prio - old_prio;
				4779
				4780	if (queued) {
				4781	enqueue_task(rq, p, ENQUEUE_RESTORE \| ENQUEUE_NOCLOCK);
				4782	/*
				4783	* If the task increased its priority or is running and
				4784	* lowered its priority, then reschedule its CPU:
				4785	*/
				4786	if (delta < 0 \|\| (delta > 0 && task_running(rq, p)))
				4787	resched_curr(rq);
				4788	}
				4789	if (running)
				4790	set_next_task(rq, p);
				4791	out_unlock:
				4792	task_rq_unlock(rq, p, &rf);
				4793	}
				4794	EXPORT_SYMBOL(set_user_nice);
				4795
				4796	/*
				4797	* can_nice - check if a task can reduce its nice value
				4798	* @p: task
				4799	* @nice: nice value
				4800	*/
				4801	int can_nice(const struct task_struct *p, const int nice)
				4802	{
				4803	/* Convert nice value [19,-20] to rlimit style value [1,40]: */
				4804	int nice_rlim = nice_to_rlimit(nice);
				4805
				4806	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) \|\|
				4807	capable(CAP_SYS_NICE));
				4808	}
				4809
				4810	#ifdef __ARCH_WANT_SYS_NICE
				4811
				4812	/*
				4813	* sys_nice - change the priority of the current process.
				4814	* @increment: priority increment
				4815	*
				4816	* sys_setpriority is a more generic, but much slower function that
				4817	* does similar things.
				4818	*/
				4819	SYSCALL_DEFINE1(nice, int, increment)
				4820	{
				4821	long nice, retval;
				4822
				4823	/*
				4824	* Setpriority might change our priority at the same moment.
				4825	* We don't have to worry. Conceptually one call occurs first
				4826	* and we have a single winner.
				4827	*/
				4828	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
				4829	nice = task_nice(current) + increment;
				4830
				4831	nice = clamp_val(nice, MIN_NICE, MAX_NICE);
				4832	if (increment < 0 && !can_nice(current, nice))
				4833	return -EPERM;
				4834
				4835	retval = security_task_setnice(current, nice);
				4836	if (retval)
				4837	return retval;
				4838
				4839	set_user_nice(current, nice);
				4840	return 0;
				4841	}
				4842
				4843	#endif
				4844
				4845	/**
				4846	* task_prio - return the priority value of a given task.
				4847	* @p: the task in question.
				4848	*
				4849	* Return: The priority value as seen by users in /proc.
				4850	* RT tasks are offset by -200. Normal tasks are centered
				4851	* around 0, value goes from -16 to +15.
				4852	*/
				4853	int task_prio(const struct task_struct *p)
				4854	{
				4855	return p->prio - MAX_RT_PRIO;
				4856	}
				4857
				4858	/**
				4859	* idle_cpu - is a given CPU idle currently?
				4860	* @cpu: the processor in question.
				4861	*
				4862	* Return: 1 if the CPU is currently idle. 0 otherwise.
				4863	*/
				4864	int idle_cpu(int cpu)
				4865	{
				4866	struct rq *rq = cpu_rq(cpu);
				4867
				4868	if (rq->curr != rq->idle)
				4869	return 0;
				4870
				4871	if (rq->nr_running)
				4872	return 0;
				4873
				4874	#ifdef CONFIG_SMP
				4875	if (!llist_empty(&rq->wake_list))
				4876	return 0;
				4877	#endif
				4878
				4879	return 1;
				4880	}
				4881
				4882	/**
				4883	* available_idle_cpu - is a given CPU idle for enqueuing work.
				4884	* @cpu: the CPU in question.
				4885	*
				4886	* Return: 1 if the CPU is currently idle. 0 otherwise.
				4887	*/
				4888	int available_idle_cpu(int cpu)
				4889	{
				4890	if (!idle_cpu(cpu))
				4891	return 0;
				4892
				4893	if (vcpu_is_preempted(cpu))
				4894	return 0;
				4895
				4896	return 1;
				4897	}
				4898
				4899	/**
				4900	* idle_task - return the idle task for a given CPU.
				4901	* @cpu: the processor in question.
				4902	*
				4903	* Return: The idle task for the CPU @cpu.
				4904	*/
				4905	struct task_struct *idle_task(int cpu)
				4906	{
				4907	return cpu_rq(cpu)->idle;
				4908	}
				4909
				4910	/**
				4911	* find_process_by_pid - find a process with a matching PID value.
				4912	* @pid: the pid in question.
				4913	*
				4914	* The task of @pid, if found. %NULL otherwise.
				4915	*/
				4916	static struct task_struct *find_process_by_pid(pid_t pid)
				4917	{
				4918	return pid ? find_task_by_vpid(pid) : current;
				4919	}
				4920
				4921	/*
				4922	* sched_setparam() passes in -1 for its policy, to let the functions
				4923	* it calls know not to change it.
				4924	*/
				4925	#define SETPARAM_POLICY -1
				4926
				4927	static void __setscheduler_params(struct task_struct *p,
				4928	const struct sched_attr *attr)
				4929	{
				4930	int policy = attr->sched_policy;
				4931
				4932	if (policy == SETPARAM_POLICY)
				4933	policy = p->policy;
				4934
				4935	p->policy = policy;
				4936
				4937	if (dl_policy(policy))
				4938	__setparam_dl(p, attr);
				4939	else if (fair_policy(policy))
				4940	p->static_prio = NICE_TO_PRIO(attr->sched_nice);
				4941
				4942	/*
				4943	* __sched_setscheduler() ensures attr->sched_priority == 0 when
				4944	* !rt_policy. Always setting this ensures that things like
				4945	* getparam()/getattr() don't report silly values for !rt tasks.
				4946	*/
				4947	p->rt_priority = attr->sched_priority;
				4948	p->normal_prio = normal_prio(p);
				4949	set_load_weight(p, true);
				4950	}
				4951
				4952	/* Actually do priority change: must hold pi & rq lock. */
				4953	static void __setscheduler(struct rq rq, struct task_struct p,
				4954	const struct sched_attr *attr, bool keep_boost)
				4955	{
				4956	/*
				4957	* If params can't change scheduling class changes aren't allowed
				4958	* either.
				4959	*/
				4960	if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
				4961	return;
				4962
				4963	__setscheduler_params(p, attr);
				4964
				4965	/*
				4966	* Keep a potential priority boosting if called from
				4967	* sched_setscheduler().
				4968	*/
				4969	p->prio = normal_prio(p);
				4970	if (keep_boost)
				4971	p->prio = rt_effective_prio(p, p->prio);
				4972
				4973	if (dl_prio(p->prio))
				4974	p->sched_class = &dl_sched_class;
				4975	else if (rt_prio(p->prio))
				4976	p->sched_class = &rt_sched_class;
				4977	else
				4978	p->sched_class = &fair_sched_class;
				4979
				4980	trace_android_rvh_setscheduler(p);
				4981	}
				4982
				4983	/*
				4984	* Check the target process has a UID that matches the current process's:
				4985	*/
				4986	static bool check_same_owner(struct task_struct *p)
				4987	{
				4988	const struct cred cred = current_cred(), pcred;
				4989	bool match;
				4990
				4991	rcu_read_lock();
				4992	pcred = __task_cred(p);
				4993	match = (uid_eq(cred->euid, pcred->euid) \|\|
				4994	uid_eq(cred->euid, pcred->uid));
				4995	rcu_read_unlock();
				4996	return match;
				4997	}
				4998
				4999	static int __sched_setscheduler(struct task_struct *p,
				5000	const struct sched_attr *attr,
				5001	bool user, bool pi)
				5002	{
				5003	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
				5004	MAX_RT_PRIO - 1 - attr->sched_priority;
				5005	int retval, oldprio, oldpolicy = -1, queued, running;
				5006	int new_effective_prio, policy = attr->sched_policy;
				5007	const struct sched_class *prev_class;
				5008	struct rq_flags rf;
				5009	int reset_on_fork;
				5010	int queue_flags = DEQUEUE_SAVE \| DEQUEUE_MOVE \| DEQUEUE_NOCLOCK;
				5011	struct rq *rq;
				5012
				5013	/* The pi code expects interrupts enabled */
				5014	BUG_ON(pi && in_interrupt());
				5015	recheck:
				5016	/* Double check policy once rq lock held: */
				5017	if (policy < 0) {
				5018	reset_on_fork = p->sched_reset_on_fork;
				5019	policy = oldpolicy = p->policy;
				5020	} else {
				5021	reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
				5022
				5023	if (!valid_policy(policy))
				5024	return -EINVAL;
				5025	}
				5026
				5027	if (attr->sched_flags & ~(SCHED_FLAG_ALL \| SCHED_FLAG_SUGOV))
				5028	return -EINVAL;
				5029
				5030	/*
				5031	* Valid priorities for SCHED_FIFO and SCHED_RR are
				5032	* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
				5033	* SCHED_BATCH and SCHED_IDLE is 0.
				5034	*/
				5035	if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) \|\|
				5036	(!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
				5037	return -EINVAL;
				5038	if ((dl_policy(policy) && !__checkparam_dl(attr)) \|\|
				5039	(rt_policy(policy) != (attr->sched_priority != 0)))
				5040	return -EINVAL;
				5041
				5042	/*
				5043	* Allow unprivileged RT tasks to decrease priority:
				5044	*/
				5045	if (user && !capable(CAP_SYS_NICE)) {
				5046	if (fair_policy(policy)) {
				5047	if (attr->sched_nice < task_nice(p) &&
				5048	!can_nice(p, attr->sched_nice))
				5049	return -EPERM;
				5050	}
				5051
				5052	if (rt_policy(policy)) {
				5053	unsigned long rlim_rtprio =
				5054	task_rlimit(p, RLIMIT_RTPRIO);
				5055
				5056	/* Can't set/change the rt policy: */
				5057	if (policy != p->policy && !rlim_rtprio)
				5058	return -EPERM;
				5059
				5060	/* Can't increase priority: */
				5061	if (attr->sched_priority > p->rt_priority &&
				5062	attr->sched_priority > rlim_rtprio)
				5063	return -EPERM;
				5064	}
				5065
				5066	/*
				5067	* Can't set/change SCHED_DEADLINE policy at all for now
				5068	* (safest behavior); in the future we would like to allow
				5069	* unprivileged DL tasks to increase their relative deadline
				5070	* or reduce their runtime (both ways reducing utilization)
				5071	*/
				5072	if (dl_policy(policy))
				5073	return -EPERM;
				5074
				5075	/*
				5076	* Treat SCHED_IDLE as nice 20. Only allow a switch to
				5077	* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
				5078	*/
				5079	if (task_has_idle_policy(p) && !idle_policy(policy)) {
				5080	if (!can_nice(p, task_nice(p)))
				5081	return -EPERM;
				5082	}
				5083
				5084	/* Can't change other user's priorities: */
				5085	if (!check_same_owner(p))
				5086	return -EPERM;
				5087
				5088	/* Normal users shall not reset the sched_reset_on_fork flag: */
				5089	if (p->sched_reset_on_fork && !reset_on_fork)
				5090	return -EPERM;
				5091	}
				5092
				5093	if (user) {
				5094	if (attr->sched_flags & SCHED_FLAG_SUGOV)
				5095	return -EINVAL;
				5096
				5097	retval = security_task_setscheduler(p);
				5098	if (retval)
				5099	return retval;
				5100	}
				5101
				5102	/* Update task specific "requested" clamps */
				5103	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
				5104	retval = uclamp_validate(p, attr);
				5105	if (retval)
				5106	return retval;
				5107	}
				5108
				5109	/*
				5110	* Make sure no PI-waiters arrive (or leave) while we are
				5111	* changing the priority of the task:
				5112	*
				5113	* To be able to change p->policy safely, the appropriate
				5114	* runqueue lock must be held.
				5115	*/
				5116	rq = task_rq_lock(p, &rf);
				5117	update_rq_clock(rq);
				5118
				5119	/*
				5120	* Changing the policy of the stop threads its a very bad idea:
				5121	*/
				5122	if (p == rq->stop) {
				5123	retval = -EINVAL;
				5124	goto unlock;
				5125	}
				5126
				5127	/*
				5128	* If not changing anything there's no need to proceed further,
				5129	* but store a possible modification of reset_on_fork.
				5130	*/
				5131	if (unlikely(policy == p->policy)) {
				5132	if (fair_policy(policy) && attr->sched_nice != task_nice(p))
				5133	goto change;
				5134	if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
				5135	goto change;
				5136	if (dl_policy(policy) && dl_param_changed(p, attr))
				5137	goto change;
				5138	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
				5139	goto change;
				5140
				5141	p->sched_reset_on_fork = reset_on_fork;
				5142	retval = 0;
				5143	goto unlock;
				5144	}
				5145	change:
				5146
				5147	if (user) {
				5148	#ifdef CONFIG_RT_GROUP_SCHED
				5149	/*
				5150	* Do not allow realtime tasks into groups that have no runtime
				5151	* assigned.
				5152	*/
				5153	if (rt_bandwidth_enabled() && rt_policy(policy) &&
				5154	task_group(p)->rt_bandwidth.rt_runtime == 0 &&
				5155	!task_group_is_autogroup(task_group(p))) {
				5156	retval = -EPERM;
				5157	goto unlock;
				5158	}
				5159	#endif
				5160	#ifdef CONFIG_SMP
				5161	if (dl_bandwidth_enabled() && dl_policy(policy) &&
				5162	!(attr->sched_flags & SCHED_FLAG_SUGOV)) {
				5163	cpumask_t *span = rq->rd->span;
				5164
				5165	/*
				5166	* Don't allow tasks with an affinity mask smaller than
				5167	* the entire root_domain to become SCHED_DEADLINE. We
				5168	* will also fail if there's no bandwidth available.
				5169	*/
				5170	if (!cpumask_subset(span, p->cpus_ptr) \|\|
				5171	rq->rd->dl_bw.bw == 0) {
				5172	retval = -EPERM;
				5173	goto unlock;
				5174	}
				5175	}
				5176	#endif
				5177	}
				5178
				5179	/* Re-check policy now with rq lock held: */
				5180	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
				5181	policy = oldpolicy = -1;
				5182	task_rq_unlock(rq, p, &rf);
				5183	goto recheck;
				5184	}
				5185
				5186	/*
				5187	* If setscheduling to SCHED_DEADLINE (or changing the parameters
				5188	* of a SCHED_DEADLINE task) we need to check if enough bandwidth
				5189	* is available.
				5190	*/
				5191	if ((dl_policy(policy) \|\| dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
				5192	retval = -EBUSY;
				5193	goto unlock;
				5194	}
				5195
				5196	p->sched_reset_on_fork = reset_on_fork;
				5197	oldprio = p->prio;
				5198
				5199	if (pi) {
				5200	/*
				5201	* Take priority boosted tasks into account. If the new
				5202	* effective priority is unchanged, we just store the new
				5203	* normal parameters and do not touch the scheduler class and
				5204	* the runqueue. This will be done when the task deboost
				5205	* itself.
				5206	*/
				5207	new_effective_prio = rt_effective_prio(p, newprio);
				5208	if (new_effective_prio == oldprio)
				5209	queue_flags &= ~DEQUEUE_MOVE;
				5210	}
				5211
				5212	queued = task_on_rq_queued(p);
				5213	running = task_current(rq, p);
				5214	if (queued)
				5215	dequeue_task(rq, p, queue_flags);
				5216	if (running)
				5217	put_prev_task(rq, p);
				5218
				5219	prev_class = p->sched_class;
				5220
				5221	__setscheduler(rq, p, attr, pi);
				5222	__setscheduler_uclamp(p, attr);
				5223
				5224	if (queued) {
				5225	/*
				5226	* We enqueue to tail when the priority of a task is
				5227	* increased (user space view).
				5228	*/
				5229	if (oldprio < p->prio)
				5230	queue_flags \|= ENQUEUE_HEAD;
				5231
				5232	enqueue_task(rq, p, queue_flags);
				5233	}
				5234	if (running)
				5235	set_next_task(rq, p);
				5236
				5237	check_class_changed(rq, p, prev_class, oldprio);
				5238
				5239	/* Avoid rq from going away on us: */
				5240	preempt_disable();
				5241	task_rq_unlock(rq, p, &rf);
				5242
				5243	if (pi)
				5244	rt_mutex_adjust_pi(p);
				5245
				5246	/* Run balance callbacks after we've adjusted the PI chain: */
				5247	balance_callback(rq);
				5248	preempt_enable();
				5249
				5250	return 0;
				5251
				5252	unlock:
				5253	task_rq_unlock(rq, p, &rf);
				5254	return retval;
				5255	}
				5256
				5257	static int _sched_setscheduler(struct task_struct *p, int policy,
				5258	const struct sched_param *param, bool check)
				5259	{
				5260	struct sched_attr attr = {
				5261	.sched_policy = policy,
				5262	.sched_priority = param->sched_priority,
				5263	.sched_nice = PRIO_TO_NICE(p->static_prio),
				5264	};
				5265
				5266	/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
				5267	if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
				5268	attr.sched_flags \|= SCHED_FLAG_RESET_ON_FORK;
				5269	policy &= ~SCHED_RESET_ON_FORK;
				5270	attr.sched_policy = policy;
				5271	}
				5272
				5273	return __sched_setscheduler(p, &attr, check, true);
				5274	}
				5275	/**
				5276	* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
				5277	* @p: the task in question.
				5278	* @policy: new policy.
				5279	* @param: structure containing the new RT priority.
				5280	*
				5281	* Return: 0 on success. An error code otherwise.
				5282	*
				5283	* NOTE that the task may be already dead.
				5284	*/
				5285	int sched_setscheduler(struct task_struct *p, int policy,
				5286	const struct sched_param *param)
				5287	{
				5288	return _sched_setscheduler(p, policy, param, true);
				5289	}
				5290	EXPORT_SYMBOL_GPL(sched_setscheduler);
				5291
				5292	int sched_setattr(struct task_struct p, const struct sched_attr attr)
				5293	{
				5294	return __sched_setscheduler(p, attr, true, true);
				5295	}
				5296	EXPORT_SYMBOL_GPL(sched_setattr);
				5297
				5298	int sched_setattr_nocheck(struct task_struct p, const struct sched_attr attr)
				5299	{
				5300	return __sched_setscheduler(p, attr, false, true);
				5301	}
				5302	EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
				5303
				5304	/**
				5305	* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
				5306	* @p: the task in question.
				5307	* @policy: new policy.
				5308	* @param: structure containing the new RT priority.
				5309	*
				5310	* Just like sched_setscheduler, only don't bother checking if the
				5311	* current context has permission. For example, this is needed in
				5312	* stop_machine(): we create temporary high priority worker threads,
				5313	* but our caller might not have that capability.
				5314	*
				5315	* Return: 0 on success. An error code otherwise.
				5316	*/
				5317	int sched_setscheduler_nocheck(struct task_struct *p, int policy,
				5318	const struct sched_param *param)
				5319	{
				5320	return _sched_setscheduler(p, policy, param, false);
				5321	}
				5322	EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
				5323
				5324	static int
				5325	do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
				5326	{
				5327	struct sched_param lparam;
				5328	struct task_struct *p;
				5329	int retval;
				5330
				5331	if (!param \|\| pid < 0)
				5332	return -EINVAL;
				5333	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
				5334	return -EFAULT;
				5335
				5336	rcu_read_lock();
				5337	retval = -ESRCH;
				5338	p = find_process_by_pid(pid);
				5339	if (p != NULL)
				5340	retval = sched_setscheduler(p, policy, &lparam);
				5341	rcu_read_unlock();
				5342
				5343	return retval;
				5344	}
				5345
				5346	/*
				5347	* Mimics kernel/events/core.c perf_copy_attr().
				5348	*/
				5349	static int sched_copy_attr(struct sched_attr __user uattr, struct sched_attr attr)
				5350	{
				5351	u32 size;
				5352	int ret;
				5353
				5354	/* Zero the full structure, so that a short copy will be nice: */
				5355	memset(attr, 0, sizeof(*attr));
				5356
				5357	ret = get_user(size, &uattr->size);
				5358	if (ret)
				5359	return ret;
				5360
				5361	/* ABI compatibility quirk: */
				5362	if (!size)
				5363	size = SCHED_ATTR_SIZE_VER0;
				5364	if (size < SCHED_ATTR_SIZE_VER0 \|\| size > PAGE_SIZE)
				5365	goto err_size;
				5366
				5367	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
				5368	if (ret) {
				5369	if (ret == -E2BIG)
				5370	goto err_size;
				5371	return ret;
				5372	}
				5373
				5374	if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
				5375	size < SCHED_ATTR_SIZE_VER1)
				5376	return -EINVAL;
				5377
				5378	/*
				5379	* XXX: Do we want to be lenient like existing syscalls; or do we want
				5380	* to be strict and return an error on out-of-bounds values?
				5381	*/
				5382	attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
				5383
				5384	return 0;
				5385
				5386	err_size:
				5387	put_user(sizeof(*attr), &uattr->size);
				5388	return -E2BIG;
				5389	}
				5390
				5391	/**
				5392	* sys_sched_setscheduler - set/change the scheduler policy and RT priority
				5393	* @pid: the pid in question.
				5394	* @policy: new policy.
				5395	* @param: structure containing the new RT priority.
				5396	*
				5397	* Return: 0 on success. An error code otherwise.
				5398	*/
				5399	SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
				5400	{
				5401	if (policy < 0)
				5402	return -EINVAL;
				5403
				5404	return do_sched_setscheduler(pid, policy, param);
				5405	}
				5406
				5407	/**
				5408	* sys_sched_setparam - set/change the RT priority of a thread
				5409	* @pid: the pid in question.
				5410	* @param: structure containing the new RT priority.
				5411	*
				5412	* Return: 0 on success. An error code otherwise.
				5413	*/
				5414	SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
				5415	{
				5416	return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
				5417	}
				5418
				5419	/**
				5420	* sys_sched_setattr - same as above, but with extended sched_attr
				5421	* @pid: the pid in question.
				5422	* @uattr: structure containing the extended parameters.
				5423	* @flags: for future extension.
				5424	*/
				5425	SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
				5426	unsigned int, flags)
				5427	{
				5428	struct sched_attr attr;
				5429	struct task_struct *p;
				5430	int retval;
				5431
				5432	if (!uattr \|\| pid < 0 \|\| flags)
				5433	return -EINVAL;
				5434
				5435	retval = sched_copy_attr(uattr, &attr);
				5436	if (retval)
				5437	return retval;
				5438
				5439	if ((int)attr.sched_policy < 0)
				5440	return -EINVAL;
				5441	if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
				5442	attr.sched_policy = SETPARAM_POLICY;
				5443
				5444	rcu_read_lock();
				5445	retval = -ESRCH;
				5446	p = find_process_by_pid(pid);
				5447	if (likely(p))
				5448	get_task_struct(p);
				5449	rcu_read_unlock();
				5450
				5451	if (likely(p)) {
				5452	retval = sched_setattr(p, &attr);
				5453	put_task_struct(p);
				5454	}
				5455
				5456	return retval;
				5457	}
				5458
				5459	/**
				5460	* sys_sched_getscheduler - get the policy (scheduling class) of a thread
				5461	* @pid: the pid in question.
				5462	*
				5463	* Return: On success, the policy of the thread. Otherwise, a negative error
				5464	* code.
				5465	*/
				5466	SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
				5467	{
				5468	struct task_struct *p;
				5469	int retval;
				5470
				5471	if (pid < 0)
				5472	return -EINVAL;
				5473
				5474	retval = -ESRCH;
				5475	rcu_read_lock();
				5476	p = find_process_by_pid(pid);
				5477	if (p) {
				5478	retval = security_task_getscheduler(p);
				5479	if (!retval)
				5480	retval = p->policy
				5481	\| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
				5482	}
				5483	rcu_read_unlock();
				5484	return retval;
				5485	}
				5486
				5487	/**
				5488	* sys_sched_getparam - get the RT priority of a thread
				5489	* @pid: the pid in question.
				5490	* @param: structure containing the RT priority.
				5491	*
				5492	* Return: On success, 0 and the RT priority is in @param. Otherwise, an error
				5493	* code.
				5494	*/
				5495	SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
				5496	{
				5497	struct sched_param lp = { .sched_priority = 0 };
				5498	struct task_struct *p;
				5499	int retval;
				5500
				5501	if (!param \|\| pid < 0)
				5502	return -EINVAL;
				5503
				5504	rcu_read_lock();
				5505	p = find_process_by_pid(pid);
				5506	retval = -ESRCH;
				5507	if (!p)
				5508	goto out_unlock;
				5509
				5510	retval = security_task_getscheduler(p);
				5511	if (retval)
				5512	goto out_unlock;
				5513
				5514	if (task_has_rt_policy(p))
				5515	lp.sched_priority = p->rt_priority;
				5516	rcu_read_unlock();
				5517
				5518	/*
				5519	* This one might sleep, we cannot do it with a spinlock held ...
				5520	*/
				5521	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
				5522
				5523	return retval;
				5524
				5525	out_unlock:
				5526	rcu_read_unlock();
				5527	return retval;
				5528	}
				5529
				5530	/*
				5531	* Copy the kernel size attribute structure (which might be larger
				5532	* than what user-space knows about) to user-space.
				5533	*
				5534	* Note that all cases are valid: user-space buffer can be larger or
				5535	* smaller than the kernel-space buffer. The usual case is that both
				5536	* have the same size.
				5537	*/
				5538	static int
				5539	sched_attr_copy_to_user(struct sched_attr __user *uattr,
				5540	struct sched_attr *kattr,
				5541	unsigned int usize)
				5542	{
				5543	unsigned int ksize = sizeof(*kattr);
				5544
				5545	if (!access_ok(uattr, usize))
				5546	return -EFAULT;
				5547
				5548	/*
				5549	* sched_getattr() ABI forwards and backwards compatibility:
				5550	*
				5551	* If usize == ksize then we just copy everything to user-space and all is good.
				5552	*
				5553	* If usize < ksize then we only copy as much as user-space has space for,
				5554	* this keeps ABI compatibility as well. We skip the rest.
				5555	*
				5556	* If usize > ksize then user-space is using a newer version of the ABI,
				5557	* which part the kernel doesn't know about. Just ignore it - tooling can
				5558	* detect the kernel's knowledge of attributes from the attr->size value
				5559	* which is set to ksize in this case.
				5560	*/
				5561	kattr->size = min(usize, ksize);
				5562
				5563	if (copy_to_user(uattr, kattr, kattr->size))
				5564	return -EFAULT;
				5565
				5566	return 0;
				5567	}
				5568
				5569	/**
				5570	* sys_sched_getattr - similar to sched_getparam, but with sched_attr
				5571	* @pid: the pid in question.
				5572	* @uattr: structure containing the extended parameters.
				5573	* @usize: sizeof(attr) for fwd/bwd comp.
				5574	* @flags: for future extension.
				5575	*/
				5576	SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
				5577	unsigned int, usize, unsigned int, flags)
				5578	{
				5579	struct sched_attr kattr = { };
				5580	struct task_struct *p;
				5581	int retval;
				5582
				5583	if (!uattr \|\| pid < 0 \|\| usize > PAGE_SIZE \|\|
				5584	usize < SCHED_ATTR_SIZE_VER0 \|\| flags)
				5585	return -EINVAL;
				5586
				5587	rcu_read_lock();
				5588	p = find_process_by_pid(pid);
				5589	retval = -ESRCH;
				5590	if (!p)
				5591	goto out_unlock;
				5592
				5593	retval = security_task_getscheduler(p);
				5594	if (retval)
				5595	goto out_unlock;
				5596
				5597	kattr.sched_policy = p->policy;
				5598	if (p->sched_reset_on_fork)
				5599	kattr.sched_flags \|= SCHED_FLAG_RESET_ON_FORK;
				5600	if (task_has_dl_policy(p))
				5601	__getparam_dl(p, &kattr);
				5602	else if (task_has_rt_policy(p))
				5603	kattr.sched_priority = p->rt_priority;
				5604	else
				5605	kattr.sched_nice = task_nice(p);
				5606
				5607	#ifdef CONFIG_UCLAMP_TASK
				5608	/*
				5609	* This could race with another potential updater, but this is fine
				5610	* because it'll correctly read the old or the new value. We don't need
				5611	* to guarantee who wins the race as long as it doesn't return garbage.
				5612	*/
				5613	kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
				5614	kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
				5615	#endif
				5616
				5617	rcu_read_unlock();
				5618
				5619	return sched_attr_copy_to_user(uattr, &kattr, usize);
				5620
				5621	out_unlock:
				5622	rcu_read_unlock();
				5623	return retval;
				5624	}
				5625
				5626	long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
				5627	{
				5628	cpumask_var_t cpus_allowed, new_mask;
				5629	struct task_struct *p;
				5630	int retval;
				5631
				5632	rcu_read_lock();
				5633
				5634	p = find_process_by_pid(pid);
				5635	if (!p) {
				5636	rcu_read_unlock();
				5637	return -ESRCH;
				5638	}
				5639
				5640	/* Prevent p going away */
				5641	get_task_struct(p);
				5642	rcu_read_unlock();
				5643
				5644	if (p->flags & PF_NO_SETAFFINITY) {
				5645	retval = -EINVAL;
				5646	goto out_put_task;
				5647	}
				5648	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
				5649	retval = -ENOMEM;
				5650	goto out_put_task;
				5651	}
				5652	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
				5653	retval = -ENOMEM;
				5654	goto out_free_cpus_allowed;
				5655	}
				5656	retval = -EPERM;
				5657	if (!check_same_owner(p)) {
				5658	rcu_read_lock();
				5659	if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
				5660	rcu_read_unlock();
				5661	goto out_free_new_mask;
				5662	}
				5663	rcu_read_unlock();
				5664	}
				5665
				5666	retval = security_task_setscheduler(p);
				5667	if (retval)
				5668	goto out_free_new_mask;
				5669
				5670
				5671	cpuset_cpus_allowed(p, cpus_allowed);
				5672	cpumask_and(new_mask, in_mask, cpus_allowed);
				5673
				5674	/*
				5675	* Since bandwidth control happens on root_domain basis,
				5676	* if admission test is enabled, we only admit -deadline
				5677	* tasks allowed to run on all the CPUs in the task's
				5678	* root_domain.
				5679	*/
				5680	#ifdef CONFIG_SMP
				5681	if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
				5682	rcu_read_lock();
				5683	if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
				5684	retval = -EBUSY;
				5685	rcu_read_unlock();
				5686	goto out_free_new_mask;
				5687	}
				5688	rcu_read_unlock();
				5689	}
				5690	#endif
				5691	again:
				5692	retval = __set_cpus_allowed_ptr(p, new_mask, true);
				5693
				5694	if (!retval) {
				5695	cpuset_cpus_allowed(p, cpus_allowed);
				5696	if (!cpumask_subset(new_mask, cpus_allowed)) {
				5697	/*
				5698	* We must have raced with a concurrent cpuset
				5699	* update. Just reset the cpus_allowed to the
				5700	* cpuset's cpus_allowed
				5701	*/
				5702	cpumask_copy(new_mask, cpus_allowed);
				5703	goto again;
				5704	}
				5705	}
				5706	out_free_new_mask:
				5707	free_cpumask_var(new_mask);
				5708	out_free_cpus_allowed:
				5709	free_cpumask_var(cpus_allowed);
				5710	out_put_task:
				5711	put_task_struct(p);
				5712	return retval;
				5713	}
				5714
				5715	static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
				5716	struct cpumask *new_mask)
				5717	{
				5718	if (len < cpumask_size())
				5719	cpumask_clear(new_mask);
				5720	else if (len > cpumask_size())
				5721	len = cpumask_size();
				5722
				5723	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
				5724	}
				5725
				5726	/**
				5727	* sys_sched_setaffinity - set the CPU affinity of a process
				5728	* @pid: pid of the process
				5729	* @len: length in bytes of the bitmask pointed to by user_mask_ptr
				5730	* @user_mask_ptr: user-space pointer to the new CPU mask
				5731	*
				5732	* Return: 0 on success. An error code otherwise.
				5733	*/
				5734	SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
				5735	unsigned long __user *, user_mask_ptr)
				5736	{
				5737	cpumask_var_t new_mask;
				5738	int retval;
				5739
				5740	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
				5741	return -ENOMEM;
				5742
				5743	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
				5744	if (retval == 0)
				5745	retval = sched_setaffinity(pid, new_mask);
				5746	free_cpumask_var(new_mask);
				5747	return retval;
				5748	}
				5749
				5750	long sched_getaffinity(pid_t pid, struct cpumask *mask)
				5751	{
				5752	struct task_struct *p;
				5753	unsigned long flags;
				5754	int retval;
				5755
				5756	rcu_read_lock();
				5757
				5758	retval = -ESRCH;
				5759	p = find_process_by_pid(pid);
				5760	if (!p)
				5761	goto out_unlock;
				5762
				5763	retval = security_task_getscheduler(p);
				5764	if (retval)
				5765	goto out_unlock;
				5766
				5767	raw_spin_lock_irqsave(&p->pi_lock, flags);
				5768	cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
				5769	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
				5770
				5771	out_unlock:
				5772	rcu_read_unlock();
				5773
				5774	return retval;
				5775	}
				5776
				5777	/**
				5778	* sys_sched_getaffinity - get the CPU affinity of a process
				5779	* @pid: pid of the process
				5780	* @len: length in bytes of the bitmask pointed to by user_mask_ptr
				5781	* @user_mask_ptr: user-space pointer to hold the current CPU mask
				5782	*
				5783	* Return: size of CPU mask copied to user_mask_ptr on success. An
				5784	* error code otherwise.
				5785	*/
				5786	SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
				5787	unsigned long __user *, user_mask_ptr)
				5788	{
				5789	int ret;
				5790	cpumask_var_t mask;
				5791
				5792	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
				5793	return -EINVAL;
				5794	if (len & (sizeof(unsigned long)-1))
				5795	return -EINVAL;
				5796
				5797	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
				5798	return -ENOMEM;
				5799
				5800	ret = sched_getaffinity(pid, mask);
				5801	if (ret == 0) {
				5802	unsigned int retlen = min(len, cpumask_size());
				5803
				5804	if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
				5805	ret = -EFAULT;
				5806	else
				5807	ret = retlen;
				5808	}
				5809	free_cpumask_var(mask);
				5810
				5811	return ret;
				5812	}
				5813
				5814	/**
				5815	* sys_sched_yield - yield the current processor to other threads.
				5816	*
				5817	* This function yields the current CPU to other tasks. If there are no
				5818	* other threads running on this CPU then this function will return.
				5819	*
				5820	* Return: 0.
				5821	*/
				5822	static void do_sched_yield(void)
				5823	{
				5824	struct rq_flags rf;
				5825	struct rq *rq;
				5826
				5827	rq = this_rq_lock_irq(&rf);
				5828
				5829	schedstat_inc(rq->yld_count);
				5830	current->sched_class->yield_task(rq);
				5831
				5832	preempt_disable();
				5833	rq_unlock_irq(rq, &rf);
				5834	sched_preempt_enable_no_resched();
				5835
				5836	schedule();
				5837	}
				5838
				5839	SYSCALL_DEFINE0(sched_yield)
				5840	{
				5841	do_sched_yield();
				5842	return 0;
				5843	}
				5844
				5845	#ifndef CONFIG_PREEMPTION
				5846	int __sched _cond_resched(void)
				5847	{
				5848	if (should_resched(0)) {
				5849	preempt_schedule_common();
				5850	return 1;
				5851	}
				5852	rcu_all_qs();
				5853	return 0;
				5854	}
				5855	EXPORT_SYMBOL(_cond_resched);
				5856	#endif
				5857
				5858	/*
				5859	* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
				5860	* call schedule, and on return reacquire the lock.
				5861	*
				5862	* This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
				5863	* operations here to prevent schedule() from being called twice (once via
				5864	* spin_unlock(), once by hand).
				5865	*/
				5866	int __cond_resched_lock(spinlock_t *lock)
				5867	{
				5868	int resched = should_resched(PREEMPT_LOCK_OFFSET);
				5869	int ret = 0;
				5870
				5871	lockdep_assert_held(lock);
				5872
				5873	if (spin_needbreak(lock) \|\| resched) {
				5874	spin_unlock(lock);
				5875	if (resched)
				5876	preempt_schedule_common();
				5877	else
				5878	cpu_relax();
				5879	ret = 1;
				5880	spin_lock(lock);
				5881	}
				5882	return ret;
				5883	}
				5884	EXPORT_SYMBOL(__cond_resched_lock);
				5885
				5886	/**
				5887	* yield - yield the current processor to other threads.
				5888	*
				5889	* Do not ever use this function, there's a 99% chance you're doing it wrong.
				5890	*
				5891	* The scheduler is at all times free to pick the calling task as the most
				5892	* eligible task to run, if removing the yield() call from your code breaks
				5893	* it, its already broken.
				5894	*
				5895	* Typical broken usage is:
				5896	*
				5897	* while (!event)
				5898	* yield();
				5899	*
				5900	* where one assumes that yield() will let 'the other' process run that will
				5901	* make event true. If the current task is a SCHED_FIFO task that will never
				5902	* happen. Never use yield() as a progress guarantee!!
				5903	*
				5904	* If you want to use yield() to wait for something, use wait_event().
				5905	* If you want to use yield() to be 'nice' for others, use cond_resched().
				5906	* If you still want to use yield(), do not!
				5907	*/
				5908	void __sched yield(void)
				5909	{
				5910	set_current_state(TASK_RUNNING);
				5911	do_sched_yield();
				5912	}
				5913	EXPORT_SYMBOL(yield);
				5914
				5915	/**
				5916	* yield_to - yield the current processor to another thread in
				5917	* your thread group, or accelerate that thread toward the
				5918	* processor it's on.
				5919	* @p: target task
				5920	* @preempt: whether task preemption is allowed or not
				5921	*
				5922	* It's the caller's job to ensure that the target task struct
				5923	* can't go away on us before we can do any checks.
				5924	*
				5925	* Return:
				5926	* true (>0) if we indeed boosted the target task.
				5927	* false (0) if we failed to boost the target.
				5928	* -ESRCH if there's no task to yield to.
				5929	*/
				5930	int __sched yield_to(struct task_struct *p, bool preempt)
				5931	{
				5932	struct task_struct *curr = current;
				5933	struct rq rq, p_rq;
				5934	unsigned long flags;
				5935	int yielded = 0;
				5936
				5937	local_irq_save(flags);
				5938	rq = this_rq();
				5939
				5940	again:
				5941	p_rq = task_rq(p);
				5942	/*
				5943	* If we're the only runnable task on the rq and target rq also
				5944	* has only one task, there's absolutely no point in yielding.
				5945	*/
				5946	if (rq->nr_running == 1 && p_rq->nr_running == 1) {
				5947	yielded = -ESRCH;
				5948	goto out_irq;
				5949	}
				5950
				5951	double_rq_lock(rq, p_rq);
				5952	if (task_rq(p) != p_rq) {
				5953	double_rq_unlock(rq, p_rq);
				5954	goto again;
				5955	}
				5956
				5957	if (!curr->sched_class->yield_to_task)
				5958	goto out_unlock;
				5959
				5960	if (curr->sched_class != p->sched_class)
				5961	goto out_unlock;
				5962
				5963	if (task_running(p_rq, p) \|\| p->state)
				5964	goto out_unlock;
				5965
				5966	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
				5967	if (yielded) {
				5968	schedstat_inc(rq->yld_count);
				5969	/*
				5970	* Make p's CPU reschedule; pick_next_entity takes care of
				5971	* fairness.
				5972	*/
				5973	if (preempt && rq != p_rq)
				5974	resched_curr(p_rq);
				5975	}
				5976
				5977	out_unlock:
				5978	double_rq_unlock(rq, p_rq);
				5979	out_irq:
				5980	local_irq_restore(flags);
				5981
				5982	if (yielded > 0)
				5983	schedule();
				5984
				5985	return yielded;
				5986	}
				5987	EXPORT_SYMBOL_GPL(yield_to);
				5988
				5989	int io_schedule_prepare(void)
				5990	{
				5991	int old_iowait = current->in_iowait;
				5992
				5993	current->in_iowait = 1;
				5994	blk_schedule_flush_plug(current);
				5995
				5996	return old_iowait;
				5997	}
				5998
				5999	void io_schedule_finish(int token)
				6000	{
				6001	current->in_iowait = token;
				6002	}
				6003
				6004	/*
				6005	* This task is about to go to sleep on IO. Increment rq->nr_iowait so
				6006	* that process accounting knows that this is a task in IO wait state.
				6007	*/
				6008	long __sched io_schedule_timeout(long timeout)
				6009	{
				6010	int token;
				6011	long ret;
				6012
				6013	token = io_schedule_prepare();
				6014	ret = schedule_timeout(timeout);
				6015	io_schedule_finish(token);
				6016
				6017	return ret;
				6018	}
				6019	EXPORT_SYMBOL(io_schedule_timeout);
				6020
				6021	void __sched io_schedule(void)
				6022	{
				6023	int token;
				6024
				6025	token = io_schedule_prepare();
				6026	schedule();
				6027	io_schedule_finish(token);
				6028	}
				6029	EXPORT_SYMBOL(io_schedule);
				6030
				6031	/**
				6032	* sys_sched_get_priority_max - return maximum RT priority.
				6033	* @policy: scheduling class.
				6034	*
				6035	* Return: On success, this syscall returns the maximum
				6036	* rt_priority that can be used by a given scheduling class.
				6037	* On failure, a negative error code is returned.
				6038	*/
				6039	SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
				6040	{
				6041	int ret = -EINVAL;
				6042
				6043	switch (policy) {
				6044	case SCHED_FIFO:
				6045	case SCHED_RR:
				6046	ret = MAX_USER_RT_PRIO-1;
				6047	break;
				6048	case SCHED_DEADLINE:
				6049	case SCHED_NORMAL:
				6050	case SCHED_BATCH:
				6051	case SCHED_IDLE:
				6052	ret = 0;
				6053	break;
				6054	}
				6055	return ret;
				6056	}
				6057
				6058	/**
				6059	* sys_sched_get_priority_min - return minimum RT priority.
				6060	* @policy: scheduling class.
				6061	*
				6062	* Return: On success, this syscall returns the minimum
				6063	* rt_priority that can be used by a given scheduling class.
				6064	* On failure, a negative error code is returned.
				6065	*/
				6066	SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
				6067	{
				6068	int ret = -EINVAL;
				6069
				6070	switch (policy) {
				6071	case SCHED_FIFO:
				6072	case SCHED_RR:
				6073	ret = 1;
				6074	break;
				6075	case SCHED_DEADLINE:
				6076	case SCHED_NORMAL:
				6077	case SCHED_BATCH:
				6078	case SCHED_IDLE:
				6079	ret = 0;
				6080	}
				6081	return ret;
				6082	}
				6083
				6084	static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
				6085	{
				6086	struct task_struct *p;
				6087	unsigned int time_slice;
				6088	struct rq_flags rf;
				6089	struct rq *rq;
				6090	int retval;
				6091
				6092	if (pid < 0)
				6093	return -EINVAL;
				6094
				6095	retval = -ESRCH;
				6096	rcu_read_lock();
				6097	p = find_process_by_pid(pid);
				6098	if (!p)
				6099	goto out_unlock;
				6100
				6101	retval = security_task_getscheduler(p);
				6102	if (retval)
				6103	goto out_unlock;
				6104
				6105	rq = task_rq_lock(p, &rf);
				6106	time_slice = 0;
				6107	if (p->sched_class->get_rr_interval)
				6108	time_slice = p->sched_class->get_rr_interval(rq, p);
				6109	task_rq_unlock(rq, p, &rf);
				6110
				6111	rcu_read_unlock();
				6112	jiffies_to_timespec64(time_slice, t);
				6113	return 0;
				6114
				6115	out_unlock:
				6116	rcu_read_unlock();
				6117	return retval;
				6118	}
				6119
				6120	/**
				6121	* sys_sched_rr_get_interval - return the default timeslice of a process.
				6122	* @pid: pid of the process.
				6123	* @interval: userspace pointer to the timeslice value.
				6124	*
				6125	* this syscall writes the default timeslice value of a given process
				6126	* into the user-space timespec buffer. A value of '0' means infinity.
				6127	*
				6128	* Return: On success, 0 and the timeslice is in @interval. Otherwise,
				6129	* an error code.
				6130	*/
				6131	SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
				6132	struct __kernel_timespec __user *, interval)
				6133	{
				6134	struct timespec64 t;
				6135	int retval = sched_rr_get_interval(pid, &t);
				6136
				6137	if (retval == 0)
				6138	retval = put_timespec64(&t, interval);
				6139
				6140	return retval;
				6141	}
				6142
				6143	#ifdef CONFIG_COMPAT_32BIT_TIME
				6144	SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
				6145	struct old_timespec32 __user *, interval)
				6146	{
				6147	struct timespec64 t;
				6148	int retval = sched_rr_get_interval(pid, &t);
				6149
				6150	if (retval == 0)
				6151	retval = put_old_timespec32(&t, interval);
				6152	return retval;
				6153	}
				6154	#endif
				6155
				6156	void sched_show_task(struct task_struct *p)
				6157	{
				6158	unsigned long free = 0;
				6159	int ppid;
				6160
				6161	if (!try_get_task_stack(p))
				6162	return;
				6163
				6164	printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
				6165
				6166	if (p->state == TASK_RUNNING)
				6167	printk(KERN_CONT " running task ");
				6168	#ifdef CONFIG_DEBUG_STACK_USAGE
				6169	free = stack_not_used(p);
				6170	#endif
				6171	ppid = 0;
				6172	rcu_read_lock();
				6173	if (pid_alive(p))
				6174	ppid = task_pid_nr(rcu_dereference(p->real_parent));
				6175	rcu_read_unlock();
				6176	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
				6177	task_pid_nr(p), ppid,
				6178	(unsigned long)task_thread_info(p)->flags);
				6179
				6180	print_worker_info(KERN_INFO, p);
				6181	trace_android_vh_sched_show_task(p);
				6182	show_stack(p, NULL);
				6183	put_task_stack(p);
				6184	}
				6185	EXPORT_SYMBOL_GPL(sched_show_task);
				6186
				6187	static inline bool
				6188	state_filter_match(unsigned long state_filter, struct task_struct *p)
				6189	{
				6190	/* no filter, everything matches */
				6191	if (!state_filter)
				6192	return true;
				6193
				6194	/* filter, but doesn't match */
				6195	if (!(p->state & state_filter))
				6196	return false;
				6197
				6198	/*
				6199	* When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
				6200	* TASK_KILLABLE).
				6201	*/
				6202	if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
				6203	return false;
				6204
				6205	return true;
				6206	}
				6207
				6208
				6209	void show_state_filter(unsigned long state_filter)
				6210	{
				6211	struct task_struct g, p;
				6212
				6213	#if BITS_PER_LONG == 32
				6214	printk(KERN_INFO
				6215	" task PC stack pid father\n");
				6216	#else
				6217	printk(KERN_INFO
				6218	" task PC stack pid father\n");
				6219	#endif
				6220	rcu_read_lock();
				6221	for_each_process_thread(g, p) {
				6222	/*
				6223	* reset the NMI-timeout, listing all files on a slow
				6224	* console might take a lot of time:
				6225	* Also, reset softlockup watchdogs on all CPUs, because
				6226	* another CPU might be blocked waiting for us to process
				6227	* an IPI.
				6228	*/
				6229	touch_nmi_watchdog();
				6230	touch_all_softlockup_watchdogs();
				6231	if (state_filter_match(state_filter, p))
				6232	sched_show_task(p);
				6233	}
				6234
				6235	#ifdef CONFIG_SCHED_DEBUG
				6236	if (!state_filter)
				6237	sysrq_sched_debug_show();
				6238	#endif
				6239	rcu_read_unlock();
				6240	/*
				6241	* Only show locks if all tasks are dumped:
				6242	*/
				6243	if (!state_filter)
				6244	debug_show_all_locks();
				6245	}
				6246
				6247	/**
				6248	* init_idle - set up an idle thread for a given CPU
				6249	* @idle: task in question
				6250	* @cpu: CPU the idle task belongs to
				6251	*
				6252	* NOTE: this function does not set the idle thread's NEED_RESCHED
				6253	* flag, to make booting more robust.
				6254	*/
				6255	void init_idle(struct task_struct *idle, int cpu)
				6256	{
				6257	struct rq *rq = cpu_rq(cpu);
				6258	unsigned long flags;
				6259
				6260	__sched_fork(0, idle);
				6261
				6262	raw_spin_lock_irqsave(&idle->pi_lock, flags);
				6263	raw_spin_lock(&rq->lock);
				6264
				6265	idle->state = TASK_RUNNING;
				6266	idle->se.exec_start = sched_clock();
				6267	idle->flags \|= PF_IDLE;
				6268
				6269	scs_task_reset(idle);
				6270	kasan_unpoison_task_stack(idle);
				6271
				6272	#ifdef CONFIG_SMP
				6273	/*
				6274	* Its possible that init_idle() gets called multiple times on a task,
				6275	* in that case do_set_cpus_allowed() will not do the right thing.
				6276	*
				6277	* And since this is boot we can forgo the serialization.
				6278	*/
				6279	set_cpus_allowed_common(idle, cpumask_of(cpu));
				6280	#endif
				6281	/*
				6282	* We're having a chicken and egg problem, even though we are
				6283	* holding rq->lock, the CPU isn't yet set to this CPU so the
				6284	* lockdep check in task_group() will fail.
				6285	*
				6286	* Similar case to sched_fork(). / Alternatively we could
				6287	* use task_rq_lock() here and obtain the other rq->lock.
				6288	*
				6289	* Silence PROVE_RCU
				6290	*/
				6291	rcu_read_lock();
				6292	__set_task_cpu(idle, cpu);
				6293	rcu_read_unlock();
				6294
				6295	rq->idle = idle;
				6296	rcu_assign_pointer(rq->curr, idle);
				6297	idle->on_rq = TASK_ON_RQ_QUEUED;
				6298	#ifdef CONFIG_SMP
				6299	idle->on_cpu = 1;
				6300	#endif
				6301	raw_spin_unlock(&rq->lock);
				6302	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
				6303
				6304	/* Set the preempt count _outside_ the spinlocks! */
				6305	init_idle_preempt_count(idle, cpu);
				6306
				6307	/*
				6308	* The idle tasks have their own, simple scheduling class:
				6309	*/
				6310	idle->sched_class = &idle_sched_class;
				6311	ftrace_graph_init_idle_task(idle, cpu);
				6312	vtime_init_idle(idle, cpu);
				6313	#ifdef CONFIG_SMP
				6314	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
				6315	#endif
				6316	}
				6317
				6318	#ifdef CONFIG_SMP
				6319
				6320	int cpuset_cpumask_can_shrink(const struct cpumask *cur,
				6321	const struct cpumask *trial)
				6322	{
				6323	int ret = 1;
				6324
				6325	if (!cpumask_weight(cur))
				6326	return ret;
				6327
				6328	ret = dl_cpuset_cpumask_can_shrink(cur, trial);
				6329
				6330	return ret;
				6331	}
				6332
				6333	int task_can_attach(struct task_struct *p,
				6334	const struct cpumask *cs_cpus_allowed)
				6335	{
				6336	int ret = 0;
				6337
				6338	/*
				6339	* Kthreads which disallow setaffinity shouldn't be moved
				6340	* to a new cpuset; we don't want to change their CPU
				6341	* affinity and isolating such threads by their set of
				6342	* allowed nodes is unnecessary. Thus, cpusets are not
				6343	* applicable for such threads. This prevents checking for
				6344	* success of set_cpus_allowed_ptr() on all attached tasks
				6345	* before cpus_mask may be changed.
				6346	*/
				6347	if (p->flags & PF_NO_SETAFFINITY) {
				6348	ret = -EINVAL;
				6349	goto out;
				6350	}
				6351
				6352	if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
				6353	cs_cpus_allowed))
				6354	ret = dl_task_can_attach(p, cs_cpus_allowed);
				6355
				6356	out:
				6357	return ret;
				6358	}
				6359
				6360	bool sched_smp_initialized __read_mostly;
				6361
				6362	#ifdef CONFIG_NUMA_BALANCING
				6363	/* Migrate current task p to target_cpu */
				6364	int migrate_task_to(struct task_struct *p, int target_cpu)
				6365	{
				6366	struct migration_arg arg = { p, target_cpu };
				6367	int curr_cpu = task_cpu(p);
				6368
				6369	if (curr_cpu == target_cpu)
				6370	return 0;
				6371
				6372	if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
				6373	return -EINVAL;
				6374
				6375	/* TODO: This is not properly updating schedstats */
				6376
				6377	trace_sched_move_numa(p, curr_cpu, target_cpu);
				6378	return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
				6379	}
				6380
				6381	/*
				6382	* Requeue a task on a given node and accurately track the number of NUMA
				6383	* tasks on the runqueues
				6384	*/
				6385	void sched_setnuma(struct task_struct *p, int nid)
				6386	{
				6387	bool queued, running;
				6388	struct rq_flags rf;
				6389	struct rq *rq;
				6390
				6391	rq = task_rq_lock(p, &rf);
				6392	queued = task_on_rq_queued(p);
				6393	running = task_current(rq, p);
				6394
				6395	if (queued)
				6396	dequeue_task(rq, p, DEQUEUE_SAVE);
				6397	if (running)
				6398	put_prev_task(rq, p);
				6399
				6400	p->numa_preferred_nid = nid;
				6401
				6402	if (queued)
				6403	enqueue_task(rq, p, ENQUEUE_RESTORE \| ENQUEUE_NOCLOCK);
				6404	if (running)
				6405	set_next_task(rq, p);
				6406	task_rq_unlock(rq, p, &rf);
				6407	}
				6408	#endif /* CONFIG_NUMA_BALANCING */
				6409
				6410	#ifdef CONFIG_HOTPLUG_CPU
				6411	/*
				6412	* Ensure that the idle task is using init_mm right before its CPU goes
				6413	* offline.
				6414	*/
				6415	void idle_task_exit(void)
				6416	{
				6417	struct mm_struct *mm = current->active_mm;
				6418
				6419	BUG_ON(cpu_online(smp_processor_id()));
				6420	BUG_ON(current != this_rq()->idle);
				6421
				6422	if (mm != &init_mm) {
				6423	switch_mm(mm, &init_mm, current);
				6424	finish_arch_post_lock_switch();
				6425	}
				6426
				6427	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
				6428	}
				6429
				6430	/*
				6431	* Since this CPU is going 'away' for a while, fold any nr_active delta
				6432	* we might have. Assumes we're called after migrate_tasks() so that the
				6433	* nr_active count is stable. We need to take the teardown thread which
				6434	* is calling this into account, so we hand in adjust = 1 to the load
				6435	* calculation.
				6436	*
				6437	* Also see the comment "Global load-average calculations".
				6438	*/
				6439	static void calc_load_migrate(struct rq *rq)
				6440	{
				6441	long delta = calc_load_fold_active(rq, 1);
				6442	if (delta)
				6443	atomic_long_add(delta, &calc_load_tasks);
				6444	}
				6445
				6446	static struct task_struct __pick_migrate_task(struct rq rq)
				6447	{
				6448	const struct sched_class *class;
				6449	struct task_struct *next;
				6450
				6451	for_each_class(class) {
				6452	next = class->pick_next_task(rq, NULL, NULL);
				6453	if (next) {
				6454	next->sched_class->put_prev_task(rq, next);
				6455	return next;
				6456	}
				6457	}
				6458
				6459	/* The idle class should always have a runnable task */
				6460	BUG();
				6461	}
				6462
				6463	/*
				6464	* Migrate all tasks from the rq, sleeping tasks will be migrated by
				6465	* try_to_wake_up()->select_task_rq().
				6466	*
				6467	* Called with rq->lock held even though we'er in stop_machine() and
				6468	* there's no concurrency possible, we hold the required locks anyway
				6469	* because of lock validation efforts.
				6470	*/
				6471	static void migrate_tasks(struct rq dead_rq, struct rq_flags rf)
				6472	{
				6473	struct rq *rq = dead_rq;
				6474	struct task_struct next, stop = rq->stop;
				6475	struct rq_flags orf = *rf;
				6476	int dest_cpu;
				6477
				6478	/*
				6479	* Fudge the rq selection such that the below task selection loop
				6480	* doesn't get stuck on the currently eligible stop task.
				6481	*
				6482	* We're currently inside stop_machine() and the rq is either stuck
				6483	* in the stop_machine_cpu_stop() loop, or we're executing this code,
				6484	* either way we should never end up calling schedule() until we're
				6485	* done here.
				6486	*/
				6487	rq->stop = NULL;
				6488
				6489	/*
				6490	* put_prev_task() and pick_next_task() sched
				6491	* class method both need to have an up-to-date
				6492	* value of rq->clock[_task]
				6493	*/
				6494	update_rq_clock(rq);
				6495
				6496	for (;;) {
				6497	/*
				6498	* There's this thread running, bail when that's the only
				6499	* remaining thread:
				6500	*/
				6501	if (rq->nr_running == 1)
				6502	break;
				6503
				6504	next = __pick_migrate_task(rq);
				6505
				6506	/*
				6507	* Rules for changing task_struct::cpus_mask are holding
				6508	* both pi_lock and rq->lock, such that holding either
				6509	* stabilizes the mask.
				6510	*
				6511	* Drop rq->lock is not quite as disastrous as it usually is
				6512	* because !cpu_active at this point, which means load-balance
				6513	* will not interfere. Also, stop-machine.
				6514	*/
				6515	rq_unlock(rq, rf);
				6516	raw_spin_lock(&next->pi_lock);
				6517	rq_relock(rq, rf);
				6518
				6519	/*
				6520	* Since we're inside stop-machine, _nothing_ should have
				6521	* changed the task, WARN if weird stuff happened, because in
				6522	* that case the above rq->lock drop is a fail too.
				6523	*/
				6524	if (WARN_ON(task_rq(next) != rq \|\| !task_on_rq_queued(next))) {
				6525	raw_spin_unlock(&next->pi_lock);
				6526	continue;
				6527	}
				6528
				6529	/* Find suitable destination for @next, with force if needed. */
				6530	dest_cpu = select_fallback_rq(dead_rq->cpu, next);
				6531	rq = __migrate_task(rq, rf, next, dest_cpu);
				6532	if (rq != dead_rq) {
				6533	rq_unlock(rq, rf);
				6534	rq = dead_rq;
				6535	*rf = orf;
				6536	rq_relock(rq, rf);
				6537	}
				6538	raw_spin_unlock(&next->pi_lock);
				6539	}
				6540
				6541	rq->stop = stop;
				6542	}
				6543	#endif /* CONFIG_HOTPLUG_CPU */
				6544
				6545	void set_rq_online(struct rq *rq)
				6546	{
				6547	if (!rq->online) {
				6548	const struct sched_class *class;
				6549
				6550	cpumask_set_cpu(rq->cpu, rq->rd->online);
				6551	rq->online = 1;
				6552
				6553	for_each_class(class) {
				6554	if (class->rq_online)
				6555	class->rq_online(rq);
				6556	}
				6557	}
				6558	}
				6559
				6560	void set_rq_offline(struct rq *rq)
				6561	{
				6562	if (rq->online) {
				6563	const struct sched_class *class;
				6564
				6565	for_each_class(class) {
				6566	if (class->rq_offline)
				6567	class->rq_offline(rq);
				6568	}
				6569
				6570	cpumask_clear_cpu(rq->cpu, rq->rd->online);
				6571	rq->online = 0;
				6572	}
				6573	}
				6574
				6575	/*
				6576	* used to mark begin/end of suspend/resume:
				6577	*/
				6578	static int num_cpus_frozen;
				6579
				6580	/*
				6581	* Update cpusets according to cpu_active mask. If cpusets are
				6582	* disabled, cpuset_update_active_cpus() becomes a simple wrapper
				6583	* around partition_sched_domains().
				6584	*
				6585	* If we come here as part of a suspend/resume, don't touch cpusets because we
				6586	* want to restore it back to its original state upon resume anyway.
				6587	*/
				6588	static void cpuset_cpu_active(void)
				6589	{
				6590	if (cpuhp_tasks_frozen) {
				6591	/*
				6592	* num_cpus_frozen tracks how many CPUs are involved in suspend
				6593	* resume sequence. As long as this is not the last online
				6594	* operation in the resume sequence, just build a single sched
				6595	* domain, ignoring cpusets.
				6596	*/
				6597	partition_sched_domains(1, NULL, NULL);
				6598	if (--num_cpus_frozen)
				6599	return;
				6600	/*
				6601	* This is the last CPU online operation. So fall through and
				6602	* restore the original sched domains by considering the
				6603	* cpuset configurations.
				6604	*/
				6605	cpuset_force_rebuild();
				6606	}
				6607	cpuset_update_active_cpus();
				6608	}
				6609
				6610	static int cpuset_cpu_inactive(unsigned int cpu)
				6611	{
				6612	if (!cpuhp_tasks_frozen) {
				6613	if (dl_cpu_busy(cpu))
				6614	return -EBUSY;
				6615	cpuset_update_active_cpus();
				6616	} else {
				6617	num_cpus_frozen++;
				6618	partition_sched_domains(1, NULL, NULL);
				6619	}
				6620	return 0;
				6621	}
				6622
				6623	int sched_cpu_activate(unsigned int cpu)
				6624	{
				6625	struct rq *rq = cpu_rq(cpu);
				6626	struct rq_flags rf;
				6627
				6628	#ifdef CONFIG_SCHED_SMT
				6629	/*
				6630	* When going up, increment the number of cores with SMT present.
				6631	*/
				6632	if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
				6633	static_branch_inc_cpuslocked(&sched_smt_present);
				6634	#endif
				6635	set_cpu_active(cpu, true);
				6636
				6637	if (sched_smp_initialized) {
				6638	sched_domains_numa_masks_set(cpu);
				6639	cpuset_cpu_active();
				6640	}
				6641
				6642	/*
				6643	* Put the rq online, if not already. This happens:
				6644	*
				6645	* 1) In the early boot process, because we build the real domains
				6646	* after all CPUs have been brought up.
				6647	*
				6648	* 2) At runtime, if cpuset_cpu_active() fails to rebuild the
				6649	* domains.
				6650	*/
				6651	rq_lock_irqsave(rq, &rf);
				6652	if (rq->rd) {
				6653	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
				6654	set_rq_online(rq);
				6655	}
				6656	rq_unlock_irqrestore(rq, &rf);
				6657
				6658	return 0;
				6659	}
				6660
				6661	int sched_cpu_deactivate(unsigned int cpu)
				6662	{
				6663	int ret;
				6664
				6665	set_cpu_active(cpu, false);
				6666	/*
				6667	* We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
				6668	* users of this state to go away such that all new such users will
				6669	* observe it.
				6670	*
				6671	* Do sync before park smpboot threads to take care the rcu boost case.
				6672	*/
				6673	synchronize_rcu();
				6674
				6675	#ifdef CONFIG_SCHED_SMT
				6676	/*
				6677	* When going down, decrement the number of cores with SMT present.
				6678	*/
				6679	if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
				6680	static_branch_dec_cpuslocked(&sched_smt_present);
				6681	#endif
				6682
				6683	if (!sched_smp_initialized)
				6684	return 0;
				6685
				6686	ret = cpuset_cpu_inactive(cpu);
				6687	if (ret) {
				6688	set_cpu_active(cpu, true);
				6689	return ret;
				6690	}
				6691	sched_domains_numa_masks_clear(cpu);
				6692	return 0;
				6693	}
				6694
				6695	static void sched_rq_cpu_starting(unsigned int cpu)
				6696	{
				6697	struct rq *rq = cpu_rq(cpu);
				6698
				6699	rq->calc_load_update = calc_load_update;
				6700	update_max_interval();
				6701	}
				6702
				6703	int sched_cpu_starting(unsigned int cpu)
				6704	{
				6705	sched_rq_cpu_starting(cpu);
				6706	sched_tick_start(cpu);
				6707	return 0;
				6708	}
				6709
				6710	#ifdef CONFIG_HOTPLUG_CPU
				6711	int sched_cpu_dying(unsigned int cpu)
				6712	{
				6713	struct rq *rq = cpu_rq(cpu);
				6714	struct rq_flags rf;
				6715
				6716	/* Handle pending wakeups and then migrate everything off */
				6717	sched_ttwu_pending();
				6718	sched_tick_stop(cpu);
				6719
				6720	rq_lock_irqsave(rq, &rf);
				6721	if (rq->rd) {
				6722	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
				6723	set_rq_offline(rq);
				6724	}
				6725	migrate_tasks(rq, &rf);
				6726	BUG_ON(rq->nr_running != 1);
				6727	rq_unlock_irqrestore(rq, &rf);
				6728
				6729	calc_load_migrate(rq);
				6730	update_max_interval();
				6731	nohz_balance_exit_idle(rq);
				6732	hrtick_clear(rq);
				6733	return 0;
				6734	}
				6735	#endif
				6736
				6737	void __init sched_init_smp(void)
				6738	{
				6739	sched_init_numa();
				6740
				6741	/*
				6742	* There's no userspace yet to cause hotplug operations; hence all the
				6743	* CPU masks are stable and all blatant races in the below code cannot
				6744	* happen.
				6745	*/
				6746	mutex_lock(&sched_domains_mutex);
				6747	sched_init_domains(cpu_active_mask);
				6748	mutex_unlock(&sched_domains_mutex);
				6749
				6750	/* Move init over to a non-isolated CPU */
				6751	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
				6752	BUG();
				6753	sched_init_granularity();
				6754
				6755	init_sched_rt_class();
				6756	init_sched_dl_class();
				6757
				6758	sched_smp_initialized = true;
				6759	}
				6760
				6761	static int __init migration_init(void)
				6762	{
				6763	sched_cpu_starting(smp_processor_id());
				6764	return 0;
				6765	}
				6766	early_initcall(migration_init);
				6767
				6768	#else
				6769	void __init sched_init_smp(void)
				6770	{
				6771	sched_init_granularity();
				6772	}
				6773	#endif /* CONFIG_SMP */
				6774
				6775	int in_sched_functions(unsigned long addr)
				6776	{
				6777	return in_lock_functions(addr) \|\|
				6778	(addr >= (unsigned long)__sched_text_start
				6779	&& addr < (unsigned long)__sched_text_end);
				6780	}
				6781
				6782	#ifdef CONFIG_CGROUP_SCHED
				6783	/*
				6784	* Default task group.
				6785	* Every task in system belongs to this group at bootup.
				6786	*/
				6787	struct task_group root_task_group;
				6788	LIST_HEAD(task_groups);
				6789
				6790	/* Cacheline aligned slab cache for task_group */
				6791	static struct kmem_cache *task_group_cache __read_mostly;
				6792	#endif
				6793
				6794	DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
				6795	DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
				6796
				6797	void __init sched_init(void)
				6798	{
				6799	unsigned long ptr = 0;
				6800	int i;
				6801
				6802	wait_bit_init();
				6803
				6804	#ifdef CONFIG_FAIR_GROUP_SCHED
				6805	ptr += 2 * nr_cpu_ids * sizeof(void **);
				6806	#endif
				6807	#ifdef CONFIG_RT_GROUP_SCHED
				6808	ptr += 2 * nr_cpu_ids * sizeof(void **);
				6809	#endif
				6810	if (ptr) {
				6811	ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
				6812
				6813	#ifdef CONFIG_FAIR_GROUP_SCHED
				6814	root_task_group.se = (struct sched_entity **)ptr;
				6815	ptr += nr_cpu_ids * sizeof(void **);
				6816
				6817	root_task_group.cfs_rq = (struct cfs_rq **)ptr;
				6818	ptr += nr_cpu_ids * sizeof(void **);
				6819
				6820	#endif /* CONFIG_FAIR_GROUP_SCHED */
				6821	#ifdef CONFIG_RT_GROUP_SCHED
				6822	root_task_group.rt_se = (struct sched_rt_entity **)ptr;
				6823	ptr += nr_cpu_ids * sizeof(void **);
				6824
				6825	root_task_group.rt_rq = (struct rt_rq **)ptr;
				6826	ptr += nr_cpu_ids * sizeof(void **);
				6827
				6828	#endif /* CONFIG_RT_GROUP_SCHED */
				6829	}
				6830	#ifdef CONFIG_CPUMASK_OFFSTACK
				6831	for_each_possible_cpu(i) {
				6832	per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
				6833	cpumask_size(), GFP_KERNEL, cpu_to_node(i));
				6834	per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
				6835	cpumask_size(), GFP_KERNEL, cpu_to_node(i));
				6836	}
				6837	#endif /* CONFIG_CPUMASK_OFFSTACK */
				6838
				6839	init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
				6840	init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
				6841
				6842	#ifdef CONFIG_SMP
				6843	init_defrootdomain();
				6844	#endif
				6845
				6846	#ifdef CONFIG_RT_GROUP_SCHED
				6847	init_rt_bandwidth(&root_task_group.rt_bandwidth,
				6848	global_rt_period(), global_rt_runtime());
				6849	#endif /* CONFIG_RT_GROUP_SCHED */
				6850
				6851	#ifdef CONFIG_CGROUP_SCHED
				6852	task_group_cache = KMEM_CACHE(task_group, 0);
				6853
				6854	list_add(&root_task_group.list, &task_groups);
				6855	INIT_LIST_HEAD(&root_task_group.children);
				6856	INIT_LIST_HEAD(&root_task_group.siblings);
				6857	autogroup_init(&init_task);
				6858	#endif /* CONFIG_CGROUP_SCHED */
				6859
				6860	for_each_possible_cpu(i) {
				6861	struct rq *rq;
				6862
				6863	rq = cpu_rq(i);
				6864	raw_spin_lock_init(&rq->lock);
				6865	rq->nr_running = 0;
				6866	rq->calc_load_active = 0;
				6867	rq->calc_load_update = jiffies + LOAD_FREQ;
				6868	init_cfs_rq(&rq->cfs);
				6869	init_rt_rq(&rq->rt);
				6870	init_dl_rq(&rq->dl);
				6871	#ifdef CONFIG_FAIR_GROUP_SCHED
				6872	root_task_group.shares = ROOT_TASK_GROUP_LOAD;
				6873	INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
				6874	rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
				6875	/*
				6876	* How much CPU bandwidth does root_task_group get?
				6877	*
				6878	* In case of task-groups formed thr' the cgroup filesystem, it
				6879	* gets 100% of the CPU resources in the system. This overall
				6880	* system CPU resource is divided among the tasks of
				6881	* root_task_group and its child task-groups in a fair manner,
				6882	* based on each entity's (task or task-group's) weight
				6883	* (se->load.weight).
				6884	*
				6885	* In other words, if root_task_group has 10 tasks of weight
				6886	* 1024) and two child groups A0 and A1 (of weight 1024 each),
				6887	* then A0's share of the CPU resource is:
				6888	*
				6889	* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
				6890	*
				6891	* We achieve this by letting root_task_group's tasks sit
				6892	* directly in rq->cfs (i.e root_task_group->se[] = NULL).
				6893	*/
				6894	init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
				6895	init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
				6896	#endif /* CONFIG_FAIR_GROUP_SCHED */
				6897
				6898	rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
				6899	#ifdef CONFIG_RT_GROUP_SCHED
				6900	init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
				6901	#endif
				6902	#ifdef CONFIG_SMP
				6903	rq->sd = NULL;
				6904	rq->rd = NULL;
				6905	rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
				6906	rq->balance_callback = NULL;
				6907	rq->active_balance = 0;
				6908	rq->next_balance = jiffies;
				6909	rq->push_cpu = 0;
				6910	rq->cpu = i;
				6911	rq->online = 0;
				6912	rq->idle_stamp = 0;
				6913	rq->avg_idle = 2*sysctl_sched_migration_cost;
				6914	rq->max_idle_balance_cost = sysctl_sched_migration_cost;
				6915
				6916	INIT_LIST_HEAD(&rq->cfs_tasks);
				6917
				6918	rq_attach_root(rq, &def_root_domain);
				6919	#ifdef CONFIG_NO_HZ_COMMON
				6920	rq->last_load_update_tick = jiffies;
				6921	rq->last_blocked_load_update_tick = jiffies;
				6922	atomic_set(&rq->nohz_flags, 0);
				6923	#endif
				6924	#endif /* CONFIG_SMP */
				6925	hrtick_rq_init(rq);
				6926	atomic_set(&rq->nr_iowait, 0);
				6927	}
				6928
				6929	set_load_weight(&init_task, false);
				6930
				6931	/*
				6932	* The boot idle thread does lazy MMU switching as well:
				6933	*/
				6934	mmgrab(&init_mm);
				6935	enter_lazy_tlb(&init_mm, current);
				6936
				6937	/*
				6938	* Make us the idle thread. Technically, schedule() should not be
				6939	* called from this thread, however somewhere below it might be,
				6940	* but because we are the idle thread, we just pick up running again
				6941	* when this runqueue becomes "idle".
				6942	*/
				6943	init_idle(current, smp_processor_id());
				6944
				6945	calc_load_update = jiffies + LOAD_FREQ;
				6946
				6947	#ifdef CONFIG_SMP
				6948	idle_thread_set_boot_cpu();
				6949	#endif
				6950	init_sched_fair_class();
				6951
				6952	init_schedstats();
				6953
				6954	psi_init();
				6955
				6956	init_uclamp();
				6957
				6958	scheduler_running = 1;
				6959	}
				6960
				6961	#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
				6962	static inline int preempt_count_equals(int preempt_offset)
				6963	{
				6964	int nested = preempt_count() + rcu_preempt_depth();
				6965
				6966	return (nested == preempt_offset);
				6967	}
				6968
				6969	void __might_sleep(const char *file, int line, int preempt_offset)
				6970	{
				6971	/*
				6972	* Blocking primitives will set (and therefore destroy) current->state,
				6973	* since we will exit with TASK_RUNNING make sure we enter with it,
				6974	* otherwise we will destroy state.
				6975	*/
				6976	WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
				6977	"do not call blocking ops when !TASK_RUNNING; "
				6978	"state=%lx set at [<%p>] %pS\n",
				6979	current->state,
				6980	(void *)current->task_state_change,
				6981	(void *)current->task_state_change);
				6982
				6983	___might_sleep(file, line, preempt_offset);
				6984	}
				6985	EXPORT_SYMBOL(__might_sleep);
				6986
				6987	void ___might_sleep(const char *file, int line, int preempt_offset)
				6988	{
				6989	/* Ratelimiting timestamp: */
				6990	static unsigned long prev_jiffy;
				6991
				6992	unsigned long preempt_disable_ip;
				6993
				6994	/* WARN_ON_ONCE() by default, no rate limit required: */
				6995	rcu_sleep_check();
				6996
				6997	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
				6998	!is_idle_task(current) && !current->non_block_count) \|\|
				6999	system_state == SYSTEM_BOOTING \|\| system_state > SYSTEM_RUNNING \|\|
				7000	oops_in_progress)
				7001	return;
				7002
				7003	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
				7004	return;
				7005	prev_jiffy = jiffies;
				7006
				7007	/* Save this before calling printk(), since that will clobber it: */
				7008	preempt_disable_ip = get_preempt_disable_ip(current);
				7009
				7010	printk(KERN_ERR
				7011	"BUG: sleeping function called from invalid context at %s:%d\n",
				7012	file, line);
				7013	printk(KERN_ERR
				7014	"in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
				7015	in_atomic(), irqs_disabled(), current->non_block_count,
				7016	current->pid, current->comm);
				7017
				7018	if (task_stack_end_corrupted(current))
				7019	printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
				7020
				7021	debug_show_held_locks(current);
				7022	if (irqs_disabled())
				7023	print_irqtrace_events(current);
				7024	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
				7025	&& !preempt_count_equals(preempt_offset)) {
				7026	pr_err("Preemption disabled at:");
				7027	print_ip_sym(preempt_disable_ip);
				7028	pr_cont("\n");
				7029	}
				7030	dump_stack();
				7031	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
				7032	}
				7033	EXPORT_SYMBOL(___might_sleep);
				7034
				7035	void __cant_sleep(const char *file, int line, int preempt_offset)
				7036	{
				7037	static unsigned long prev_jiffy;
				7038
				7039	if (irqs_disabled())
				7040	return;
				7041
				7042	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
				7043	return;
				7044
				7045	if (preempt_count() > preempt_offset)
				7046	return;
				7047
				7048	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
				7049	return;
				7050	prev_jiffy = jiffies;
				7051
				7052	printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
				7053	printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
				7054	in_atomic(), irqs_disabled(),
				7055	current->pid, current->comm);
				7056
				7057	debug_show_held_locks(current);
				7058	dump_stack();
				7059	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
				7060	}
				7061	EXPORT_SYMBOL_GPL(__cant_sleep);
				7062	#endif
				7063
				7064	#ifdef CONFIG_MAGIC_SYSRQ
				7065	void normalize_rt_tasks(void)
				7066	{
				7067	struct task_struct g, p;
				7068	struct sched_attr attr = {
				7069	.sched_policy = SCHED_NORMAL,
				7070	};
				7071
				7072	read_lock(&tasklist_lock);
				7073	for_each_process_thread(g, p) {
				7074	/*
				7075	* Only normalize user tasks:
				7076	*/
				7077	if (p->flags & PF_KTHREAD)
				7078	continue;
				7079
				7080	p->se.exec_start = 0;
				7081	schedstat_set(p->se.statistics.wait_start, 0);
				7082	schedstat_set(p->se.statistics.sleep_start, 0);
				7083	schedstat_set(p->se.statistics.block_start, 0);
				7084
				7085	if (!dl_task(p) && !rt_task(p)) {
				7086	/*
				7087	* Renice negative nice level userspace
				7088	* tasks back to 0:
				7089	*/
				7090	if (task_nice(p) < 0)
				7091	set_user_nice(p, 0);
				7092	continue;
				7093	}
				7094
				7095	__sched_setscheduler(p, &attr, false, false);
				7096	}
				7097	read_unlock(&tasklist_lock);
				7098	}
				7099
				7100	#endif /* CONFIG_MAGIC_SYSRQ */
				7101
				7102	#if defined(CONFIG_IA64) \|\| defined(CONFIG_KGDB_KDB)
				7103	/*
				7104	* These functions are only useful for the IA64 MCA handling, or kdb.
				7105	*
				7106	* They can only be called when the whole system has been
				7107	* stopped - every CPU needs to be quiescent, and no scheduling
				7108	* activity can take place. Using them for anything else would
				7109	* be a serious bug, and as a result, they aren't even visible
				7110	* under any other configuration.
				7111	*/
				7112
				7113	/**
				7114	* curr_task - return the current task for a given CPU.
				7115	* @cpu: the processor in question.
				7116	*
				7117	* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
				7118	*
				7119	* Return: The current task for @cpu.
				7120	*/
				7121	struct task_struct *curr_task(int cpu)
				7122	{
				7123	return cpu_curr(cpu);
				7124	}
				7125
				7126	#endif /* defined(CONFIG_IA64) \|\| defined(CONFIG_KGDB_KDB) */
				7127
				7128	#ifdef CONFIG_IA64
				7129	/**
				7130	* ia64_set_curr_task - set the current task for a given CPU.
				7131	* @cpu: the processor in question.
				7132	* @p: the task pointer to set.
				7133	*
				7134	* Description: This function must only be used when non-maskable interrupts
				7135	* are serviced on a separate stack. It allows the architecture to switch the
				7136	* notion of the current task on a CPU in a non-blocking manner. This function
				7137	* must be called with all CPU's synchronized, and interrupts disabled, the
				7138	* and caller must save the original value of the current task (see
				7139	* curr_task() above) and restore that value before reenabling interrupts and
				7140	* re-starting the system.
				7141	*
				7142	* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
				7143	*/
				7144	void ia64_set_curr_task(int cpu, struct task_struct *p)
				7145	{
				7146	cpu_curr(cpu) = p;
				7147	}
				7148
				7149	#endif
				7150
				7151	#ifdef CONFIG_CGROUP_SCHED
				7152	/* task_group_lock serializes the addition/removal of task groups */
				7153	static DEFINE_SPINLOCK(task_group_lock);
				7154
				7155	static inline void alloc_uclamp_sched_group(struct task_group *tg,
				7156	struct task_group *parent)
				7157	{
				7158	#ifdef CONFIG_UCLAMP_TASK_GROUP
				7159	enum uclamp_id clamp_id;
				7160
				7161	for_each_clamp_id(clamp_id) {
				7162	uclamp_se_set(&tg->uclamp_req[clamp_id],
				7163	uclamp_none(clamp_id), false);
				7164	tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
				7165	}
				7166	#endif
				7167	}
				7168
				7169	static void sched_free_group(struct task_group *tg)
				7170	{
				7171	free_fair_sched_group(tg);
				7172	free_rt_sched_group(tg);
				7173	autogroup_free(tg);
				7174	kmem_cache_free(task_group_cache, tg);
				7175	}
				7176
				7177	/* allocate runqueue etc for a new task group */
				7178	struct task_group sched_create_group(struct task_group parent)
				7179	{
				7180	struct task_group *tg;
				7181
				7182	tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL \| __GFP_ZERO);
				7183	if (!tg)
				7184	return ERR_PTR(-ENOMEM);
				7185
				7186	if (!alloc_fair_sched_group(tg, parent))
				7187	goto err;
				7188
				7189	if (!alloc_rt_sched_group(tg, parent))
				7190	goto err;
				7191
				7192	alloc_uclamp_sched_group(tg, parent);
				7193
				7194	return tg;
				7195
				7196	err:
				7197	sched_free_group(tg);
				7198	return ERR_PTR(-ENOMEM);
				7199	}
				7200
				7201	void sched_online_group(struct task_group tg, struct task_group parent)
				7202	{
				7203	unsigned long flags;
				7204
				7205	spin_lock_irqsave(&task_group_lock, flags);
				7206	list_add_rcu(&tg->list, &task_groups);
				7207
				7208	/* Root should already exist: */
				7209	WARN_ON(!parent);
				7210
				7211	tg->parent = parent;
				7212	INIT_LIST_HEAD(&tg->children);
				7213	list_add_rcu(&tg->siblings, &parent->children);
				7214	spin_unlock_irqrestore(&task_group_lock, flags);
				7215
				7216	online_fair_sched_group(tg);
				7217	}
				7218
				7219	/* rcu callback to free various structures associated with a task group */
				7220	static void sched_free_group_rcu(struct rcu_head *rhp)
				7221	{
				7222	/* Now it should be safe to free those cfs_rqs: */
				7223	sched_free_group(container_of(rhp, struct task_group, rcu));
				7224	}
				7225
				7226	void sched_destroy_group(struct task_group *tg)
				7227	{
				7228	/* Wait for possible concurrent references to cfs_rqs complete: */
				7229	call_rcu(&tg->rcu, sched_free_group_rcu);
				7230	}
				7231
				7232	void sched_offline_group(struct task_group *tg)
				7233	{
				7234	unsigned long flags;
				7235
				7236	/* End participation in shares distribution: */
				7237	unregister_fair_sched_group(tg);
				7238
				7239	spin_lock_irqsave(&task_group_lock, flags);
				7240	list_del_rcu(&tg->list);
				7241	list_del_rcu(&tg->siblings);
				7242	spin_unlock_irqrestore(&task_group_lock, flags);
				7243	}
				7244
				7245	static void sched_change_group(struct task_struct *tsk, int type)
				7246	{
				7247	struct task_group *tg;
				7248
				7249	/*
				7250	* All callers are synchronized by task_rq_lock(); we do not use RCU
				7251	* which is pointless here. Thus, we pass "true" to task_css_check()
				7252	* to prevent lockdep warnings.
				7253	*/
				7254	tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
				7255	struct task_group, css);
				7256	tg = autogroup_task_group(tsk, tg);
				7257	tsk->sched_task_group = tg;
				7258
				7259	#ifdef CONFIG_FAIR_GROUP_SCHED
				7260	if (tsk->sched_class->task_change_group)
				7261	tsk->sched_class->task_change_group(tsk, type);
				7262	else
				7263	#endif
				7264	set_task_rq(tsk, task_cpu(tsk));
				7265	}
				7266
				7267	/*
				7268	* Change task's runqueue when it moves between groups.
				7269	*
				7270	* The caller of this function should have put the task in its new group by
				7271	* now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
				7272	* its new group.
				7273	*/
				7274	void sched_move_task(struct task_struct *tsk)
				7275	{
				7276	int queued, running, queue_flags =
				7277	DEQUEUE_SAVE \| DEQUEUE_MOVE \| DEQUEUE_NOCLOCK;
				7278	struct rq_flags rf;
				7279	struct rq *rq;
				7280
				7281	rq = task_rq_lock(tsk, &rf);
				7282	update_rq_clock(rq);
				7283
				7284	running = task_current(rq, tsk);
				7285	queued = task_on_rq_queued(tsk);
				7286
				7287	if (queued)
				7288	dequeue_task(rq, tsk, queue_flags);
				7289	if (running)
				7290	put_prev_task(rq, tsk);
				7291
				7292	sched_change_group(tsk, TASK_MOVE_GROUP);
				7293
				7294	if (queued)
				7295	enqueue_task(rq, tsk, queue_flags);
				7296	if (running) {
				7297	set_next_task(rq, tsk);
				7298	/*
				7299	* After changing group, the running task may have joined a
				7300	* throttled one but it's still the running task. Trigger a
				7301	* resched to make sure that task can still run.
				7302	*/
				7303	resched_curr(rq);
				7304	}
				7305
				7306	task_rq_unlock(rq, tsk, &rf);
				7307	}
				7308
				7309	static inline struct task_group css_tg(struct cgroup_subsys_state css)
				7310	{
				7311	return css ? container_of(css, struct task_group, css) : NULL;
				7312	}
				7313
				7314	static struct cgroup_subsys_state *
				7315	cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
				7316	{
				7317	struct task_group *parent = css_tg(parent_css);
				7318	struct task_group *tg;
				7319
				7320	if (!parent) {
				7321	/* This is early initialization for the top cgroup */
				7322	return &root_task_group.css;
				7323	}
				7324
				7325	tg = sched_create_group(parent);
				7326	if (IS_ERR(tg))
				7327	return ERR_PTR(-ENOMEM);
				7328
				7329	return &tg->css;
				7330	}
				7331
				7332	/* Expose task group only after completing cgroup initialization */
				7333	static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
				7334	{
				7335	struct task_group *tg = css_tg(css);
				7336	struct task_group *parent = css_tg(css->parent);
				7337
				7338	if (parent)
				7339	sched_online_group(tg, parent);
				7340
				7341	#ifdef CONFIG_UCLAMP_TASK_GROUP
				7342	/* Propagate the effective uclamp value for the new group */
				7343	mutex_lock(&uclamp_mutex);
				7344	rcu_read_lock();
				7345	cpu_util_update_eff(css);
				7346	rcu_read_unlock();
				7347	mutex_unlock(&uclamp_mutex);
				7348	#endif
				7349
				7350	return 0;
				7351	}
				7352
				7353	static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
				7354	{
				7355	struct task_group *tg = css_tg(css);
				7356
				7357	sched_offline_group(tg);
				7358	}
				7359
				7360	static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
				7361	{
				7362	struct task_group *tg = css_tg(css);
				7363
				7364	/*
				7365	* Relies on the RCU grace period between css_released() and this.
				7366	*/
				7367	sched_free_group(tg);
				7368	}
				7369
				7370	/*
				7371	* This is called before wake_up_new_task(), therefore we really only
				7372	* have to set its group bits, all the other stuff does not apply.
				7373	*/
				7374	static void cpu_cgroup_fork(struct task_struct *task)
				7375	{
				7376	struct rq_flags rf;
				7377	struct rq *rq;
				7378
				7379	rq = task_rq_lock(task, &rf);
				7380
				7381	update_rq_clock(rq);
				7382	sched_change_group(task, TASK_SET_GROUP);
				7383
				7384	task_rq_unlock(rq, task, &rf);
				7385	}
				7386
				7387	static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
				7388	{
				7389	struct task_struct *task;
				7390	struct cgroup_subsys_state *css;
				7391	int ret = 0;
				7392
				7393	cgroup_taskset_for_each(task, css, tset) {
				7394	#ifdef CONFIG_RT_GROUP_SCHED
				7395	if (!sched_rt_can_attach(css_tg(css), task))
				7396	return -EINVAL;
				7397	#endif
				7398	/*
				7399	* Serialize against wake_up_new_task() such that if its
				7400	* running, we're sure to observe its full state.
				7401	*/
				7402	raw_spin_lock_irq(&task->pi_lock);
				7403	/*
				7404	* Avoid calling sched_move_task() before wake_up_new_task()
				7405	* has happened. This would lead to problems with PELT, due to
				7406	* move wanting to detach+attach while we're not attached yet.
				7407	*/
				7408	if (task->state == TASK_NEW)
				7409	ret = -EINVAL;
				7410	raw_spin_unlock_irq(&task->pi_lock);
				7411
				7412	if (ret)
				7413	break;
				7414	}
				7415	return ret;
				7416	}
				7417
				7418	static void cpu_cgroup_attach(struct cgroup_taskset *tset)
				7419	{
				7420	struct task_struct *task;
				7421	struct cgroup_subsys_state *css;
				7422
				7423	cgroup_taskset_for_each(task, css, tset)
				7424	sched_move_task(task);
				7425	}
				7426
				7427	#ifdef CONFIG_UCLAMP_TASK_GROUP
				7428	static void cpu_util_update_eff(struct cgroup_subsys_state *css)
				7429	{
				7430	struct cgroup_subsys_state *top_css = css;
				7431	struct uclamp_se *uc_parent = NULL;
				7432	struct uclamp_se *uc_se = NULL;
				7433	unsigned int eff[UCLAMP_CNT];
				7434	enum uclamp_id clamp_id;
				7435	unsigned int clamps;
				7436
				7437	lockdep_assert_held(&uclamp_mutex);
				7438	SCHED_WARN_ON(!rcu_read_lock_held());
				7439
				7440	css_for_each_descendant_pre(css, top_css) {
				7441	uc_parent = css_tg(css)->parent
				7442	? css_tg(css)->parent->uclamp : NULL;
				7443
				7444	for_each_clamp_id(clamp_id) {
				7445	/* Assume effective clamps matches requested clamps */
				7446	eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
				7447	/* Cap effective clamps with parent's effective clamps */
				7448	if (uc_parent &&
				7449	eff[clamp_id] > uc_parent[clamp_id].value) {
				7450	eff[clamp_id] = uc_parent[clamp_id].value;
				7451	}
				7452	}
				7453	/* Ensure protection is always capped by limit */
				7454	eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
				7455
				7456	/* Propagate most restrictive effective clamps */
				7457	clamps = 0x0;
				7458	uc_se = css_tg(css)->uclamp;
				7459	for_each_clamp_id(clamp_id) {
				7460	if (eff[clamp_id] == uc_se[clamp_id].value)
				7461	continue;
				7462	uc_se[clamp_id].value = eff[clamp_id];
				7463	uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
				7464	clamps \|= (0x1 << clamp_id);
				7465	}
				7466	if (!clamps) {
				7467	css = css_rightmost_descendant(css);
				7468	continue;
				7469	}
				7470
				7471	/* Immediately update descendants RUNNABLE tasks */
				7472	uclamp_update_active_tasks(css);
				7473	}
				7474	}
				7475
				7476	/*
				7477	* Integer 10^N with a given N exponent by casting to integer the literal "1eN"
				7478	* C expression. Since there is no way to convert a macro argument (N) into a
				7479	* character constant, use two levels of macros.
				7480	*/
				7481	#define _POW10(exp) ((unsigned int)1e##exp)
				7482	#define POW10(exp) _POW10(exp)
				7483
				7484	struct uclamp_request {
				7485	#define UCLAMP_PERCENT_SHIFT 2
				7486	#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
				7487	s64 percent;
				7488	u64 util;
				7489	int ret;
				7490	};
				7491
				7492	static inline struct uclamp_request
				7493	capacity_from_percent(char *buf)
				7494	{
				7495	struct uclamp_request req = {
				7496	.percent = UCLAMP_PERCENT_SCALE,
				7497	.util = SCHED_CAPACITY_SCALE,
				7498	.ret = 0,
				7499	};
				7500
				7501	buf = strim(buf);
				7502	if (strcmp(buf, "max")) {
				7503	req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
				7504	&req.percent);
				7505	if (req.ret)
				7506	return req;
				7507	if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
				7508	req.ret = -ERANGE;
				7509	return req;
				7510	}
				7511
				7512	req.util = req.percent << SCHED_CAPACITY_SHIFT;
				7513	req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
				7514	}
				7515
				7516	return req;
				7517	}
				7518
				7519	static ssize_t cpu_uclamp_write(struct kernfs_open_file of, char buf,
				7520	size_t nbytes, loff_t off,
				7521	enum uclamp_id clamp_id)
				7522	{
				7523	struct uclamp_request req;
				7524	struct task_group *tg;
				7525
				7526	req = capacity_from_percent(buf);
				7527	if (req.ret)
				7528	return req.ret;
				7529
				7530	static_branch_enable(&sched_uclamp_used);
				7531
				7532	mutex_lock(&uclamp_mutex);
				7533	rcu_read_lock();
				7534
				7535	tg = css_tg(of_css(of));
				7536	if (tg->uclamp_req[clamp_id].value != req.util)
				7537	uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
				7538
				7539	/*
				7540	* Because of not recoverable conversion rounding we keep track of the
				7541	* exact requested value
				7542	*/
				7543	tg->uclamp_pct[clamp_id] = req.percent;
				7544
				7545	/* Update effective clamps to track the most restrictive value */
				7546	cpu_util_update_eff(of_css(of));
				7547
				7548	rcu_read_unlock();
				7549	mutex_unlock(&uclamp_mutex);
				7550
				7551	return nbytes;
				7552	}
				7553
				7554	static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
				7555	char *buf, size_t nbytes,
				7556	loff_t off)
				7557	{
				7558	return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
				7559	}
				7560
				7561	static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
				7562	char *buf, size_t nbytes,
				7563	loff_t off)
				7564	{
				7565	return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
				7566	}
				7567
				7568	static inline void cpu_uclamp_print(struct seq_file *sf,
				7569	enum uclamp_id clamp_id)
				7570	{
				7571	struct task_group *tg;
				7572	u64 util_clamp;
				7573	u64 percent;
				7574	u32 rem;
				7575
				7576	rcu_read_lock();
				7577	tg = css_tg(seq_css(sf));
				7578	util_clamp = tg->uclamp_req[clamp_id].value;
				7579	rcu_read_unlock();
				7580
				7581	if (util_clamp == SCHED_CAPACITY_SCALE) {
				7582	seq_puts(sf, "max\n");
				7583	return;
				7584	}
				7585
				7586	percent = tg->uclamp_pct[clamp_id];
				7587	percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
				7588	seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
				7589	}
				7590
				7591	static int cpu_uclamp_min_show(struct seq_file sf, void v)
				7592	{
				7593	cpu_uclamp_print(sf, UCLAMP_MIN);
				7594	return 0;
				7595	}
				7596
				7597	static int cpu_uclamp_max_show(struct seq_file sf, void v)
				7598	{
				7599	cpu_uclamp_print(sf, UCLAMP_MAX);
				7600	return 0;
				7601	}
				7602
				7603	static int cpu_uclamp_ls_write_u64(struct cgroup_subsys_state *css,
				7604	struct cftype *cftype, u64 ls)
				7605	{
				7606	struct task_group *tg;
				7607
				7608	if (ls > 1)
				7609	return -EINVAL;
				7610	tg = css_tg(css);
				7611	tg->latency_sensitive = (unsigned int) ls;
				7612
				7613	return 0;
				7614	}
				7615
				7616	static u64 cpu_uclamp_ls_read_u64(struct cgroup_subsys_state *css,
				7617	struct cftype *cft)
				7618	{
				7619	struct task_group *tg = css_tg(css);
				7620
				7621	return (u64) tg->latency_sensitive;
				7622	}
				7623	#endif /* CONFIG_UCLAMP_TASK_GROUP */
				7624
				7625	#ifdef CONFIG_FAIR_GROUP_SCHED
				7626	static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
				7627	struct cftype *cftype, u64 shareval)
				7628	{
				7629	if (shareval > scale_load_down(ULONG_MAX))
				7630	shareval = MAX_SHARES;
				7631	return sched_group_set_shares(css_tg(css), scale_load(shareval));
				7632	}
				7633
				7634	static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
				7635	struct cftype *cft)
				7636	{
				7637	struct task_group *tg = css_tg(css);
				7638
				7639	return (u64) scale_load_down(tg->shares);
				7640	}
				7641
				7642	#ifdef CONFIG_CFS_BANDWIDTH
				7643	static DEFINE_MUTEX(cfs_constraints_mutex);
				7644
				7645	const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
				7646	static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
				7647	/* More than 203 days if BW_SHIFT equals 20. */
				7648	static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
				7649
				7650	static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
				7651
				7652	static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
				7653	{
				7654	int i, ret = 0, runtime_enabled, runtime_was_enabled;
				7655	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
				7656
				7657	if (tg == &root_task_group)
				7658	return -EINVAL;
				7659
				7660	/*
				7661	* Ensure we have at some amount of bandwidth every period. This is
				7662	* to prevent reaching a state of large arrears when throttled via
				7663	* entity_tick() resulting in prolonged exit starvation.
				7664	*/
				7665	if (quota < min_cfs_quota_period \|\| period < min_cfs_quota_period)
				7666	return -EINVAL;
				7667
				7668	/*
				7669	* Likewise, bound things on the otherside by preventing insane quota
				7670	* periods. This also allows us to normalize in computing quota
				7671	* feasibility.
				7672	*/
				7673	if (period > max_cfs_quota_period)
				7674	return -EINVAL;
				7675
				7676	/*
				7677	* Bound quota to defend quota against overflow during bandwidth shift.
				7678	*/
				7679	if (quota != RUNTIME_INF && quota > max_cfs_runtime)
				7680	return -EINVAL;
				7681
				7682	/*
				7683	* Prevent race between setting of cfs_rq->runtime_enabled and
				7684	* unthrottle_offline_cfs_rqs().
				7685	*/
				7686	get_online_cpus();
				7687	mutex_lock(&cfs_constraints_mutex);
				7688	ret = __cfs_schedulable(tg, period, quota);
				7689	if (ret)
				7690	goto out_unlock;
				7691
				7692	runtime_enabled = quota != RUNTIME_INF;
				7693	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
				7694	/*
				7695	* If we need to toggle cfs_bandwidth_used, off->on must occur
				7696	* before making related changes, and on->off must occur afterwards
				7697	*/
				7698	if (runtime_enabled && !runtime_was_enabled)
				7699	cfs_bandwidth_usage_inc();
				7700	raw_spin_lock_irq(&cfs_b->lock);
				7701	cfs_b->period = ns_to_ktime(period);
				7702	cfs_b->quota = quota;
				7703
				7704	__refill_cfs_bandwidth_runtime(cfs_b);
				7705
				7706	/* Restart the period timer (if active) to handle new period expiry: */
				7707	if (runtime_enabled)
				7708	start_cfs_bandwidth(cfs_b);
				7709
				7710	raw_spin_unlock_irq(&cfs_b->lock);
				7711
				7712	for_each_online_cpu(i) {
				7713	struct cfs_rq *cfs_rq = tg->cfs_rq[i];
				7714	struct rq *rq = cfs_rq->rq;
				7715	struct rq_flags rf;
				7716
				7717	rq_lock_irq(rq, &rf);
				7718	cfs_rq->runtime_enabled = runtime_enabled;
				7719	cfs_rq->runtime_remaining = 0;
				7720
				7721	if (cfs_rq->throttled)
				7722	unthrottle_cfs_rq(cfs_rq);
				7723	rq_unlock_irq(rq, &rf);
				7724	}
				7725	if (runtime_was_enabled && !runtime_enabled)
				7726	cfs_bandwidth_usage_dec();
				7727	out_unlock:
				7728	mutex_unlock(&cfs_constraints_mutex);
				7729	put_online_cpus();
				7730
				7731	return ret;
				7732	}
				7733
				7734	static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
				7735	{
				7736	u64 quota, period;
				7737
				7738	period = ktime_to_ns(tg->cfs_bandwidth.period);
				7739	if (cfs_quota_us < 0)
				7740	quota = RUNTIME_INF;
				7741	else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
				7742	quota = (u64)cfs_quota_us * NSEC_PER_USEC;
				7743	else
				7744	return -EINVAL;
				7745
				7746	return tg_set_cfs_bandwidth(tg, period, quota);
				7747	}
				7748
				7749	static long tg_get_cfs_quota(struct task_group *tg)
				7750	{
				7751	u64 quota_us;
				7752
				7753	if (tg->cfs_bandwidth.quota == RUNTIME_INF)
				7754	return -1;
				7755
				7756	quota_us = tg->cfs_bandwidth.quota;
				7757	do_div(quota_us, NSEC_PER_USEC);
				7758
				7759	return quota_us;
				7760	}
				7761
				7762	static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
				7763	{
				7764	u64 quota, period;
				7765
				7766	if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
				7767	return -EINVAL;
				7768
				7769	period = (u64)cfs_period_us * NSEC_PER_USEC;
				7770	quota = tg->cfs_bandwidth.quota;
				7771
				7772	return tg_set_cfs_bandwidth(tg, period, quota);
				7773	}
				7774
				7775	static long tg_get_cfs_period(struct task_group *tg)
				7776	{
				7777	u64 cfs_period_us;
				7778
				7779	cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
				7780	do_div(cfs_period_us, NSEC_PER_USEC);
				7781
				7782	return cfs_period_us;
				7783	}
				7784
				7785	static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
				7786	struct cftype *cft)
				7787	{
				7788	return tg_get_cfs_quota(css_tg(css));
				7789	}
				7790
				7791	static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
				7792	struct cftype *cftype, s64 cfs_quota_us)
				7793	{
				7794	return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
				7795	}
				7796
				7797	static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
				7798	struct cftype *cft)
				7799	{
				7800	return tg_get_cfs_period(css_tg(css));
				7801	}
				7802
				7803	static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
				7804	struct cftype *cftype, u64 cfs_period_us)
				7805	{
				7806	return tg_set_cfs_period(css_tg(css), cfs_period_us);
				7807	}
				7808
				7809	struct cfs_schedulable_data {
				7810	struct task_group *tg;
				7811	u64 period, quota;
				7812	};
				7813
				7814	/*
				7815	* normalize group quota/period to be quota/max_period
				7816	* note: units are usecs
				7817	*/
				7818	static u64 normalize_cfs_quota(struct task_group *tg,
				7819	struct cfs_schedulable_data *d)
				7820	{
				7821	u64 quota, period;
				7822
				7823	if (tg == d->tg) {
				7824	period = d->period;
				7825	quota = d->quota;
				7826	} else {
				7827	period = tg_get_cfs_period(tg);
				7828	quota = tg_get_cfs_quota(tg);
				7829	}
				7830
				7831	/* note: these should typically be equivalent */
				7832	if (quota == RUNTIME_INF \|\| quota == -1)
				7833	return RUNTIME_INF;
				7834
				7835	return to_ratio(period, quota);
				7836	}
				7837
				7838	static int tg_cfs_schedulable_down(struct task_group tg, void data)
				7839	{
				7840	struct cfs_schedulable_data *d = data;
				7841	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
				7842	s64 quota = 0, parent_quota = -1;
				7843
				7844	if (!tg->parent) {
				7845	quota = RUNTIME_INF;
				7846	} else {
				7847	struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
				7848
				7849	quota = normalize_cfs_quota(tg, d);
				7850	parent_quota = parent_b->hierarchical_quota;
				7851
				7852	/*
				7853	* Ensure max(child_quota) <= parent_quota. On cgroup2,
				7854	* always take the min. On cgroup1, only inherit when no
				7855	* limit is set:
				7856	*/
				7857	if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
				7858	quota = min(quota, parent_quota);
				7859	} else {
				7860	if (quota == RUNTIME_INF)
				7861	quota = parent_quota;
				7862	else if (parent_quota != RUNTIME_INF && quota > parent_quota)
				7863	return -EINVAL;
				7864	}
				7865	}
				7866	cfs_b->hierarchical_quota = quota;
				7867
				7868	return 0;
				7869	}
				7870
				7871	static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
				7872	{
				7873	int ret;
				7874	struct cfs_schedulable_data data = {
				7875	.tg = tg,
				7876	.period = period,
				7877	.quota = quota,
				7878	};
				7879
				7880	if (quota != RUNTIME_INF) {
				7881	do_div(data.period, NSEC_PER_USEC);
				7882	do_div(data.quota, NSEC_PER_USEC);
				7883	}
				7884
				7885	rcu_read_lock();
				7886	ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
				7887	rcu_read_unlock();
				7888
				7889	return ret;
				7890	}
				7891
				7892	static int cpu_cfs_stat_show(struct seq_file sf, void v)
				7893	{
				7894	struct task_group *tg = css_tg(seq_css(sf));
				7895	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
				7896
				7897	seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
				7898	seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
				7899	seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
				7900
				7901	if (schedstat_enabled() && tg != &root_task_group) {
				7902	u64 ws = 0;
				7903	int i;
				7904
				7905	for_each_possible_cpu(i)
				7906	ws += schedstat_val(tg->se[i]->statistics.wait_sum);
				7907
				7908	seq_printf(sf, "wait_sum %llu\n", ws);
				7909	}
				7910
				7911	return 0;
				7912	}
				7913	#endif /* CONFIG_CFS_BANDWIDTH */
				7914	#endif /* CONFIG_FAIR_GROUP_SCHED */
				7915
				7916	#ifdef CONFIG_RT_GROUP_SCHED
				7917	static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
				7918	struct cftype *cft, s64 val)
				7919	{
				7920	return sched_group_set_rt_runtime(css_tg(css), val);
				7921	}
				7922
				7923	static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
				7924	struct cftype *cft)
				7925	{
				7926	return sched_group_rt_runtime(css_tg(css));
				7927	}
				7928
				7929	static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
				7930	struct cftype *cftype, u64 rt_period_us)
				7931	{
				7932	return sched_group_set_rt_period(css_tg(css), rt_period_us);
				7933	}
				7934
				7935	static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
				7936	struct cftype *cft)
				7937	{
				7938	return sched_group_rt_period(css_tg(css));
				7939	}
				7940	#endif /* CONFIG_RT_GROUP_SCHED */
				7941
				7942	static struct cftype cpu_legacy_files[] = {
				7943	#ifdef CONFIG_FAIR_GROUP_SCHED
				7944	{
				7945	.name = "shares",
				7946	.read_u64 = cpu_shares_read_u64,
				7947	.write_u64 = cpu_shares_write_u64,
				7948	},
				7949	#endif
				7950	#ifdef CONFIG_CFS_BANDWIDTH
				7951	{
				7952	.name = "cfs_quota_us",
				7953	.read_s64 = cpu_cfs_quota_read_s64,
				7954	.write_s64 = cpu_cfs_quota_write_s64,
				7955	},
				7956	{
				7957	.name = "cfs_period_us",
				7958	.read_u64 = cpu_cfs_period_read_u64,
				7959	.write_u64 = cpu_cfs_period_write_u64,
				7960	},
				7961	{
				7962	.name = "stat",
				7963	.seq_show = cpu_cfs_stat_show,
				7964	},
				7965	#endif
				7966	#ifdef CONFIG_RT_GROUP_SCHED
				7967	{
				7968	.name = "rt_runtime_us",
				7969	.read_s64 = cpu_rt_runtime_read,
				7970	.write_s64 = cpu_rt_runtime_write,
				7971	},
				7972	{
				7973	.name = "rt_period_us",
				7974	.read_u64 = cpu_rt_period_read_uint,
				7975	.write_u64 = cpu_rt_period_write_uint,
				7976	},
				7977	#endif
				7978	#ifdef CONFIG_UCLAMP_TASK_GROUP
				7979	{
				7980	.name = "uclamp.min",
				7981	.flags = CFTYPE_NOT_ON_ROOT,
				7982	.seq_show = cpu_uclamp_min_show,
				7983	.write = cpu_uclamp_min_write,
				7984	},
				7985	{
				7986	.name = "uclamp.max",
				7987	.flags = CFTYPE_NOT_ON_ROOT,
				7988	.seq_show = cpu_uclamp_max_show,
				7989	.write = cpu_uclamp_max_write,
				7990	},
				7991	{
				7992	.name = "uclamp.latency_sensitive",
				7993	.flags = CFTYPE_NOT_ON_ROOT,
				7994	.read_u64 = cpu_uclamp_ls_read_u64,
				7995	.write_u64 = cpu_uclamp_ls_write_u64,
				7996	},
				7997	#endif
				7998	{ } /* Terminate */
				7999	};
				8000
				8001	static int cpu_extra_stat_show(struct seq_file *sf,
				8002	struct cgroup_subsys_state *css)
				8003	{
				8004	#ifdef CONFIG_CFS_BANDWIDTH
				8005	{
				8006	struct task_group *tg = css_tg(css);
				8007	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
				8008	u64 throttled_usec;
				8009
				8010	throttled_usec = cfs_b->throttled_time;
				8011	do_div(throttled_usec, NSEC_PER_USEC);
				8012
				8013	seq_printf(sf, "nr_periods %d\n"
				8014	"nr_throttled %d\n"
				8015	"throttled_usec %llu\n",
				8016	cfs_b->nr_periods, cfs_b->nr_throttled,
				8017	throttled_usec);
				8018	}
				8019	#endif
				8020	return 0;
				8021	}
				8022
				8023	#ifdef CONFIG_FAIR_GROUP_SCHED
				8024	static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
				8025	struct cftype *cft)
				8026	{
				8027	struct task_group *tg = css_tg(css);
				8028	u64 weight = scale_load_down(tg->shares);
				8029
				8030	return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
				8031	}
				8032
				8033	static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
				8034	struct cftype *cft, u64 weight)
				8035	{
				8036	/*
				8037	* cgroup weight knobs should use the common MIN, DFL and MAX
				8038	* values which are 1, 100 and 10000 respectively. While it loses
				8039	* a bit of range on both ends, it maps pretty well onto the shares
				8040	* value used by scheduler and the round-trip conversions preserve
				8041	* the original value over the entire range.
				8042	*/
				8043	if (weight < CGROUP_WEIGHT_MIN \|\| weight > CGROUP_WEIGHT_MAX)
				8044	return -ERANGE;
				8045
				8046	weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
				8047
				8048	return sched_group_set_shares(css_tg(css), scale_load(weight));
				8049	}
				8050
				8051	static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
				8052	struct cftype *cft)
				8053	{
				8054	unsigned long weight = scale_load_down(css_tg(css)->shares);
				8055	int last_delta = INT_MAX;
				8056	int prio, delta;
				8057
				8058	/* find the closest nice value to the current weight */
				8059	for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
				8060	delta = abs(sched_prio_to_weight[prio] - weight);
				8061	if (delta >= last_delta)
				8062	break;
				8063	last_delta = delta;
				8064	}
				8065
				8066	return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
				8067	}
				8068
				8069	static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
				8070	struct cftype *cft, s64 nice)
				8071	{
				8072	unsigned long weight;
				8073	int idx;
				8074
				8075	if (nice < MIN_NICE \|\| nice > MAX_NICE)
				8076	return -ERANGE;
				8077
				8078	idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
				8079	idx = array_index_nospec(idx, 40);
				8080	weight = sched_prio_to_weight[idx];
				8081
				8082	return sched_group_set_shares(css_tg(css), scale_load(weight));
				8083	}
				8084	#endif
				8085
				8086	static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
				8087	long period, long quota)
				8088	{
				8089	if (quota < 0)
				8090	seq_puts(sf, "max");
				8091	else
				8092	seq_printf(sf, "%ld", quota);
				8093
				8094	seq_printf(sf, " %ld\n", period);
				8095	}
				8096
				8097	/* caller should put the current value in @periodp before calling /
				8098	static int __maybe_unused cpu_period_quota_parse(char *buf,
				8099	u64 periodp, u64 quotap)
				8100	{
				8101	char tok[21]; /* U64_MAX */
				8102
				8103	if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
				8104	return -EINVAL;
				8105
				8106	periodp = NSEC_PER_USEC;
				8107
				8108	if (sscanf(tok, "%llu", quotap))
				8109	quotap = NSEC_PER_USEC;
				8110	else if (!strcmp(tok, "max"))
				8111	*quotap = RUNTIME_INF;
				8112	else
				8113	return -EINVAL;
				8114
				8115	return 0;
				8116	}
				8117
				8118	#ifdef CONFIG_CFS_BANDWIDTH
				8119	static int cpu_max_show(struct seq_file sf, void v)
				8120	{
				8121	struct task_group *tg = css_tg(seq_css(sf));
				8122
				8123	cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
				8124	return 0;
				8125	}
				8126
				8127	static ssize_t cpu_max_write(struct kernfs_open_file *of,
				8128	char *buf, size_t nbytes, loff_t off)
				8129	{
				8130	struct task_group *tg = css_tg(of_css(of));
				8131	u64 period = tg_get_cfs_period(tg);
				8132	u64 quota;
				8133	int ret;
				8134
				8135	ret = cpu_period_quota_parse(buf, &period, &quota);
				8136	if (!ret)
				8137	ret = tg_set_cfs_bandwidth(tg, period, quota);
				8138	return ret ?: nbytes;
				8139	}
				8140	#endif
				8141
				8142	static struct cftype cpu_files[] = {
				8143	#ifdef CONFIG_FAIR_GROUP_SCHED
				8144	{
				8145	.name = "weight",
				8146	.flags = CFTYPE_NOT_ON_ROOT,
				8147	.read_u64 = cpu_weight_read_u64,
				8148	.write_u64 = cpu_weight_write_u64,
				8149	},
				8150	{
				8151	.name = "weight.nice",
				8152	.flags = CFTYPE_NOT_ON_ROOT,
				8153	.read_s64 = cpu_weight_nice_read_s64,
				8154	.write_s64 = cpu_weight_nice_write_s64,
				8155	},
				8156	#endif
				8157	#ifdef CONFIG_CFS_BANDWIDTH
				8158	{
				8159	.name = "max",
				8160	.flags = CFTYPE_NOT_ON_ROOT,
				8161	.seq_show = cpu_max_show,
				8162	.write = cpu_max_write,
				8163	},
				8164	#endif
				8165	#ifdef CONFIG_UCLAMP_TASK_GROUP
				8166	{
				8167	.name = "uclamp.min",
				8168	.flags = CFTYPE_NOT_ON_ROOT,
				8169	.seq_show = cpu_uclamp_min_show,
				8170	.write = cpu_uclamp_min_write,
				8171	},
				8172	{
				8173	.name = "uclamp.max",
				8174	.flags = CFTYPE_NOT_ON_ROOT,
				8175	.seq_show = cpu_uclamp_max_show,
				8176	.write = cpu_uclamp_max_write,
				8177	},
				8178	{
				8179	.name = "uclamp.latency_sensitive",
				8180	.flags = CFTYPE_NOT_ON_ROOT,
				8181	.read_u64 = cpu_uclamp_ls_read_u64,
				8182	.write_u64 = cpu_uclamp_ls_write_u64,
				8183	},
				8184	#endif
				8185	{ } /* terminate */
				8186	};
				8187
				8188	struct cgroup_subsys cpu_cgrp_subsys = {
				8189	.css_alloc = cpu_cgroup_css_alloc,
				8190	.css_online = cpu_cgroup_css_online,
				8191	.css_released = cpu_cgroup_css_released,
				8192	.css_free = cpu_cgroup_css_free,
				8193	.css_extra_stat_show = cpu_extra_stat_show,
				8194	.fork = cpu_cgroup_fork,
				8195	.can_attach = cpu_cgroup_can_attach,
				8196	.attach = cpu_cgroup_attach,
				8197	.legacy_cftypes = cpu_legacy_files,
				8198	.dfl_cftypes = cpu_files,
				8199	.early_init = true,
				8200	.threaded = true,
				8201	};
				8202
				8203	#endif /* CONFIG_CGROUP_SCHED */
				8204
				8205	void dump_cpu_task(int cpu)
				8206	{
				8207	pr_info("Task dump for CPU %d:\n", cpu);
				8208	sched_show_task(cpu_curr(cpu));
				8209	}
				8210
				8211	/*
				8212	* Nice levels are multiplicative, with a gentle 10% change for every
				8213	* nice level changed. I.e. when a CPU-bound task goes from nice 0 to
				8214	* nice 1, it will get ~10% less CPU time than another CPU-bound task
				8215	* that remained on nice 0.
				8216	*
				8217	* The "10% effect" is relative and cumulative: from _any_ nice level,
				8218	* if you go up 1 level, it's -10% CPU usage, if you go down 1 level
				8219	* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
				8220	* If a task goes up by ~10% and another task goes down by ~10% then
				8221	* the relative distance between them is ~25%.)
				8222	*/
				8223	const int sched_prio_to_weight[40] = {
				8224	/* -20 */ 88761, 71755, 56483, 46273, 36291,
				8225	/* -15 */ 29154, 23254, 18705, 14949, 11916,
				8226	/* -10 */ 9548, 7620, 6100, 4904, 3906,
				8227	/* -5 */ 3121, 2501, 1991, 1586, 1277,
				8228	/* 0 */ 1024, 820, 655, 526, 423,
				8229	/* 5 */ 335, 272, 215, 172, 137,
				8230	/* 10 */ 110, 87, 70, 56, 45,
				8231	/* 15 */ 36, 29, 23, 18, 15,
				8232	};
				8233
				8234	/*
				8235	* Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
				8236	*
				8237	* In cases where the weight does not change often, we can use the
				8238	* precalculated inverse to speed up arithmetics by turning divisions
				8239	* into multiplications:
				8240	*/
				8241	const u32 sched_prio_to_wmult[40] = {
				8242	/* -20 */ 48388, 59856, 76040, 92818, 118348,
				8243	/* -15 */ 147320, 184698, 229616, 287308, 360437,
				8244	/* -10 */ 449829, 563644, 704093, 875809, 1099582,
				8245	/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
				8246	/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
				8247	/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
				8248	/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
				8249	/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
				8250	};
				8251
				8252	#undef CREATE_TRACE_POINTS