Blame - marvell/linux/kernel/sched/rt.c - T108

blob: ff7f29d210b2f509acedb0c1c19b923486b0802e [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
				4	* policies)
				5	*/
				6	#include "sched.h"
				7
				8	#include "pelt.h"
				9
				10	#include <trace/hooks/sched.h>
				11
				12	int sched_rr_timeslice = RR_TIMESLICE;
				13	int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
				14	/* More than 4 hours if BW_SHIFT equals 20. */
				15	static const u64 max_rt_runtime = MAX_BW;
				16
				17	static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
				18
				19	struct rt_bandwidth def_rt_bandwidth;
				20
				21	static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
				22	{
				23	struct rt_bandwidth *rt_b =
				24	container_of(timer, struct rt_bandwidth, rt_period_timer);
				25	int idle = 0;
				26	int overrun;
				27
				28	raw_spin_lock(&rt_b->rt_runtime_lock);
				29	for (;;) {
				30	overrun = hrtimer_forward_now(timer, rt_b->rt_period);
				31	if (!overrun)
				32	break;
				33
				34	raw_spin_unlock(&rt_b->rt_runtime_lock);
				35	idle = do_sched_rt_period_timer(rt_b, overrun);
				36	raw_spin_lock(&rt_b->rt_runtime_lock);
				37	}
				38	if (idle)
				39	rt_b->rt_period_active = 0;
				40	raw_spin_unlock(&rt_b->rt_runtime_lock);
				41
				42	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
				43	}
				44
				45	void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
				46	{
				47	rt_b->rt_period = ns_to_ktime(period);
				48	rt_b->rt_runtime = runtime;
				49
				50	raw_spin_lock_init(&rt_b->rt_runtime_lock);
				51
				52	hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
				53	HRTIMER_MODE_REL_HARD);
				54	rt_b->rt_period_timer.function = sched_rt_period_timer;
				55	}
				56
				57	static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
				58	{
				59	raw_spin_lock(&rt_b->rt_runtime_lock);
				60	if (!rt_b->rt_period_active) {
				61	rt_b->rt_period_active = 1;
				62	/*
				63	* SCHED_DEADLINE updates the bandwidth, as a run away
				64	* RT task with a DL task could hog a CPU. But DL does
				65	* not reset the period. If a deadline task was running
				66	* without an RT task running, it can cause RT tasks to
				67	* throttle when they start up. Kick the timer right away
				68	* to update the period.
				69	*/
				70	hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
				71	hrtimer_start_expires(&rt_b->rt_period_timer,
				72	HRTIMER_MODE_ABS_PINNED_HARD);
				73	}
				74	raw_spin_unlock(&rt_b->rt_runtime_lock);
				75	}
				76
				77	static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
				78	{
				79	if (!rt_bandwidth_enabled() \|\| rt_b->rt_runtime == RUNTIME_INF)
				80	return;
				81
				82	do_start_rt_bandwidth(rt_b);
				83	}
				84
				85	void init_rt_rq(struct rt_rq *rt_rq)
				86	{
				87	struct rt_prio_array *array;
				88	int i;
				89
				90	array = &rt_rq->active;
				91	for (i = 0; i < MAX_RT_PRIO; i++) {
				92	INIT_LIST_HEAD(array->queue + i);
				93	__clear_bit(i, array->bitmap);
				94	}
				95	/* delimiter for bitsearch: */
				96	__set_bit(MAX_RT_PRIO, array->bitmap);
				97
				98	#if defined CONFIG_SMP
				99	rt_rq->highest_prio.curr = MAX_RT_PRIO;
				100	rt_rq->highest_prio.next = MAX_RT_PRIO;
				101	rt_rq->rt_nr_migratory = 0;
				102	rt_rq->overloaded = 0;
				103	plist_head_init(&rt_rq->pushable_tasks);
				104	#endif /* CONFIG_SMP */
				105	/* We start is dequeued state, because no RT tasks are queued */
				106	rt_rq->rt_queued = 0;
				107
				108	rt_rq->rt_time = 0;
				109	rt_rq->rt_throttled = 0;
				110	rt_rq->rt_runtime = 0;
				111	raw_spin_lock_init(&rt_rq->rt_runtime_lock);
				112	}
				113
				114	#ifdef CONFIG_RT_GROUP_SCHED
				115	static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
				116	{
				117	hrtimer_cancel(&rt_b->rt_period_timer);
				118	}
				119
				120	#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
				121
				122	static inline struct task_struct rt_task_of(struct sched_rt_entity rt_se)
				123	{
				124	#ifdef CONFIG_SCHED_DEBUG
				125	WARN_ON_ONCE(!rt_entity_is_task(rt_se));
				126	#endif
				127	return container_of(rt_se, struct task_struct, rt);
				128	}
				129
				130	static inline struct rq rq_of_rt_rq(struct rt_rq rt_rq)
				131	{
				132	return rt_rq->rq;
				133	}
				134
				135	static inline struct rt_rq rt_rq_of_se(struct sched_rt_entity rt_se)
				136	{
				137	return rt_se->rt_rq;
				138	}
				139
				140	static inline struct rq rq_of_rt_se(struct sched_rt_entity rt_se)
				141	{
				142	struct rt_rq *rt_rq = rt_se->rt_rq;
				143
				144	return rt_rq->rq;
				145	}
				146
				147	void free_rt_sched_group(struct task_group *tg)
				148	{
				149	int i;
				150
				151	if (tg->rt_se)
				152	destroy_rt_bandwidth(&tg->rt_bandwidth);
				153
				154	for_each_possible_cpu(i) {
				155	if (tg->rt_rq)
				156	kfree(tg->rt_rq[i]);
				157	if (tg->rt_se)
				158	kfree(tg->rt_se[i]);
				159	}
				160
				161	kfree(tg->rt_rq);
				162	kfree(tg->rt_se);
				163	}
				164
				165	void init_tg_rt_entry(struct task_group tg, struct rt_rq rt_rq,
				166	struct sched_rt_entity *rt_se, int cpu,
				167	struct sched_rt_entity *parent)
				168	{
				169	struct rq *rq = cpu_rq(cpu);
				170
				171	rt_rq->highest_prio.curr = MAX_RT_PRIO;
				172	rt_rq->rt_nr_boosted = 0;
				173	rt_rq->rq = rq;
				174	rt_rq->tg = tg;
				175
				176	tg->rt_rq[cpu] = rt_rq;
				177	tg->rt_se[cpu] = rt_se;
				178
				179	if (!rt_se)
				180	return;
				181
				182	if (!parent)
				183	rt_se->rt_rq = &rq->rt;
				184	else
				185	rt_se->rt_rq = parent->my_q;
				186
				187	rt_se->my_q = rt_rq;
				188	rt_se->parent = parent;
				189	INIT_LIST_HEAD(&rt_se->run_list);
				190	}
				191
				192	int alloc_rt_sched_group(struct task_group tg, struct task_group parent)
				193	{
				194	struct rt_rq *rt_rq;
				195	struct sched_rt_entity *rt_se;
				196	int i;
				197
				198	tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
				199	if (!tg->rt_rq)
				200	goto err;
				201	tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
				202	if (!tg->rt_se)
				203	goto err;
				204
				205	init_rt_bandwidth(&tg->rt_bandwidth,
				206	ktime_to_ns(def_rt_bandwidth.rt_period), 0);
				207
				208	for_each_possible_cpu(i) {
				209	rt_rq = kzalloc_node(sizeof(struct rt_rq),
				210	GFP_KERNEL, cpu_to_node(i));
				211	if (!rt_rq)
				212	goto err;
				213
				214	rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
				215	GFP_KERNEL, cpu_to_node(i));
				216	if (!rt_se)
				217	goto err_free_rq;
				218
				219	init_rt_rq(rt_rq);
				220	rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
				221	init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
				222	}
				223
				224	return 1;
				225
				226	err_free_rq:
				227	kfree(rt_rq);
				228	err:
				229	return 0;
				230	}
				231
				232	#else /* CONFIG_RT_GROUP_SCHED */
				233
				234	#define rt_entity_is_task(rt_se) (1)
				235
				236	static inline struct task_struct rt_task_of(struct sched_rt_entity rt_se)
				237	{
				238	return container_of(rt_se, struct task_struct, rt);
				239	}
				240
				241	static inline struct rq rq_of_rt_rq(struct rt_rq rt_rq)
				242	{
				243	return container_of(rt_rq, struct rq, rt);
				244	}
				245
				246	static inline struct rq rq_of_rt_se(struct sched_rt_entity rt_se)
				247	{
				248	struct task_struct *p = rt_task_of(rt_se);
				249
				250	return task_rq(p);
				251	}
				252
				253	static inline struct rt_rq rt_rq_of_se(struct sched_rt_entity rt_se)
				254	{
				255	struct rq *rq = rq_of_rt_se(rt_se);
				256
				257	return &rq->rt;
				258	}
				259
				260	void free_rt_sched_group(struct task_group *tg) { }
				261
				262	int alloc_rt_sched_group(struct task_group tg, struct task_group parent)
				263	{
				264	return 1;
				265	}
				266	#endif /* CONFIG_RT_GROUP_SCHED */
				267
				268	#ifdef CONFIG_SMP
				269
				270	static void pull_rt_task(struct rq *this_rq);
				271
				272	static inline bool need_pull_rt_task(struct rq rq, struct task_struct prev)
				273	{
				274	/* Try to pull RT tasks here if we lower this rq's prio */
				275	return rq->rt.highest_prio.curr > prev->prio;
				276	}
				277
				278	static inline int rt_overloaded(struct rq *rq)
				279	{
				280	return atomic_read(&rq->rd->rto_count);
				281	}
				282
				283	static inline void rt_set_overload(struct rq *rq)
				284	{
				285	if (!rq->online)
				286	return;
				287
				288	cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
				289	/*
				290	* Make sure the mask is visible before we set
				291	* the overload count. That is checked to determine
				292	* if we should look at the mask. It would be a shame
				293	* if we looked at the mask, but the mask was not
				294	* updated yet.
				295	*
				296	* Matched by the barrier in pull_rt_task().
				297	*/
				298	smp_wmb();
				299	atomic_inc(&rq->rd->rto_count);
				300	}
				301
				302	static inline void rt_clear_overload(struct rq *rq)
				303	{
				304	if (!rq->online)
				305	return;
				306
				307	/* the order here really doesn't matter */
				308	atomic_dec(&rq->rd->rto_count);
				309	cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
				310	}
				311
				312	static void update_rt_migration(struct rt_rq *rt_rq)
				313	{
				314	if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
				315	if (!rt_rq->overloaded) {
				316	rt_set_overload(rq_of_rt_rq(rt_rq));
				317	rt_rq->overloaded = 1;
				318	}
				319	} else if (rt_rq->overloaded) {
				320	rt_clear_overload(rq_of_rt_rq(rt_rq));
				321	rt_rq->overloaded = 0;
				322	}
				323	}
				324
				325	static void inc_rt_migration(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
				326	{
				327	struct task_struct *p;
				328
				329	if (!rt_entity_is_task(rt_se))
				330	return;
				331
				332	p = rt_task_of(rt_se);
				333	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
				334
				335	rt_rq->rt_nr_total++;
				336	if (p->nr_cpus_allowed > 1)
				337	rt_rq->rt_nr_migratory++;
				338
				339	update_rt_migration(rt_rq);
				340	}
				341
				342	static void dec_rt_migration(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
				343	{
				344	struct task_struct *p;
				345
				346	if (!rt_entity_is_task(rt_se))
				347	return;
				348
				349	p = rt_task_of(rt_se);
				350	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
				351
				352	rt_rq->rt_nr_total--;
				353	if (p->nr_cpus_allowed > 1)
				354	rt_rq->rt_nr_migratory--;
				355
				356	update_rt_migration(rt_rq);
				357	}
				358
				359	static inline int has_pushable_tasks(struct rq *rq)
				360	{
				361	return !plist_head_empty(&rq->rt.pushable_tasks);
				362	}
				363
				364	static DEFINE_PER_CPU(struct callback_head, rt_push_head);
				365	static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
				366
				367	static void push_rt_tasks(struct rq *);
				368	static void pull_rt_task(struct rq *);
				369
				370	static inline void rt_queue_push_tasks(struct rq *rq)
				371	{
				372	if (!has_pushable_tasks(rq))
				373	return;
				374
				375	queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
				376	}
				377
				378	static inline void rt_queue_pull_task(struct rq *rq)
				379	{
				380	queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
				381	}
				382
				383	static void enqueue_pushable_task(struct rq rq, struct task_struct p)
				384	{
				385	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
				386	plist_node_init(&p->pushable_tasks, p->prio);
				387	plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
				388
				389	/* Update the highest prio pushable task */
				390	if (p->prio < rq->rt.highest_prio.next)
				391	rq->rt.highest_prio.next = p->prio;
				392	}
				393
				394	static void dequeue_pushable_task(struct rq rq, struct task_struct p)
				395	{
				396	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
				397
				398	/* Update the new highest prio pushable task */
				399	if (has_pushable_tasks(rq)) {
				400	p = plist_first_entry(&rq->rt.pushable_tasks,
				401	struct task_struct, pushable_tasks);
				402	rq->rt.highest_prio.next = p->prio;
				403	} else
				404	rq->rt.highest_prio.next = MAX_RT_PRIO;
				405	}
				406
				407	#else
				408
				409	static inline void enqueue_pushable_task(struct rq rq, struct task_struct p)
				410	{
				411	}
				412
				413	static inline void dequeue_pushable_task(struct rq rq, struct task_struct p)
				414	{
				415	}
				416
				417	static inline
				418	void inc_rt_migration(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
				419	{
				420	}
				421
				422	static inline
				423	void dec_rt_migration(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
				424	{
				425	}
				426
				427	static inline bool need_pull_rt_task(struct rq rq, struct task_struct prev)
				428	{
				429	return false;
				430	}
				431
				432	static inline void pull_rt_task(struct rq *this_rq)
				433	{
				434	}
				435
				436	static inline void rt_queue_push_tasks(struct rq *rq)
				437	{
				438	}
				439	#endif /* CONFIG_SMP */
				440
				441	static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
				442	static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
				443
				444	static inline int on_rt_rq(struct sched_rt_entity *rt_se)
				445	{
				446	return rt_se->on_rq;
				447	}
				448
				449	#ifdef CONFIG_UCLAMP_TASK
				450	/*
				451	* Verify the fitness of task @p to run on @cpu taking into account the uclamp
				452	* settings.
				453	*
				454	* This check is only important for heterogeneous systems where uclamp_min value
				455	* is higher than the capacity of a @cpu. For non-heterogeneous system this
				456	* function will always return true.
				457	*
				458	* The function will return true if the capacity of the @cpu is >= the
				459	* uclamp_min and false otherwise.
				460	*
				461	* Note that uclamp_min will be clamped to uclamp_max if uclamp_min
				462	* > uclamp_max.
				463	*/
				464	static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
				465	{
				466	unsigned int min_cap;
				467	unsigned int max_cap;
				468	unsigned int cpu_cap;
				469
				470	/* Only heterogeneous systems can benefit from this check */
				471	if (!static_branch_unlikely(&sched_asym_cpucapacity))
				472	return true;
				473
				474	min_cap = uclamp_eff_value(p, UCLAMP_MIN);
				475	max_cap = uclamp_eff_value(p, UCLAMP_MAX);
				476
				477	cpu_cap = capacity_orig_of(cpu);
				478
				479	return cpu_cap >= min(min_cap, max_cap);
				480	}
				481	#else
				482	static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
				483	{
				484	return true;
				485	}
				486	#endif
				487
				488	#ifdef CONFIG_RT_GROUP_SCHED
				489
				490	static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
				491	{
				492	if (!rt_rq->tg)
				493	return RUNTIME_INF;
				494
				495	return rt_rq->rt_runtime;
				496	}
				497
				498	static inline u64 sched_rt_period(struct rt_rq *rt_rq)
				499	{
				500	return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
				501	}
				502
				503	typedef struct task_group *rt_rq_iter_t;
				504
				505	static inline struct task_group next_task_group(struct task_group tg)
				506	{
				507	do {
				508	tg = list_entry_rcu(tg->list.next,
				509	typeof(struct task_group), list);
				510	} while (&tg->list != &task_groups && task_group_is_autogroup(tg));
				511
				512	if (&tg->list == &task_groups)
				513	tg = NULL;
				514
				515	return tg;
				516	}
				517
				518	#define for_each_rt_rq(rt_rq, iter, rq) \
				519	for (iter = container_of(&task_groups, typeof(*iter), list); \
				520	(iter = next_task_group(iter)) && \
				521	(rt_rq = iter->rt_rq[cpu_of(rq)]);)
				522
				523	#define for_each_sched_rt_entity(rt_se) \
				524	for (; rt_se; rt_se = rt_se->parent)
				525
				526	static inline struct rt_rq group_rt_rq(struct sched_rt_entity rt_se)
				527	{
				528	return rt_se->my_q;
				529	}
				530
				531	static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
				532	static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
				533
				534	static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
				535	{
				536	struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
				537	struct rq *rq = rq_of_rt_rq(rt_rq);
				538	struct sched_rt_entity *rt_se;
				539
				540	int cpu = cpu_of(rq);
				541
				542	rt_se = rt_rq->tg->rt_se[cpu];
				543
				544	if (rt_rq->rt_nr_running) {
				545	if (!rt_se)
				546	enqueue_top_rt_rq(rt_rq);
				547	else if (!on_rt_rq(rt_se))
				548	enqueue_rt_entity(rt_se, 0);
				549
				550	if (rt_rq->highest_prio.curr < curr->prio)
				551	resched_curr(rq);
				552	}
				553	}
				554
				555	static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
				556	{
				557	struct sched_rt_entity *rt_se;
				558	int cpu = cpu_of(rq_of_rt_rq(rt_rq));
				559
				560	rt_se = rt_rq->tg->rt_se[cpu];
				561
				562	if (!rt_se) {
				563	dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
				564	/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
				565	cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
				566	}
				567	else if (on_rt_rq(rt_se))
				568	dequeue_rt_entity(rt_se, 0);
				569	}
				570
				571	static inline int rt_rq_throttled(struct rt_rq *rt_rq)
				572	{
				573	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
				574	}
				575
				576	static int rt_se_boosted(struct sched_rt_entity *rt_se)
				577	{
				578	struct rt_rq *rt_rq = group_rt_rq(rt_se);
				579	struct task_struct *p;
				580
				581	if (rt_rq)
				582	return !!rt_rq->rt_nr_boosted;
				583
				584	p = rt_task_of(rt_se);
				585	return p->prio != p->normal_prio;
				586	}
				587
				588	#ifdef CONFIG_SMP
				589	static inline const struct cpumask *sched_rt_period_mask(void)
				590	{
				591	return this_rq()->rd->span;
				592	}
				593	#else
				594	static inline const struct cpumask *sched_rt_period_mask(void)
				595	{
				596	return cpu_online_mask;
				597	}
				598	#endif
				599
				600	static inline
				601	struct rt_rq sched_rt_period_rt_rq(struct rt_bandwidth rt_b, int cpu)
				602	{
				603	return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
				604	}
				605
				606	static inline struct rt_bandwidth sched_rt_bandwidth(struct rt_rq rt_rq)
				607	{
				608	return &rt_rq->tg->rt_bandwidth;
				609	}
				610
				611	#else /* !CONFIG_RT_GROUP_SCHED */
				612
				613	static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
				614	{
				615	return rt_rq->rt_runtime;
				616	}
				617
				618	static inline u64 sched_rt_period(struct rt_rq *rt_rq)
				619	{
				620	return ktime_to_ns(def_rt_bandwidth.rt_period);
				621	}
				622
				623	typedef struct rt_rq *rt_rq_iter_t;
				624
				625	#define for_each_rt_rq(rt_rq, iter, rq) \
				626	for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
				627
				628	#define for_each_sched_rt_entity(rt_se) \
				629	for (; rt_se; rt_se = NULL)
				630
				631	static inline struct rt_rq group_rt_rq(struct sched_rt_entity rt_se)
				632	{
				633	return NULL;
				634	}
				635
				636	static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
				637	{
				638	struct rq *rq = rq_of_rt_rq(rt_rq);
				639
				640	if (!rt_rq->rt_nr_running)
				641	return;
				642
				643	enqueue_top_rt_rq(rt_rq);
				644	resched_curr(rq);
				645	}
				646
				647	static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
				648	{
				649	dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
				650	}
				651
				652	static inline int rt_rq_throttled(struct rt_rq *rt_rq)
				653	{
				654	return rt_rq->rt_throttled;
				655	}
				656
				657	static inline const struct cpumask *sched_rt_period_mask(void)
				658	{
				659	return cpu_online_mask;
				660	}
				661
				662	static inline
				663	struct rt_rq sched_rt_period_rt_rq(struct rt_bandwidth rt_b, int cpu)
				664	{
				665	return &cpu_rq(cpu)->rt;
				666	}
				667
				668	static inline struct rt_bandwidth sched_rt_bandwidth(struct rt_rq rt_rq)
				669	{
				670	return &def_rt_bandwidth;
				671	}
				672
				673	#endif /* CONFIG_RT_GROUP_SCHED */
				674
				675	bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
				676	{
				677	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
				678
				679	return (hrtimer_active(&rt_b->rt_period_timer) \|\|
				680	rt_rq->rt_time < rt_b->rt_runtime);
				681	}
				682
				683	#ifdef CONFIG_SMP
				684	/*
				685	* We ran out of runtime, see if we can borrow some from our neighbours.
				686	*/
				687	static void do_balance_runtime(struct rt_rq *rt_rq)
				688	{
				689	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
				690	struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
				691	int i, weight;
				692	u64 rt_period;
				693
				694	weight = cpumask_weight(rd->span);
				695
				696	raw_spin_lock(&rt_b->rt_runtime_lock);
				697	rt_period = ktime_to_ns(rt_b->rt_period);
				698	for_each_cpu(i, rd->span) {
				699	struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
				700	s64 diff;
				701
				702	if (iter == rt_rq)
				703	continue;
				704
				705	raw_spin_lock(&iter->rt_runtime_lock);
				706	/*
				707	* Either all rqs have inf runtime and there's nothing to steal
				708	* or __disable_runtime() below sets a specific rq to inf to
				709	* indicate its been disabled and disalow stealing.
				710	*/
				711	if (iter->rt_runtime == RUNTIME_INF)
				712	goto next;
				713
				714	/*
				715	* From runqueues with spare time, take 1/n part of their
				716	* spare time, but no more than our period.
				717	*/
				718	diff = iter->rt_runtime - iter->rt_time;
				719	if (diff > 0) {
				720	diff = div_u64((u64)diff, weight);
				721	if (rt_rq->rt_runtime + diff > rt_period)
				722	diff = rt_period - rt_rq->rt_runtime;
				723	iter->rt_runtime -= diff;
				724	rt_rq->rt_runtime += diff;
				725	if (rt_rq->rt_runtime == rt_period) {
				726	raw_spin_unlock(&iter->rt_runtime_lock);
				727	break;
				728	}
				729	}
				730	next:
				731	raw_spin_unlock(&iter->rt_runtime_lock);
				732	}
				733	raw_spin_unlock(&rt_b->rt_runtime_lock);
				734	}
				735
				736	/*
				737	* Ensure this RQ takes back all the runtime it lend to its neighbours.
				738	*/
				739	static void __disable_runtime(struct rq *rq)
				740	{
				741	struct root_domain *rd = rq->rd;
				742	rt_rq_iter_t iter;
				743	struct rt_rq *rt_rq;
				744
				745	if (unlikely(!scheduler_running))
				746	return;
				747
				748	for_each_rt_rq(rt_rq, iter, rq) {
				749	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
				750	s64 want;
				751	int i;
				752
				753	raw_spin_lock(&rt_b->rt_runtime_lock);
				754	raw_spin_lock(&rt_rq->rt_runtime_lock);
				755	/*
				756	* Either we're all inf and nobody needs to borrow, or we're
				757	* already disabled and thus have nothing to do, or we have
				758	* exactly the right amount of runtime to take out.
				759	*/
				760	if (rt_rq->rt_runtime == RUNTIME_INF \|\|
				761	rt_rq->rt_runtime == rt_b->rt_runtime)
				762	goto balanced;
				763	raw_spin_unlock(&rt_rq->rt_runtime_lock);
				764
				765	/*
				766	* Calculate the difference between what we started out with
				767	* and what we current have, that's the amount of runtime
				768	* we lend and now have to reclaim.
				769	*/
				770	want = rt_b->rt_runtime - rt_rq->rt_runtime;
				771
				772	/*
				773	* Greedy reclaim, take back as much as we can.
				774	*/
				775	for_each_cpu(i, rd->span) {
				776	struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
				777	s64 diff;
				778
				779	/*
				780	* Can't reclaim from ourselves or disabled runqueues.
				781	*/
				782	if (iter == rt_rq \|\| iter->rt_runtime == RUNTIME_INF)
				783	continue;
				784
				785	raw_spin_lock(&iter->rt_runtime_lock);
				786	if (want > 0) {
				787	diff = min_t(s64, iter->rt_runtime, want);
				788	iter->rt_runtime -= diff;
				789	want -= diff;
				790	} else {
				791	iter->rt_runtime -= want;
				792	want -= want;
				793	}
				794	raw_spin_unlock(&iter->rt_runtime_lock);
				795
				796	if (!want)
				797	break;
				798	}
				799
				800	raw_spin_lock(&rt_rq->rt_runtime_lock);
				801	/*
				802	* We cannot be left wanting - that would mean some runtime
				803	* leaked out of the system.
				804	*/
				805	BUG_ON(want);
				806	balanced:
				807	/*
				808	* Disable all the borrow logic by pretending we have inf
				809	* runtime - in which case borrowing doesn't make sense.
				810	*/
				811	rt_rq->rt_runtime = RUNTIME_INF;
				812	rt_rq->rt_throttled = 0;
				813	raw_spin_unlock(&rt_rq->rt_runtime_lock);
				814	raw_spin_unlock(&rt_b->rt_runtime_lock);
				815
				816	/* Make rt_rq available for pick_next_task() */
				817	sched_rt_rq_enqueue(rt_rq);
				818	}
				819	}
				820
				821	static void __enable_runtime(struct rq *rq)
				822	{
				823	rt_rq_iter_t iter;
				824	struct rt_rq *rt_rq;
				825
				826	if (unlikely(!scheduler_running))
				827	return;
				828
				829	/*
				830	* Reset each runqueue's bandwidth settings
				831	*/
				832	for_each_rt_rq(rt_rq, iter, rq) {
				833	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
				834
				835	raw_spin_lock(&rt_b->rt_runtime_lock);
				836	raw_spin_lock(&rt_rq->rt_runtime_lock);
				837	rt_rq->rt_runtime = rt_b->rt_runtime;
				838	rt_rq->rt_time = 0;
				839	rt_rq->rt_throttled = 0;
				840	raw_spin_unlock(&rt_rq->rt_runtime_lock);
				841	raw_spin_unlock(&rt_b->rt_runtime_lock);
				842	}
				843	}
				844
				845	static void balance_runtime(struct rt_rq *rt_rq)
				846	{
				847	if (!sched_feat(RT_RUNTIME_SHARE))
				848	return;
				849
				850	if (rt_rq->rt_time > rt_rq->rt_runtime) {
				851	raw_spin_unlock(&rt_rq->rt_runtime_lock);
				852	do_balance_runtime(rt_rq);
				853	raw_spin_lock(&rt_rq->rt_runtime_lock);
				854	}
				855	}
				856	#else /* !CONFIG_SMP */
				857	static inline void balance_runtime(struct rt_rq *rt_rq) {}
				858	#endif /* CONFIG_SMP */
				859
				860	static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
				861	{
				862	int i, idle = 1, throttled = 0;
				863	const struct cpumask *span;
				864
				865	span = sched_rt_period_mask();
				866	#ifdef CONFIG_RT_GROUP_SCHED
				867	/*
				868	* FIXME: isolated CPUs should really leave the root task group,
				869	* whether they are isolcpus or were isolated via cpusets, lest
				870	* the timer run on a CPU which does not service all runqueues,
				871	* potentially leaving other CPUs indefinitely throttled. If
				872	* isolation is really required, the user will turn the throttle
				873	* off to kill the perturbations it causes anyway. Meanwhile,
				874	* this maintains functionality for boot and/or troubleshooting.
				875	*/
				876	if (rt_b == &root_task_group.rt_bandwidth)
				877	span = cpu_online_mask;
				878	#endif
				879	for_each_cpu(i, span) {
				880	int enqueue = 0;
				881	struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
				882	struct rq *rq = rq_of_rt_rq(rt_rq);
				883	int skip;
				884
				885	/*
				886	* When span == cpu_online_mask, taking each rq->lock
				887	* can be time-consuming. Try to avoid it when possible.
				888	*/
				889	raw_spin_lock(&rt_rq->rt_runtime_lock);
				890	if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
				891	rt_rq->rt_runtime = rt_b->rt_runtime;
				892	skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
				893	raw_spin_unlock(&rt_rq->rt_runtime_lock);
				894	if (skip)
				895	continue;
				896
				897	raw_spin_lock(&rq->lock);
				898	update_rq_clock(rq);
				899
				900	if (rt_rq->rt_time) {
				901	u64 runtime;
				902
				903	raw_spin_lock(&rt_rq->rt_runtime_lock);
				904	if (rt_rq->rt_throttled)
				905	balance_runtime(rt_rq);
				906	runtime = rt_rq->rt_runtime;
				907	rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
				908	if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
				909	rt_rq->rt_throttled = 0;
				910	enqueue = 1;
				911
				912	/*
				913	* When we're idle and a woken (rt) task is
				914	* throttled check_preempt_curr() will set
				915	* skip_update and the time between the wakeup
				916	* and this unthrottle will get accounted as
				917	* 'runtime'.
				918	*/
				919	if (rt_rq->rt_nr_running && rq->curr == rq->idle)
				920	rq_clock_cancel_skipupdate(rq);
				921	}
				922	if (rt_rq->rt_time \|\| rt_rq->rt_nr_running)
				923	idle = 0;
				924	raw_spin_unlock(&rt_rq->rt_runtime_lock);
				925	} else if (rt_rq->rt_nr_running) {
				926	idle = 0;
				927	if (!rt_rq_throttled(rt_rq))
				928	enqueue = 1;
				929	}
				930	if (rt_rq->rt_throttled)
				931	throttled = 1;
				932
				933	if (enqueue)
				934	sched_rt_rq_enqueue(rt_rq);
				935	raw_spin_unlock(&rq->lock);
				936	}
				937
				938	if (!throttled && (!rt_bandwidth_enabled() \|\| rt_b->rt_runtime == RUNTIME_INF))
				939	return 1;
				940
				941	return idle;
				942	}
				943
				944	static inline int rt_se_prio(struct sched_rt_entity *rt_se)
				945	{
				946	#ifdef CONFIG_RT_GROUP_SCHED
				947	struct rt_rq *rt_rq = group_rt_rq(rt_se);
				948
				949	if (rt_rq)
				950	return rt_rq->highest_prio.curr;
				951	#endif
				952
				953	return rt_task_of(rt_se)->prio;
				954	}
				955
				956	static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
				957	{
				958	u64 runtime = sched_rt_runtime(rt_rq);
				959
				960	if (rt_rq->rt_throttled)
				961	return rt_rq_throttled(rt_rq);
				962
				963	if (runtime >= sched_rt_period(rt_rq))
				964	return 0;
				965
				966	balance_runtime(rt_rq);
				967	runtime = sched_rt_runtime(rt_rq);
				968	if (runtime == RUNTIME_INF)
				969	return 0;
				970
				971	if (rt_rq->rt_time > runtime) {
				972	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
				973
				974	/*
				975	* Don't actually throttle groups that have no runtime assigned
				976	* but accrue some time due to boosting.
				977	*/
				978	if (likely(rt_b->rt_runtime)) {
				979	rt_rq->rt_throttled = 1;
				980	printk_deferred_once("sched: RT throttling activated\n");
				981	} else {
				982	/*
				983	* In case we did anyway, make it go away,
				984	* replenishment is a joke, since it will replenish us
				985	* with exactly 0 ns.
				986	*/
				987	rt_rq->rt_time = 0;
				988	}
				989
				990	if (rt_rq_throttled(rt_rq)) {
				991	sched_rt_rq_dequeue(rt_rq);
				992	return 1;
				993	}
				994	}
				995
				996	return 0;
				997	}
				998
				999	/*
				1000	* Update the current task's runtime statistics. Skip current tasks that
				1001	* are not in our scheduling class.
				1002	*/
				1003	static void update_curr_rt(struct rq *rq)
				1004	{
				1005	struct task_struct *curr = rq->curr;
				1006	struct sched_rt_entity *rt_se = &curr->rt;
				1007	u64 delta_exec;
				1008	u64 now;
				1009
				1010	if (curr->sched_class != &rt_sched_class)
				1011	return;
				1012
				1013	now = rq_clock_task(rq);
				1014	delta_exec = now - curr->se.exec_start;
				1015	if (unlikely((s64)delta_exec <= 0))
				1016	return;
				1017
				1018	schedstat_set(curr->se.statistics.exec_max,
				1019	max(curr->se.statistics.exec_max, delta_exec));
				1020
				1021	curr->se.sum_exec_runtime += delta_exec;
				1022	account_group_exec_runtime(curr, delta_exec);
				1023
				1024	curr->se.exec_start = now;
				1025	cgroup_account_cputime(curr, delta_exec);
				1026
				1027	if (!rt_bandwidth_enabled())
				1028	return;
				1029
				1030	for_each_sched_rt_entity(rt_se) {
				1031	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
				1032	int exceeded;
				1033
				1034	if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
				1035	raw_spin_lock(&rt_rq->rt_runtime_lock);
				1036	rt_rq->rt_time += delta_exec;
				1037	exceeded = sched_rt_runtime_exceeded(rt_rq);
				1038	if (exceeded)
				1039	resched_curr(rq);
				1040	raw_spin_unlock(&rt_rq->rt_runtime_lock);
				1041	if (exceeded)
				1042	do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
				1043	}
				1044	}
				1045	}
				1046
				1047	static void
				1048	dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)
				1049	{
				1050	struct rq *rq = rq_of_rt_rq(rt_rq);
				1051
				1052	BUG_ON(&rq->rt != rt_rq);
				1053
				1054	if (!rt_rq->rt_queued)
				1055	return;
				1056
				1057	BUG_ON(!rq->nr_running);
				1058
				1059	sub_nr_running(rq, count);
				1060	rt_rq->rt_queued = 0;
				1061
				1062	}
				1063
				1064	static void
				1065	enqueue_top_rt_rq(struct rt_rq *rt_rq)
				1066	{
				1067	struct rq *rq = rq_of_rt_rq(rt_rq);
				1068
				1069	BUG_ON(&rq->rt != rt_rq);
				1070
				1071	if (rt_rq->rt_queued)
				1072	return;
				1073
				1074	if (rt_rq_throttled(rt_rq))
				1075	return;
				1076
				1077	if (rt_rq->rt_nr_running) {
				1078	add_nr_running(rq, rt_rq->rt_nr_running);
				1079	rt_rq->rt_queued = 1;
				1080	}
				1081
				1082	/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
				1083	cpufreq_update_util(rq, 0);
				1084	}
				1085
				1086	#if defined CONFIG_SMP
				1087
				1088	static void
				1089	inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
				1090	{
				1091	struct rq *rq = rq_of_rt_rq(rt_rq);
				1092
				1093	#ifdef CONFIG_RT_GROUP_SCHED
				1094	/*
				1095	* Change rq's cpupri only if rt_rq is the top queue.
				1096	*/
				1097	if (&rq->rt != rt_rq)
				1098	return;
				1099	#endif
				1100	if (rq->online && prio < prev_prio)
				1101	cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
				1102	}
				1103
				1104	static void
				1105	dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
				1106	{
				1107	struct rq *rq = rq_of_rt_rq(rt_rq);
				1108
				1109	#ifdef CONFIG_RT_GROUP_SCHED
				1110	/*
				1111	* Change rq's cpupri only if rt_rq is the top queue.
				1112	*/
				1113	if (&rq->rt != rt_rq)
				1114	return;
				1115	#endif
				1116	if (rq->online && rt_rq->highest_prio.curr != prev_prio)
				1117	cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
				1118	}
				1119
				1120	#else /* CONFIG_SMP */
				1121
				1122	static inline
				1123	void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
				1124	static inline
				1125	void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
				1126
				1127	#endif /* CONFIG_SMP */
				1128
				1129	#if defined CONFIG_SMP \|\| defined CONFIG_RT_GROUP_SCHED
				1130	static void
				1131	inc_rt_prio(struct rt_rq *rt_rq, int prio)
				1132	{
				1133	int prev_prio = rt_rq->highest_prio.curr;
				1134
				1135	if (prio < prev_prio)
				1136	rt_rq->highest_prio.curr = prio;
				1137
				1138	inc_rt_prio_smp(rt_rq, prio, prev_prio);
				1139	}
				1140
				1141	static void
				1142	dec_rt_prio(struct rt_rq *rt_rq, int prio)
				1143	{
				1144	int prev_prio = rt_rq->highest_prio.curr;
				1145
				1146	if (rt_rq->rt_nr_running) {
				1147
				1148	WARN_ON(prio < prev_prio);
				1149
				1150	/*
				1151	* This may have been our highest task, and therefore
				1152	* we may have some recomputation to do
				1153	*/
				1154	if (prio == prev_prio) {
				1155	struct rt_prio_array *array = &rt_rq->active;
				1156
				1157	rt_rq->highest_prio.curr =
				1158	sched_find_first_bit(array->bitmap);
				1159	}
				1160
				1161	} else
				1162	rt_rq->highest_prio.curr = MAX_RT_PRIO;
				1163
				1164	dec_rt_prio_smp(rt_rq, prio, prev_prio);
				1165	}
				1166
				1167	#else
				1168
				1169	static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
				1170	static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
				1171
				1172	#endif /* CONFIG_SMP \|\| CONFIG_RT_GROUP_SCHED */
				1173
				1174	#ifdef CONFIG_RT_GROUP_SCHED
				1175
				1176	static void
				1177	inc_rt_group(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
				1178	{
				1179	if (rt_se_boosted(rt_se))
				1180	rt_rq->rt_nr_boosted++;
				1181
				1182	if (rt_rq->tg)
				1183	start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
				1184	}
				1185
				1186	static void
				1187	dec_rt_group(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
				1188	{
				1189	if (rt_se_boosted(rt_se))
				1190	rt_rq->rt_nr_boosted--;
				1191
				1192	WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
				1193	}
				1194
				1195	#else /* CONFIG_RT_GROUP_SCHED */
				1196
				1197	static void
				1198	inc_rt_group(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
				1199	{
				1200	start_rt_bandwidth(&def_rt_bandwidth);
				1201	}
				1202
				1203	static inline
				1204	void dec_rt_group(struct sched_rt_entity rt_se, struct rt_rq rt_rq) {}
				1205
				1206	#endif /* CONFIG_RT_GROUP_SCHED */
				1207
				1208	static inline
				1209	unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
				1210	{
				1211	struct rt_rq *group_rq = group_rt_rq(rt_se);
				1212
				1213	if (group_rq)
				1214	return group_rq->rt_nr_running;
				1215	else
				1216	return 1;
				1217	}
				1218
				1219	static inline
				1220	unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
				1221	{
				1222	struct rt_rq *group_rq = group_rt_rq(rt_se);
				1223	struct task_struct *tsk;
				1224
				1225	if (group_rq)
				1226	return group_rq->rr_nr_running;
				1227
				1228	tsk = rt_task_of(rt_se);
				1229
				1230	return (tsk->policy == SCHED_RR) ? 1 : 0;
				1231	}
				1232
				1233	static inline
				1234	void inc_rt_tasks(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
				1235	{
				1236	int prio = rt_se_prio(rt_se);
				1237
				1238	WARN_ON(!rt_prio(prio));
				1239	rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
				1240	rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
				1241
				1242	inc_rt_prio(rt_rq, prio);
				1243	inc_rt_migration(rt_se, rt_rq);
				1244	inc_rt_group(rt_se, rt_rq);
				1245	}
				1246
				1247	static inline
				1248	void dec_rt_tasks(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
				1249	{
				1250	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
				1251	WARN_ON(!rt_rq->rt_nr_running);
				1252	rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
				1253	rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
				1254
				1255	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
				1256	dec_rt_migration(rt_se, rt_rq);
				1257	dec_rt_group(rt_se, rt_rq);
				1258	}
				1259
				1260	/*
				1261	* Change rt_se->run_list location unless SAVE && !MOVE
				1262	*
				1263	* assumes ENQUEUE/DEQUEUE flags match
				1264	*/
				1265	static inline bool move_entity(unsigned int flags)
				1266	{
				1267	if ((flags & (DEQUEUE_SAVE \| DEQUEUE_MOVE)) == DEQUEUE_SAVE)
				1268	return false;
				1269
				1270	return true;
				1271	}
				1272
				1273	static void __delist_rt_entity(struct sched_rt_entity rt_se, struct rt_prio_array array)
				1274	{
				1275	list_del_init(&rt_se->run_list);
				1276
				1277	if (list_empty(array->queue + rt_se_prio(rt_se)))
				1278	__clear_bit(rt_se_prio(rt_se), array->bitmap);
				1279
				1280	rt_se->on_list = 0;
				1281	}
				1282
				1283	static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
				1284	{
				1285	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
				1286	struct rt_prio_array *array = &rt_rq->active;
				1287	struct rt_rq *group_rq = group_rt_rq(rt_se);
				1288	struct list_head *queue = array->queue + rt_se_prio(rt_se);
				1289
				1290	/*
				1291	* Don't enqueue the group if its throttled, or when empty.
				1292	* The latter is a consequence of the former when a child group
				1293	* get throttled and the current group doesn't have any other
				1294	* active members.
				1295	*/
				1296	if (group_rq && (rt_rq_throttled(group_rq) \|\| !group_rq->rt_nr_running)) {
				1297	if (rt_se->on_list)
				1298	__delist_rt_entity(rt_se, array);
				1299	return;
				1300	}
				1301
				1302	if (move_entity(flags)) {
				1303	WARN_ON_ONCE(rt_se->on_list);
				1304	if (flags & ENQUEUE_HEAD)
				1305	list_add(&rt_se->run_list, queue);
				1306	else
				1307	list_add_tail(&rt_se->run_list, queue);
				1308
				1309	__set_bit(rt_se_prio(rt_se), array->bitmap);
				1310	rt_se->on_list = 1;
				1311	}
				1312	rt_se->on_rq = 1;
				1313
				1314	inc_rt_tasks(rt_se, rt_rq);
				1315	}
				1316
				1317	static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
				1318	{
				1319	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
				1320	struct rt_prio_array *array = &rt_rq->active;
				1321
				1322	if (move_entity(flags)) {
				1323	WARN_ON_ONCE(!rt_se->on_list);
				1324	__delist_rt_entity(rt_se, array);
				1325	}
				1326	rt_se->on_rq = 0;
				1327
				1328	dec_rt_tasks(rt_se, rt_rq);
				1329	}
				1330
				1331	/*
				1332	* Because the prio of an upper entry depends on the lower
				1333	* entries, we must remove entries top - down.
				1334	*/
				1335	static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
				1336	{
				1337	struct sched_rt_entity *back = NULL;
				1338	unsigned int rt_nr_running;
				1339
				1340	for_each_sched_rt_entity(rt_se) {
				1341	rt_se->back = back;
				1342	back = rt_se;
				1343	}
				1344
				1345	rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
				1346
				1347	for (rt_se = back; rt_se; rt_se = rt_se->back) {
				1348	if (on_rt_rq(rt_se))
				1349	__dequeue_rt_entity(rt_se, flags);
				1350	}
				1351
				1352	dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
				1353	}
				1354
				1355	static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
				1356	{
				1357	struct rq *rq = rq_of_rt_se(rt_se);
				1358
				1359	dequeue_rt_stack(rt_se, flags);
				1360	for_each_sched_rt_entity(rt_se)
				1361	__enqueue_rt_entity(rt_se, flags);
				1362	enqueue_top_rt_rq(&rq->rt);
				1363	}
				1364
				1365	static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
				1366	{
				1367	struct rq *rq = rq_of_rt_se(rt_se);
				1368
				1369	dequeue_rt_stack(rt_se, flags);
				1370
				1371	for_each_sched_rt_entity(rt_se) {
				1372	struct rt_rq *rt_rq = group_rt_rq(rt_se);
				1373
				1374	if (rt_rq && rt_rq->rt_nr_running)
				1375	__enqueue_rt_entity(rt_se, flags);
				1376	}
				1377	enqueue_top_rt_rq(&rq->rt);
				1378	}
				1379
				1380	/*
				1381	* Adding/removing a task to/from a priority array:
				1382	*/
				1383	static void
				1384	enqueue_task_rt(struct rq rq, struct task_struct p, int flags)
				1385	{
				1386	struct sched_rt_entity *rt_se = &p->rt;
				1387
				1388	if (flags & ENQUEUE_WAKEUP)
				1389	rt_se->timeout = 0;
				1390
				1391	enqueue_rt_entity(rt_se, flags);
				1392
				1393	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
				1394	enqueue_pushable_task(rq, p);
				1395	}
				1396
				1397	static void dequeue_task_rt(struct rq rq, struct task_struct p, int flags)
				1398	{
				1399	struct sched_rt_entity *rt_se = &p->rt;
				1400
				1401	update_curr_rt(rq);
				1402	dequeue_rt_entity(rt_se, flags);
				1403
				1404	dequeue_pushable_task(rq, p);
				1405	}
				1406
				1407	/*
				1408	* Put task to the head or the end of the run list without the overhead of
				1409	* dequeue followed by enqueue.
				1410	*/
				1411	static void
				1412	requeue_rt_entity(struct rt_rq rt_rq, struct sched_rt_entity rt_se, int head)
				1413	{
				1414	if (on_rt_rq(rt_se)) {
				1415	struct rt_prio_array *array = &rt_rq->active;
				1416	struct list_head *queue = array->queue + rt_se_prio(rt_se);
				1417
				1418	if (head)
				1419	list_move(&rt_se->run_list, queue);
				1420	else
				1421	list_move_tail(&rt_se->run_list, queue);
				1422	}
				1423	}
				1424
				1425	static void requeue_task_rt(struct rq rq, struct task_struct p, int head)
				1426	{
				1427	struct sched_rt_entity *rt_se = &p->rt;
				1428	struct rt_rq *rt_rq;
				1429
				1430	for_each_sched_rt_entity(rt_se) {
				1431	rt_rq = rt_rq_of_se(rt_se);
				1432	requeue_rt_entity(rt_rq, rt_se, head);
				1433	}
				1434	}
				1435
				1436	static void yield_task_rt(struct rq *rq)
				1437	{
				1438	requeue_task_rt(rq, rq->curr, 0);
				1439	}
				1440
				1441	#ifdef CONFIG_SMP
				1442	static int find_lowest_rq(struct task_struct *task);
				1443
				1444	static int
				1445	select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
				1446	{
				1447	struct task_struct *curr;
				1448	struct rq *rq;
				1449	bool test;
				1450	int target_cpu = -1;
				1451
				1452	trace_android_rvh_select_task_rq_rt(p, cpu, sd_flag,
				1453	flags, &target_cpu);
				1454	if (target_cpu >= 0)
				1455	return target_cpu;
				1456
				1457	/* For anything but wake ups, just return the task_cpu */
				1458	if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
				1459	goto out;
				1460
				1461	rq = cpu_rq(cpu);
				1462
				1463	rcu_read_lock();
				1464	curr = READ_ONCE(rq->curr); /* unlocked access */
				1465
				1466	/*
				1467	* If the current task on @p's runqueue is an RT task, then
				1468	* try to see if we can wake this RT task up on another
				1469	* runqueue. Otherwise simply start this RT task
				1470	* on its current runqueue.
				1471	*
				1472	* We want to avoid overloading runqueues. If the woken
				1473	* task is a higher priority, then it will stay on this CPU
				1474	* and the lower prio task should be moved to another CPU.
				1475	* Even though this will probably make the lower prio task
				1476	* lose its cache, we do not want to bounce a higher task
				1477	* around just because it gave up its CPU, perhaps for a
				1478	* lock?
				1479	*
				1480	* For equal prio tasks, we just let the scheduler sort it out.
				1481	*
				1482	* Otherwise, just let it ride on the affined RQ and the
				1483	* post-schedule router will push the preempted task away
				1484	*
				1485	* This test is optimistic, if we get it wrong the load-balancer
				1486	* will have to sort it out.
				1487	*
				1488	* We take into account the capacity of the CPU to ensure it fits the
				1489	* requirement of the task - which is only important on heterogeneous
				1490	* systems like big.LITTLE.
				1491	*/
				1492	test = curr &&
				1493	unlikely(rt_task(curr)) &&
				1494	(curr->nr_cpus_allowed < 2 \|\| curr->prio <= p->prio);
				1495
				1496	if (test \|\| !rt_task_fits_capacity(p, cpu)) {
				1497	int target = find_lowest_rq(p);
				1498
				1499	/*
				1500	* Bail out if we were forcing a migration to find a better
				1501	* fitting CPU but our search failed.
				1502	*/
				1503	if (!test && target != -1 && !rt_task_fits_capacity(p, target))
				1504	goto out_unlock;
				1505
				1506	/*
				1507	* Don't bother moving it if the destination CPU is
				1508	* not running a lower priority task.
				1509	*/
				1510	if (target != -1 &&
				1511	p->prio < cpu_rq(target)->rt.highest_prio.curr)
				1512	cpu = target;
				1513	}
				1514
				1515	out_unlock:
				1516	rcu_read_unlock();
				1517
				1518	out:
				1519	return cpu;
				1520	}
				1521
				1522	static void check_preempt_equal_prio(struct rq rq, struct task_struct p)
				1523	{
				1524	/*
				1525	* Current can't be migrated, useless to reschedule,
				1526	* let's hope p can move out.
				1527	*/
				1528	if (rq->curr->nr_cpus_allowed == 1 \|\|
				1529	!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
				1530	return;
				1531
				1532	/*
				1533	* p is migratable, so let's not schedule it and
				1534	* see if it is pushed or pulled somewhere else.
				1535	*/
				1536	if (p->nr_cpus_allowed != 1 &&
				1537	cpupri_find(&rq->rd->cpupri, p, NULL))
				1538	return;
				1539
				1540	/*
				1541	* There appear to be other CPUs that can accept
				1542	* the current task but none can run 'p', so lets reschedule
				1543	* to try and push the current task away:
				1544	*/
				1545	requeue_task_rt(rq, p, 1);
				1546	resched_curr(rq);
				1547	}
				1548
				1549	static int balance_rt(struct rq rq, struct task_struct p, struct rq_flags *rf)
				1550	{
				1551	if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
				1552	/*
				1553	* This is OK, because current is on_cpu, which avoids it being
				1554	* picked for load-balance and preemption/IRQs are still
				1555	* disabled avoiding further scheduler activity on it and we've
				1556	* not yet started the picking loop.
				1557	*/
				1558	rq_unpin_lock(rq, rf);
				1559	pull_rt_task(rq);
				1560	rq_repin_lock(rq, rf);
				1561	}
				1562
				1563	return sched_stop_runnable(rq) \|\| sched_dl_runnable(rq) \|\| sched_rt_runnable(rq);
				1564	}
				1565	#endif /* CONFIG_SMP */
				1566
				1567	/*
				1568	* Preempt the current task with a newly woken task if needed:
				1569	*/
				1570	static void check_preempt_curr_rt(struct rq rq, struct task_struct p, int flags)
				1571	{
				1572	if (p->prio < rq->curr->prio) {
				1573	resched_curr(rq);
				1574	return;
				1575	}
				1576
				1577	#ifdef CONFIG_SMP
				1578	/*
				1579	* If:
				1580	*
				1581	* - the newly woken task is of equal priority to the current task
				1582	* - the newly woken task is non-migratable while current is migratable
				1583	* - current will be preempted on the next reschedule
				1584	*
				1585	* we should check to see if current can readily move to a different
				1586	* cpu. If so, we will reschedule to allow the push logic to try
				1587	* to move current somewhere else, making room for our non-migratable
				1588	* task.
				1589	*/
				1590	if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
				1591	check_preempt_equal_prio(rq, p);
				1592	#endif
				1593	}
				1594
				1595	static inline void set_next_task_rt(struct rq rq, struct task_struct p, bool first)
				1596	{
				1597	p->se.exec_start = rq_clock_task(rq);
				1598
				1599	/* The running task is never eligible for pushing */
				1600	dequeue_pushable_task(rq, p);
				1601
				1602	if (!first)
				1603	return;
				1604
				1605	/*
				1606	* If prev task was rt, put_prev_task() has already updated the
				1607	* utilization. We only care of the case where we start to schedule a
				1608	* rt task
				1609	*/
				1610	if (rq->curr->sched_class != &rt_sched_class)
				1611	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
				1612
				1613	rt_queue_push_tasks(rq);
				1614	}
				1615
				1616	static struct sched_rt_entity pick_next_rt_entity(struct rt_rq rt_rq)
				1617	{
				1618	struct rt_prio_array *array = &rt_rq->active;
				1619	struct sched_rt_entity *next = NULL;
				1620	struct list_head *queue;
				1621	int idx;
				1622
				1623	idx = sched_find_first_bit(array->bitmap);
				1624	BUG_ON(idx >= MAX_RT_PRIO);
				1625
				1626	queue = array->queue + idx;
				1627	if (SCHED_WARN_ON(list_empty(queue)))
				1628	return NULL;
				1629	next = list_entry(queue->next, struct sched_rt_entity, run_list);
				1630
				1631	return next;
				1632	}
				1633
				1634	static struct task_struct _pick_next_task_rt(struct rq rq)
				1635	{
				1636	struct sched_rt_entity *rt_se;
				1637	struct rt_rq *rt_rq = &rq->rt;
				1638
				1639	do {
				1640	rt_se = pick_next_rt_entity(rt_rq);
				1641	if (unlikely(!rt_se))
				1642	return NULL;
				1643	rt_rq = group_rt_rq(rt_se);
				1644	} while (rt_rq);
				1645
				1646	return rt_task_of(rt_se);
				1647	}
				1648
				1649	static struct task_struct *
				1650	pick_next_task_rt(struct rq rq, struct task_struct prev, struct rq_flags *rf)
				1651	{
				1652	struct task_struct *p;
				1653
				1654	WARN_ON_ONCE(prev \|\| rf);
				1655
				1656	if (!sched_rt_runnable(rq))
				1657	return NULL;
				1658
				1659	p = _pick_next_task_rt(rq);
				1660	set_next_task_rt(rq, p, true);
				1661	return p;
				1662	}
				1663
				1664	static void put_prev_task_rt(struct rq rq, struct task_struct p)
				1665	{
				1666	update_curr_rt(rq);
				1667
				1668	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
				1669
				1670	/*
				1671	* The previous task needs to be made eligible for pushing
				1672	* if it is still active
				1673	*/
				1674	if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
				1675	enqueue_pushable_task(rq, p);
				1676	}
				1677
				1678	#ifdef CONFIG_SMP
				1679
				1680	/* Only try algorithms three times */
				1681	#define RT_MAX_TRIES 3
				1682
				1683	static int pick_rt_task(struct rq rq, struct task_struct p, int cpu)
				1684	{
				1685	if (!task_running(rq, p) &&
				1686	cpumask_test_cpu(cpu, p->cpus_ptr))
				1687	return 1;
				1688
				1689	return 0;
				1690	}
				1691
				1692	/*
				1693	* Return the highest pushable rq's task, which is suitable to be executed
				1694	* on the CPU, NULL otherwise
				1695	*/
				1696	static struct task_struct pick_highest_pushable_task(struct rq rq, int cpu)
				1697	{
				1698	struct plist_head *head = &rq->rt.pushable_tasks;
				1699	struct task_struct *p;
				1700
				1701	if (!has_pushable_tasks(rq))
				1702	return NULL;
				1703
				1704	plist_for_each_entry(p, head, pushable_tasks) {
				1705	if (pick_rt_task(rq, p, cpu))
				1706	return p;
				1707	}
				1708
				1709	return NULL;
				1710	}
				1711
				1712	static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
				1713
				1714	static int find_lowest_rq(struct task_struct *task)
				1715	{
				1716	struct sched_domain *sd;
				1717	struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
				1718	int this_cpu = smp_processor_id();
				1719	int cpu = task_cpu(task);
				1720	int ret;
				1721	int lowest_cpu = -1;
				1722
				1723	trace_android_rvh_find_lowest_rq(task, lowest_mask, &lowest_cpu);
				1724	if (lowest_cpu >= 0)
				1725	return lowest_cpu;
				1726
				1727	/* Make sure the mask is initialized first */
				1728	if (unlikely(!lowest_mask))
				1729	return -1;
				1730
				1731	if (task->nr_cpus_allowed == 1)
				1732	return -1; /* No other targets possible */
				1733
				1734	/*
				1735	* If we're on asym system ensure we consider the different capacities
				1736	* of the CPUs when searching for the lowest_mask.
				1737	*/
				1738	if (static_branch_unlikely(&sched_asym_cpucapacity)) {
				1739
				1740	ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
				1741	task, lowest_mask,
				1742	rt_task_fits_capacity);
				1743	} else {
				1744
				1745	ret = cpupri_find(&task_rq(task)->rd->cpupri,
				1746	task, lowest_mask);
				1747	}
				1748
				1749	if (!ret)
				1750	return -1; /* No targets found */
				1751
				1752	/*
				1753	* At this point we have built a mask of CPUs representing the
				1754	* lowest priority tasks in the system. Now we want to elect
				1755	* the best one based on our affinity and topology.
				1756	*
				1757	* We prioritize the last CPU that the task executed on since
				1758	* it is most likely cache-hot in that location.
				1759	*/
				1760	if (cpumask_test_cpu(cpu, lowest_mask))
				1761	return cpu;
				1762
				1763	/*
				1764	* Otherwise, we consult the sched_domains span maps to figure
				1765	* out which CPU is logically closest to our hot cache data.
				1766	*/
				1767	if (!cpumask_test_cpu(this_cpu, lowest_mask))
				1768	this_cpu = -1; /* Skip this_cpu opt if not among lowest */
				1769
				1770	rcu_read_lock();
				1771	for_each_domain(cpu, sd) {
				1772	if (sd->flags & SD_WAKE_AFFINE) {
				1773	int best_cpu;
				1774
				1775	/*
				1776	* "this_cpu" is cheaper to preempt than a
				1777	* remote processor.
				1778	*/
				1779	if (this_cpu != -1 &&
				1780	cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
				1781	rcu_read_unlock();
				1782	return this_cpu;
				1783	}
				1784
				1785	best_cpu = cpumask_first_and(lowest_mask,
				1786	sched_domain_span(sd));
				1787	if (best_cpu < nr_cpu_ids) {
				1788	rcu_read_unlock();
				1789	return best_cpu;
				1790	}
				1791	}
				1792	}
				1793	rcu_read_unlock();
				1794
				1795	/*
				1796	* And finally, if there were no matches within the domains
				1797	* just give the caller something to work with from the compatible
				1798	* locations.
				1799	*/
				1800	if (this_cpu != -1)
				1801	return this_cpu;
				1802
				1803	cpu = cpumask_any(lowest_mask);
				1804	if (cpu < nr_cpu_ids)
				1805	return cpu;
				1806
				1807	return -1;
				1808	}
				1809
				1810	/* Will lock the rq it finds */
				1811	static struct rq find_lock_lowest_rq(struct task_struct task, struct rq *rq)
				1812	{
				1813	struct rq *lowest_rq = NULL;
				1814	int tries;
				1815	int cpu;
				1816
				1817	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
				1818	cpu = find_lowest_rq(task);
				1819
				1820	if ((cpu == -1) \|\| (cpu == rq->cpu))
				1821	break;
				1822
				1823	lowest_rq = cpu_rq(cpu);
				1824
				1825	if (lowest_rq->rt.highest_prio.curr <= task->prio) {
				1826	/*
				1827	* Target rq has tasks of equal or higher priority,
				1828	* retrying does not release any lock and is unlikely
				1829	* to yield a different result.
				1830	*/
				1831	lowest_rq = NULL;
				1832	break;
				1833	}
				1834
				1835	/* if the prio of this runqueue changed, try again */
				1836	if (double_lock_balance(rq, lowest_rq)) {
				1837	/*
				1838	* We had to unlock the run queue. In
				1839	* the mean time, task could have
				1840	* migrated already or had its affinity changed.
				1841	* Also make sure that it wasn't scheduled on its rq.
				1842	*/
				1843	if (unlikely(task_rq(task) != rq \|\|
				1844	!cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) \|\|
				1845	task_running(rq, task) \|\|
				1846	!rt_task(task) \|\|
				1847	!task_on_rq_queued(task))) {
				1848
				1849	double_unlock_balance(rq, lowest_rq);
				1850	lowest_rq = NULL;
				1851	break;
				1852	}
				1853	}
				1854
				1855	/* If this rq is still suitable use it. */
				1856	if (lowest_rq->rt.highest_prio.curr > task->prio)
				1857	break;
				1858
				1859	/* try again */
				1860	double_unlock_balance(rq, lowest_rq);
				1861	lowest_rq = NULL;
				1862	}
				1863
				1864	return lowest_rq;
				1865	}
				1866
				1867	static struct task_struct pick_next_pushable_task(struct rq rq)
				1868	{
				1869	struct task_struct *p;
				1870
				1871	if (!has_pushable_tasks(rq))
				1872	return NULL;
				1873
				1874	p = plist_first_entry(&rq->rt.pushable_tasks,
				1875	struct task_struct, pushable_tasks);
				1876
				1877	BUG_ON(rq->cpu != task_cpu(p));
				1878	BUG_ON(task_current(rq, p));
				1879	BUG_ON(p->nr_cpus_allowed <= 1);
				1880
				1881	BUG_ON(!task_on_rq_queued(p));
				1882	BUG_ON(!rt_task(p));
				1883
				1884	return p;
				1885	}
				1886
				1887	/*
				1888	* If the current CPU has more than one RT task, see if the non
				1889	* running task can migrate over to a CPU that is running a task
				1890	* of lesser priority.
				1891	*/
				1892	static int push_rt_task(struct rq *rq)
				1893	{
				1894	struct task_struct *next_task;
				1895	struct rq *lowest_rq;
				1896	int ret = 0;
				1897
				1898	if (!rq->rt.overloaded)
				1899	return 0;
				1900
				1901	next_task = pick_next_pushable_task(rq);
				1902	if (!next_task)
				1903	return 0;
				1904
				1905	retry:
				1906	if (WARN_ON(next_task == rq->curr))
				1907	return 0;
				1908
				1909	/*
				1910	* It's possible that the next_task slipped in of
				1911	* higher priority than current. If that's the case
				1912	* just reschedule current.
				1913	*/
				1914	if (unlikely(next_task->prio < rq->curr->prio)) {
				1915	resched_curr(rq);
				1916	return 0;
				1917	}
				1918
				1919	/* We might release rq lock */
				1920	get_task_struct(next_task);
				1921
				1922	/* find_lock_lowest_rq locks the rq if found */
				1923	lowest_rq = find_lock_lowest_rq(next_task, rq);
				1924	if (!lowest_rq) {
				1925	struct task_struct *task;
				1926	/*
				1927	* find_lock_lowest_rq releases rq->lock
				1928	* so it is possible that next_task has migrated.
				1929	*
				1930	* We need to make sure that the task is still on the same
				1931	* run-queue and is also still the next task eligible for
				1932	* pushing.
				1933	*/
				1934	task = pick_next_pushable_task(rq);
				1935	if (task == next_task) {
				1936	/*
				1937	* The task hasn't migrated, and is still the next
				1938	* eligible task, but we failed to find a run-queue
				1939	* to push it to. Do not retry in this case, since
				1940	* other CPUs will pull from us when ready.
				1941	*/
				1942	goto out;
				1943	}
				1944
				1945	if (!task)
				1946	/* No more tasks, just exit */
				1947	goto out;
				1948
				1949	/*
				1950	* Something has shifted, try again.
				1951	*/
				1952	put_task_struct(next_task);
				1953	next_task = task;
				1954	goto retry;
				1955	}
				1956
				1957	deactivate_task(rq, next_task, 0);
				1958	set_task_cpu(next_task, lowest_rq->cpu);
				1959	activate_task(lowest_rq, next_task, 0);
				1960	ret = 1;
				1961
				1962	resched_curr(lowest_rq);
				1963
				1964	double_unlock_balance(rq, lowest_rq);
				1965
				1966	out:
				1967	put_task_struct(next_task);
				1968
				1969	return ret;
				1970	}
				1971
				1972	static void push_rt_tasks(struct rq *rq)
				1973	{
				1974	/* push_rt_task will return true if it moved an RT */
				1975	while (push_rt_task(rq))
				1976	;
				1977	}
				1978
				1979	#ifdef HAVE_RT_PUSH_IPI
				1980
				1981	/*
				1982	* When a high priority task schedules out from a CPU and a lower priority
				1983	* task is scheduled in, a check is made to see if there's any RT tasks
				1984	* on other CPUs that are waiting to run because a higher priority RT task
				1985	* is currently running on its CPU. In this case, the CPU with multiple RT
				1986	* tasks queued on it (overloaded) needs to be notified that a CPU has opened
				1987	* up that may be able to run one of its non-running queued RT tasks.
				1988	*
				1989	* All CPUs with overloaded RT tasks need to be notified as there is currently
				1990	* no way to know which of these CPUs have the highest priority task waiting
				1991	* to run. Instead of trying to take a spinlock on each of these CPUs,
				1992	* which has shown to cause large latency when done on machines with many
				1993	* CPUs, sending an IPI to the CPUs to have them push off the overloaded
				1994	* RT tasks waiting to run.
				1995	*
				1996	* Just sending an IPI to each of the CPUs is also an issue, as on large
				1997	* count CPU machines, this can cause an IPI storm on a CPU, especially
				1998	* if its the only CPU with multiple RT tasks queued, and a large number
				1999	* of CPUs scheduling a lower priority task at the same time.
				2000	*
				2001	* Each root domain has its own irq work function that can iterate over
				2002	* all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
				2003	* tassk must be checked if there's one or many CPUs that are lowering
				2004	* their priority, there's a single irq work iterator that will try to
				2005	* push off RT tasks that are waiting to run.
				2006	*
				2007	* When a CPU schedules a lower priority task, it will kick off the
				2008	* irq work iterator that will jump to each CPU with overloaded RT tasks.
				2009	* As it only takes the first CPU that schedules a lower priority task
				2010	* to start the process, the rto_start variable is incremented and if
				2011	* the atomic result is one, then that CPU will try to take the rto_lock.
				2012	* This prevents high contention on the lock as the process handles all
				2013	* CPUs scheduling lower priority tasks.
				2014	*
				2015	* All CPUs that are scheduling a lower priority task will increment the
				2016	* rt_loop_next variable. This will make sure that the irq work iterator
				2017	* checks all RT overloaded CPUs whenever a CPU schedules a new lower
				2018	* priority task, even if the iterator is in the middle of a scan. Incrementing
				2019	* the rt_loop_next will cause the iterator to perform another scan.
				2020	*
				2021	*/
				2022	static int rto_next_cpu(struct root_domain *rd)
				2023	{
				2024	int next;
				2025	int cpu;
				2026
				2027	/*
				2028	* When starting the IPI RT pushing, the rto_cpu is set to -1,
				2029	* rt_next_cpu() will simply return the first CPU found in
				2030	* the rto_mask.
				2031	*
				2032	* If rto_next_cpu() is called with rto_cpu is a valid CPU, it
				2033	* will return the next CPU found in the rto_mask.
				2034	*
				2035	* If there are no more CPUs left in the rto_mask, then a check is made
				2036	* against rto_loop and rto_loop_next. rto_loop is only updated with
				2037	* the rto_lock held, but any CPU may increment the rto_loop_next
				2038	* without any locking.
				2039	*/
				2040	for (;;) {
				2041
				2042	/* When rto_cpu is -1 this acts like cpumask_first() */
				2043	cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
				2044
				2045	rd->rto_cpu = cpu;
				2046
				2047	if (cpu < nr_cpu_ids)
				2048	return cpu;
				2049
				2050	rd->rto_cpu = -1;
				2051
				2052	/*
				2053	* ACQUIRE ensures we see the @rto_mask changes
				2054	* made prior to the @next value observed.
				2055	*
				2056	* Matches WMB in rt_set_overload().
				2057	*/
				2058	next = atomic_read_acquire(&rd->rto_loop_next);
				2059
				2060	if (rd->rto_loop == next)
				2061	break;
				2062
				2063	rd->rto_loop = next;
				2064	}
				2065
				2066	return -1;
				2067	}
				2068
				2069	static inline bool rto_start_trylock(atomic_t *v)
				2070	{
				2071	return !atomic_cmpxchg_acquire(v, 0, 1);
				2072	}
				2073
				2074	static inline void rto_start_unlock(atomic_t *v)
				2075	{
				2076	atomic_set_release(v, 0);
				2077	}
				2078
				2079	static void tell_cpu_to_push(struct rq *rq)
				2080	{
				2081	int cpu = -1;
				2082
				2083	/* Keep the loop going if the IPI is currently active */
				2084	atomic_inc(&rq->rd->rto_loop_next);
				2085
				2086	/* Only one CPU can initiate a loop at a time */
				2087	if (!rto_start_trylock(&rq->rd->rto_loop_start))
				2088	return;
				2089
				2090	raw_spin_lock(&rq->rd->rto_lock);
				2091
				2092	/*
				2093	* The rto_cpu is updated under the lock, if it has a valid CPU
				2094	* then the IPI is still running and will continue due to the
				2095	* update to loop_next, and nothing needs to be done here.
				2096	* Otherwise it is finishing up and an ipi needs to be sent.
				2097	*/
				2098	if (rq->rd->rto_cpu < 0)
				2099	cpu = rto_next_cpu(rq->rd);
				2100
				2101	raw_spin_unlock(&rq->rd->rto_lock);
				2102
				2103	rto_start_unlock(&rq->rd->rto_loop_start);
				2104
				2105	if (cpu >= 0) {
				2106	/* Make sure the rd does not get freed while pushing */
				2107	sched_get_rd(rq->rd);
				2108	irq_work_queue_on(&rq->rd->rto_push_work, cpu);
				2109	}
				2110	}
				2111
				2112	/* Called from hardirq context */
				2113	void rto_push_irq_work_func(struct irq_work *work)
				2114	{
				2115	struct root_domain *rd =
				2116	container_of(work, struct root_domain, rto_push_work);
				2117	struct rq *rq;
				2118	int cpu;
				2119
				2120	rq = this_rq();
				2121
				2122	/*
				2123	* We do not need to grab the lock to check for has_pushable_tasks.
				2124	* When it gets updated, a check is made if a push is possible.
				2125	*/
				2126	if (has_pushable_tasks(rq)) {
				2127	raw_spin_lock(&rq->lock);
				2128	push_rt_tasks(rq);
				2129	raw_spin_unlock(&rq->lock);
				2130	}
				2131
				2132	raw_spin_lock(&rd->rto_lock);
				2133
				2134	/* Pass the IPI to the next rt overloaded queue */
				2135	cpu = rto_next_cpu(rd);
				2136
				2137	raw_spin_unlock(&rd->rto_lock);
				2138
				2139	if (cpu < 0) {
				2140	sched_put_rd(rd);
				2141	return;
				2142	}
				2143
				2144	/* Try the next RT overloaded CPU */
				2145	irq_work_queue_on(&rd->rto_push_work, cpu);
				2146	}
				2147	#endif /* HAVE_RT_PUSH_IPI */
				2148
				2149	static void pull_rt_task(struct rq *this_rq)
				2150	{
				2151	int this_cpu = this_rq->cpu, cpu;
				2152	bool resched = false;
				2153	struct task_struct *p;
				2154	struct rq *src_rq;
				2155	int rt_overload_count = rt_overloaded(this_rq);
				2156
				2157	if (likely(!rt_overload_count))
				2158	return;
				2159
				2160	/*
				2161	* Match the barrier from rt_set_overloaded; this guarantees that if we
				2162	* see overloaded we must also see the rto_mask bit.
				2163	*/
				2164	smp_rmb();
				2165
				2166	/* If we are the only overloaded CPU do nothing */
				2167	if (rt_overload_count == 1 &&
				2168	cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
				2169	return;
				2170
				2171	#ifdef HAVE_RT_PUSH_IPI
				2172	if (sched_feat(RT_PUSH_IPI)) {
				2173	tell_cpu_to_push(this_rq);
				2174	return;
				2175	}
				2176	#endif
				2177
				2178	for_each_cpu(cpu, this_rq->rd->rto_mask) {
				2179	if (this_cpu == cpu)
				2180	continue;
				2181
				2182	src_rq = cpu_rq(cpu);
				2183
				2184	/*
				2185	* Don't bother taking the src_rq->lock if the next highest
				2186	* task is known to be lower-priority than our current task.
				2187	* This may look racy, but if this value is about to go
				2188	* logically higher, the src_rq will push this task away.
				2189	* And if its going logically lower, we do not care
				2190	*/
				2191	if (src_rq->rt.highest_prio.next >=
				2192	this_rq->rt.highest_prio.curr)
				2193	continue;
				2194
				2195	/*
				2196	* We can potentially drop this_rq's lock in
				2197	* double_lock_balance, and another CPU could
				2198	* alter this_rq
				2199	*/
				2200	double_lock_balance(this_rq, src_rq);
				2201
				2202	/*
				2203	* We can pull only a task, which is pushable
				2204	* on its rq, and no others.
				2205	*/
				2206	p = pick_highest_pushable_task(src_rq, this_cpu);
				2207
				2208	/*
				2209	* Do we have an RT task that preempts
				2210	* the to-be-scheduled task?
				2211	*/
				2212	if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
				2213	WARN_ON(p == src_rq->curr);
				2214	WARN_ON(!task_on_rq_queued(p));
				2215
				2216	/*
				2217	* There's a chance that p is higher in priority
				2218	* than what's currently running on its CPU.
				2219	* This is just that p is wakeing up and hasn't
				2220	* had a chance to schedule. We only pull
				2221	* p if it is lower in priority than the
				2222	* current task on the run queue
				2223	*/
				2224	if (p->prio < src_rq->curr->prio)
				2225	goto skip;
				2226
				2227	resched = true;
				2228
				2229	deactivate_task(src_rq, p, 0);
				2230	set_task_cpu(p, this_cpu);
				2231	activate_task(this_rq, p, 0);
				2232	/*
				2233	* We continue with the search, just in
				2234	* case there's an even higher prio task
				2235	* in another runqueue. (low likelihood
				2236	* but possible)
				2237	*/
				2238	}
				2239	skip:
				2240	double_unlock_balance(this_rq, src_rq);
				2241	}
				2242
				2243	if (resched)
				2244	resched_curr(this_rq);
				2245	}
				2246
				2247	/*
				2248	* If we are not running and we are not going to reschedule soon, we should
				2249	* try to push tasks away now
				2250	*/
				2251	static void task_woken_rt(struct rq rq, struct task_struct p)
				2252	{
				2253	bool need_to_push = !task_running(rq, p) &&
				2254	!test_tsk_need_resched(rq->curr) &&
				2255	p->nr_cpus_allowed > 1 &&
				2256	(dl_task(rq->curr) \|\| rt_task(rq->curr)) &&
				2257	(rq->curr->nr_cpus_allowed < 2 \|\|
				2258	rq->curr->prio <= p->prio);
				2259
				2260	if (need_to_push)
				2261	push_rt_tasks(rq);
				2262	}
				2263
				2264	/* Assumes rq->lock is held */
				2265	static void rq_online_rt(struct rq *rq)
				2266	{
				2267	if (rq->rt.overloaded)
				2268	rt_set_overload(rq);
				2269
				2270	__enable_runtime(rq);
				2271
				2272	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
				2273	}
				2274
				2275	/* Assumes rq->lock is held */
				2276	static void rq_offline_rt(struct rq *rq)
				2277	{
				2278	if (rq->rt.overloaded)
				2279	rt_clear_overload(rq);
				2280
				2281	__disable_runtime(rq);
				2282
				2283	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
				2284	}
				2285
				2286	/*
				2287	* When switch from the rt queue, we bring ourselves to a position
				2288	* that we might want to pull RT tasks from other runqueues.
				2289	*/
				2290	static void switched_from_rt(struct rq rq, struct task_struct p)
				2291	{
				2292	/*
				2293	* If there are other RT tasks then we will reschedule
				2294	* and the scheduling of the other RT tasks will handle
				2295	* the balancing. But if we are the last RT task
				2296	* we may need to handle the pulling of RT tasks
				2297	* now.
				2298	*/
				2299	if (!task_on_rq_queued(p) \|\| rq->rt.rt_nr_running)
				2300	return;
				2301
				2302	rt_queue_pull_task(rq);
				2303	}
				2304
				2305	void __init init_sched_rt_class(void)
				2306	{
				2307	unsigned int i;
				2308
				2309	for_each_possible_cpu(i) {
				2310	zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
				2311	GFP_KERNEL, cpu_to_node(i));
				2312	}
				2313	}
				2314	#endif /* CONFIG_SMP */
				2315
				2316	/*
				2317	* When switching a task to RT, we may overload the runqueue
				2318	* with RT tasks. In this case we try to push them off to
				2319	* other runqueues.
				2320	*/
				2321	static void switched_to_rt(struct rq rq, struct task_struct p)
				2322	{
				2323	/*
				2324	* If we are running, update the avg_rt tracking, as the running time
				2325	* will now on be accounted into the latter.
				2326	*/
				2327	if (task_current(rq, p)) {
				2328	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
				2329	return;
				2330	}
				2331
				2332	/*
				2333	* If we are not running we may need to preempt the current
				2334	* running task. If that current running task is also an RT task
				2335	* then see if we can move to another run queue.
				2336	*/
				2337	if (task_on_rq_queued(p)) {
				2338	#ifdef CONFIG_SMP
				2339	if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
				2340	rt_queue_push_tasks(rq);
				2341	#endif /* CONFIG_SMP */
				2342	if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
				2343	resched_curr(rq);
				2344	}
				2345	}
				2346
				2347	/*
				2348	* Priority of the task has changed. This may cause
				2349	* us to initiate a push or pull.
				2350	*/
				2351	static void
				2352	prio_changed_rt(struct rq rq, struct task_struct p, int oldprio)
				2353	{
				2354	if (!task_on_rq_queued(p))
				2355	return;
				2356
				2357	if (rq->curr == p) {
				2358	#ifdef CONFIG_SMP
				2359	/*
				2360	* If our priority decreases while running, we
				2361	* may need to pull tasks to this runqueue.
				2362	*/
				2363	if (oldprio < p->prio)
				2364	rt_queue_pull_task(rq);
				2365
				2366	/*
				2367	* If there's a higher priority task waiting to run
				2368	* then reschedule.
				2369	*/
				2370	if (p->prio > rq->rt.highest_prio.curr)
				2371	resched_curr(rq);
				2372	#else
				2373	/* For UP simply resched on drop of prio */
				2374	if (oldprio < p->prio)
				2375	resched_curr(rq);
				2376	#endif /* CONFIG_SMP */
				2377	} else {
				2378	/*
				2379	* This task is not running, but if it is
				2380	* greater than the current running task
				2381	* then reschedule.
				2382	*/
				2383	if (p->prio < rq->curr->prio)
				2384	resched_curr(rq);
				2385	}
				2386	}
				2387
				2388	#ifdef CONFIG_POSIX_TIMERS
				2389	static void watchdog(struct rq rq, struct task_struct p)
				2390	{
				2391	unsigned long soft, hard;
				2392
				2393	/* max may change after cur was read, this will be fixed next tick */
				2394	soft = task_rlimit(p, RLIMIT_RTTIME);
				2395	hard = task_rlimit_max(p, RLIMIT_RTTIME);
				2396
				2397	if (soft != RLIM_INFINITY) {
				2398	unsigned long next;
				2399
				2400	if (p->rt.watchdog_stamp != jiffies) {
				2401	p->rt.timeout++;
				2402	p->rt.watchdog_stamp = jiffies;
				2403	}
				2404
				2405	next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
				2406	if (p->rt.timeout > next) {
				2407	posix_cputimers_rt_watchdog(&p->posix_cputimers,
				2408	p->se.sum_exec_runtime);
				2409	}
				2410	}
				2411	}
				2412	#else
				2413	static inline void watchdog(struct rq rq, struct task_struct p) { }
				2414	#endif
				2415
				2416	/*
				2417	* scheduler tick hitting a task of our scheduling class.
				2418	*
				2419	* NOTE: This function can be called remotely by the tick offload that
				2420	* goes along full dynticks. Therefore no local assumption can be made
				2421	* and everything must be accessed through the @rq and @curr passed in
				2422	* parameters.
				2423	*/
				2424	static void task_tick_rt(struct rq rq, struct task_struct p, int queued)
				2425	{
				2426	struct sched_rt_entity *rt_se = &p->rt;
				2427
				2428	update_curr_rt(rq);
				2429	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
				2430
				2431	watchdog(rq, p);
				2432
				2433	/*
				2434	* RR tasks need a special form of timeslice management.
				2435	* FIFO tasks have no timeslices.
				2436	*/
				2437	if (p->policy != SCHED_RR)
				2438	return;
				2439
				2440	if (--p->rt.time_slice)
				2441	return;
				2442
				2443	p->rt.time_slice = sched_rr_timeslice;
				2444
				2445	/*
				2446	* Requeue to the end of queue if we (and all of our ancestors) are not
				2447	* the only element on the queue
				2448	*/
				2449	for_each_sched_rt_entity(rt_se) {
				2450	if (rt_se->run_list.prev != rt_se->run_list.next) {
				2451	requeue_task_rt(rq, p, 0);
				2452	resched_curr(rq);
				2453	return;
				2454	}
				2455	}
				2456	}
				2457
				2458	static unsigned int get_rr_interval_rt(struct rq rq, struct task_struct task)
				2459	{
				2460	/*
				2461	* Time slice is 0 for SCHED_FIFO tasks
				2462	*/
				2463	if (task->policy == SCHED_RR)
				2464	return sched_rr_timeslice;
				2465	else
				2466	return 0;
				2467	}
				2468
				2469	const struct sched_class rt_sched_class = {
				2470	.next = &fair_sched_class,
				2471	.enqueue_task = enqueue_task_rt,
				2472	.dequeue_task = dequeue_task_rt,
				2473	.yield_task = yield_task_rt,
				2474
				2475	.check_preempt_curr = check_preempt_curr_rt,
				2476
				2477	.pick_next_task = pick_next_task_rt,
				2478	.put_prev_task = put_prev_task_rt,
				2479	.set_next_task = set_next_task_rt,
				2480
				2481	#ifdef CONFIG_SMP
				2482	.balance = balance_rt,
				2483	.select_task_rq = select_task_rq_rt,
				2484	.set_cpus_allowed = set_cpus_allowed_common,
				2485	.rq_online = rq_online_rt,
				2486	.rq_offline = rq_offline_rt,
				2487	.task_woken = task_woken_rt,
				2488	.switched_from = switched_from_rt,
				2489	#endif
				2490
				2491	.task_tick = task_tick_rt,
				2492
				2493	.get_rr_interval = get_rr_interval_rt,
				2494
				2495	.prio_changed = prio_changed_rt,
				2496	.switched_to = switched_to_rt,
				2497
				2498	.update_curr = update_curr_rt,
				2499
				2500	#ifdef CONFIG_UCLAMP_TASK
				2501	.uclamp_enabled = 1,
				2502	#endif
				2503	};
				2504
				2505	#ifdef CONFIG_RT_GROUP_SCHED
				2506	/*
				2507	* Ensure that the real time constraints are schedulable.
				2508	*/
				2509	static DEFINE_MUTEX(rt_constraints_mutex);
				2510
				2511	/* Must be called with tasklist_lock held */
				2512	static inline int tg_has_rt_tasks(struct task_group *tg)
				2513	{
				2514	struct task_struct g, p;
				2515
				2516	/*
				2517	* Autogroups do not have RT tasks; see autogroup_create().
				2518	*/
				2519	if (task_group_is_autogroup(tg))
				2520	return 0;
				2521
				2522	for_each_process_thread(g, p) {
				2523	if (rt_task(p) && task_group(p) == tg)
				2524	return 1;
				2525	}
				2526
				2527	return 0;
				2528	}
				2529
				2530	struct rt_schedulable_data {
				2531	struct task_group *tg;
				2532	u64 rt_period;
				2533	u64 rt_runtime;
				2534	};
				2535
				2536	static int tg_rt_schedulable(struct task_group tg, void data)
				2537	{
				2538	struct rt_schedulable_data *d = data;
				2539	struct task_group *child;
				2540	unsigned long total, sum = 0;
				2541	u64 period, runtime;
				2542
				2543	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
				2544	runtime = tg->rt_bandwidth.rt_runtime;
				2545
				2546	if (tg == d->tg) {
				2547	period = d->rt_period;
				2548	runtime = d->rt_runtime;
				2549	}
				2550
				2551	/*
				2552	* Cannot have more runtime than the period.
				2553	*/
				2554	if (runtime > period && runtime != RUNTIME_INF)
				2555	return -EINVAL;
				2556
				2557	/*
				2558	* Ensure we don't starve existing RT tasks.
				2559	*/
				2560	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
				2561	return -EBUSY;
				2562
				2563	total = to_ratio(period, runtime);
				2564
				2565	/*
				2566	* Nobody can have more than the global setting allows.
				2567	*/
				2568	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
				2569	return -EINVAL;
				2570
				2571	/*
				2572	* The sum of our children's runtime should not exceed our own.
				2573	*/
				2574	list_for_each_entry_rcu(child, &tg->children, siblings) {
				2575	period = ktime_to_ns(child->rt_bandwidth.rt_period);
				2576	runtime = child->rt_bandwidth.rt_runtime;
				2577
				2578	if (child == d->tg) {
				2579	period = d->rt_period;
				2580	runtime = d->rt_runtime;
				2581	}
				2582
				2583	sum += to_ratio(period, runtime);
				2584	}
				2585
				2586	if (sum > total)
				2587	return -EINVAL;
				2588
				2589	return 0;
				2590	}
				2591
				2592	static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
				2593	{
				2594	int ret;
				2595
				2596	struct rt_schedulable_data data = {
				2597	.tg = tg,
				2598	.rt_period = period,
				2599	.rt_runtime = runtime,
				2600	};
				2601
				2602	rcu_read_lock();
				2603	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
				2604	rcu_read_unlock();
				2605
				2606	return ret;
				2607	}
				2608
				2609	static int tg_set_rt_bandwidth(struct task_group *tg,
				2610	u64 rt_period, u64 rt_runtime)
				2611	{
				2612	int i, err = 0;
				2613
				2614	/*
				2615	* Disallowing the root group RT runtime is BAD, it would disallow the
				2616	* kernel creating (and or operating) RT threads.
				2617	*/
				2618	if (tg == &root_task_group && rt_runtime == 0)
				2619	return -EINVAL;
				2620
				2621	/* No period doesn't make any sense. */
				2622	if (rt_period == 0)
				2623	return -EINVAL;
				2624
				2625	/*
				2626	* Bound quota to defend quota against overflow during bandwidth shift.
				2627	*/
				2628	if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
				2629	return -EINVAL;
				2630
				2631	mutex_lock(&rt_constraints_mutex);
				2632	read_lock(&tasklist_lock);
				2633	err = __rt_schedulable(tg, rt_period, rt_runtime);
				2634	if (err)
				2635	goto unlock;
				2636
				2637	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
				2638	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
				2639	tg->rt_bandwidth.rt_runtime = rt_runtime;
				2640
				2641	for_each_possible_cpu(i) {
				2642	struct rt_rq *rt_rq = tg->rt_rq[i];
				2643
				2644	raw_spin_lock(&rt_rq->rt_runtime_lock);
				2645	rt_rq->rt_runtime = rt_runtime;
				2646	raw_spin_unlock(&rt_rq->rt_runtime_lock);
				2647	}
				2648	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
				2649	unlock:
				2650	read_unlock(&tasklist_lock);
				2651	mutex_unlock(&rt_constraints_mutex);
				2652
				2653	return err;
				2654	}
				2655
				2656	int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
				2657	{
				2658	u64 rt_runtime, rt_period;
				2659
				2660	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
				2661	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
				2662	if (rt_runtime_us < 0)
				2663	rt_runtime = RUNTIME_INF;
				2664	else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
				2665	return -EINVAL;
				2666
				2667	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
				2668	}
				2669
				2670	long sched_group_rt_runtime(struct task_group *tg)
				2671	{
				2672	u64 rt_runtime_us;
				2673
				2674	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
				2675	return -1;
				2676
				2677	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
				2678	do_div(rt_runtime_us, NSEC_PER_USEC);
				2679	return rt_runtime_us;
				2680	}
				2681
				2682	int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
				2683	{
				2684	u64 rt_runtime, rt_period;
				2685
				2686	if (rt_period_us > U64_MAX / NSEC_PER_USEC)
				2687	return -EINVAL;
				2688
				2689	rt_period = rt_period_us * NSEC_PER_USEC;
				2690	rt_runtime = tg->rt_bandwidth.rt_runtime;
				2691
				2692	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
				2693	}
				2694
				2695	long sched_group_rt_period(struct task_group *tg)
				2696	{
				2697	u64 rt_period_us;
				2698
				2699	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
				2700	do_div(rt_period_us, NSEC_PER_USEC);
				2701	return rt_period_us;
				2702	}
				2703
				2704	static int sched_rt_global_constraints(void)
				2705	{
				2706	int ret = 0;
				2707
				2708	mutex_lock(&rt_constraints_mutex);
				2709	read_lock(&tasklist_lock);
				2710	ret = __rt_schedulable(NULL, 0, 0);
				2711	read_unlock(&tasklist_lock);
				2712	mutex_unlock(&rt_constraints_mutex);
				2713
				2714	return ret;
				2715	}
				2716
				2717	int sched_rt_can_attach(struct task_group tg, struct task_struct tsk)
				2718	{
				2719	/* Don't accept realtime tasks when there is no way for them to run */
				2720	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
				2721	return 0;
				2722
				2723	return 1;
				2724	}
				2725
				2726	#else /* !CONFIG_RT_GROUP_SCHED */
				2727	static int sched_rt_global_constraints(void)
				2728	{
				2729	unsigned long flags;
				2730	int i;
				2731
				2732	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
				2733	for_each_possible_cpu(i) {
				2734	struct rt_rq *rt_rq = &cpu_rq(i)->rt;
				2735
				2736	raw_spin_lock(&rt_rq->rt_runtime_lock);
				2737	rt_rq->rt_runtime = global_rt_runtime();
				2738	raw_spin_unlock(&rt_rq->rt_runtime_lock);
				2739	}
				2740	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
				2741
				2742	return 0;
				2743	}
				2744	#endif /* CONFIG_RT_GROUP_SCHED */
				2745
				2746	static int sched_rt_global_validate(void)
				2747	{
				2748	if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
				2749	((sysctl_sched_rt_runtime > sysctl_sched_rt_period) \|\|
				2750	((u64)sysctl_sched_rt_runtime *
				2751	NSEC_PER_USEC > max_rt_runtime)))
				2752	return -EINVAL;
				2753
				2754	return 0;
				2755	}
				2756
				2757	static void sched_rt_do_global(void)
				2758	{
				2759	unsigned long flags;
				2760
				2761	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
				2762	def_rt_bandwidth.rt_runtime = global_rt_runtime();
				2763	def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
				2764	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
				2765	}
				2766
				2767	int sched_rt_handler(struct ctl_table *table, int write,
				2768	void __user buffer, size_t lenp,
				2769	loff_t *ppos)
				2770	{
				2771	int old_period, old_runtime;
				2772	static DEFINE_MUTEX(mutex);
				2773	int ret;
				2774
				2775	mutex_lock(&mutex);
				2776	old_period = sysctl_sched_rt_period;
				2777	old_runtime = sysctl_sched_rt_runtime;
				2778
				2779	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
				2780
				2781	if (!ret && write) {
				2782	ret = sched_rt_global_validate();
				2783	if (ret)
				2784	goto undo;
				2785
				2786	ret = sched_dl_global_validate();
				2787	if (ret)
				2788	goto undo;
				2789
				2790	ret = sched_rt_global_constraints();
				2791	if (ret)
				2792	goto undo;
				2793
				2794	sched_rt_do_global();
				2795	sched_dl_do_global();
				2796	}
				2797	if (0) {
				2798	undo:
				2799	sysctl_sched_rt_period = old_period;
				2800	sysctl_sched_rt_runtime = old_runtime;
				2801	}
				2802	mutex_unlock(&mutex);
				2803
				2804	return ret;
				2805	}
				2806
				2807	int sched_rr_handler(struct ctl_table *table, int write,
				2808	void __user buffer, size_t lenp,
				2809	loff_t *ppos)
				2810	{
				2811	int ret;
				2812	static DEFINE_MUTEX(mutex);
				2813
				2814	mutex_lock(&mutex);
				2815	ret = proc_dointvec(table, write, buffer, lenp, ppos);
				2816	/*
				2817	* Make sure that internally we keep jiffies.
				2818	* Also, writing zero resets the timeslice to default:
				2819	*/
				2820	if (!ret && write) {
				2821	sched_rr_timeslice =
				2822	sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
				2823	msecs_to_jiffies(sysctl_sched_rr_timeslice);
				2824
				2825	if (sysctl_sched_rr_timeslice <= 0)
				2826	sysctl_sched_rr_timeslice = jiffies_to_msecs(RR_TIMESLICE);
				2827	}
				2828	mutex_unlock(&mutex);
				2829
				2830	return ret;
				2831	}
				2832
				2833	#ifdef CONFIG_SCHED_DEBUG
				2834	void print_rt_stats(struct seq_file *m, int cpu)
				2835	{
				2836	rt_rq_iter_t iter;
				2837	struct rt_rq *rt_rq;
				2838
				2839	rcu_read_lock();
				2840	for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
				2841	print_rt_rq(m, cpu, rt_rq);
				2842	rcu_read_unlock();
				2843	}
				2844	#endif /* CONFIG_SCHED_DEBUG */