Blame - marvell/linux/kernel/sched/cputime.c - T108

blob: d77f652aedbc7828fa5d1f1ffc7f6b1c12b92918 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
				2	/*
				3	* Simple CPU accounting cgroup controller
				4	*/
				5	#include <linux/cpufreq_times.h>
				6	#include "sched.h"
				7
				8	#ifdef CONFIG_IRQ_TIME_ACCOUNTING
				9
				10	/*
				11	* There are no locks covering percpu hardirq/softirq time.
				12	* They are only modified in vtime_account, on corresponding CPU
				13	* with interrupts disabled. So, writes are safe.
				14	* They are read and saved off onto struct rq in update_rq_clock().
				15	* This may result in other CPU reading this CPU's irq time and can
				16	* race with irq/vtime_account on this CPU. We would either get old
				17	* or new value with a side effect of accounting a slice of irq time to wrong
				18	* task when irq is in progress while we read rq->clock. That is a worthy
				19	* compromise in place of having locks on each irq in account_system_time.
				20	*/
				21	DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
				22
				23	static int sched_clock_irqtime;
				24
				25	void enable_sched_clock_irqtime(void)
				26	{
				27	sched_clock_irqtime = 1;
				28	}
				29
				30	void disable_sched_clock_irqtime(void)
				31	{
				32	sched_clock_irqtime = 0;
				33	}
				34
				35	static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
				36	enum cpu_usage_stat idx)
				37	{
				38	u64 *cpustat = kcpustat_this_cpu->cpustat;
				39
				40	u64_stats_update_begin(&irqtime->sync);
				41	cpustat[idx] += delta;
				42	irqtime->total += delta;
				43	irqtime->tick_delta += delta;
				44	u64_stats_update_end(&irqtime->sync);
				45	}
				46
				47	/*
				48	* Called before incrementing preempt_count on {soft,}irq_enter
				49	* and before decrementing preempt_count on {soft,}irq_exit.
				50	*/
				51	void irqtime_account_irq(struct task_struct *curr)
				52	{
				53	struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
				54	s64 delta;
				55	int cpu;
				56
				57	if (!sched_clock_irqtime)
				58	return;
				59
				60	cpu = smp_processor_id();
				61	delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
				62	irqtime->irq_start_time += delta;
				63
				64	/*
				65	* We do not account for softirq time from ksoftirqd here.
				66	* We want to continue accounting softirq time to ksoftirqd thread
				67	* in that case, so as not to confuse scheduler with a special task
				68	* that do not consume any time, but still wants to run.
				69	*/
				70	if (hardirq_count())
				71	irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
				72	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
				73	irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
				74	}
				75	EXPORT_SYMBOL_GPL(irqtime_account_irq);
				76
				77	static u64 irqtime_tick_accounted(u64 maxtime)
				78	{
				79	struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
				80	u64 delta;
				81
				82	delta = min(irqtime->tick_delta, maxtime);
				83	irqtime->tick_delta -= delta;
				84
				85	return delta;
				86	}
				87
				88	#else /* CONFIG_IRQ_TIME_ACCOUNTING */
				89
				90	#define sched_clock_irqtime (0)
				91
				92	static u64 irqtime_tick_accounted(u64 dummy)
				93	{
				94	return 0;
				95	}
				96
				97	#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
				98
				99	static inline void task_group_account_field(struct task_struct *p, int index,
				100	u64 tmp)
				101	{
				102	/*
				103	* Since all updates are sure to touch the root cgroup, we
				104	* get ourselves ahead and touch it first. If the root cgroup
				105	* is the only cgroup, then nothing else should be necessary.
				106	*
				107	*/
				108	__this_cpu_add(kernel_cpustat.cpustat[index], tmp);
				109
				110	cgroup_account_cputime_field(p, index, tmp);
				111	}
				112
				113	/*
				114	* Account user CPU time to a process.
				115	* @p: the process that the CPU time gets accounted to
				116	* @cputime: the CPU time spent in user space since the last update
				117	*/
				118	void account_user_time(struct task_struct *p, u64 cputime)
				119	{
				120	int index;
				121
				122	/* Add user time to process. */
				123	p->utime += cputime;
				124	account_group_user_time(p, cputime);
				125
				126	index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
				127
				128	/* Add user time to cpustat. */
				129	task_group_account_field(p, index, cputime);
				130
				131	/* Account for user time used */
				132	acct_account_cputime(p);
				133
				134	/* Account power usage for user time */
				135	cpufreq_acct_update_power(p, cputime);
				136	}
				137
				138	/*
				139	* Account guest CPU time to a process.
				140	* @p: the process that the CPU time gets accounted to
				141	* @cputime: the CPU time spent in virtual machine since the last update
				142	*/
				143	void account_guest_time(struct task_struct *p, u64 cputime)
				144	{
				145	u64 *cpustat = kcpustat_this_cpu->cpustat;
				146
				147	/* Add guest time to process. */
				148	p->utime += cputime;
				149	account_group_user_time(p, cputime);
				150	p->gtime += cputime;
				151
				152	/* Add guest time to cpustat. */
				153	if (task_nice(p) > 0) {
				154	task_group_account_field(p, CPUTIME_NICE, cputime);
				155	cpustat[CPUTIME_GUEST_NICE] += cputime;
				156	} else {
				157	task_group_account_field(p, CPUTIME_USER, cputime);
				158	cpustat[CPUTIME_GUEST] += cputime;
				159	}
				160	}
				161
				162	/*
				163	* Account system CPU time to a process and desired cpustat field
				164	* @p: the process that the CPU time gets accounted to
				165	* @cputime: the CPU time spent in kernel space since the last update
				166	* @index: pointer to cpustat field that has to be updated
				167	*/
				168	void account_system_index_time(struct task_struct *p,
				169	u64 cputime, enum cpu_usage_stat index)
				170	{
				171	/* Add system time to process. */
				172	p->stime += cputime;
				173	account_group_system_time(p, cputime);
				174
				175	/* Add system time to cpustat. */
				176	task_group_account_field(p, index, cputime);
				177
				178	/* Account for system time used */
				179	acct_account_cputime(p);
				180
				181	/* Account power usage for system time */
				182	cpufreq_acct_update_power(p, cputime);
				183	}
				184
				185	/*
				186	* Account system CPU time to a process.
				187	* @p: the process that the CPU time gets accounted to
				188	* @hardirq_offset: the offset to subtract from hardirq_count()
				189	* @cputime: the CPU time spent in kernel space since the last update
				190	*/
				191	void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
				192	{
				193	int index;
				194
				195	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
				196	account_guest_time(p, cputime);
				197	return;
				198	}
				199
				200	if (hardirq_count() - hardirq_offset)
				201	index = CPUTIME_IRQ;
				202	else if (in_serving_softirq())
				203	index = CPUTIME_SOFTIRQ;
				204	else
				205	index = CPUTIME_SYSTEM;
				206
				207	account_system_index_time(p, cputime, index);
				208	}
				209
				210	/*
				211	* Account for involuntary wait time.
				212	* @cputime: the CPU time spent in involuntary wait
				213	*/
				214	void account_steal_time(u64 cputime)
				215	{
				216	u64 *cpustat = kcpustat_this_cpu->cpustat;
				217
				218	cpustat[CPUTIME_STEAL] += cputime;
				219	}
				220
				221	/*
				222	* Account for idle time.
				223	* @cputime: the CPU time spent in idle wait
				224	*/
				225	void account_idle_time(u64 cputime)
				226	{
				227	u64 *cpustat = kcpustat_this_cpu->cpustat;
				228	struct rq *rq = this_rq();
				229
				230	if (atomic_read(&rq->nr_iowait) > 0)
				231	cpustat[CPUTIME_IOWAIT] += cputime;
				232	else
				233	cpustat[CPUTIME_IDLE] += cputime;
				234	}
				235
				236	/*
				237	* When a guest is interrupted for a longer amount of time, missed clock
				238	* ticks are not redelivered later. Due to that, this function may on
				239	* occasion account more time than the calling functions think elapsed.
				240	*/
				241	static __always_inline u64 steal_account_process_time(u64 maxtime)
				242	{
				243	#ifdef CONFIG_PARAVIRT
				244	if (static_key_false(&paravirt_steal_enabled)) {
				245	u64 steal;
				246
				247	steal = paravirt_steal_clock(smp_processor_id());
				248	steal -= this_rq()->prev_steal_time;
				249	steal = min(steal, maxtime);
				250	account_steal_time(steal);
				251	this_rq()->prev_steal_time += steal;
				252
				253	return steal;
				254	}
				255	#endif
				256	return 0;
				257	}
				258
				259	/*
				260	* Account how much elapsed time was spent in steal, irq, or softirq time.
				261	*/
				262	static inline u64 account_other_time(u64 max)
				263	{
				264	u64 accounted;
				265
				266	lockdep_assert_irqs_disabled();
				267
				268	accounted = steal_account_process_time(max);
				269
				270	if (accounted < max)
				271	accounted += irqtime_tick_accounted(max - accounted);
				272
				273	return accounted;
				274	}
				275
				276	#ifdef CONFIG_64BIT
				277	static inline u64 read_sum_exec_runtime(struct task_struct *t)
				278	{
				279	return t->se.sum_exec_runtime;
				280	}
				281	#else
				282	static u64 read_sum_exec_runtime(struct task_struct *t)
				283	{
				284	u64 ns;
				285	struct rq_flags rf;
				286	struct rq *rq;
				287
				288	rq = task_rq_lock(t, &rf);
				289	ns = t->se.sum_exec_runtime;
				290	task_rq_unlock(rq, t, &rf);
				291
				292	return ns;
				293	}
				294	#endif
				295
				296	/*
				297	* Accumulate raw cputime values of dead tasks (sig->[us]time) and live
				298	* tasks (sum on group iteration) belonging to @tsk's group.
				299	*/
				300	void thread_group_cputime(struct task_struct tsk, struct task_cputime times)
				301	{
				302	struct signal_struct *sig = tsk->signal;
				303	u64 utime, stime;
				304	struct task_struct *t;
				305	unsigned int seq, nextseq;
				306	unsigned long flags;
				307
				308	/*
				309	* Update current task runtime to account pending time since last
				310	* scheduler action or thread_group_cputime() call. This thread group
				311	* might have other running tasks on different CPUs, but updating
				312	* their runtime can affect syscall performance, so we skip account
				313	* those pending times and rely only on values updated on tick or
				314	* other scheduler action.
				315	*/
				316	if (same_thread_group(current, tsk))
				317	(void) task_sched_runtime(current);
				318
				319	rcu_read_lock();
				320	/* Attempt a lockless read on the first round. */
				321	nextseq = 0;
				322	do {
				323	seq = nextseq;
				324	flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
				325	times->utime = sig->utime;
				326	times->stime = sig->stime;
				327	times->sum_exec_runtime = sig->sum_sched_runtime;
				328
				329	for_each_thread(tsk, t) {
				330	task_cputime(t, &utime, &stime);
				331	times->utime += utime;
				332	times->stime += stime;
				333	times->sum_exec_runtime += read_sum_exec_runtime(t);
				334	}
				335	/* If lockless access failed, take the lock. */
				336	nextseq = 1;
				337	} while (need_seqretry(&sig->stats_lock, seq));
				338	done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
				339	rcu_read_unlock();
				340	}
				341
				342	#ifdef CONFIG_IRQ_TIME_ACCOUNTING
				343	/*
				344	* Account a tick to a process and cpustat
				345	* @p: the process that the CPU time gets accounted to
				346	* @user_tick: is the tick from userspace
				347	* @rq: the pointer to rq
				348	*
				349	* Tick demultiplexing follows the order
				350	* - pending hardirq update
				351	* - pending softirq update
				352	* - user_time
				353	* - idle_time
				354	* - system time
				355	* - check for guest_time
				356	* - else account as system_time
				357	*
				358	* Check for hardirq is done both for system and user time as there is
				359	* no timer going off while we are on hardirq and hence we may never get an
				360	* opportunity to update it solely in system time.
				361	* p->stime and friends are only updated on system time and not on irq
				362	* softirq as those do not count in task exec_runtime any more.
				363	*/
				364	static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
				365	struct rq *rq, int ticks)
				366	{
				367	u64 other, cputime = TICK_NSEC * ticks;
				368
				369	/*
				370	* When returning from idle, many ticks can get accounted at
				371	* once, including some ticks of steal, irq, and softirq time.
				372	* Subtract those ticks from the amount of time accounted to
				373	* idle, or potentially user or system time. Due to rounding,
				374	* other time can exceed ticks occasionally.
				375	*/
				376	other = account_other_time(ULONG_MAX);
				377	if (other >= cputime)
				378	return;
				379
				380	cputime -= other;
				381
				382	if (this_cpu_ksoftirqd() == p) {
				383	/*
				384	* ksoftirqd time do not get accounted in cpu_softirq_time.
				385	* So, we have to handle it separately here.
				386	* Also, p->stime needs to be updated for ksoftirqd.
				387	*/
				388	account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
				389	} else if (user_tick) {
				390	account_user_time(p, cputime);
				391	} else if (p == rq->idle) {
				392	account_idle_time(cputime);
				393	} else if (p->flags & PF_VCPU) { /* System time or guest time */
				394	account_guest_time(p, cputime);
				395	} else {
				396	account_system_index_time(p, cputime, CPUTIME_SYSTEM);
				397	}
				398	}
				399
				400	static void irqtime_account_idle_ticks(int ticks)
				401	{
				402	struct rq *rq = this_rq();
				403
				404	irqtime_account_process_tick(current, 0, rq, ticks);
				405	}
				406	#else /* CONFIG_IRQ_TIME_ACCOUNTING */
				407	static inline void irqtime_account_idle_ticks(int ticks) { }
				408	static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
				409	struct rq *rq, int nr_ticks) { }
				410	#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
				411
				412	/*
				413	* Use precise platform statistics if available:
				414	*/
				415	#ifdef CONFIG_VIRT_CPU_ACCOUNTING
				416	# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
				417	void vtime_common_task_switch(struct task_struct *prev)
				418	{
				419	if (is_idle_task(prev))
				420	vtime_account_idle(prev);
				421	else
				422	vtime_account_system(prev);
				423
				424	vtime_flush(prev);
				425	arch_vtime_task_switch(prev);
				426	}
				427	# endif
				428	#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
				429
				430
				431	#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
				432	/*
				433	* Archs that account the whole time spent in the idle task
				434	* (outside irq) as idle time can rely on this and just implement
				435	* vtime_account_system() and vtime_account_idle(). Archs that
				436	* have other meaning of the idle time (s390 only includes the
				437	* time spent by the CPU when it's in low power mode) must override
				438	* vtime_account().
				439	*/
				440	#ifndef __ARCH_HAS_VTIME_ACCOUNT
				441	void vtime_account_irq_enter(struct task_struct *tsk)
				442	{
				443	if (!in_interrupt() && is_idle_task(tsk))
				444	vtime_account_idle(tsk);
				445	else
				446	vtime_account_system(tsk);
				447	}
				448	EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
				449	#endif /* __ARCH_HAS_VTIME_ACCOUNT */
				450
				451	void cputime_adjust(struct task_cputime curr, struct prev_cputime prev,
				452	u64 ut, u64 st)
				453	{
				454	*ut = curr->utime;
				455	*st = curr->stime;
				456	}
				457
				458	void task_cputime_adjusted(struct task_struct p, u64 ut, u64 *st)
				459	{
				460	*ut = p->utime;
				461	*st = p->stime;
				462	}
				463	EXPORT_SYMBOL_GPL(task_cputime_adjusted);
				464
				465	void thread_group_cputime_adjusted(struct task_struct p, u64 ut, u64 *st)
				466	{
				467	struct task_cputime cputime;
				468
				469	thread_group_cputime(p, &cputime);
				470
				471	*ut = cputime.utime;
				472	*st = cputime.stime;
				473	}
				474	EXPORT_SYMBOL_GPL(thread_group_cputime_adjusted);
				475
				476	#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
				477
				478	/*
				479	* Account a single tick of CPU time.
				480	* @p: the process that the CPU time gets accounted to
				481	* @user_tick: indicates if the tick is a user or a system tick
				482	*/
				483	void account_process_tick(struct task_struct *p, int user_tick)
				484	{
				485	u64 cputime, steal;
				486	struct rq *rq = this_rq();
				487
				488	if (vtime_accounting_cpu_enabled())
				489	return;
				490
				491	if (sched_clock_irqtime) {
				492	irqtime_account_process_tick(p, user_tick, rq, 1);
				493	return;
				494	}
				495
				496	cputime = TICK_NSEC;
				497	steal = steal_account_process_time(ULONG_MAX);
				498
				499	if (steal >= cputime)
				500	return;
				501
				502	cputime -= steal;
				503
				504	if (user_tick)
				505	account_user_time(p, cputime);
				506	else if ((p != rq->idle) \|\| (irq_count() != HARDIRQ_OFFSET))
				507	account_system_time(p, HARDIRQ_OFFSET, cputime);
				508	else
				509	account_idle_time(cputime);
				510	}
				511
				512	/*
				513	* Account multiple ticks of idle time.
				514	* @ticks: number of stolen ticks
				515	*/
				516	void account_idle_ticks(unsigned long ticks)
				517	{
				518	u64 cputime, steal;
				519
				520	if (sched_clock_irqtime) {
				521	irqtime_account_idle_ticks(ticks);
				522	return;
				523	}
				524
				525	cputime = ticks * TICK_NSEC;
				526	steal = steal_account_process_time(ULONG_MAX);
				527
				528	if (steal >= cputime)
				529	return;
				530
				531	cputime -= steal;
				532	account_idle_time(cputime);
				533	}
				534
				535	/*
				536	* Perform (stime * rtime) / total, but avoid multiplication overflow by
				537	* losing precision when the numbers are big.
				538	*/
				539	static u64 scale_stime(u64 stime, u64 rtime, u64 total)
				540	{
				541	u64 scaled;
				542
				543	for (;;) {
				544	/* Make sure "rtime" is the bigger of stime/rtime */
				545	if (stime > rtime)
				546	swap(rtime, stime);
				547
				548	/* Make sure 'total' fits in 32 bits */
				549	if (total >> 32)
				550	goto drop_precision;
				551
				552	/* Does rtime (and thus stime) fit in 32 bits? */
				553	if (!(rtime >> 32))
				554	break;
				555
				556	/* Can we just balance rtime/stime rather than dropping bits? */
				557	if (stime >> 31)
				558	goto drop_precision;
				559
				560	/* We can grow stime and shrink rtime and try to make them both fit */
				561	stime <<= 1;
				562	rtime >>= 1;
				563	continue;
				564
				565	drop_precision:
				566	/* We drop from rtime, it has more bits than stime */
				567	rtime >>= 1;
				568	total >>= 1;
				569	}
				570
				571	/*
				572	* Make sure gcc understands that this is a 32x32->64 multiply,
				573	* followed by a 64/32->64 divide.
				574	*/
				575	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
				576	return scaled;
				577	}
				578
				579	/*
				580	* Adjust tick based cputime random precision against scheduler runtime
				581	* accounting.
				582	*
				583	* Tick based cputime accounting depend on random scheduling timeslices of a
				584	* task to be interrupted or not by the timer. Depending on these
				585	* circumstances, the number of these interrupts may be over or
				586	* under-optimistic, matching the real user and system cputime with a variable
				587	* precision.
				588	*
				589	* Fix this by scaling these tick based values against the total runtime
				590	* accounted by the CFS scheduler.
				591	*
				592	* This code provides the following guarantees:
				593	*
				594	* stime + utime == rtime
				595	* stime_i+1 >= stime_i, utime_i+1 >= utime_i
				596	*
				597	* Assuming that rtime_i+1 >= rtime_i.
				598	*/
				599	void cputime_adjust(struct task_cputime curr, struct prev_cputime prev,
				600	u64 ut, u64 st)
				601	{
				602	u64 rtime, stime, utime;
				603	unsigned long flags;
				604
				605	/* Serialize concurrent callers such that we can honour our guarantees */
				606	raw_spin_lock_irqsave(&prev->lock, flags);
				607	rtime = curr->sum_exec_runtime;
				608
				609	/*
				610	* This is possible under two circumstances:
				611	* - rtime isn't monotonic after all (a bug);
				612	* - we got reordered by the lock.
				613	*
				614	* In both cases this acts as a filter such that the rest of the code
				615	* can assume it is monotonic regardless of anything else.
				616	*/
				617	if (prev->stime + prev->utime >= rtime)
				618	goto out;
				619
				620	stime = curr->stime;
				621	utime = curr->utime;
				622
				623	/*
				624	* If either stime or utime are 0, assume all runtime is userspace.
				625	* Once a task gets some ticks, the monotonicy code at 'update:'
				626	* will ensure things converge to the observed ratio.
				627	*/
				628	if (stime == 0) {
				629	utime = rtime;
				630	goto update;
				631	}
				632
				633	if (utime == 0) {
				634	stime = rtime;
				635	goto update;
				636	}
				637
				638	stime = scale_stime(stime, rtime, stime + utime);
				639
				640	update:
				641	/*
				642	* Make sure stime doesn't go backwards; this preserves monotonicity
				643	* for utime because rtime is monotonic.
				644	*
				645	* utime_i+1 = rtime_i+1 - stime_i
				646	* = rtime_i+1 - (rtime_i - utime_i)
				647	* = (rtime_i+1 - rtime_i) + utime_i
				648	* >= utime_i
				649	*/
				650	if (stime < prev->stime)
				651	stime = prev->stime;
				652	utime = rtime - stime;
				653
				654	/*
				655	* Make sure utime doesn't go backwards; this still preserves
				656	* monotonicity for stime, analogous argument to above.
				657	*/
				658	if (utime < prev->utime) {
				659	utime = prev->utime;
				660	stime = rtime - utime;
				661	}
				662
				663	prev->stime = stime;
				664	prev->utime = utime;
				665	out:
				666	*ut = prev->utime;
				667	*st = prev->stime;
				668	raw_spin_unlock_irqrestore(&prev->lock, flags);
				669	}
				670
				671	void task_cputime_adjusted(struct task_struct p, u64 ut, u64 *st)
				672	{
				673	struct task_cputime cputime = {
				674	.sum_exec_runtime = p->se.sum_exec_runtime,
				675	};
				676
				677	task_cputime(p, &cputime.utime, &cputime.stime);
				678	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
				679	}
				680	EXPORT_SYMBOL_GPL(task_cputime_adjusted);
				681
				682	void thread_group_cputime_adjusted(struct task_struct p, u64 ut, u64 *st)
				683	{
				684	struct task_cputime cputime;
				685
				686	thread_group_cputime(p, &cputime);
				687	cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
				688	}
				689	EXPORT_SYMBOL_GPL(thread_group_cputime_adjusted);
				690
				691	#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
				692
				693	#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
				694	static u64 vtime_delta(struct vtime *vtime)
				695	{
				696	unsigned long long clock;
				697
				698	clock = sched_clock();
				699	if (clock < vtime->starttime)
				700	return 0;
				701
				702	return clock - vtime->starttime;
				703	}
				704
				705	static u64 get_vtime_delta(struct vtime *vtime)
				706	{
				707	u64 delta = vtime_delta(vtime);
				708	u64 other;
				709
				710	/*
				711	* Unlike tick based timing, vtime based timing never has lost
				712	* ticks, and no need for steal time accounting to make up for
				713	* lost ticks. Vtime accounts a rounded version of actual
				714	* elapsed time. Limit account_other_time to prevent rounding
				715	* errors from causing elapsed vtime to go negative.
				716	*/
				717	other = account_other_time(delta);
				718	WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
				719	vtime->starttime += delta;
				720
				721	return delta - other;
				722	}
				723
				724	static void __vtime_account_system(struct task_struct *tsk,
				725	struct vtime *vtime)
				726	{
				727	vtime->stime += get_vtime_delta(vtime);
				728	if (vtime->stime >= TICK_NSEC) {
				729	account_system_time(tsk, irq_count(), vtime->stime);
				730	vtime->stime = 0;
				731	}
				732	}
				733
				734	static void vtime_account_guest(struct task_struct *tsk,
				735	struct vtime *vtime)
				736	{
				737	vtime->gtime += get_vtime_delta(vtime);
				738	if (vtime->gtime >= TICK_NSEC) {
				739	account_guest_time(tsk, vtime->gtime);
				740	vtime->gtime = 0;
				741	}
				742	}
				743
				744	void vtime_account_system(struct task_struct *tsk)
				745	{
				746	struct vtime *vtime = &tsk->vtime;
				747
				748	if (!vtime_delta(vtime))
				749	return;
				750
				751	write_seqcount_begin(&vtime->seqcount);
				752	/* We might have scheduled out from guest path */
				753	if (tsk->flags & PF_VCPU)
				754	vtime_account_guest(tsk, vtime);
				755	else
				756	__vtime_account_system(tsk, vtime);
				757	write_seqcount_end(&vtime->seqcount);
				758	}
				759
				760	void vtime_user_enter(struct task_struct *tsk)
				761	{
				762	struct vtime *vtime = &tsk->vtime;
				763
				764	write_seqcount_begin(&vtime->seqcount);
				765	__vtime_account_system(tsk, vtime);
				766	vtime->state = VTIME_USER;
				767	write_seqcount_end(&vtime->seqcount);
				768	}
				769
				770	void vtime_user_exit(struct task_struct *tsk)
				771	{
				772	struct vtime *vtime = &tsk->vtime;
				773
				774	write_seqcount_begin(&vtime->seqcount);
				775	vtime->utime += get_vtime_delta(vtime);
				776	if (vtime->utime >= TICK_NSEC) {
				777	account_user_time(tsk, vtime->utime);
				778	vtime->utime = 0;
				779	}
				780	vtime->state = VTIME_SYS;
				781	write_seqcount_end(&vtime->seqcount);
				782	}
				783
				784	void vtime_guest_enter(struct task_struct *tsk)
				785	{
				786	struct vtime *vtime = &tsk->vtime;
				787	/*
				788	* The flags must be updated under the lock with
				789	* the vtime_starttime flush and update.
				790	* That enforces a right ordering and update sequence
				791	* synchronization against the reader (task_gtime())
				792	* that can thus safely catch up with a tickless delta.
				793	*/
				794	write_seqcount_begin(&vtime->seqcount);
				795	__vtime_account_system(tsk, vtime);
				796	tsk->flags \|= PF_VCPU;
				797	write_seqcount_end(&vtime->seqcount);
				798	}
				799	EXPORT_SYMBOL_GPL(vtime_guest_enter);
				800
				801	void vtime_guest_exit(struct task_struct *tsk)
				802	{
				803	struct vtime *vtime = &tsk->vtime;
				804
				805	write_seqcount_begin(&vtime->seqcount);
				806	vtime_account_guest(tsk, vtime);
				807	tsk->flags &= ~PF_VCPU;
				808	write_seqcount_end(&vtime->seqcount);
				809	}
				810	EXPORT_SYMBOL_GPL(vtime_guest_exit);
				811
				812	void vtime_account_idle(struct task_struct *tsk)
				813	{
				814	account_idle_time(get_vtime_delta(&tsk->vtime));
				815	}
				816
				817	void arch_vtime_task_switch(struct task_struct *prev)
				818	{
				819	struct vtime *vtime = &prev->vtime;
				820
				821	write_seqcount_begin(&vtime->seqcount);
				822	vtime->state = VTIME_INACTIVE;
				823	write_seqcount_end(&vtime->seqcount);
				824
				825	vtime = &current->vtime;
				826
				827	write_seqcount_begin(&vtime->seqcount);
				828	vtime->state = VTIME_SYS;
				829	vtime->starttime = sched_clock();
				830	write_seqcount_end(&vtime->seqcount);
				831	}
				832
				833	void vtime_init_idle(struct task_struct *t, int cpu)
				834	{
				835	struct vtime *vtime = &t->vtime;
				836	unsigned long flags;
				837
				838	local_irq_save(flags);
				839	write_seqcount_begin(&vtime->seqcount);
				840	vtime->state = VTIME_SYS;
				841	vtime->starttime = sched_clock();
				842	write_seqcount_end(&vtime->seqcount);
				843	local_irq_restore(flags);
				844	}
				845
				846	u64 task_gtime(struct task_struct *t)
				847	{
				848	struct vtime *vtime = &t->vtime;
				849	unsigned int seq;
				850	u64 gtime;
				851
				852	if (!vtime_accounting_enabled())
				853	return t->gtime;
				854
				855	do {
				856	seq = read_seqcount_begin(&vtime->seqcount);
				857
				858	gtime = t->gtime;
				859	if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
				860	gtime += vtime->gtime + vtime_delta(vtime);
				861
				862	} while (read_seqcount_retry(&vtime->seqcount, seq));
				863
				864	return gtime;
				865	}
				866
				867	/*
				868	* Fetch cputime raw values from fields of task_struct and
				869	* add up the pending nohz execution time since the last
				870	* cputime snapshot.
				871	*/
				872	void task_cputime(struct task_struct t, u64 utime, u64 *stime)
				873	{
				874	struct vtime *vtime = &t->vtime;
				875	unsigned int seq;
				876	u64 delta;
				877
				878	if (!vtime_accounting_enabled()) {
				879	*utime = t->utime;
				880	*stime = t->stime;
				881	return;
				882	}
				883
				884	do {
				885	seq = read_seqcount_begin(&vtime->seqcount);
				886
				887	*utime = t->utime;
				888	*stime = t->stime;
				889
				890	/* Task is sleeping, nothing to add */
				891	if (vtime->state == VTIME_INACTIVE \|\| is_idle_task(t))
				892	continue;
				893
				894	delta = vtime_delta(vtime);
				895
				896	/*
				897	* Task runs either in user or kernel space, add pending nohz time to
				898	* the right place.
				899	*/
				900	if (vtime->state == VTIME_USER \|\| t->flags & PF_VCPU)
				901	*utime += vtime->utime + delta;
				902	else if (vtime->state == VTIME_SYS)
				903	*stime += vtime->stime + delta;
				904	} while (read_seqcount_retry(&vtime->seqcount, seq));
				905	}
				906	#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */