Blame - ap/os/linux/linux-3.4.x/kernel/posix-cpu-timers.c - T106_DC

blob: 54f30e425f4246adc96e818e0d957257cc441730 [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame^]	1	/*
				2	* Implement CPU time clocks for the POSIX clock interface.
				3	*/
				4
				5	#include <linux/sched.h>
				6	#include <linux/posix-timers.h>
				7	#include <linux/errno.h>
				8	#include <linux/math64.h>
				9	#include <asm/uaccess.h>
				10	#include <linux/kernel_stat.h>
				11	#include <trace/events/timer.h>
				12
				13	/*
				14	* Called after updating RLIMIT_CPU to run cpu timer and update
				15	* tsk->signal->cputime_expires expiration cache if necessary. Needs
				16	* siglock protection since other code may update expiration cache as
				17	* well.
				18	*/
				19	void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
				20	{
				21	cputime_t cputime = secs_to_cputime(rlim_new);
				22
				23	spin_lock_irq(&task->sighand->siglock);
				24	set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
				25	spin_unlock_irq(&task->sighand->siglock);
				26	}
				27
				28	static int check_clock(const clockid_t which_clock)
				29	{
				30	int error = 0;
				31	struct task_struct *p;
				32	const pid_t pid = CPUCLOCK_PID(which_clock);
				33
				34	if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX)
				35	return -EINVAL;
				36
				37	if (pid == 0)
				38	return 0;
				39
				40	rcu_read_lock();
				41	p = find_task_by_vpid(pid);
				42	if (!p \|\| !(CPUCLOCK_PERTHREAD(which_clock) ?
				43	same_thread_group(p, current) : has_group_leader_pid(p))) {
				44	error = -EINVAL;
				45	}
				46	rcu_read_unlock();
				47
				48	return error;
				49	}
				50
				51	static inline union cpu_time_count
				52	timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
				53	{
				54	union cpu_time_count ret;
				55	ret.sched = 0; /* high half always zero when .cpu used */
				56	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
				57	ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
				58	} else {
				59	ret.cpu = timespec_to_cputime(tp);
				60	}
				61	return ret;
				62	}
				63
				64	static void sample_to_timespec(const clockid_t which_clock,
				65	union cpu_time_count cpu,
				66	struct timespec *tp)
				67	{
				68	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
				69	*tp = ns_to_timespec(cpu.sched);
				70	else
				71	cputime_to_timespec(cpu.cpu, tp);
				72	}
				73
				74	static inline int cpu_time_before(const clockid_t which_clock,
				75	union cpu_time_count now,
				76	union cpu_time_count then)
				77	{
				78	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
				79	return now.sched < then.sched;
				80	} else {
				81	return now.cpu < then.cpu;
				82	}
				83	}
				84	static inline void cpu_time_add(const clockid_t which_clock,
				85	union cpu_time_count *acc,
				86	union cpu_time_count val)
				87	{
				88	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
				89	acc->sched += val.sched;
				90	} else {
				91	acc->cpu += val.cpu;
				92	}
				93	}
				94	static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
				95	union cpu_time_count a,
				96	union cpu_time_count b)
				97	{
				98	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
				99	a.sched -= b.sched;
				100	} else {
				101	a.cpu -= b.cpu;
				102	}
				103	return a;
				104	}
				105
				106	/*
				107	* Update expiry time from increment, and increase overrun count,
				108	* given the current clock sample.
				109	*/
				110	static void bump_cpu_timer(struct k_itimer *timer,
				111	union cpu_time_count now)
				112	{
				113	int i;
				114
				115	if (timer->it.cpu.incr.sched == 0)
				116	return;
				117
				118	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
				119	unsigned long long delta, incr;
				120
				121	if (now.sched < timer->it.cpu.expires.sched)
				122	return;
				123	incr = timer->it.cpu.incr.sched;
				124	delta = now.sched + incr - timer->it.cpu.expires.sched;
				125	/* Don't use (incr2 < delta), incr2 might overflow. */
				126	for (i = 0; incr < delta - incr; i++)
				127	incr = incr << 1;
				128	for (; i >= 0; incr >>= 1, i--) {
				129	if (delta < incr)
				130	continue;
				131	timer->it.cpu.expires.sched += incr;
				132	timer->it_overrun += 1 << i;
				133	delta -= incr;
				134	}
				135	} else {
				136	cputime_t delta, incr;
				137
				138	if (now.cpu < timer->it.cpu.expires.cpu)
				139	return;
				140	incr = timer->it.cpu.incr.cpu;
				141	delta = now.cpu + incr - timer->it.cpu.expires.cpu;
				142	/* Don't use (incr2 < delta), incr2 might overflow. */
				143	for (i = 0; incr < delta - incr; i++)
				144	incr += incr;
				145	for (; i >= 0; incr = incr >> 1, i--) {
				146	if (delta < incr)
				147	continue;
				148	timer->it.cpu.expires.cpu += incr;
				149	timer->it_overrun += 1 << i;
				150	delta -= incr;
				151	}
				152	}
				153	}
				154
				155	static inline cputime_t prof_ticks(struct task_struct *p)
				156	{
				157	return p->utime + p->stime;
				158	}
				159	static inline cputime_t virt_ticks(struct task_struct *p)
				160	{
				161	return p->utime;
				162	}
				163
				164	static int
				165	posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
				166	{
				167	int error = check_clock(which_clock);
				168	if (!error) {
				169	tp->tv_sec = 0;
				170	tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
				171	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
				172	/*
				173	* If sched_clock is using a cycle counter, we
				174	* don't have any idea of its true resolution
				175	* exported, but it is much more than 1s/HZ.
				176	*/
				177	tp->tv_nsec = 1;
				178	}
				179	}
				180	return error;
				181	}
				182
				183	static int
				184	posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
				185	{
				186	/*
				187	* You can never reset a CPU clock, but we check for other errors
				188	* in the call before failing with EPERM.
				189	*/
				190	int error = check_clock(which_clock);
				191	if (error == 0) {
				192	error = -EPERM;
				193	}
				194	return error;
				195	}
				196
				197
				198	/*
				199	* Sample a per-thread clock for the given task.
				200	*/
				201	static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
				202	union cpu_time_count *cpu)
				203	{
				204	switch (CPUCLOCK_WHICH(which_clock)) {
				205	default:
				206	return -EINVAL;
				207	case CPUCLOCK_PROF:
				208	cpu->cpu = prof_ticks(p);
				209	break;
				210	case CPUCLOCK_VIRT:
				211	cpu->cpu = virt_ticks(p);
				212	break;
				213	case CPUCLOCK_SCHED:
				214	cpu->sched = task_sched_runtime(p);
				215	break;
				216	}
				217	return 0;
				218	}
				219
				220	void thread_group_cputime(struct task_struct tsk, struct task_cputime times)
				221	{
				222	struct signal_struct *sig = tsk->signal;
				223	struct task_struct *t;
				224
				225	times->utime = sig->utime;
				226	times->stime = sig->stime;
				227	times->sum_exec_runtime = sig->sum_sched_runtime;
				228
				229	rcu_read_lock();
				230	/* make sure we can trust tsk->thread_group list */
				231	if (!likely(pid_alive(tsk)))
				232	goto out;
				233
				234	t = tsk;
				235	do {
				236	times->utime += t->utime;
				237	times->stime += t->stime;
				238	times->sum_exec_runtime += task_sched_runtime(t);
				239	} while_each_thread(tsk, t);
				240	out:
				241	rcu_read_unlock();
				242	}
				243
				244	static void update_gt_cputime(struct task_cputime a, struct task_cputime b)
				245	{
				246	if (b->utime > a->utime)
				247	a->utime = b->utime;
				248
				249	if (b->stime > a->stime)
				250	a->stime = b->stime;
				251
				252	if (b->sum_exec_runtime > a->sum_exec_runtime)
				253	a->sum_exec_runtime = b->sum_exec_runtime;
				254	}
				255
				256	void thread_group_cputimer(struct task_struct tsk, struct task_cputime times)
				257	{
				258	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
				259	struct task_cputime sum;
				260	unsigned long flags;
				261
				262	if (!cputimer->running) {
				263	/*
				264	* The POSIX timer interface allows for absolute time expiry
				265	* values through the TIMER_ABSTIME flag, therefore we have
				266	* to synchronize the timer to the clock every time we start
				267	* it.
				268	*/
				269	thread_group_cputime(tsk, &sum);
				270	raw_spin_lock_irqsave(&cputimer->lock, flags);
				271	cputimer->running = 1;
				272	update_gt_cputime(&cputimer->cputime, &sum);
				273	} else
				274	raw_spin_lock_irqsave(&cputimer->lock, flags);
				275	*times = cputimer->cputime;
				276	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
				277	}
				278
				279	/*
				280	* Sample a process (thread group) clock for the given group_leader task.
				281	* Must be called with tasklist_lock held for reading.
				282	*/
				283	static int cpu_clock_sample_group(const clockid_t which_clock,
				284	struct task_struct *p,
				285	union cpu_time_count *cpu)
				286	{
				287	struct task_cputime cputime;
				288
				289	switch (CPUCLOCK_WHICH(which_clock)) {
				290	default:
				291	return -EINVAL;
				292	case CPUCLOCK_PROF:
				293	thread_group_cputime(p, &cputime);
				294	cpu->cpu = cputime.utime + cputime.stime;
				295	break;
				296	case CPUCLOCK_VIRT:
				297	thread_group_cputime(p, &cputime);
				298	cpu->cpu = cputime.utime;
				299	break;
				300	case CPUCLOCK_SCHED:
				301	thread_group_cputime(p, &cputime);
				302	cpu->sched = cputime.sum_exec_runtime;
				303	break;
				304	}
				305	return 0;
				306	}
				307
				308
				309	static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
				310	{
				311	const pid_t pid = CPUCLOCK_PID(which_clock);
				312	int error = -EINVAL;
				313	union cpu_time_count rtn;
				314
				315	if (pid == 0) {
				316	/*
				317	* Special case constant value for our own clocks.
				318	* We don't have to do any lookup to find ourselves.
				319	*/
				320	if (CPUCLOCK_PERTHREAD(which_clock)) {
				321	/*
				322	* Sampling just ourselves we can do with no locking.
				323	*/
				324	error = cpu_clock_sample(which_clock,
				325	current, &rtn);
				326	} else {
				327	read_lock(&tasklist_lock);
				328	error = cpu_clock_sample_group(which_clock,
				329	current, &rtn);
				330	read_unlock(&tasklist_lock);
				331	}
				332	} else {
				333	/*
				334	* Find the given PID, and validate that the caller
				335	* should be able to see it.
				336	*/
				337	struct task_struct *p;
				338	rcu_read_lock();
				339	p = find_task_by_vpid(pid);
				340	if (p) {
				341	if (CPUCLOCK_PERTHREAD(which_clock)) {
				342	if (same_thread_group(p, current)) {
				343	error = cpu_clock_sample(which_clock,
				344	p, &rtn);
				345	}
				346	} else {
				347	read_lock(&tasklist_lock);
				348	if (thread_group_leader(p) && p->sighand) {
				349	error =
				350	cpu_clock_sample_group(which_clock,
				351	p, &rtn);
				352	}
				353	read_unlock(&tasklist_lock);
				354	}
				355	}
				356	rcu_read_unlock();
				357	}
				358
				359	if (error)
				360	return error;
				361	sample_to_timespec(which_clock, rtn, tp);
				362	return 0;
				363	}
				364
				365
				366	/*
				367	* Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
				368	* This is called from sys_timer_create() and do_cpu_nanosleep() with the
				369	* new timer already all-zeros initialized.
				370	*/
				371	static int posix_cpu_timer_create(struct k_itimer *new_timer)
				372	{
				373	int ret = 0;
				374	const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
				375	struct task_struct *p;
				376
				377	if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX)
				378	return -EINVAL;
				379
				380	INIT_LIST_HEAD(&new_timer->it.cpu.entry);
				381
				382	rcu_read_lock();
				383	if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
				384	if (pid == 0) {
				385	p = current;
				386	} else {
				387	p = find_task_by_vpid(pid);
				388	if (p && !same_thread_group(p, current))
				389	p = NULL;
				390	}
				391	} else {
				392	if (pid == 0) {
				393	p = current->group_leader;
				394	} else {
				395	p = find_task_by_vpid(pid);
				396	if (p && !has_group_leader_pid(p))
				397	p = NULL;
				398	}
				399	}
				400	new_timer->it.cpu.task = p;
				401	if (p) {
				402	get_task_struct(p);
				403	} else {
				404	ret = -EINVAL;
				405	}
				406	rcu_read_unlock();
				407
				408	return ret;
				409	}
				410
				411	/*
				412	* Clean up a CPU-clock timer that is about to be destroyed.
				413	* This is called from timer deletion with the timer already locked.
				414	* If we return TIMER_RETRY, it's necessary to release the timer's lock
				415	* and try again. (This happens when the timer is in the middle of firing.)
				416	*/
				417	static int posix_cpu_timer_del(struct k_itimer *timer)
				418	{
				419	struct task_struct *p = timer->it.cpu.task;
				420	int ret = 0;
				421
				422	if (likely(p != NULL)) {
				423	read_lock(&tasklist_lock);
				424	if (unlikely(p->sighand == NULL)) {
				425	/*
				426	* We raced with the reaping of the task.
				427	* The deletion should have cleared us off the list.
				428	*/
				429	BUG_ON(!list_empty(&timer->it.cpu.entry));
				430	} else {
				431	spin_lock(&p->sighand->siglock);
				432	if (timer->it.cpu.firing)
				433	ret = TIMER_RETRY;
				434	else
				435	list_del(&timer->it.cpu.entry);
				436	spin_unlock(&p->sighand->siglock);
				437	}
				438	read_unlock(&tasklist_lock);
				439
				440	if (!ret)
				441	put_task_struct(p);
				442	}
				443
				444	return ret;
				445	}
				446
				447	/*
				448	* Clean out CPU timers still ticking when a thread exited. The task
				449	* pointer is cleared, and the expiry time is replaced with the residual
				450	* time for later timer_gettime calls to return.
				451	* This must be called with the siglock held.
				452	*/
				453	static void cleanup_timers(struct list_head *head,
				454	cputime_t utime, cputime_t stime,
				455	unsigned long long sum_exec_runtime)
				456	{
				457	struct cpu_timer_list timer, next;
				458	cputime_t ptime = utime + stime;
				459
				460	list_for_each_entry_safe(timer, next, head, entry) {
				461	list_del_init(&timer->entry);
				462	if (timer->expires.cpu < ptime) {
				463	timer->expires.cpu = 0;
				464	} else {
				465	timer->expires.cpu -= ptime;
				466	}
				467	}
				468
				469	++head;
				470	list_for_each_entry_safe(timer, next, head, entry) {
				471	list_del_init(&timer->entry);
				472	if (timer->expires.cpu < utime) {
				473	timer->expires.cpu = 0;
				474	} else {
				475	timer->expires.cpu -= utime;
				476	}
				477	}
				478
				479	++head;
				480	list_for_each_entry_safe(timer, next, head, entry) {
				481	list_del_init(&timer->entry);
				482	if (timer->expires.sched < sum_exec_runtime) {
				483	timer->expires.sched = 0;
				484	} else {
				485	timer->expires.sched -= sum_exec_runtime;
				486	}
				487	}
				488	}
				489
				490	/*
				491	* These are both called with the siglock held, when the current thread
				492	* is being reaped. When the final (leader) thread in the group is reaped,
				493	* posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit.
				494	*/
				495	void posix_cpu_timers_exit(struct task_struct *tsk)
				496	{
				497	cleanup_timers(tsk->cpu_timers,
				498	tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
				499
				500	}
				501	void posix_cpu_timers_exit_group(struct task_struct *tsk)
				502	{
				503	struct signal_struct *const sig = tsk->signal;
				504
				505	cleanup_timers(tsk->signal->cpu_timers,
				506	tsk->utime + sig->utime, tsk->stime + sig->stime,
				507	tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
				508	}
				509
				510	static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
				511	{
				512	/*
				513	* That's all for this thread or process.
				514	* We leave our residual in expires to be reported.
				515	*/
				516	put_task_struct(timer->it.cpu.task);
				517	timer->it.cpu.task = NULL;
				518	timer->it.cpu.expires = cpu_time_sub(timer->it_clock,
				519	timer->it.cpu.expires,
				520	now);
				521	}
				522
				523	static inline int expires_gt(cputime_t expires, cputime_t new_exp)
				524	{
				525	return expires == 0 \|\| expires > new_exp;
				526	}
				527
				528	/*
				529	* Insert the timer on the appropriate list before any timers that
				530	* expire later. This must be called with the tasklist_lock held
				531	* for reading, interrupts disabled and p->sighand->siglock taken.
				532	*/
				533	static void arm_timer(struct k_itimer *timer)
				534	{
				535	struct task_struct *p = timer->it.cpu.task;
				536	struct list_head head, listpos;
				537	struct task_cputime *cputime_expires;
				538	struct cpu_timer_list *const nt = &timer->it.cpu;
				539	struct cpu_timer_list *next;
				540
				541	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
				542	head = p->cpu_timers;
				543	cputime_expires = &p->cputime_expires;
				544	} else {
				545	head = p->signal->cpu_timers;
				546	cputime_expires = &p->signal->cputime_expires;
				547	}
				548	head += CPUCLOCK_WHICH(timer->it_clock);
				549
				550	listpos = head;
				551	list_for_each_entry(next, head, entry) {
				552	if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
				553	break;
				554	listpos = &next->entry;
				555	}
				556	list_add(&nt->entry, listpos);
				557
				558	if (listpos == head) {
				559	union cpu_time_count *exp = &nt->expires;
				560
				561	/*
				562	* We are the new earliest-expiring POSIX 1.b timer, hence
				563	* need to update expiration cache. Take into account that
				564	* for process timers we share expiration cache with itimers
				565	* and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
				566	*/
				567
				568	switch (CPUCLOCK_WHICH(timer->it_clock)) {
				569	case CPUCLOCK_PROF:
				570	if (expires_gt(cputime_expires->prof_exp, exp->cpu))
				571	cputime_expires->prof_exp = exp->cpu;
				572	break;
				573	case CPUCLOCK_VIRT:
				574	if (expires_gt(cputime_expires->virt_exp, exp->cpu))
				575	cputime_expires->virt_exp = exp->cpu;
				576	break;
				577	case CPUCLOCK_SCHED:
				578	if (cputime_expires->sched_exp == 0 \|\|
				579	cputime_expires->sched_exp > exp->sched)
				580	cputime_expires->sched_exp = exp->sched;
				581	break;
				582	}
				583	}
				584	}
				585
				586	/*
				587	* The timer is locked, fire it and arrange for its reload.
				588	*/
				589	static void cpu_timer_fire(struct k_itimer *timer)
				590	{
				591	if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
				592	/*
				593	* User don't want any signal.
				594	*/
				595	timer->it.cpu.expires.sched = 0;
				596	} else if (unlikely(timer->sigq == NULL)) {
				597	/*
				598	* This a special case for clock_nanosleep,
				599	* not a normal timer from sys_timer_create.
				600	*/
				601	wake_up_process(timer->it_process);
				602	timer->it.cpu.expires.sched = 0;
				603	} else if (timer->it.cpu.incr.sched == 0) {
				604	/*
				605	* One-shot timer. Clear it as soon as it's fired.
				606	*/
				607	posix_timer_event(timer, 0);
				608	timer->it.cpu.expires.sched = 0;
				609	} else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
				610	/*
				611	* The signal did not get queued because the signal
				612	* was ignored, so we won't get any callback to
				613	* reload the timer. But we need to keep it
				614	* ticking in case the signal is deliverable next time.
				615	*/
				616	posix_cpu_timer_schedule(timer);
				617	}
				618	}
				619
				620	/*
				621	* Sample a process (thread group) timer for the given group_leader task.
				622	* Must be called with tasklist_lock held for reading.
				623	*/
				624	static int cpu_timer_sample_group(const clockid_t which_clock,
				625	struct task_struct *p,
				626	union cpu_time_count *cpu)
				627	{
				628	struct task_cputime cputime;
				629
				630	thread_group_cputimer(p, &cputime);
				631	switch (CPUCLOCK_WHICH(which_clock)) {
				632	default:
				633	return -EINVAL;
				634	case CPUCLOCK_PROF:
				635	cpu->cpu = cputime.utime + cputime.stime;
				636	break;
				637	case CPUCLOCK_VIRT:
				638	cpu->cpu = cputime.utime;
				639	break;
				640	case CPUCLOCK_SCHED:
				641	cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
				642	break;
				643	}
				644	return 0;
				645	}
				646
				647	/*
				648	* Guts of sys_timer_settime for CPU timers.
				649	* This is called with the timer locked and interrupts disabled.
				650	* If we return TIMER_RETRY, it's necessary to release the timer's lock
				651	* and try again. (This happens when the timer is in the middle of firing.)
				652	*/
				653	static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
				654	struct itimerspec new, struct itimerspec old)
				655	{
				656	struct task_struct *p = timer->it.cpu.task;
				657	union cpu_time_count old_expires, new_expires, old_incr, val;
				658	int ret;
				659
				660	if (unlikely(p == NULL)) {
				661	/*
				662	* Timer refers to a dead task's clock.
				663	*/
				664	return -ESRCH;
				665	}
				666
				667	new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
				668
				669	read_lock(&tasklist_lock);
				670	/*
				671	* We need the tasklist_lock to protect against reaping that
				672	* clears p->sighand. If p has just been reaped, we can no
				673	* longer get any information about it at all.
				674	*/
				675	if (unlikely(p->sighand == NULL)) {
				676	read_unlock(&tasklist_lock);
				677	put_task_struct(p);
				678	timer->it.cpu.task = NULL;
				679	return -ESRCH;
				680	}
				681
				682	/*
				683	* Disarm any old timer after extracting its expiry time.
				684	*/
				685	BUG_ON_NONRT(!irqs_disabled());
				686
				687	ret = 0;
				688	old_incr = timer->it.cpu.incr;
				689	spin_lock(&p->sighand->siglock);
				690	old_expires = timer->it.cpu.expires;
				691	if (unlikely(timer->it.cpu.firing)) {
				692	timer->it.cpu.firing = -1;
				693	ret = TIMER_RETRY;
				694	} else
				695	list_del_init(&timer->it.cpu.entry);
				696
				697	/*
				698	* We need to sample the current value to convert the new
				699	* value from to relative and absolute, and to convert the
				700	* old value from absolute to relative. To set a process
				701	* timer, we need a sample to balance the thread expiry
				702	* times (in arm_timer). With an absolute time, we must
				703	* check if it's already passed. In short, we need a sample.
				704	*/
				705	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
				706	cpu_clock_sample(timer->it_clock, p, &val);
				707	} else {
				708	cpu_timer_sample_group(timer->it_clock, p, &val);
				709	}
				710
				711	if (old) {
				712	if (old_expires.sched == 0) {
				713	old->it_value.tv_sec = 0;
				714	old->it_value.tv_nsec = 0;
				715	} else {
				716	/*
				717	* Update the timer in case it has
				718	* overrun already. If it has,
				719	* we'll report it as having overrun
				720	* and with the next reloaded timer
				721	* already ticking, though we are
				722	* swallowing that pending
				723	* notification here to install the
				724	* new setting.
				725	*/
				726	bump_cpu_timer(timer, val);
				727	if (cpu_time_before(timer->it_clock, val,
				728	timer->it.cpu.expires)) {
				729	old_expires = cpu_time_sub(
				730	timer->it_clock,
				731	timer->it.cpu.expires, val);
				732	sample_to_timespec(timer->it_clock,
				733	old_expires,
				734	&old->it_value);
				735	} else {
				736	old->it_value.tv_nsec = 1;
				737	old->it_value.tv_sec = 0;
				738	}
				739	}
				740	}
				741
				742	if (unlikely(ret)) {
				743	/*
				744	* We are colliding with the timer actually firing.
				745	* Punt after filling in the timer's old value, and
				746	* disable this firing since we are already reporting
				747	* it as an overrun (thanks to bump_cpu_timer above).
				748	*/
				749	spin_unlock(&p->sighand->siglock);
				750	read_unlock(&tasklist_lock);
				751	goto out;
				752	}
				753
				754	if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) {
				755	cpu_time_add(timer->it_clock, &new_expires, val);
				756	}
				757
				758	/*
				759	* Install the new expiry time (or zero).
				760	* For a timer with no notification action, we don't actually
				761	* arm the timer (we'll just fake it for timer_gettime).
				762	*/
				763	timer->it.cpu.expires = new_expires;
				764	if (new_expires.sched != 0 &&
				765	cpu_time_before(timer->it_clock, val, new_expires)) {
				766	arm_timer(timer);
				767	}
				768
				769	spin_unlock(&p->sighand->siglock);
				770	read_unlock(&tasklist_lock);
				771
				772	/*
				773	* Install the new reload setting, and
				774	* set up the signal and overrun bookkeeping.
				775	*/
				776	timer->it.cpu.incr = timespec_to_sample(timer->it_clock,
				777	&new->it_interval);
				778
				779	/*
				780	* This acts as a modification timestamp for the timer,
				781	* so any automatic reload attempt will punt on seeing
				782	* that we have reset the timer manually.
				783	*/
				784	timer->it_requeue_pending = (timer->it_requeue_pending + 2) &
				785	~REQUEUE_PENDING;
				786	timer->it_overrun_last = 0;
				787	timer->it_overrun = -1;
				788
				789	if (new_expires.sched != 0 &&
				790	!cpu_time_before(timer->it_clock, val, new_expires)) {
				791	/*
				792	* The designated time already passed, so we notify
				793	* immediately, even if the thread never runs to
				794	* accumulate more time on this clock.
				795	*/
				796	cpu_timer_fire(timer);
				797	}
				798
				799	ret = 0;
				800	out:
				801	if (old) {
				802	sample_to_timespec(timer->it_clock,
				803	old_incr, &old->it_interval);
				804	}
				805	return ret;
				806	}
				807
				808	static void posix_cpu_timer_get(struct k_itimer timer, struct itimerspec itp)
				809	{
				810	union cpu_time_count now;
				811	struct task_struct *p = timer->it.cpu.task;
				812	int clear_dead;
				813
				814	/*
				815	* Easy part: convert the reload time.
				816	*/
				817	sample_to_timespec(timer->it_clock,
				818	timer->it.cpu.incr, &itp->it_interval);
				819
				820	if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */
				821	itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
				822	return;
				823	}
				824
				825	if (unlikely(p == NULL)) {
				826	/*
				827	* This task already died and the timer will never fire.
				828	* In this case, expires is actually the dead value.
				829	*/
				830	dead:
				831	sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
				832	&itp->it_value);
				833	return;
				834	}
				835
				836	/*
				837	* Sample the clock to take the difference with the expiry time.
				838	*/
				839	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
				840	cpu_clock_sample(timer->it_clock, p, &now);
				841	clear_dead = p->exit_state;
				842	} else {
				843	read_lock(&tasklist_lock);
				844	if (unlikely(p->sighand == NULL)) {
				845	/*
				846	* The process has been reaped.
				847	* We can't even collect a sample any more.
				848	* Call the timer disarmed, nothing else to do.
				849	*/
				850	put_task_struct(p);
				851	timer->it.cpu.task = NULL;
				852	timer->it.cpu.expires.sched = 0;
				853	read_unlock(&tasklist_lock);
				854	goto dead;
				855	} else {
				856	cpu_timer_sample_group(timer->it_clock, p, &now);
				857	clear_dead = (unlikely(p->exit_state) &&
				858	thread_group_empty(p));
				859	}
				860	read_unlock(&tasklist_lock);
				861	}
				862
				863	if (unlikely(clear_dead)) {
				864	/*
				865	* We've noticed that the thread is dead, but
				866	* not yet reaped. Take this opportunity to
				867	* drop our task ref.
				868	*/
				869	clear_dead_task(timer, now);
				870	goto dead;
				871	}
				872
				873	if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) {
				874	sample_to_timespec(timer->it_clock,
				875	cpu_time_sub(timer->it_clock,
				876	timer->it.cpu.expires, now),
				877	&itp->it_value);
				878	} else {
				879	/*
				880	* The timer should have expired already, but the firing
				881	* hasn't taken place yet. Say it's just about to expire.
				882	*/
				883	itp->it_value.tv_nsec = 1;
				884	itp->it_value.tv_sec = 0;
				885	}
				886	}
				887
				888	/*
				889	* Check for any per-thread CPU timers that have fired and move them off
				890	* the tsk->cpu_timers[N] list onto the firing list. Here we update the
				891	* tsk->it_*_expires values to reflect the remaining thread CPU timers.
				892	*/
				893	static void check_thread_timers(struct task_struct *tsk,
				894	struct list_head *firing)
				895	{
				896	int maxfire;
				897	struct list_head *timers = tsk->cpu_timers;
				898	struct signal_struct *const sig = tsk->signal;
				899	unsigned long soft;
				900
				901	maxfire = 20;
				902	tsk->cputime_expires.prof_exp = 0;
				903	while (!list_empty(timers)) {
				904	struct cpu_timer_list *t = list_first_entry(timers,
				905	struct cpu_timer_list,
				906	entry);
				907	if (!--maxfire \|\| prof_ticks(tsk) < t->expires.cpu) {
				908	tsk->cputime_expires.prof_exp = t->expires.cpu;
				909	break;
				910	}
				911	t->firing = 1;
				912	list_move_tail(&t->entry, firing);
				913	}
				914
				915	++timers;
				916	maxfire = 20;
				917	tsk->cputime_expires.virt_exp = 0;
				918	while (!list_empty(timers)) {
				919	struct cpu_timer_list *t = list_first_entry(timers,
				920	struct cpu_timer_list,
				921	entry);
				922	if (!--maxfire \|\| virt_ticks(tsk) < t->expires.cpu) {
				923	tsk->cputime_expires.virt_exp = t->expires.cpu;
				924	break;
				925	}
				926	t->firing = 1;
				927	list_move_tail(&t->entry, firing);
				928	}
				929
				930	++timers;
				931	maxfire = 20;
				932	tsk->cputime_expires.sched_exp = 0;
				933	while (!list_empty(timers)) {
				934	struct cpu_timer_list *t = list_first_entry(timers,
				935	struct cpu_timer_list,
				936	entry);
				937	if (!--maxfire \|\| tsk->se.sum_exec_runtime < t->expires.sched) {
				938	tsk->cputime_expires.sched_exp = t->expires.sched;
				939	break;
				940	}
				941	t->firing = 1;
				942	list_move_tail(&t->entry, firing);
				943	}
				944
				945	/*
				946	* Check for the special case thread timers.
				947	*/
				948	soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
				949	if (soft != RLIM_INFINITY) {
				950	unsigned long hard =
				951	ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
				952
				953	if (hard != RLIM_INFINITY &&
				954	tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
				955	/*
				956	* At the hard limit, we just die.
				957	* No need to calculate anything else now.
				958	*/
				959	__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
				960	return;
				961	}
				962	if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
				963	/*
				964	* At the soft limit, send a SIGXCPU every second.
				965	*/
				966	if (soft < hard) {
				967	soft += USEC_PER_SEC;
				968	sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
				969	}
				970	printk(KERN_INFO
				971	"RT Watchdog Timeout: %s[%d]\n",
				972	tsk->comm, task_pid_nr(tsk));
				973	__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
				974	}
				975	}
				976	}
				977
				978	static void stop_process_timers(struct signal_struct *sig)
				979	{
				980	struct thread_group_cputimer *cputimer = &sig->cputimer;
				981	unsigned long flags;
				982
				983	raw_spin_lock_irqsave(&cputimer->lock, flags);
				984	cputimer->running = 0;
				985	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
				986	}
				987
				988	static u32 onecputick;
				989
				990	static void check_cpu_itimer(struct task_struct tsk, struct cpu_itimer it,
				991	cputime_t *expires, cputime_t cur_time, int signo)
				992	{
				993	if (!it->expires)
				994	return;
				995
				996	if (cur_time >= it->expires) {
				997	if (it->incr) {
				998	it->expires += it->incr;
				999	it->error += it->incr_error;
				1000	if (it->error >= onecputick) {
				1001	it->expires -= cputime_one_jiffy;
				1002	it->error -= onecputick;
				1003	}
				1004	} else {
				1005	it->expires = 0;
				1006	}
				1007
				1008	trace_itimer_expire(signo == SIGPROF ?
				1009	ITIMER_PROF : ITIMER_VIRTUAL,
				1010	tsk->signal->leader_pid, cur_time);
				1011	__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
				1012	}
				1013
				1014	if (it->expires && (!expires \|\| it->expires < expires)) {
				1015	*expires = it->expires;
				1016	}
				1017	}
				1018
				1019	/**
				1020	* task_cputime_zero - Check a task_cputime struct for all zero fields.
				1021	*
				1022	* @cputime: The struct to compare.
				1023	*
				1024	* Checks @cputime to see if all fields are zero. Returns true if all fields
				1025	* are zero, false if any field is nonzero.
				1026	*/
				1027	static inline int task_cputime_zero(const struct task_cputime *cputime)
				1028	{
				1029	if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
				1030	return 1;
				1031	return 0;
				1032	}
				1033
				1034	/*
				1035	* Check for any per-thread CPU timers that have fired and move them
				1036	* off the tsk->*_timers list onto the firing list. Per-thread timers
				1037	* have already been taken off.
				1038	*/
				1039	static void check_process_timers(struct task_struct *tsk,
				1040	struct list_head *firing)
				1041	{
				1042	int maxfire;
				1043	struct signal_struct *const sig = tsk->signal;
				1044	cputime_t utime, ptime, virt_expires, prof_expires;
				1045	unsigned long long sum_sched_runtime, sched_expires;
				1046	struct list_head *timers = sig->cpu_timers;
				1047	struct task_cputime cputime;
				1048	unsigned long soft;
				1049
				1050	/*
				1051	* Collect the current process totals.
				1052	*/
				1053	thread_group_cputimer(tsk, &cputime);
				1054	utime = cputime.utime;
				1055	ptime = utime + cputime.stime;
				1056	sum_sched_runtime = cputime.sum_exec_runtime;
				1057	maxfire = 20;
				1058	prof_expires = 0;
				1059	while (!list_empty(timers)) {
				1060	struct cpu_timer_list *tl = list_first_entry(timers,
				1061	struct cpu_timer_list,
				1062	entry);
				1063	if (!--maxfire \|\| ptime < tl->expires.cpu) {
				1064	prof_expires = tl->expires.cpu;
				1065	break;
				1066	}
				1067	tl->firing = 1;
				1068	list_move_tail(&tl->entry, firing);
				1069	}
				1070
				1071	++timers;
				1072	maxfire = 20;
				1073	virt_expires = 0;
				1074	while (!list_empty(timers)) {
				1075	struct cpu_timer_list *tl = list_first_entry(timers,
				1076	struct cpu_timer_list,
				1077	entry);
				1078	if (!--maxfire \|\| utime < tl->expires.cpu) {
				1079	virt_expires = tl->expires.cpu;
				1080	break;
				1081	}
				1082	tl->firing = 1;
				1083	list_move_tail(&tl->entry, firing);
				1084	}
				1085
				1086	++timers;
				1087	maxfire = 20;
				1088	sched_expires = 0;
				1089	while (!list_empty(timers)) {
				1090	struct cpu_timer_list *tl = list_first_entry(timers,
				1091	struct cpu_timer_list,
				1092	entry);
				1093	if (!--maxfire \|\| sum_sched_runtime < tl->expires.sched) {
				1094	sched_expires = tl->expires.sched;
				1095	break;
				1096	}
				1097	tl->firing = 1;
				1098	list_move_tail(&tl->entry, firing);
				1099	}
				1100
				1101	/*
				1102	* Check for the special case process timers.
				1103	*/
				1104	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
				1105	SIGPROF);
				1106	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
				1107	SIGVTALRM);
				1108	soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
				1109	if (soft != RLIM_INFINITY) {
				1110	unsigned long psecs = cputime_to_secs(ptime);
				1111	unsigned long hard =
				1112	ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
				1113	cputime_t x;
				1114	if (psecs >= hard) {
				1115	/*
				1116	* At the hard limit, we just die.
				1117	* No need to calculate anything else now.
				1118	*/
				1119	__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
				1120	return;
				1121	}
				1122	if (psecs >= soft) {
				1123	/*
				1124	* At the soft limit, send a SIGXCPU every second.
				1125	*/
				1126	__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
				1127	if (soft < hard) {
				1128	soft++;
				1129	sig->rlim[RLIMIT_CPU].rlim_cur = soft;
				1130	}
				1131	}
				1132	x = secs_to_cputime(soft);
				1133	if (!prof_expires \|\| x < prof_expires) {
				1134	prof_expires = x;
				1135	}
				1136	}
				1137
				1138	sig->cputime_expires.prof_exp = prof_expires;
				1139	sig->cputime_expires.virt_exp = virt_expires;
				1140	sig->cputime_expires.sched_exp = sched_expires;
				1141	if (task_cputime_zero(&sig->cputime_expires))
				1142	stop_process_timers(sig);
				1143	}
				1144
				1145	/*
				1146	* This is called from the signal code (via do_schedule_next_timer)
				1147	* when the last timer signal was delivered and we have to reload the timer.
				1148	*/
				1149	void posix_cpu_timer_schedule(struct k_itimer *timer)
				1150	{
				1151	struct task_struct *p = timer->it.cpu.task;
				1152	union cpu_time_count now;
				1153
				1154	if (unlikely(p == NULL))
				1155	/*
				1156	* The task was cleaned up already, no future firings.
				1157	*/
				1158	goto out;
				1159
				1160	/*
				1161	* Fetch the current sample and update the timer's expiry time.
				1162	*/
				1163	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
				1164	cpu_clock_sample(timer->it_clock, p, &now);
				1165	bump_cpu_timer(timer, now);
				1166	if (unlikely(p->exit_state)) {
				1167	clear_dead_task(timer, now);
				1168	goto out;
				1169	}
				1170	read_lock(&tasklist_lock); /* arm_timer needs it. */
				1171	spin_lock(&p->sighand->siglock);
				1172	} else {
				1173	read_lock(&tasklist_lock);
				1174	if (unlikely(p->sighand == NULL)) {
				1175	/*
				1176	* The process has been reaped.
				1177	* We can't even collect a sample any more.
				1178	*/
				1179	put_task_struct(p);
				1180	timer->it.cpu.task = p = NULL;
				1181	timer->it.cpu.expires.sched = 0;
				1182	goto out_unlock;
				1183	} else if (unlikely(p->exit_state) && thread_group_empty(p)) {
				1184	/*
				1185	* We've noticed that the thread is dead, but
				1186	* not yet reaped. Take this opportunity to
				1187	* drop our task ref.
				1188	*/
				1189	clear_dead_task(timer, now);
				1190	goto out_unlock;
				1191	}
				1192	spin_lock(&p->sighand->siglock);
				1193	cpu_timer_sample_group(timer->it_clock, p, &now);
				1194	bump_cpu_timer(timer, now);
				1195	/* Leave the tasklist_lock locked for the call below. */
				1196	}
				1197
				1198	/*
				1199	* Now re-arm for the new expiry time.
				1200	*/
				1201	BUG_ON_NONRT(!irqs_disabled());
				1202	arm_timer(timer);
				1203	spin_unlock(&p->sighand->siglock);
				1204
				1205	out_unlock:
				1206	read_unlock(&tasklist_lock);
				1207
				1208	out:
				1209	timer->it_overrun_last = timer->it_overrun;
				1210	timer->it_overrun = -1;
				1211	++timer->it_requeue_pending;
				1212	}
				1213
				1214	/**
				1215	* task_cputime_expired - Compare two task_cputime entities.
				1216	*
				1217	* @sample: The task_cputime structure to be checked for expiration.
				1218	* @expires: Expiration times, against which @sample will be checked.
				1219	*
				1220	* Checks @sample against @expires to see if any field of @sample has expired.
				1221	* Returns true if any field of the former is greater than the corresponding
				1222	* field of the latter if the latter field is set. Otherwise returns false.
				1223	*/
				1224	static inline int task_cputime_expired(const struct task_cputime *sample,
				1225	const struct task_cputime *expires)
				1226	{
				1227	if (expires->utime && sample->utime >= expires->utime)
				1228	return 1;
				1229	if (expires->stime && sample->utime + sample->stime >= expires->stime)
				1230	return 1;
				1231	if (expires->sum_exec_runtime != 0 &&
				1232	sample->sum_exec_runtime >= expires->sum_exec_runtime)
				1233	return 1;
				1234	return 0;
				1235	}
				1236
				1237	/**
				1238	* fastpath_timer_check - POSIX CPU timers fast path.
				1239	*
				1240	* @tsk: The task (thread) being checked.
				1241	*
				1242	* Check the task and thread group timers. If both are zero (there are no
				1243	* timers set) return false. Otherwise snapshot the task and thread group
				1244	* timers and compare them with the corresponding expiration times. Return
				1245	* true if a timer has expired, else return false.
				1246	*/
				1247	static inline int fastpath_timer_check(struct task_struct *tsk)
				1248	{
				1249	struct signal_struct *sig;
				1250
				1251	if (!task_cputime_zero(&tsk->cputime_expires)) {
				1252	struct task_cputime task_sample = {
				1253	.utime = tsk->utime,
				1254	.stime = tsk->stime,
				1255	.sum_exec_runtime = tsk->se.sum_exec_runtime
				1256	};
				1257
				1258	if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
				1259	return 1;
				1260	}
				1261
				1262	sig = tsk->signal;
				1263	if (sig->cputimer.running) {
				1264	struct task_cputime group_sample;
				1265	unsigned long flags;
				1266
				1267	raw_spin_lock_irqsave(&sig->cputimer.lock, flags);
				1268	group_sample = sig->cputimer.cputime;
				1269	raw_spin_unlock_irqrestore(&sig->cputimer.lock, flags);
				1270
				1271	if (task_cputime_expired(&group_sample, &sig->cputime_expires))
				1272	return 1;
				1273	}
				1274
				1275	return 0;
				1276	}
				1277
				1278	/*
				1279	* This is called from the timer interrupt handler. The irq handler has
				1280	* already updated our counts. We need to check if any timers fire now.
				1281	* Interrupts are disabled.
				1282	*/
				1283	static void __run_posix_cpu_timers(struct task_struct *tsk)
				1284	{
				1285	LIST_HEAD(firing);
				1286	struct k_itimer timer, next;
				1287	unsigned long flags;
				1288
				1289	BUG_ON_NONRT(!irqs_disabled());
				1290
				1291	/*
				1292	* The fast path checks that there are no expired thread or thread
				1293	* group timers. If that's so, just return.
				1294	*/
				1295	if (!fastpath_timer_check(tsk))
				1296	return;
				1297
				1298	if (!lock_task_sighand(tsk, &flags))
				1299	return;
				1300	/*
				1301	* Here we take off tsk->signal->cpu_timers[N] and
				1302	* tsk->cpu_timers[N] all the timers that are firing, and
				1303	* put them on the firing list.
				1304	*/
				1305	check_thread_timers(tsk, &firing);
				1306	/*
				1307	* If there are any active process wide timers (POSIX 1.b, itimers,
				1308	* RLIMIT_CPU) cputimer must be running.
				1309	*/
				1310	if (tsk->signal->cputimer.running)
				1311	check_process_timers(tsk, &firing);
				1312
				1313	/*
				1314	* We must release these locks before taking any timer's lock.
				1315	* There is a potential race with timer deletion here, as the
				1316	* siglock now protects our private firing list. We have set
				1317	* the firing flag in each timer, so that a deletion attempt
				1318	* that gets the timer lock before we do will give it up and
				1319	* spin until we've taken care of that timer below.
				1320	*/
				1321	unlock_task_sighand(tsk, &flags);
				1322
				1323	/*
				1324	* Now that all the timers on our list have the firing flag,
				1325	* no one will touch their list entries but us. We'll take
				1326	* each timer's lock before clearing its firing flag, so no
				1327	* timer call will interfere.
				1328	*/
				1329	list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
				1330	int cpu_firing;
				1331
				1332	spin_lock(&timer->it_lock);
				1333	list_del_init(&timer->it.cpu.entry);
				1334	cpu_firing = timer->it.cpu.firing;
				1335	timer->it.cpu.firing = 0;
				1336	/*
				1337	* The firing flag is -1 if we collided with a reset
				1338	* of the timer, which already reported this
				1339	* almost-firing as an overrun. So don't generate an event.
				1340	*/
				1341	if (likely(cpu_firing >= 0))
				1342	cpu_timer_fire(timer);
				1343	spin_unlock(&timer->it_lock);
				1344	}
				1345	}
				1346
				1347	#ifdef CONFIG_PREEMPT_RT_BASE
				1348	#include <linux/kthread.h>
				1349	#include <linux/cpu.h>
				1350	DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
				1351	DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
				1352
				1353	static int posix_cpu_timers_thread(void *data)
				1354	{
				1355	int cpu = (long)data;
				1356
				1357	BUG_ON(per_cpu(posix_timer_task,cpu) != current);
				1358
				1359	while (!kthread_should_stop()) {
				1360	struct task_struct *tsk = NULL;
				1361	struct task_struct *next = NULL;
				1362
				1363	if (cpu_is_offline(cpu))
				1364	goto wait_to_die;
				1365
				1366	/* grab task list */
				1367	raw_local_irq_disable();
				1368	tsk = per_cpu(posix_timer_tasklist, cpu);
				1369	per_cpu(posix_timer_tasklist, cpu) = NULL;
				1370	raw_local_irq_enable();
				1371
				1372	/* its possible the list is empty, just return */
				1373	if (!tsk) {
				1374	set_current_state(TASK_INTERRUPTIBLE);
				1375	schedule();
				1376	__set_current_state(TASK_RUNNING);
				1377	continue;
				1378	}
				1379
				1380	/* Process task list */
				1381	while (1) {
				1382	/* save next */
				1383	next = tsk->posix_timer_list;
				1384
				1385	/* run the task timers, clear its ptr and
				1386	* unreference it
				1387	*/
				1388	__run_posix_cpu_timers(tsk);
				1389	tsk->posix_timer_list = NULL;
				1390	put_task_struct(tsk);
				1391
				1392	/* check if this is the last on the list */
				1393	if (next == tsk)
				1394	break;
				1395	tsk = next;
				1396	}
				1397	}
				1398	return 0;
				1399
				1400	wait_to_die:
				1401	/* Wait for kthread_stop */
				1402	set_current_state(TASK_INTERRUPTIBLE);
				1403	while (!kthread_should_stop()) {
				1404	schedule();
				1405	set_current_state(TASK_INTERRUPTIBLE);
				1406	}
				1407	__set_current_state(TASK_RUNNING);
				1408	return 0;
				1409	}
				1410
				1411	static inline int __fastpath_timer_check(struct task_struct *tsk)
				1412	{
				1413	/* tsk == current, ensure it is safe to use ->signal/sighand */
				1414	if (unlikely(tsk->exit_state))
				1415	return 0;
				1416
				1417	if (!task_cputime_zero(&tsk->cputime_expires))
				1418	return 1;
				1419
				1420	if (!task_cputime_zero(&tsk->signal->cputime_expires))
				1421	return 1;
				1422
				1423	return 0;
				1424	}
				1425
				1426	void run_posix_cpu_timers(struct task_struct *tsk)
				1427	{
				1428	unsigned long cpu = smp_processor_id();
				1429	struct task_struct *tasklist;
				1430
				1431	BUG_ON(!irqs_disabled());
				1432	if(!per_cpu(posix_timer_task, cpu))
				1433	return;
				1434	/* get per-cpu references */
				1435	tasklist = per_cpu(posix_timer_tasklist, cpu);
				1436
				1437	/* check to see if we're already queued */
				1438	if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
				1439	get_task_struct(tsk);
				1440	if (tasklist) {
				1441	tsk->posix_timer_list = tasklist;
				1442	} else {
				1443	/*
				1444	* The list is terminated by a self-pointing
				1445	* task_struct
				1446	*/
				1447	tsk->posix_timer_list = tsk;
				1448	}
				1449	per_cpu(posix_timer_tasklist, cpu) = tsk;
				1450
				1451	wake_up_process(per_cpu(posix_timer_task, cpu));
				1452	}
				1453	}
				1454
				1455	/*
				1456	* posix_cpu_thread_call - callback that gets triggered when a CPU is added.
				1457	* Here we can start up the necessary migration thread for the new CPU.
				1458	*/
				1459	static int posix_cpu_thread_call(struct notifier_block *nfb,
				1460	unsigned long action, void *hcpu)
				1461	{
				1462	int cpu = (long)hcpu;
				1463	struct task_struct *p;
				1464	struct sched_param param;
				1465
				1466	switch (action) {
				1467	case CPU_UP_PREPARE:
				1468	p = kthread_create(posix_cpu_timers_thread, hcpu,
				1469	"posixcputmr/%d",cpu);
				1470	if (IS_ERR(p))
				1471	return NOTIFY_BAD;
				1472	p->flags \|= PF_NOFREEZE;
				1473	kthread_bind(p, cpu);
				1474	/* Must be high prio to avoid getting starved */
				1475	param.sched_priority = MAX_RT_PRIO-1;
				1476	sched_setscheduler(p, SCHED_FIFO, &param);
				1477	per_cpu(posix_timer_task,cpu) = p;
				1478	break;
				1479	case CPU_ONLINE:
				1480	/* Strictly unneccessary, as first user will wake it. */
				1481	wake_up_process(per_cpu(posix_timer_task,cpu));
				1482	break;
				1483	#ifdef CONFIG_HOTPLUG_CPU
				1484	case CPU_UP_CANCELED:
				1485	/* Unbind it from offline cpu so it can run. Fall thru. */
				1486	kthread_bind(per_cpu(posix_timer_task, cpu),
				1487	cpumask_any(cpu_online_mask));
				1488	kthread_stop(per_cpu(posix_timer_task,cpu));
				1489	per_cpu(posix_timer_task,cpu) = NULL;
				1490	break;
				1491	case CPU_DEAD:
				1492	kthread_stop(per_cpu(posix_timer_task,cpu));
				1493	per_cpu(posix_timer_task,cpu) = NULL;
				1494	break;
				1495	#endif
				1496	}
				1497	return NOTIFY_OK;
				1498	}
				1499
				1500	/* Register at highest priority so that task migration (migrate_all_tasks)
				1501	* happens before everything else.
				1502	*/
				1503	static struct notifier_block __devinitdata posix_cpu_thread_notifier = {
				1504	.notifier_call = posix_cpu_thread_call,
				1505	.priority = 10
				1506	};
				1507
				1508	static int __init posix_cpu_thread_init(void)
				1509	{
				1510	void hcpu = (void )(long)smp_processor_id();
				1511	/* Start one for boot CPU. */
				1512	unsigned long cpu;
				1513
				1514	/* init the per-cpu posix_timer_tasklets */
				1515	for_each_possible_cpu(cpu)
				1516	per_cpu(posix_timer_tasklist, cpu) = NULL;
				1517
				1518	posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
				1519	posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
				1520	register_cpu_notifier(&posix_cpu_thread_notifier);
				1521	return 0;
				1522	}
				1523	early_initcall(posix_cpu_thread_init);
				1524	#else /* CONFIG_PREEMPT_RT_BASE */
				1525	void run_posix_cpu_timers(struct task_struct *tsk)
				1526	{
				1527	__run_posix_cpu_timers(tsk);
				1528	}
				1529	#endif /* CONFIG_PREEMPT_RT_BASE */
				1530
				1531	/*
				1532	* Set one of the process-wide special case CPU timers or RLIMIT_CPU.
				1533	* The tsk->sighand->siglock must be held by the caller.
				1534	*/
				1535	void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
				1536	cputime_t newval, cputime_t oldval)
				1537	{
				1538	union cpu_time_count now;
				1539
				1540	BUG_ON(clock_idx == CPUCLOCK_SCHED);
				1541	cpu_timer_sample_group(clock_idx, tsk, &now);
				1542
				1543	if (oldval) {
				1544	/*
				1545	* We are setting itimer. The *oldval is absolute and we update
				1546	* it to be relative, *newval argument is relative and we update
				1547	* it to be absolute.
				1548	*/
				1549	if (*oldval) {
				1550	if (*oldval <= now.cpu) {
				1551	/* Just about to fire. */
				1552	*oldval = cputime_one_jiffy;
				1553	} else {
				1554	*oldval -= now.cpu;
				1555	}
				1556	}
				1557
				1558	if (!*newval)
				1559	return;
				1560	*newval += now.cpu;
				1561	}
				1562
				1563	/*
				1564	* Update expiration cache if we are the earliest timer, or eventually
				1565	* RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
				1566	*/
				1567	switch (clock_idx) {
				1568	case CPUCLOCK_PROF:
				1569	if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
				1570	tsk->signal->cputime_expires.prof_exp = *newval;
				1571	break;
				1572	case CPUCLOCK_VIRT:
				1573	if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
				1574	tsk->signal->cputime_expires.virt_exp = *newval;
				1575	break;
				1576	}
				1577	}
				1578
				1579	static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
				1580	struct timespec rqtp, struct itimerspec it)
				1581	{
				1582	struct k_itimer timer;
				1583	int error;
				1584
				1585	/*
				1586	* Set up a temporary timer and then wait for it to go off.
				1587	*/
				1588	memset(&timer, 0, sizeof timer);
				1589	spin_lock_init(&timer.it_lock);
				1590	timer.it_clock = which_clock;
				1591	timer.it_overrun = -1;
				1592	error = posix_cpu_timer_create(&timer);
				1593	timer.it_process = current;
				1594	if (!error) {
				1595	static struct itimerspec zero_it;
				1596
				1597	memset(it, 0, sizeof *it);
				1598	it->it_value = *rqtp;
				1599
				1600	spin_lock_irq(&timer.it_lock);
				1601	error = posix_cpu_timer_set(&timer, flags, it, NULL);
				1602	if (error) {
				1603	spin_unlock_irq(&timer.it_lock);
				1604	return error;
				1605	}
				1606
				1607	while (!signal_pending(current)) {
				1608	if (timer.it.cpu.expires.sched == 0) {
				1609	/*
				1610	* Our timer fired and was reset, below
				1611	* deletion can not fail.
				1612	*/
				1613	posix_cpu_timer_del(&timer);
				1614	spin_unlock_irq(&timer.it_lock);
				1615	return 0;
				1616	}
				1617
				1618	/*
				1619	* Block until cpu_timer_fire (or a signal) wakes us.
				1620	*/
				1621	__set_current_state(TASK_INTERRUPTIBLE);
				1622	spin_unlock_irq(&timer.it_lock);
				1623	schedule();
				1624	spin_lock_irq(&timer.it_lock);
				1625	}
				1626
				1627	/*
				1628	* We were interrupted by a signal.
				1629	*/
				1630	sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
				1631	error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
				1632	if (!error) {
				1633	/*
				1634	* Timer is now unarmed, deletion can not fail.
				1635	*/
				1636	posix_cpu_timer_del(&timer);
				1637	}
				1638	spin_unlock_irq(&timer.it_lock);
				1639
				1640	while (error == TIMER_RETRY) {
				1641	/*
				1642	* We need to handle case when timer was or is in the
				1643	* middle of firing. In other cases we already freed
				1644	* resources.
				1645	*/
				1646	spin_lock_irq(&timer.it_lock);
				1647	error = posix_cpu_timer_del(&timer);
				1648	spin_unlock_irq(&timer.it_lock);
				1649	}
				1650
				1651	if ((it->it_value.tv_sec \| it->it_value.tv_nsec) == 0) {
				1652	/*
				1653	* It actually did fire already.
				1654	*/
				1655	return 0;
				1656	}
				1657
				1658	error = -ERESTART_RESTARTBLOCK;
				1659	}
				1660
				1661	return error;
				1662	}
				1663
				1664	static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
				1665
				1666	static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
				1667	struct timespec rqtp, struct timespec __user rmtp)
				1668	{
				1669	struct restart_block *restart_block =
				1670	&current_thread_info()->restart_block;
				1671	struct itimerspec it;
				1672	int error;
				1673
				1674	/*
				1675	* Diagnose required errors first.
				1676	*/
				1677	if (CPUCLOCK_PERTHREAD(which_clock) &&
				1678	(CPUCLOCK_PID(which_clock) == 0 \|\|
				1679	CPUCLOCK_PID(which_clock) == current->pid))
				1680	return -EINVAL;
				1681
				1682	error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
				1683
				1684	if (error == -ERESTART_RESTARTBLOCK) {
				1685
				1686	if (flags & TIMER_ABSTIME)
				1687	return -ERESTARTNOHAND;
				1688	/*
				1689	* Report back to the user the time still remaining.
				1690	*/
				1691	if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
				1692	return -EFAULT;
				1693
				1694	restart_block->fn = posix_cpu_nsleep_restart;
				1695	restart_block->nanosleep.clockid = which_clock;
				1696	restart_block->nanosleep.rmtp = rmtp;
				1697	restart_block->nanosleep.expires = timespec_to_ns(rqtp);
				1698	}
				1699	return error;
				1700	}
				1701
				1702	static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
				1703	{
				1704	clockid_t which_clock = restart_block->nanosleep.clockid;
				1705	struct timespec t;
				1706	struct itimerspec it;
				1707	int error;
				1708
				1709	t = ns_to_timespec(restart_block->nanosleep.expires);
				1710
				1711	error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
				1712
				1713	if (error == -ERESTART_RESTARTBLOCK) {
				1714	struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
				1715	/*
				1716	* Report back to the user the time still remaining.
				1717	*/
				1718	if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
				1719	return -EFAULT;
				1720
				1721	restart_block->nanosleep.expires = timespec_to_ns(&t);
				1722	}
				1723	return error;
				1724
				1725	}
				1726
				1727	#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
				1728	#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
				1729
				1730	static int process_cpu_clock_getres(const clockid_t which_clock,
				1731	struct timespec *tp)
				1732	{
				1733	return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
				1734	}
				1735	static int process_cpu_clock_get(const clockid_t which_clock,
				1736	struct timespec *tp)
				1737	{
				1738	return posix_cpu_clock_get(PROCESS_CLOCK, tp);
				1739	}
				1740	static int process_cpu_timer_create(struct k_itimer *timer)
				1741	{
				1742	timer->it_clock = PROCESS_CLOCK;
				1743	return posix_cpu_timer_create(timer);
				1744	}
				1745	static int process_cpu_nsleep(const clockid_t which_clock, int flags,
				1746	struct timespec *rqtp,
				1747	struct timespec __user *rmtp)
				1748	{
				1749	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
				1750	}
				1751	static long process_cpu_nsleep_restart(struct restart_block *restart_block)
				1752	{
				1753	return -EINVAL;
				1754	}
				1755	static int thread_cpu_clock_getres(const clockid_t which_clock,
				1756	struct timespec *tp)
				1757	{
				1758	return posix_cpu_clock_getres(THREAD_CLOCK, tp);
				1759	}
				1760	static int thread_cpu_clock_get(const clockid_t which_clock,
				1761	struct timespec *tp)
				1762	{
				1763	return posix_cpu_clock_get(THREAD_CLOCK, tp);
				1764	}
				1765	static int thread_cpu_timer_create(struct k_itimer *timer)
				1766	{
				1767	timer->it_clock = THREAD_CLOCK;
				1768	return posix_cpu_timer_create(timer);
				1769	}
				1770
				1771	struct k_clock clock_posix_cpu = {
				1772	.clock_getres = posix_cpu_clock_getres,
				1773	.clock_set = posix_cpu_clock_set,
				1774	.clock_get = posix_cpu_clock_get,
				1775	.timer_create = posix_cpu_timer_create,
				1776	.nsleep = posix_cpu_nsleep,
				1777	.nsleep_restart = posix_cpu_nsleep_restart,
				1778	.timer_set = posix_cpu_timer_set,
				1779	.timer_del = posix_cpu_timer_del,
				1780	.timer_get = posix_cpu_timer_get,
				1781	};
				1782
				1783	static __init int init_posix_cpu_timers(void)
				1784	{
				1785	struct k_clock process = {
				1786	.clock_getres = process_cpu_clock_getres,
				1787	.clock_get = process_cpu_clock_get,
				1788	.timer_create = process_cpu_timer_create,
				1789	.nsleep = process_cpu_nsleep,
				1790	.nsleep_restart = process_cpu_nsleep_restart,
				1791	};
				1792	struct k_clock thread = {
				1793	.clock_getres = thread_cpu_clock_getres,
				1794	.clock_get = thread_cpu_clock_get,
				1795	.timer_create = thread_cpu_timer_create,
				1796	};
				1797	struct timespec ts;
				1798
				1799	posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
				1800	posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
				1801
				1802	cputime_to_timespec(cputime_one_jiffy, &ts);
				1803	onecputick = ts.tv_nsec;
				1804	WARN_ON(ts.tv_sec != 0);
				1805
				1806	return 0;
				1807	}
				1808	__initcall(init_posix_cpu_timers);