Blame - marvell/linux/kernel/time/hrtimer.c - T108

blob: 0772b848c7a81df89b187c10d3ca1b5fcf138d1b [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
				4	* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
				5	* Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
				6	*
				7	* High-resolution kernel timers
				8	*
				9	* In contrast to the low-resolution timeout API, aka timer wheel,
				10	* hrtimers provide finer resolution and accuracy depending on system
				11	* configuration and capabilities.
				12	*
				13	* Started by: Thomas Gleixner and Ingo Molnar
				14	*
				15	* Credits:
				16	* Based on the original timer wheel code
				17	*
				18	* Help, testing, suggestions, bugfixes, improvements were
				19	* provided by:
				20	*
				21	* George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
				22	* et. al.
				23	*/
				24
				25	#include <linux/cpu.h>
				26	#include <linux/export.h>
				27	#include <linux/percpu.h>
				28	#include <linux/hrtimer.h>
				29	#include <linux/notifier.h>
				30	#include <linux/syscalls.h>
				31	#include <linux/interrupt.h>
				32	#include <linux/tick.h>
				33	#include <linux/err.h>
				34	#include <linux/debugobjects.h>
				35	#include <linux/sched/signal.h>
				36	#include <linux/sched/sysctl.h>
				37	#include <linux/sched/rt.h>
				38	#include <linux/sched/deadline.h>
				39	#include <linux/sched/nohz.h>
				40	#include <linux/sched/debug.h>
				41	#include <linux/timer.h>
				42	#include <linux/freezer.h>
				43	#include <linux/compat.h>
				44
				45	#include <linux/uaccess.h>
				46
				47	#include <trace/events/timer.h>
				48
				49	#include "tick-internal.h"
				50
				51	/*
				52	* Masks for selecting the soft and hard context timers from
				53	* cpu_base->active
				54	*/
				55	#define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT)
				56	#define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1)
				57	#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
				58	#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT \| HRTIMER_ACTIVE_HARD)
				59
				60	/*
				61	* The timer bases:
				62	*
				63	* There are more clockids than hrtimer bases. Thus, we index
				64	* into the timer bases by the hrtimer_base_type enum. When trying
				65	* to reach a base using a clockid, hrtimer_clockid_to_base()
				66	* is used to convert from clockid to the proper hrtimer_base_type.
				67	*/
				68	DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
				69	{
				70	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
				71	.clock_base =
				72	{
				73	{
				74	.index = HRTIMER_BASE_MONOTONIC,
				75	.clockid = CLOCK_MONOTONIC,
				76	.get_time = &ktime_get,
				77	},
				78	{
				79	.index = HRTIMER_BASE_REALTIME,
				80	.clockid = CLOCK_REALTIME,
				81	.get_time = &ktime_get_real,
				82	},
				83	{
				84	.index = HRTIMER_BASE_BOOTTIME,
				85	.clockid = CLOCK_BOOTTIME,
				86	.get_time = &ktime_get_boottime,
				87	},
				88	{
				89	.index = HRTIMER_BASE_TAI,
				90	.clockid = CLOCK_TAI,
				91	.get_time = &ktime_get_clocktai,
				92	},
				93	{
				94	.index = HRTIMER_BASE_MONOTONIC_SOFT,
				95	.clockid = CLOCK_MONOTONIC,
				96	.get_time = &ktime_get,
				97	},
				98	{
				99	.index = HRTIMER_BASE_REALTIME_SOFT,
				100	.clockid = CLOCK_REALTIME,
				101	.get_time = &ktime_get_real,
				102	},
				103	{
				104	.index = HRTIMER_BASE_BOOTTIME_SOFT,
				105	.clockid = CLOCK_BOOTTIME,
				106	.get_time = &ktime_get_boottime,
				107	},
				108	{
				109	.index = HRTIMER_BASE_TAI_SOFT,
				110	.clockid = CLOCK_TAI,
				111	.get_time = &ktime_get_clocktai,
				112	},
				113	}
				114	};
				115
				116	static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
				117	/* Make sure we catch unsupported clockids */
				118	[0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES,
				119
				120	[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
				121	[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
				122	[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
				123	[CLOCK_TAI] = HRTIMER_BASE_TAI,
				124	};
				125
				126	/*
				127	* Functions and macros which are different for UP/SMP systems are kept in a
				128	* single place
				129	*/
				130	#ifdef CONFIG_SMP
				131
				132	/*
				133	* We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
				134	* such that hrtimer_callback_running() can unconditionally dereference
				135	* timer->base->cpu_base
				136	*/
				137	static struct hrtimer_cpu_base migration_cpu_base = {
				138	.clock_base = { { .cpu_base = &migration_cpu_base, }, },
				139	};
				140
				141	#define migration_base migration_cpu_base.clock_base[0]
				142
				143	static inline bool is_migration_base(struct hrtimer_clock_base *base)
				144	{
				145	return base == &migration_base;
				146	}
				147
				148	/*
				149	* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
				150	* means that all timers which are tied to this base via timer->base are
				151	* locked, and the base itself is locked too.
				152	*
				153	* So __run_timers/migrate_timers can safely modify all timers which could
				154	* be found on the lists/queues.
				155	*
				156	* When the timer's base is locked, and the timer removed from list, it is
				157	* possible to set timer->base = &migration_base and drop the lock: the timer
				158	* remains locked.
				159	*/
				160	static
				161	struct hrtimer_clock_base lock_hrtimer_base(const struct hrtimer timer,
				162	unsigned long *flags)
				163	{
				164	struct hrtimer_clock_base *base;
				165
				166	for (;;) {
				167	base = READ_ONCE(timer->base);
				168	if (likely(base != &migration_base)) {
				169	raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
				170	if (likely(base == timer->base))
				171	return base;
				172	/* The timer has migrated to another CPU: */
				173	raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
				174	}
				175	cpu_relax();
				176	}
				177	}
				178
				179	/*
				180	* We do not migrate the timer when it is expiring before the next
				181	* event on the target cpu. When high resolution is enabled, we cannot
				182	* reprogram the target cpu hardware and we would cause it to fire
				183	* late. To keep it simple, we handle the high resolution enabled and
				184	* disabled case similar.
				185	*
				186	* Called with cpu_base->lock of target cpu held.
				187	*/
				188	static int
				189	hrtimer_check_target(struct hrtimer timer, struct hrtimer_clock_base new_base)
				190	{
				191	ktime_t expires;
				192
				193	expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
				194	return expires < new_base->cpu_base->expires_next;
				195	}
				196
				197	static inline
				198	struct hrtimer_cpu_base get_target_base(struct hrtimer_cpu_base base,
				199	int pinned)
				200	{
				201	#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
				202	if (static_branch_likely(&timers_migration_enabled) && !pinned)
				203	return &per_cpu(hrtimer_bases, get_nohz_timer_target());
				204	#endif
				205	return base;
				206	}
				207
				208	/*
				209	* We switch the timer base to a power-optimized selected CPU target,
				210	* if:
				211	* - NO_HZ_COMMON is enabled
				212	* - timer migration is enabled
				213	* - the timer callback is not running
				214	* - the timer is not the first expiring timer on the new target
				215	*
				216	* If one of the above requirements is not fulfilled we move the timer
				217	* to the current CPU or leave it on the previously assigned CPU if
				218	* the timer callback is currently running.
				219	*/
				220	static inline struct hrtimer_clock_base *
				221	switch_hrtimer_base(struct hrtimer timer, struct hrtimer_clock_base base,
				222	int pinned)
				223	{
				224	struct hrtimer_cpu_base new_cpu_base, this_cpu_base;
				225	struct hrtimer_clock_base *new_base;
				226	int basenum = base->index;
				227
				228	this_cpu_base = this_cpu_ptr(&hrtimer_bases);
				229	new_cpu_base = get_target_base(this_cpu_base, pinned);
				230	again:
				231	new_base = &new_cpu_base->clock_base[basenum];
				232
				233	if (base != new_base) {
				234	/*
				235	* We are trying to move timer to new_base.
				236	* However we can't change timer's base while it is running,
				237	* so we keep it on the same CPU. No hassle vs. reprogramming
				238	* the event source in the high resolution case. The softirq
				239	* code will take care of this when the timer function has
				240	* completed. There is no conflict as we hold the lock until
				241	* the timer is enqueued.
				242	*/
				243	if (unlikely(hrtimer_callback_running(timer)))
				244	return base;
				245
				246	/* See the comment in lock_hrtimer_base() */
				247	WRITE_ONCE(timer->base, &migration_base);
				248	raw_spin_unlock(&base->cpu_base->lock);
				249	raw_spin_lock(&new_base->cpu_base->lock);
				250
				251	if (new_cpu_base != this_cpu_base &&
				252	hrtimer_check_target(timer, new_base)) {
				253	raw_spin_unlock(&new_base->cpu_base->lock);
				254	raw_spin_lock(&base->cpu_base->lock);
				255	new_cpu_base = this_cpu_base;
				256	WRITE_ONCE(timer->base, base);
				257	goto again;
				258	}
				259	WRITE_ONCE(timer->base, new_base);
				260	} else {
				261	if (new_cpu_base != this_cpu_base &&
				262	hrtimer_check_target(timer, new_base)) {
				263	new_cpu_base = this_cpu_base;
				264	goto again;
				265	}
				266	}
				267	return new_base;
				268	}
				269
				270	#else /* CONFIG_SMP */
				271
				272	static inline bool is_migration_base(struct hrtimer_clock_base *base)
				273	{
				274	return false;
				275	}
				276
				277	static inline struct hrtimer_clock_base *
				278	lock_hrtimer_base(const struct hrtimer timer, unsigned long flags)
				279	{
				280	struct hrtimer_clock_base *base = timer->base;
				281
				282	raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
				283
				284	return base;
				285	}
				286
				287	# define switch_hrtimer_base(t, b, p) (b)
				288
				289	#endif /* !CONFIG_SMP */
				290
				291	/*
				292	* Functions for the union type storage format of ktime_t which are
				293	* too large for inlining:
				294	*/
				295	#if BITS_PER_LONG < 64
				296	/*
				297	* Divide a ktime value by a nanosecond value
				298	*/
				299	s64 __ktime_divns(const ktime_t kt, s64 div)
				300	{
				301	int sft = 0;
				302	s64 dclc;
				303	u64 tmp;
				304
				305	dclc = ktime_to_ns(kt);
				306	tmp = dclc < 0 ? -dclc : dclc;
				307
				308	/* Make sure the divisor is less than 2^32: */
				309	while (div >> 32) {
				310	sft++;
				311	div >>= 1;
				312	}
				313	tmp >>= sft;
				314	do_div(tmp, (unsigned long) div);
				315	return dclc < 0 ? -tmp : tmp;
				316	}
				317	EXPORT_SYMBOL_GPL(__ktime_divns);
				318	#endif /* BITS_PER_LONG >= 64 */
				319
				320	/*
				321	* Add two ktime values and do a safety check for overflow:
				322	*/
				323	ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
				324	{
				325	ktime_t res = ktime_add_unsafe(lhs, rhs);
				326
				327	/*
				328	* We use KTIME_SEC_MAX here, the maximum timeout which we can
				329	* return to user space in a timespec:
				330	*/
				331	if (res < 0 \|\| res < lhs \|\| res < rhs)
				332	res = ktime_set(KTIME_SEC_MAX, 0);
				333
				334	return res;
				335	}
				336
				337	EXPORT_SYMBOL_GPL(ktime_add_safe);
				338
				339	#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
				340
				341	static struct debug_obj_descr hrtimer_debug_descr;
				342
				343	static void hrtimer_debug_hint(void addr)
				344	{
				345	return ((struct hrtimer *) addr)->function;
				346	}
				347
				348	/*
				349	* fixup_init is called when:
				350	* - an active object is initialized
				351	*/
				352	static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
				353	{
				354	struct hrtimer *timer = addr;
				355
				356	switch (state) {
				357	case ODEBUG_STATE_ACTIVE:
				358	hrtimer_cancel(timer);
				359	debug_object_init(timer, &hrtimer_debug_descr);
				360	return true;
				361	default:
				362	return false;
				363	}
				364	}
				365
				366	/*
				367	* fixup_activate is called when:
				368	* - an active object is activated
				369	* - an unknown non-static object is activated
				370	*/
				371	static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
				372	{
				373	switch (state) {
				374	case ODEBUG_STATE_ACTIVE:
				375	WARN_ON(1);
				376	/* fall through */
				377	default:
				378	return false;
				379	}
				380	}
				381
				382	/*
				383	* fixup_free is called when:
				384	* - an active object is freed
				385	*/
				386	static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
				387	{
				388	struct hrtimer *timer = addr;
				389
				390	switch (state) {
				391	case ODEBUG_STATE_ACTIVE:
				392	hrtimer_cancel(timer);
				393	debug_object_free(timer, &hrtimer_debug_descr);
				394	return true;
				395	default:
				396	return false;
				397	}
				398	}
				399
				400	static struct debug_obj_descr hrtimer_debug_descr = {
				401	.name = "hrtimer",
				402	.debug_hint = hrtimer_debug_hint,
				403	.fixup_init = hrtimer_fixup_init,
				404	.fixup_activate = hrtimer_fixup_activate,
				405	.fixup_free = hrtimer_fixup_free,
				406	};
				407
				408	static inline void debug_hrtimer_init(struct hrtimer *timer)
				409	{
				410	debug_object_init(timer, &hrtimer_debug_descr);
				411	}
				412
				413	static inline void debug_hrtimer_activate(struct hrtimer *timer,
				414	enum hrtimer_mode mode)
				415	{
				416	debug_object_activate(timer, &hrtimer_debug_descr);
				417	}
				418
				419	static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
				420	{
				421	debug_object_deactivate(timer, &hrtimer_debug_descr);
				422	}
				423
				424	static inline void debug_hrtimer_free(struct hrtimer *timer)
				425	{
				426	debug_object_free(timer, &hrtimer_debug_descr);
				427	}
				428
				429	static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
				430	enum hrtimer_mode mode);
				431
				432	void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
				433	enum hrtimer_mode mode)
				434	{
				435	debug_object_init_on_stack(timer, &hrtimer_debug_descr);
				436	__hrtimer_init(timer, clock_id, mode);
				437	}
				438	EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
				439
				440	static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
				441	clockid_t clock_id, enum hrtimer_mode mode);
				442
				443	void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
				444	clockid_t clock_id, enum hrtimer_mode mode)
				445	{
				446	debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr);
				447	__hrtimer_init_sleeper(sl, clock_id, mode);
				448	}
				449	EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);
				450
				451	void destroy_hrtimer_on_stack(struct hrtimer *timer)
				452	{
				453	debug_object_free(timer, &hrtimer_debug_descr);
				454	}
				455	EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
				456
				457	#else
				458
				459	static inline void debug_hrtimer_init(struct hrtimer *timer) { }
				460	static inline void debug_hrtimer_activate(struct hrtimer *timer,
				461	enum hrtimer_mode mode) { }
				462	static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
				463	#endif
				464
				465	static inline void
				466	debug_init(struct hrtimer *timer, clockid_t clockid,
				467	enum hrtimer_mode mode)
				468	{
				469	debug_hrtimer_init(timer);
				470	trace_hrtimer_init(timer, clockid, mode);
				471	}
				472
				473	static inline void debug_activate(struct hrtimer *timer,
				474	enum hrtimer_mode mode)
				475	{
				476	debug_hrtimer_activate(timer, mode);
				477	trace_hrtimer_start(timer, mode);
				478	}
				479
				480	static inline void debug_deactivate(struct hrtimer *timer)
				481	{
				482	debug_hrtimer_deactivate(timer);
				483	trace_hrtimer_cancel(timer);
				484	}
				485
				486	static struct hrtimer_clock_base *
				487	__next_base(struct hrtimer_cpu_base cpu_base, unsigned int active)
				488	{
				489	unsigned int idx;
				490
				491	if (!*active)
				492	return NULL;
				493
				494	idx = __ffs(*active);
				495	*active &= ~(1U << idx);
				496
				497	return &cpu_base->clock_base[idx];
				498	}
				499
				500	#define for_each_active_base(base, cpu_base, active) \
				501	while ((base = __next_base((cpu_base), &(active))))
				502
				503	static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
				504	const struct hrtimer *exclude,
				505	unsigned int active,
				506	ktime_t expires_next)
				507	{
				508	struct hrtimer_clock_base *base;
				509	ktime_t expires;
				510
				511	for_each_active_base(base, cpu_base, active) {
				512	struct timerqueue_node *next;
				513	struct hrtimer *timer;
				514
				515	next = timerqueue_getnext(&base->active);
				516	timer = container_of(next, struct hrtimer, node);
				517	if (timer == exclude) {
				518	/* Get to the next timer in the queue. */
				519	next = timerqueue_iterate_next(next);
				520	if (!next)
				521	continue;
				522
				523	timer = container_of(next, struct hrtimer, node);
				524	}
				525	expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
				526	if (expires < expires_next) {
				527	expires_next = expires;
				528
				529	/* Skip cpu_base update if a timer is being excluded. */
				530	if (exclude)
				531	continue;
				532
				533	if (timer->is_soft)
				534	cpu_base->softirq_next_timer = timer;
				535	else
				536	cpu_base->next_timer = timer;
				537	}
				538	}
				539	/*
				540	* clock_was_set() might have changed base->offset of any of
				541	* the clock bases so the result might be negative. Fix it up
				542	* to prevent a false positive in clockevents_program_event().
				543	*/
				544	if (expires_next < 0)
				545	expires_next = 0;
				546	return expires_next;
				547	}
				548
				549	/*
				550	* Recomputes cpu_base::*next_timer and returns the earliest expires_next
				551	* but does not set cpu_base::*expires_next, that is done by
				552	* hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating
				553	* cpu_base::*expires_next right away, reprogramming logic would no longer
				554	* work.
				555	*
				556	* When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
				557	* those timers will get run whenever the softirq gets handled, at the end of
				558	* hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
				559	*
				560	* Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
				561	* The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
				562	* softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
				563	*
				564	* @active_mask must be one of:
				565	* - HRTIMER_ACTIVE_ALL,
				566	* - HRTIMER_ACTIVE_SOFT, or
				567	* - HRTIMER_ACTIVE_HARD.
				568	*/
				569	static ktime_t
				570	__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
				571	{
				572	unsigned int active;
				573	struct hrtimer *next_timer = NULL;
				574	ktime_t expires_next = KTIME_MAX;
				575
				576	if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
				577	active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
				578	cpu_base->softirq_next_timer = NULL;
				579	expires_next = __hrtimer_next_event_base(cpu_base, NULL,
				580	active, KTIME_MAX);
				581
				582	next_timer = cpu_base->softirq_next_timer;
				583	}
				584
				585	if (active_mask & HRTIMER_ACTIVE_HARD) {
				586	active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
				587	cpu_base->next_timer = next_timer;
				588	expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
				589	expires_next);
				590	}
				591
				592	return expires_next;
				593	}
				594
				595	static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
				596	{
				597	ktime_t expires_next, soft = KTIME_MAX;
				598
				599	/*
				600	* If the soft interrupt has already been activated, ignore the
				601	* soft bases. They will be handled in the already raised soft
				602	* interrupt.
				603	*/
				604	if (!cpu_base->softirq_activated) {
				605	soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
				606	/*
				607	* Update the soft expiry time. clock_settime() might have
				608	* affected it.
				609	*/
				610	cpu_base->softirq_expires_next = soft;
				611	}
				612
				613	expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD);
				614	/*
				615	* If a softirq timer is expiring first, update cpu_base->next_timer
				616	* and program the hardware with the soft expiry time.
				617	*/
				618	if (expires_next > soft) {
				619	cpu_base->next_timer = cpu_base->softirq_next_timer;
				620	expires_next = soft;
				621	}
				622
				623	return expires_next;
				624	}
				625
				626	static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
				627	{
				628	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
				629	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
				630	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
				631
				632	ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
				633	offs_real, offs_boot, offs_tai);
				634
				635	base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
				636	base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
				637	base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;
				638
				639	return now;
				640	}
				641
				642	/*
				643	* Is the high resolution mode active ?
				644	*/
				645	static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
				646	{
				647	return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
				648	cpu_base->hres_active : 0;
				649	}
				650
				651	static inline int hrtimer_hres_active(void)
				652	{
				653	return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
				654	}
				655
				656	/*
				657	* Reprogram the event source with checking both queues for the
				658	* next event
				659	* Called with interrupts disabled and base->lock held
				660	*/
				661	static void
				662	hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
				663	{
				664	ktime_t expires_next;
				665
				666	expires_next = hrtimer_update_next_event(cpu_base);
				667
				668	if (skip_equal && expires_next == cpu_base->expires_next)
				669	return;
				670
				671	cpu_base->expires_next = expires_next;
				672
				673	/*
				674	* If hres is not active, hardware does not have to be
				675	* reprogrammed yet.
				676	*
				677	* If a hang was detected in the last timer interrupt then we
				678	* leave the hang delay active in the hardware. We want the
				679	* system to make progress. That also prevents the following
				680	* scenario:
				681	* T1 expires 50ms from now
				682	* T2 expires 5s from now
				683	*
				684	* T1 is removed, so this code is called and would reprogram
				685	* the hardware to 5s from now. Any hrtimer_start after that
				686	* will not reprogram the hardware due to hang_detected being
				687	* set. So we'd effectivly block all timers until the T2 event
				688	* fires.
				689	*/
				690	if (!__hrtimer_hres_active(cpu_base) \|\| cpu_base->hang_detected)
				691	return;
				692
				693	tick_program_event(cpu_base->expires_next, 1);
				694	}
				695
				696	/* High resolution timer related functions */
				697	#ifdef CONFIG_HIGH_RES_TIMERS
				698
				699	/*
				700	* High resolution timer enabled ?
				701	*/
				702	static bool hrtimer_hres_enabled __read_mostly = true;
				703	unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
				704	EXPORT_SYMBOL_GPL(hrtimer_resolution);
				705
				706	/*
				707	* Enable / Disable high resolution mode
				708	*/
				709	static int __init setup_hrtimer_hres(char *str)
				710	{
				711	return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
				712	}
				713
				714	__setup("highres=", setup_hrtimer_hres);
				715
				716	/*
				717	* hrtimer_high_res_enabled - query, if the highres mode is enabled
				718	*/
				719	static inline int hrtimer_is_hres_enabled(void)
				720	{
				721	return hrtimer_hres_enabled;
				722	}
				723
				724	/*
				725	* Retrigger next event is called after clock was set
				726	*
				727	* Called with interrupts disabled via on_each_cpu()
				728	*/
				729	static void retrigger_next_event(void *arg)
				730	{
				731	struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
				732
				733	if (!__hrtimer_hres_active(base))
				734	return;
				735
				736	raw_spin_lock(&base->lock);
				737	hrtimer_update_base(base);
				738	hrtimer_force_reprogram(base, 0);
				739	raw_spin_unlock(&base->lock);
				740	}
				741
				742	/*
				743	* Switch to high resolution mode
				744	*/
				745	static void hrtimer_switch_to_hres(void)
				746	{
				747	struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
				748
				749	if (tick_init_highres()) {
				750	pr_warn("Could not switch to high resolution mode on CPU %u\n",
				751	base->cpu);
				752	return;
				753	}
				754	base->hres_active = 1;
				755	hrtimer_resolution = HIGH_RES_NSEC;
				756
				757	tick_setup_sched_timer();
				758	/* "Retrigger" the interrupt to get things going */
				759	retrigger_next_event(NULL);
				760	}
				761
				762	#else
				763
				764	static inline int hrtimer_is_hres_enabled(void) { return 0; }
				765	static inline void hrtimer_switch_to_hres(void) { }
				766	static inline void retrigger_next_event(void *arg) { }
				767
				768	#endif /* CONFIG_HIGH_RES_TIMERS */
				769
				770	/*
				771	* When a timer is enqueued and expires earlier than the already enqueued
				772	* timers, we have to check, whether it expires earlier than the timer for
				773	* which the clock event device was armed.
				774	*
				775	* Called with interrupts disabled and base->cpu_base.lock held
				776	*/
				777	static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
				778	{
				779	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
				780	struct hrtimer_clock_base *base = timer->base;
				781	ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
				782
				783	WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
				784
				785	/*
				786	* CLOCK_REALTIME timer might be requested with an absolute
				787	* expiry time which is less than base->offset. Set it to 0.
				788	*/
				789	if (expires < 0)
				790	expires = 0;
				791
				792	if (timer->is_soft) {
				793	/*
				794	* soft hrtimer could be started on a remote CPU. In this
				795	* case softirq_expires_next needs to be updated on the
				796	* remote CPU. The soft hrtimer will not expire before the
				797	* first hard hrtimer on the remote CPU -
				798	* hrtimer_check_target() prevents this case.
				799	*/
				800	struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;
				801
				802	if (timer_cpu_base->softirq_activated)
				803	return;
				804
				805	if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
				806	return;
				807
				808	timer_cpu_base->softirq_next_timer = timer;
				809	timer_cpu_base->softirq_expires_next = expires;
				810
				811	if (!ktime_before(expires, timer_cpu_base->expires_next) \|\|
				812	!reprogram)
				813	return;
				814	}
				815
				816	/*
				817	* If the timer is not on the current cpu, we cannot reprogram
				818	* the other cpus clock event device.
				819	*/
				820	if (base->cpu_base != cpu_base)
				821	return;
				822
				823	/*
				824	* If the hrtimer interrupt is running, then it will
				825	* reevaluate the clock bases and reprogram the clock event
				826	* device. The callbacks are always executed in hard interrupt
				827	* context so we don't need an extra check for a running
				828	* callback.
				829	*/
				830	if (cpu_base->in_hrtirq)
				831	return;
				832
				833	if (expires >= cpu_base->expires_next)
				834	return;
				835
				836	/* Update the pointer to the next expiring timer */
				837	cpu_base->next_timer = timer;
				838	cpu_base->expires_next = expires;
				839
				840	/*
				841	* If hres is not active, hardware does not have to be
				842	* programmed yet.
				843	*
				844	* If a hang was detected in the last timer interrupt then we
				845	* do not schedule a timer which is earlier than the expiry
				846	* which we enforced in the hang detection. We want the system
				847	* to make progress.
				848	*/
				849	if (!__hrtimer_hres_active(cpu_base) \|\| cpu_base->hang_detected)
				850	return;
				851
				852	/*
				853	* Program the timer hardware. We enforce the expiry for
				854	* events which are already in the past.
				855	*/
				856	tick_program_event(expires, 1);
				857	}
				858
				859	/*
				860	* Clock realtime was set
				861	*
				862	* Change the offset of the realtime clock vs. the monotonic
				863	* clock.
				864	*
				865	* We might have to reprogram the high resolution timer interrupt. On
				866	* SMP we call the architecture specific code to retrigger _all_ high
				867	* resolution timer interrupts. On UP we just disable interrupts and
				868	* call the high resolution interrupt code.
				869	*/
				870	void clock_was_set(void)
				871	{
				872	#ifdef CONFIG_HIGH_RES_TIMERS
				873	/* Retrigger the CPU local events everywhere */
				874	on_each_cpu(retrigger_next_event, NULL, 1);
				875	#endif
				876	timerfd_clock_was_set();
				877	}
				878
				879	static void clock_was_set_work(struct work_struct *work)
				880	{
				881	clock_was_set();
				882	}
				883
				884	static DECLARE_WORK(hrtimer_work, clock_was_set_work);
				885
				886	/*
				887	* Called from timekeeping and resume code to reprogram the hrtimer
				888	* interrupt device on all cpus and to notify timerfd.
				889	*/
				890	void clock_was_set_delayed(void)
				891	{
				892	schedule_work(&hrtimer_work);
				893	}
				894
				895	/*
				896	* During resume we might have to reprogram the high resolution timer
				897	* interrupt on all online CPUs. However, all other CPUs will be
				898	* stopped with IRQs interrupts disabled so the clock_was_set() call
				899	* must be deferred.
				900	*/
				901	void hrtimers_resume(void)
				902	{
				903	lockdep_assert_irqs_disabled();
				904	/* Retrigger on the local CPU */
				905	retrigger_next_event(NULL);
				906	/* And schedule a retrigger for all others */
				907	clock_was_set_delayed();
				908	}
				909
				910	/*
				911	* Counterpart to lock_hrtimer_base above:
				912	*/
				913	static inline
				914	void unlock_hrtimer_base(const struct hrtimer timer, unsigned long flags)
				915	{
				916	raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
				917	}
				918
				919	/**
				920	* hrtimer_forward - forward the timer expiry
				921	* @timer: hrtimer to forward
				922	* @now: forward past this time
				923	* @interval: the interval to forward
				924	*
				925	* Forward the timer expiry so it will expire in the future.
				926	* Returns the number of overruns.
				927	*
				928	* Can be safely called from the callback function of @timer. If
				929	* called from other contexts @timer must neither be enqueued nor
				930	* running the callback and the caller needs to take care of
				931	* serialization.
				932	*
				933	* Note: This only updates the timer expiry value and does not requeue
				934	* the timer.
				935	*/
				936	u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
				937	{
				938	u64 orun = 1;
				939	ktime_t delta;
				940
				941	delta = ktime_sub(now, hrtimer_get_expires(timer));
				942
				943	if (delta < 0)
				944	return 0;
				945
				946	if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
				947	return 0;
				948
				949	if (interval < hrtimer_resolution)
				950	interval = hrtimer_resolution;
				951
				952	if (unlikely(delta >= interval)) {
				953	s64 incr = ktime_to_ns(interval);
				954
				955	orun = ktime_divns(delta, incr);
				956	hrtimer_add_expires_ns(timer, incr * orun);
				957	if (hrtimer_get_expires_tv64(timer) > now)
				958	return orun;
				959	/*
				960	* This (and the ktime_add() below) is the
				961	* correction for exact:
				962	*/
				963	orun++;
				964	}
				965	hrtimer_add_expires(timer, interval);
				966
				967	return orun;
				968	}
				969	EXPORT_SYMBOL_GPL(hrtimer_forward);
				970
				971	/*
				972	* enqueue_hrtimer - internal function to (re)start a timer
				973	*
				974	* The timer is inserted in expiry order. Insertion into the
				975	* red black tree is O(log(n)). Must hold the base lock.
				976	*
				977	* Returns 1 when the new timer is the leftmost timer in the tree.
				978	*/
				979	static int enqueue_hrtimer(struct hrtimer *timer,
				980	struct hrtimer_clock_base *base,
				981	enum hrtimer_mode mode)
				982	{
				983	debug_activate(timer, mode);
				984	WARN_ON_ONCE(!base->cpu_base->online);
				985
				986	base->cpu_base->active_bases \|= 1 << base->index;
				987
				988	/* Pairs with the lockless read in hrtimer_is_queued() */
				989	WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED);
				990
				991	return timerqueue_add(&base->active, &timer->node);
				992	}
				993
				994	/*
				995	* __remove_hrtimer - internal function to remove a timer
				996	*
				997	* Caller must hold the base lock.
				998	*
				999	* High resolution timer mode reprograms the clock event device when the
				1000	* timer is the one which expires next. The caller can disable this by setting
				1001	* reprogram to zero. This is useful, when the context does a reprogramming
				1002	* anyway (e.g. timer interrupt)
				1003	*/
				1004	static void __remove_hrtimer(struct hrtimer *timer,
				1005	struct hrtimer_clock_base *base,
				1006	u8 newstate, int reprogram)
				1007	{
				1008	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
				1009	u8 state = timer->state;
				1010
				1011	/* Pairs with the lockless read in hrtimer_is_queued() */
				1012	WRITE_ONCE(timer->state, newstate);
				1013	if (!(state & HRTIMER_STATE_ENQUEUED))
				1014	return;
				1015
				1016	if (!timerqueue_del(&base->active, &timer->node))
				1017	cpu_base->active_bases &= ~(1 << base->index);
				1018
				1019	/*
				1020	* Note: If reprogram is false we do not update
				1021	* cpu_base->next_timer. This happens when we remove the first
				1022	* timer on a remote cpu. No harm as we never dereference
				1023	* cpu_base->next_timer. So the worst thing what can happen is
				1024	* an superflous call to hrtimer_force_reprogram() on the
				1025	* remote cpu later on if the same timer gets enqueued again.
				1026	*/
				1027	if (reprogram && timer == cpu_base->next_timer)
				1028	hrtimer_force_reprogram(cpu_base, 1);
				1029	}
				1030
				1031	/*
				1032	* remove hrtimer, called with base lock held
				1033	*/
				1034	static inline int
				1035	remove_hrtimer(struct hrtimer timer, struct hrtimer_clock_base base,
				1036	bool restart, bool keep_local)
				1037	{
				1038	u8 state = timer->state;
				1039
				1040	if (state & HRTIMER_STATE_ENQUEUED) {
				1041	bool reprogram;
				1042
				1043	/*
				1044	* Remove the timer and force reprogramming when high
				1045	* resolution mode is active and the timer is on the current
				1046	* CPU. If we remove a timer on another CPU, reprogramming is
				1047	* skipped. The interrupt event on this CPU is fired and
				1048	* reprogramming happens in the interrupt handler. This is a
				1049	* rare case and less expensive than a smp call.
				1050	*/
				1051	debug_deactivate(timer);
				1052	reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
				1053
				1054	/*
				1055	* If the timer is not restarted then reprogramming is
				1056	* required if the timer is local. If it is local and about
				1057	* to be restarted, avoid programming it twice (on removal
				1058	* and a moment later when it's requeued).
				1059	*/
				1060	if (!restart)
				1061	state = HRTIMER_STATE_INACTIVE;
				1062	else
				1063	reprogram &= !keep_local;
				1064
				1065	__remove_hrtimer(timer, base, state, reprogram);
				1066	return 1;
				1067	}
				1068	return 0;
				1069	}
				1070
				1071	static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
				1072	const enum hrtimer_mode mode)
				1073	{
				1074	#ifdef CONFIG_TIME_LOW_RES
				1075	/*
				1076	* CONFIG_TIME_LOW_RES indicates that the system has no way to return
				1077	* granular time values. For relative timers we add hrtimer_resolution
				1078	* (i.e. one jiffie) to prevent short timeouts.
				1079	*/
				1080	timer->is_rel = mode & HRTIMER_MODE_REL;
				1081	if (timer->is_rel)
				1082	tim = ktime_add_safe(tim, hrtimer_resolution);
				1083	#endif
				1084	return tim;
				1085	}
				1086
				1087	static void
				1088	hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
				1089	{
				1090	ktime_t expires;
				1091
				1092	/*
				1093	* Find the next SOFT expiration.
				1094	*/
				1095	expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
				1096
				1097	/*
				1098	* reprogramming needs to be triggered, even if the next soft
				1099	* hrtimer expires at the same time than the next hard
				1100	* hrtimer. cpu_base->softirq_expires_next needs to be updated!
				1101	*/
				1102	if (expires == KTIME_MAX)
				1103	return;
				1104
				1105	/*
				1106	* cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
				1107	* cpu_base->*expires_next is only set by hrtimer_reprogram()
				1108	*/
				1109	hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
				1110	}
				1111
				1112	static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
				1113	u64 delta_ns, const enum hrtimer_mode mode,
				1114	struct hrtimer_clock_base *base)
				1115	{
				1116	struct hrtimer_clock_base *new_base;
				1117	bool force_local, first;
				1118
				1119	/*
				1120	* If the timer is on the local cpu base and is the first expiring
				1121	* timer then this might end up reprogramming the hardware twice
				1122	* (on removal and on enqueue). To avoid that by prevent the
				1123	* reprogram on removal, keep the timer local to the current CPU
				1124	* and enforce reprogramming after it is queued no matter whether
				1125	* it is the new first expiring timer again or not.
				1126	*/
				1127	force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
				1128	force_local &= base->cpu_base->next_timer == timer;
				1129
				1130	/*
				1131	* Remove an active timer from the queue. In case it is not queued
				1132	* on the current CPU, make sure that remove_hrtimer() updates the
				1133	* remote data correctly.
				1134	*
				1135	* If it's on the current CPU and the first expiring timer, then
				1136	* skip reprogramming, keep the timer local and enforce
				1137	* reprogramming later if it was the first expiring timer. This
				1138	* avoids programming the underlying clock event twice (once at
				1139	* removal and once after enqueue).
				1140	*/
				1141	remove_hrtimer(timer, base, true, force_local);
				1142
				1143	if (mode & HRTIMER_MODE_REL)
				1144	tim = ktime_add_safe(tim, base->get_time());
				1145
				1146	tim = hrtimer_update_lowres(timer, tim, mode);
				1147
				1148	hrtimer_set_expires_range_ns(timer, tim, delta_ns);
				1149
				1150	/* Switch the timer base, if necessary: */
				1151	if (!force_local) {
				1152	new_base = switch_hrtimer_base(timer, base,
				1153	mode & HRTIMER_MODE_PINNED);
				1154	} else {
				1155	new_base = base;
				1156	}
				1157
				1158	first = enqueue_hrtimer(timer, new_base, mode);
				1159	if (!force_local)
				1160	return first;
				1161
				1162	/*
				1163	* Timer was forced to stay on the current CPU to avoid
				1164	* reprogramming on removal and enqueue. Force reprogram the
				1165	* hardware by evaluating the new first expiring timer.
				1166	*/
				1167	hrtimer_force_reprogram(new_base->cpu_base, 1);
				1168	return 0;
				1169	}
				1170
				1171	/**
				1172	* hrtimer_start_range_ns - (re)start an hrtimer
				1173	* @timer: the timer to be added
				1174	* @tim: expiry time
				1175	* @delta_ns: "slack" range for the timer
				1176	* @mode: timer mode: absolute (HRTIMER_MODE_ABS) or
				1177	* relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
				1178	* softirq based mode is considered for debug purpose only!
				1179	*/
				1180	void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
				1181	u64 delta_ns, const enum hrtimer_mode mode)
				1182	{
				1183	struct hrtimer_clock_base *base;
				1184	unsigned long flags;
				1185
				1186	if (WARN_ON_ONCE(!timer->function))
				1187	return;
				1188	/*
				1189	* Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
				1190	* match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
				1191	* expiry mode because unmarked timers are moved to softirq expiry.
				1192	*/
				1193	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
				1194	WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
				1195	else
				1196	WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);
				1197
				1198	base = lock_hrtimer_base(timer, &flags);
				1199
				1200	if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
				1201	hrtimer_reprogram(timer, true);
				1202
				1203	unlock_hrtimer_base(timer, &flags);
				1204	}
				1205	EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
				1206
				1207	/**
				1208	* hrtimer_try_to_cancel - try to deactivate a timer
				1209	* @timer: hrtimer to stop
				1210	*
				1211	* Returns:
				1212	*
				1213	* * 0 when the timer was not active
				1214	* * 1 when the timer was active
				1215	* * -1 when the timer is currently executing the callback function and
				1216	* cannot be stopped
				1217	*/
				1218	int hrtimer_try_to_cancel(struct hrtimer *timer)
				1219	{
				1220	struct hrtimer_clock_base *base;
				1221	unsigned long flags;
				1222	int ret = -1;
				1223
				1224	/*
				1225	* Check lockless first. If the timer is not active (neither
				1226	* enqueued nor running the callback, nothing to do here. The
				1227	* base lock does not serialize against a concurrent enqueue,
				1228	* so we can avoid taking it.
				1229	*/
				1230	if (!hrtimer_active(timer))
				1231	return 0;
				1232
				1233	base = lock_hrtimer_base(timer, &flags);
				1234
				1235	if (!hrtimer_callback_running(timer))
				1236	ret = remove_hrtimer(timer, base, false, false);
				1237
				1238	unlock_hrtimer_base(timer, &flags);
				1239
				1240	return ret;
				1241
				1242	}
				1243	EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
				1244
				1245	#ifdef CONFIG_PREEMPT_RT
				1246	static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base)
				1247	{
				1248	spin_lock_init(&base->softirq_expiry_lock);
				1249	}
				1250
				1251	static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base)
				1252	{
				1253	spin_lock(&base->softirq_expiry_lock);
				1254	}
				1255
				1256	static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
				1257	{
				1258	spin_unlock(&base->softirq_expiry_lock);
				1259	}
				1260
				1261	/*
				1262	* The counterpart to hrtimer_cancel_wait_running().
				1263	*
				1264	* If there is a waiter for cpu_base->expiry_lock, then it was waiting for
				1265	* the timer callback to finish. Drop expiry_lock and reaquire it. That
				1266	* allows the waiter to acquire the lock and make progress.
				1267	*/
				1268	static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
				1269	unsigned long flags)
				1270	{
				1271	if (atomic_read(&cpu_base->timer_waiters)) {
				1272	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
				1273	spin_unlock(&cpu_base->softirq_expiry_lock);
				1274	spin_lock(&cpu_base->softirq_expiry_lock);
				1275	raw_spin_lock_irq(&cpu_base->lock);
				1276	}
				1277	}
				1278
				1279	/*
				1280	* This function is called on PREEMPT_RT kernels when the fast path
				1281	* deletion of a timer failed because the timer callback function was
				1282	* running.
				1283	*
				1284	* This prevents priority inversion: if the soft irq thread is preempted
				1285	* in the middle of a timer callback, then calling del_timer_sync() can
				1286	* lead to two issues:
				1287	*
				1288	* - If the caller is on a remote CPU then it has to spin wait for the timer
				1289	* handler to complete. This can result in unbound priority inversion.
				1290	*
				1291	* - If the caller originates from the task which preempted the timer
				1292	* handler on the same CPU, then spin waiting for the timer handler to
				1293	* complete is never going to end.
				1294	*/
				1295	void hrtimer_cancel_wait_running(const struct hrtimer *timer)
				1296	{
				1297	/* Lockless read. Prevent the compiler from reloading it below */
				1298	struct hrtimer_clock_base *base = READ_ONCE(timer->base);
				1299
				1300	/*
				1301	* Just relax if the timer expires in hard interrupt context or if
				1302	* it is currently on the migration base.
				1303	*/
				1304	if (!timer->is_soft \|\| is_migration_base(base)) {
				1305	cpu_relax();
				1306	return;
				1307	}
				1308
				1309	/*
				1310	* Mark the base as contended and grab the expiry lock, which is
				1311	* held by the softirq across the timer callback. Drop the lock
				1312	* immediately so the softirq can expire the next timer. In theory
				1313	* the timer could already be running again, but that's more than
				1314	* unlikely and just causes another wait loop.
				1315	*/
				1316	atomic_inc(&base->cpu_base->timer_waiters);
				1317	spin_lock_bh(&base->cpu_base->softirq_expiry_lock);
				1318	atomic_dec(&base->cpu_base->timer_waiters);
				1319	spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
				1320	}
				1321	#else
				1322	static inline void
				1323	hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
				1324	static inline void
				1325	hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
				1326	static inline void
				1327	hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
				1328	static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base,
				1329	unsigned long flags) { }
				1330	#endif
				1331
				1332	/**
				1333	* hrtimer_cancel - cancel a timer and wait for the handler to finish.
				1334	* @timer: the timer to be cancelled
				1335	*
				1336	* Returns:
				1337	* 0 when the timer was not active
				1338	* 1 when the timer was active
				1339	*/
				1340	int hrtimer_cancel(struct hrtimer *timer)
				1341	{
				1342	int ret;
				1343
				1344	do {
				1345	ret = hrtimer_try_to_cancel(timer);
				1346
				1347	if (ret < 0)
				1348	hrtimer_cancel_wait_running(timer);
				1349	} while (ret < 0);
				1350	return ret;
				1351	}
				1352	EXPORT_SYMBOL_GPL(hrtimer_cancel);
				1353
				1354	/**
				1355	* hrtimer_get_remaining - get remaining time for the timer
				1356	* @timer: the timer to read
				1357	* @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y
				1358	*/
				1359	ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust)
				1360	{
				1361	unsigned long flags;
				1362	ktime_t rem;
				1363
				1364	lock_hrtimer_base(timer, &flags);
				1365	if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust)
				1366	rem = hrtimer_expires_remaining_adjusted(timer);
				1367	else
				1368	rem = hrtimer_expires_remaining(timer);
				1369	unlock_hrtimer_base(timer, &flags);
				1370
				1371	return rem;
				1372	}
				1373	EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);
				1374
				1375	#ifdef CONFIG_NO_HZ_COMMON
				1376	/**
				1377	* hrtimer_get_next_event - get the time until next expiry event
				1378	*
				1379	* Returns the next expiry time or KTIME_MAX if no timer is pending.
				1380	*/
				1381	u64 hrtimer_get_next_event(void)
				1382	{
				1383	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
				1384	u64 expires = KTIME_MAX;
				1385	unsigned long flags;
				1386
				1387	raw_spin_lock_irqsave(&cpu_base->lock, flags);
				1388
				1389	if (!__hrtimer_hres_active(cpu_base))
				1390	expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
				1391
				1392	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
				1393
				1394	return expires;
				1395	}
				1396
				1397	/**
				1398	* hrtimer_next_event_without - time until next expiry event w/o one timer
				1399	* @exclude: timer to exclude
				1400	*
				1401	* Returns the next expiry time over all timers except for the @exclude one or
				1402	* KTIME_MAX if none of them is pending.
				1403	*/
				1404	u64 hrtimer_next_event_without(const struct hrtimer *exclude)
				1405	{
				1406	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
				1407	u64 expires = KTIME_MAX;
				1408	unsigned long flags;
				1409
				1410	raw_spin_lock_irqsave(&cpu_base->lock, flags);
				1411
				1412	if (__hrtimer_hres_active(cpu_base)) {
				1413	unsigned int active;
				1414
				1415	if (!cpu_base->softirq_activated) {
				1416	active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
				1417	expires = __hrtimer_next_event_base(cpu_base, exclude,
				1418	active, KTIME_MAX);
				1419	}
				1420	active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
				1421	expires = __hrtimer_next_event_base(cpu_base, exclude, active,
				1422	expires);
				1423	}
				1424
				1425	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
				1426
				1427	return expires;
				1428	}
				1429	#endif
				1430
				1431	static inline int hrtimer_clockid_to_base(clockid_t clock_id)
				1432	{
				1433	if (likely(clock_id < MAX_CLOCKS)) {
				1434	int base = hrtimer_clock_to_base_table[clock_id];
				1435
				1436	if (likely(base != HRTIMER_MAX_CLOCK_BASES))
				1437	return base;
				1438	}
				1439	WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
				1440	return HRTIMER_BASE_MONOTONIC;
				1441	}
				1442
				1443	static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
				1444	enum hrtimer_mode mode)
				1445	{
				1446	bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
				1447	struct hrtimer_cpu_base *cpu_base;
				1448	int base;
				1449
				1450	/*
				1451	* On PREEMPT_RT enabled kernels hrtimers which are not explicitely
				1452	* marked for hard interrupt expiry mode are moved into soft
				1453	* interrupt context for latency reasons and because the callbacks
				1454	* can invoke functions which might sleep on RT, e.g. spin_lock().
				1455	*/
				1456	if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD))
				1457	softtimer = true;
				1458
				1459	memset(timer, 0, sizeof(struct hrtimer));
				1460
				1461	cpu_base = raw_cpu_ptr(&hrtimer_bases);
				1462
				1463	/*
				1464	* POSIX magic: Relative CLOCK_REALTIME timers are not affected by
				1465	* clock modifications, so they needs to become CLOCK_MONOTONIC to
				1466	* ensure POSIX compliance.
				1467	*/
				1468	if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
				1469	clock_id = CLOCK_MONOTONIC;
				1470
				1471	base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
				1472	base += hrtimer_clockid_to_base(clock_id);
				1473	timer->is_soft = softtimer;
				1474	timer->is_hard = !softtimer;
				1475	timer->base = &cpu_base->clock_base[base];
				1476	timerqueue_init(&timer->node);
				1477	}
				1478
				1479	/**
				1480	* hrtimer_init - initialize a timer to the given clock
				1481	* @timer: the timer to be initialized
				1482	* @clock_id: the clock to be used
				1483	* @mode: The modes which are relevant for intitialization:
				1484	* HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
				1485	* HRTIMER_MODE_REL_SOFT
				1486	*
				1487	* The PINNED variants of the above can be handed in,
				1488	* but the PINNED bit is ignored as pinning happens
				1489	* when the hrtimer is started
				1490	*/
				1491	void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
				1492	enum hrtimer_mode mode)
				1493	{
				1494	debug_init(timer, clock_id, mode);
				1495	__hrtimer_init(timer, clock_id, mode);
				1496	}
				1497	EXPORT_SYMBOL_GPL(hrtimer_init);
				1498
				1499	/*
				1500	* A timer is active, when it is enqueued into the rbtree or the
				1501	* callback function is running or it's in the state of being migrated
				1502	* to another cpu.
				1503	*
				1504	* It is important for this function to not return a false negative.
				1505	*/
				1506	bool hrtimer_active(const struct hrtimer *timer)
				1507	{
				1508	struct hrtimer_clock_base *base;
				1509	unsigned int seq;
				1510
				1511	do {
				1512	base = READ_ONCE(timer->base);
				1513	seq = raw_read_seqcount_begin(&base->seq);
				1514
				1515	if (timer->state != HRTIMER_STATE_INACTIVE \|\|
				1516	base->running == timer)
				1517	return true;
				1518
				1519	} while (read_seqcount_retry(&base->seq, seq) \|\|
				1520	base != READ_ONCE(timer->base));
				1521
				1522	return false;
				1523	}
				1524	EXPORT_SYMBOL_GPL(hrtimer_active);
				1525
				1526	/*
				1527	* The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
				1528	* distinct sections:
				1529	*
				1530	* - queued: the timer is queued
				1531	* - callback: the timer is being ran
				1532	* - post: the timer is inactive or (re)queued
				1533	*
				1534	* On the read side we ensure we observe timer->state and cpu_base->running
				1535	* from the same section, if anything changed while we looked at it, we retry.
				1536	* This includes timer->base changing because sequence numbers alone are
				1537	* insufficient for that.
				1538	*
				1539	* The sequence numbers are required because otherwise we could still observe
				1540	* a false negative if the read side got smeared over multiple consequtive
				1541	* __run_hrtimer() invocations.
				1542	*/
				1543
				1544	static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
				1545	struct hrtimer_clock_base *base,
				1546	struct hrtimer timer, ktime_t now,
				1547	unsigned long flags)
				1548	{
				1549	enum hrtimer_restart (fn)(struct hrtimer );
				1550	int restart;
				1551
				1552	lockdep_assert_held(&cpu_base->lock);
				1553
				1554	debug_deactivate(timer);
				1555	base->running = timer;
				1556
				1557	/*
				1558	* Separate the ->running assignment from the ->state assignment.
				1559	*
				1560	* As with a regular write barrier, this ensures the read side in
				1561	* hrtimer_active() cannot observe base->running == NULL &&
				1562	* timer->state == INACTIVE.
				1563	*/
				1564	raw_write_seqcount_barrier(&base->seq);
				1565
				1566	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
				1567	fn = timer->function;
				1568
				1569	/*
				1570	* Clear the 'is relative' flag for the TIME_LOW_RES case. If the
				1571	* timer is restarted with a period then it becomes an absolute
				1572	* timer. If its not restarted it does not matter.
				1573	*/
				1574	if (IS_ENABLED(CONFIG_TIME_LOW_RES))
				1575	timer->is_rel = false;
				1576
				1577	/*
				1578	* The timer is marked as running in the CPU base, so it is
				1579	* protected against migration to a different CPU even if the lock
				1580	* is dropped.
				1581	*/
				1582	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
				1583	trace_hrtimer_expire_entry(timer, now);
				1584	restart = fn(timer);
				1585	trace_hrtimer_expire_exit(timer);
				1586	raw_spin_lock_irq(&cpu_base->lock);
				1587
				1588	/*
				1589	* Note: We clear the running state after enqueue_hrtimer and
				1590	* we do not reprogram the event hardware. Happens either in
				1591	* hrtimer_start_range_ns() or in hrtimer_interrupt()
				1592	*
				1593	* Note: Because we dropped the cpu_base->lock above,
				1594	* hrtimer_start_range_ns() can have popped in and enqueued the timer
				1595	* for us already.
				1596	*/
				1597	if (restart != HRTIMER_NORESTART &&
				1598	!(timer->state & HRTIMER_STATE_ENQUEUED))
				1599	enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
				1600
				1601	/*
				1602	* Separate the ->running assignment from the ->state assignment.
				1603	*
				1604	* As with a regular write barrier, this ensures the read side in
				1605	* hrtimer_active() cannot observe base->running.timer == NULL &&
				1606	* timer->state == INACTIVE.
				1607	*/
				1608	raw_write_seqcount_barrier(&base->seq);
				1609
				1610	WARN_ON_ONCE(base->running != timer);
				1611	base->running = NULL;
				1612	}
				1613
				1614	static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
				1615	unsigned long flags, unsigned int active_mask)
				1616	{
				1617	struct hrtimer_clock_base *base;
				1618	unsigned int active = cpu_base->active_bases & active_mask;
				1619
				1620	for_each_active_base(base, cpu_base, active) {
				1621	struct timerqueue_node *node;
				1622	ktime_t basenow;
				1623
				1624	basenow = ktime_add(now, base->offset);
				1625
				1626	while ((node = timerqueue_getnext(&base->active))) {
				1627	struct hrtimer *timer;
				1628
				1629	timer = container_of(node, struct hrtimer, node);
				1630
				1631	/*
				1632	* The immediate goal for using the softexpires is
				1633	* minimizing wakeups, not running timers at the
				1634	* earliest interrupt after their soft expiration.
				1635	* This allows us to avoid using a Priority Search
				1636	* Tree, which can answer a stabbing querry for
				1637	* overlapping intervals and instead use the simple
				1638	* BST we already have.
				1639	* We don't add extra wakeups by delaying timers that
				1640	* are right-of a not yet expired timer, because that
				1641	* timer will have to trigger a wakeup anyway.
				1642	*/
				1643	if (basenow < hrtimer_get_softexpires_tv64(timer))
				1644	break;
				1645
				1646	__run_hrtimer(cpu_base, base, timer, &basenow, flags);
				1647	if (active_mask == HRTIMER_ACTIVE_SOFT)
				1648	hrtimer_sync_wait_running(cpu_base, flags);
				1649	}
				1650	}
				1651	}
				1652
				1653	static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
				1654	{
				1655	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
				1656	unsigned long flags;
				1657	ktime_t now;
				1658
				1659	hrtimer_cpu_base_lock_expiry(cpu_base);
				1660	raw_spin_lock_irqsave(&cpu_base->lock, flags);
				1661
				1662	now = hrtimer_update_base(cpu_base);
				1663	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
				1664
				1665	cpu_base->softirq_activated = 0;
				1666	hrtimer_update_softirq_timer(cpu_base, true);
				1667
				1668	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
				1669	hrtimer_cpu_base_unlock_expiry(cpu_base);
				1670	}
				1671
				1672	#ifdef CONFIG_HIGH_RES_TIMERS
				1673
				1674	/*
				1675	* High resolution timer interrupt
				1676	* Called with interrupts disabled
				1677	*/
				1678	void hrtimer_interrupt(struct clock_event_device *dev)
				1679	{
				1680	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
				1681	ktime_t expires_next, now, entry_time, delta;
				1682	unsigned long flags;
				1683	int retries = 0;
				1684
				1685	BUG_ON(!cpu_base->hres_active);
				1686	cpu_base->nr_events++;
				1687	dev->next_event = KTIME_MAX;
				1688
				1689	raw_spin_lock_irqsave(&cpu_base->lock, flags);
				1690	entry_time = now = hrtimer_update_base(cpu_base);
				1691	retry:
				1692	cpu_base->in_hrtirq = 1;
				1693	/*
				1694	* We set expires_next to KTIME_MAX here with cpu_base->lock
				1695	* held to prevent that a timer is enqueued in our queue via
				1696	* the migration code. This does not affect enqueueing of
				1697	* timers which run their callback and need to be requeued on
				1698	* this CPU.
				1699	*/
				1700	cpu_base->expires_next = KTIME_MAX;
				1701
				1702	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
				1703	cpu_base->softirq_expires_next = KTIME_MAX;
				1704	cpu_base->softirq_activated = 1;
				1705	raise_softirq_irqoff(HRTIMER_SOFTIRQ);
				1706	}
				1707
				1708	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
				1709
				1710	/* Reevaluate the clock bases for the [soft] next expiry */
				1711	expires_next = hrtimer_update_next_event(cpu_base);
				1712	/*
				1713	* Store the new expiry value so the migration code can verify
				1714	* against it.
				1715	*/
				1716	cpu_base->expires_next = expires_next;
				1717	cpu_base->in_hrtirq = 0;
				1718	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
				1719
				1720	/* Reprogramming necessary ? */
				1721	if (!tick_program_event(expires_next, 0)) {
				1722	cpu_base->hang_detected = 0;
				1723	return;
				1724	}
				1725
				1726	/*
				1727	* The next timer was already expired due to:
				1728	* - tracing
				1729	* - long lasting callbacks
				1730	* - being scheduled away when running in a VM
				1731	*
				1732	* We need to prevent that we loop forever in the hrtimer
				1733	* interrupt routine. We give it 3 attempts to avoid
				1734	* overreacting on some spurious event.
				1735	*
				1736	* Acquire base lock for updating the offsets and retrieving
				1737	* the current time.
				1738	*/
				1739	raw_spin_lock_irqsave(&cpu_base->lock, flags);
				1740	now = hrtimer_update_base(cpu_base);
				1741	cpu_base->nr_retries++;
				1742	if (++retries < 3)
				1743	goto retry;
				1744	/*
				1745	* Give the system a chance to do something else than looping
				1746	* here. We stored the entry time, so we know exactly how long
				1747	* we spent here. We schedule the next event this amount of
				1748	* time away.
				1749	*/
				1750	cpu_base->nr_hangs++;
				1751	cpu_base->hang_detected = 1;
				1752	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
				1753
				1754	delta = ktime_sub(now, entry_time);
				1755	if ((unsigned int)delta > cpu_base->max_hang_time)
				1756	cpu_base->max_hang_time = (unsigned int) delta;
				1757	/*
				1758	* Limit it to a sensible value as we enforce a longer
				1759	* delay. Give the CPU at least 100ms to catch up.
				1760	*/
				1761	if (delta > 100 * NSEC_PER_MSEC)
				1762	expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
				1763	else
				1764	expires_next = ktime_add(now, delta);
				1765	tick_program_event(expires_next, 1);
				1766	pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
				1767	}
				1768
				1769	/* called with interrupts disabled */
				1770	static inline void __hrtimer_peek_ahead_timers(void)
				1771	{
				1772	struct tick_device *td;
				1773
				1774	if (!hrtimer_hres_active())
				1775	return;
				1776
				1777	td = this_cpu_ptr(&tick_cpu_device);
				1778	if (td && td->evtdev)
				1779	hrtimer_interrupt(td->evtdev);
				1780	}
				1781
				1782	#else /* CONFIG_HIGH_RES_TIMERS */
				1783
				1784	static inline void __hrtimer_peek_ahead_timers(void) { }
				1785
				1786	#endif /* !CONFIG_HIGH_RES_TIMERS */
				1787
				1788	/*
				1789	* Called from run_local_timers in hardirq context every jiffy
				1790	*/
				1791	void hrtimer_run_queues(void)
				1792	{
				1793	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
				1794	unsigned long flags;
				1795	ktime_t now;
				1796
				1797	if (__hrtimer_hres_active(cpu_base))
				1798	return;
				1799
				1800	/*
				1801	* This _is_ ugly: We have to check periodically, whether we
				1802	* can switch to highres and / or nohz mode. The clocksource
				1803	* switch happens with xtime_lock held. Notification from
				1804	* there only sets the check bit in the tick_oneshot code,
				1805	* otherwise we might deadlock vs. xtime_lock.
				1806	*/
				1807	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
				1808	hrtimer_switch_to_hres();
				1809	return;
				1810	}
				1811
				1812	raw_spin_lock_irqsave(&cpu_base->lock, flags);
				1813	now = hrtimer_update_base(cpu_base);
				1814
				1815	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
				1816	cpu_base->softirq_expires_next = KTIME_MAX;
				1817	cpu_base->softirq_activated = 1;
				1818	raise_softirq_irqoff(HRTIMER_SOFTIRQ);
				1819	}
				1820
				1821	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
				1822	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
				1823	}
				1824
				1825	/*
				1826	* Sleep related functions:
				1827	*/
				1828	static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
				1829	{
				1830	struct hrtimer_sleeper *t =
				1831	container_of(timer, struct hrtimer_sleeper, timer);
				1832	struct task_struct *task = t->task;
				1833
				1834	t->task = NULL;
				1835	if (task)
				1836	wake_up_process(task);
				1837
				1838	return HRTIMER_NORESTART;
				1839	}
				1840
				1841	/**
				1842	* hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer
				1843	* @sl: sleeper to be started
				1844	* @mode: timer mode abs/rel
				1845	*
				1846	* Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers
				1847	* to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context)
				1848	*/
				1849	void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
				1850	enum hrtimer_mode mode)
				1851	{
				1852	/*
				1853	* Make the enqueue delivery mode check work on RT. If the sleeper
				1854	* was initialized for hard interrupt delivery, force the mode bit.
				1855	* This is a special case for hrtimer_sleepers because
				1856	* hrtimer_init_sleeper() determines the delivery mode on RT so the
				1857	* fiddling with this decision is avoided at the call sites.
				1858	*/
				1859	if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
				1860	mode \|= HRTIMER_MODE_HARD;
				1861
				1862	hrtimer_start_expires(&sl->timer, mode);
				1863	}
				1864	EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
				1865
				1866	static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
				1867	clockid_t clock_id, enum hrtimer_mode mode)
				1868	{
				1869	/*
				1870	* On PREEMPT_RT enabled kernels hrtimers which are not explicitely
				1871	* marked for hard interrupt expiry mode are moved into soft
				1872	* interrupt context either for latency reasons or because the
				1873	* hrtimer callback takes regular spinlocks or invokes other
				1874	* functions which are not suitable for hard interrupt context on
				1875	* PREEMPT_RT.
				1876	*
				1877	* The hrtimer_sleeper callback is RT compatible in hard interrupt
				1878	* context, but there is a latency concern: Untrusted userspace can
				1879	* spawn many threads which arm timers for the same expiry time on
				1880	* the same CPU. That causes a latency spike due to the wakeup of
				1881	* a gazillion threads.
				1882	*
				1883	* OTOH, priviledged real-time user space applications rely on the
				1884	* low latency of hard interrupt wakeups. If the current task is in
				1885	* a real-time scheduling class, mark the mode for hard interrupt
				1886	* expiry.
				1887	*/
				1888	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
				1889	if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT))
				1890	mode \|= HRTIMER_MODE_HARD;
				1891	}
				1892
				1893	__hrtimer_init(&sl->timer, clock_id, mode);
				1894	sl->timer.function = hrtimer_wakeup;
				1895	sl->task = current;
				1896	}
				1897
				1898	/**
				1899	* hrtimer_init_sleeper - initialize sleeper to the given clock
				1900	* @sl: sleeper to be initialized
				1901	* @clock_id: the clock to be used
				1902	* @mode: timer mode abs/rel
				1903	*/
				1904	void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
				1905	enum hrtimer_mode mode)
				1906	{
				1907	debug_init(&sl->timer, clock_id, mode);
				1908	__hrtimer_init_sleeper(sl, clock_id, mode);
				1909
				1910	}
				1911	EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
				1912
				1913	int nanosleep_copyout(struct restart_block restart, struct timespec64 ts)
				1914	{
				1915	switch(restart->nanosleep.type) {
				1916	#ifdef CONFIG_COMPAT_32BIT_TIME
				1917	case TT_COMPAT:
				1918	if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp))
				1919	return -EFAULT;
				1920	break;
				1921	#endif
				1922	case TT_NATIVE:
				1923	if (put_timespec64(ts, restart->nanosleep.rmtp))
				1924	return -EFAULT;
				1925	break;
				1926	default:
				1927	BUG();
				1928	}
				1929	return -ERESTART_RESTARTBLOCK;
				1930	}
				1931
				1932	static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
				1933	{
				1934	struct restart_block *restart;
				1935
				1936	do {
				1937	set_current_state(TASK_INTERRUPTIBLE);
				1938	hrtimer_sleeper_start_expires(t, mode);
				1939
				1940	if (likely(t->task))
				1941	freezable_schedule();
				1942
				1943	hrtimer_cancel(&t->timer);
				1944	mode = HRTIMER_MODE_ABS;
				1945
				1946	} while (t->task && !signal_pending(current));
				1947
				1948	__set_current_state(TASK_RUNNING);
				1949
				1950	if (!t->task)
				1951	return 0;
				1952
				1953	restart = &current->restart_block;
				1954	if (restart->nanosleep.type != TT_NONE) {
				1955	ktime_t rem = hrtimer_expires_remaining(&t->timer);
				1956	struct timespec64 rmt;
				1957
				1958	if (rem <= 0)
				1959	return 0;
				1960	rmt = ktime_to_timespec64(rem);
				1961
				1962	return nanosleep_copyout(restart, &rmt);
				1963	}
				1964	return -ERESTART_RESTARTBLOCK;
				1965	}
				1966
				1967	static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
				1968	{
				1969	struct hrtimer_sleeper t;
				1970	int ret;
				1971
				1972	hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid,
				1973	HRTIMER_MODE_ABS);
				1974	hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
				1975	ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
				1976	destroy_hrtimer_on_stack(&t.timer);
				1977	return ret;
				1978	}
				1979
				1980	long hrtimer_nanosleep(const struct timespec64 *rqtp,
				1981	const enum hrtimer_mode mode, const clockid_t clockid)
				1982	{
				1983	struct restart_block *restart;
				1984	struct hrtimer_sleeper t;
				1985	int ret = 0;
				1986	u64 slack;
				1987
				1988	slack = current->timer_slack_ns;
				1989	if (dl_task(current) \|\| rt_task(current))
				1990	slack = 0;
				1991
				1992	hrtimer_init_sleeper_on_stack(&t, clockid, mode);
				1993	hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
				1994	ret = do_nanosleep(&t, mode);
				1995	if (ret != -ERESTART_RESTARTBLOCK)
				1996	goto out;
				1997
				1998	/* Absolute timers do not update the rmtp value and restart: */
				1999	if (mode == HRTIMER_MODE_ABS) {
				2000	ret = -ERESTARTNOHAND;
				2001	goto out;
				2002	}
				2003
				2004	restart = &current->restart_block;
				2005	restart->nanosleep.clockid = t.timer.base->clockid;
				2006	restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
				2007	set_restart_fn(restart, hrtimer_nanosleep_restart);
				2008	out:
				2009	destroy_hrtimer_on_stack(&t.timer);
				2010	return ret;
				2011	}
				2012
				2013	#ifdef CONFIG_64BIT
				2014
				2015	SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
				2016	struct __kernel_timespec __user *, rmtp)
				2017	{
				2018	struct timespec64 tu;
				2019
				2020	if (get_timespec64(&tu, rqtp))
				2021	return -EFAULT;
				2022
				2023	if (!timespec64_valid(&tu))
				2024	return -EINVAL;
				2025
				2026	current->restart_block.fn = do_no_restart_syscall;
				2027	current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
				2028	current->restart_block.nanosleep.rmtp = rmtp;
				2029	return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
				2030	}
				2031
				2032	#endif
				2033
				2034	#ifdef CONFIG_COMPAT_32BIT_TIME
				2035
				2036	SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
				2037	struct old_timespec32 __user *, rmtp)
				2038	{
				2039	struct timespec64 tu;
				2040
				2041	if (get_old_timespec32(&tu, rqtp))
				2042	return -EFAULT;
				2043
				2044	if (!timespec64_valid(&tu))
				2045	return -EINVAL;
				2046
				2047	current->restart_block.fn = do_no_restart_syscall;
				2048	current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
				2049	current->restart_block.nanosleep.compat_rmtp = rmtp;
				2050	return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
				2051	}
				2052	#endif
				2053
				2054	/*
				2055	* Functions related to boot-time initialization:
				2056	*/
				2057	int hrtimers_prepare_cpu(unsigned int cpu)
				2058	{
				2059	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
				2060	int i;
				2061
				2062	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
				2063	cpu_base->clock_base[i].cpu_base = cpu_base;
				2064	timerqueue_init_head(&cpu_base->clock_base[i].active);
				2065	}
				2066
				2067	cpu_base->cpu = cpu;
				2068	hrtimer_cpu_base_init_expiry_lock(cpu_base);
				2069	return 0;
				2070	}
				2071
				2072	int hrtimers_cpu_starting(unsigned int cpu)
				2073	{
				2074	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
				2075
				2076	/* Clear out any left over state from a CPU down operation */
				2077	cpu_base->active_bases = 0;
				2078	cpu_base->hres_active = 0;
				2079	cpu_base->hang_detected = 0;
				2080	cpu_base->next_timer = NULL;
				2081	cpu_base->softirq_next_timer = NULL;
				2082	cpu_base->expires_next = KTIME_MAX;
				2083	cpu_base->softirq_expires_next = KTIME_MAX;
				2084	cpu_base->online = 1;
				2085	return 0;
				2086	}
				2087
				2088	#ifdef CONFIG_HOTPLUG_CPU
				2089
				2090	static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
				2091	struct hrtimer_clock_base *new_base)
				2092	{
				2093	struct hrtimer *timer;
				2094	struct timerqueue_node *node;
				2095
				2096	while ((node = timerqueue_getnext(&old_base->active))) {
				2097	timer = container_of(node, struct hrtimer, node);
				2098	BUG_ON(hrtimer_callback_running(timer));
				2099	debug_deactivate(timer);
				2100
				2101	/*
				2102	* Mark it as ENQUEUED not INACTIVE otherwise the
				2103	* timer could be seen as !active and just vanish away
				2104	* under us on another CPU
				2105	*/
				2106	__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
				2107	timer->base = new_base;
				2108	/*
				2109	* Enqueue the timers on the new cpu. This does not
				2110	* reprogram the event device in case the timer
				2111	* expires before the earliest on this CPU, but we run
				2112	* hrtimer_interrupt after we migrated everything to
				2113	* sort out already expired timers and reprogram the
				2114	* event device.
				2115	*/
				2116	enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
				2117	}
				2118	}
				2119
				2120	int hrtimers_cpu_dying(unsigned int dying_cpu)
				2121	{
				2122	struct hrtimer_cpu_base old_base, new_base;
				2123	int i, ncpu = cpumask_first(cpu_active_mask);
				2124
				2125	tick_cancel_sched_timer(dying_cpu);
				2126
				2127	old_base = this_cpu_ptr(&hrtimer_bases);
				2128	new_base = &per_cpu(hrtimer_bases, ncpu);
				2129
				2130	/*
				2131	* The caller is globally serialized and nobody else
				2132	* takes two locks at once, deadlock is not possible.
				2133	*/
				2134	raw_spin_lock(&old_base->lock);
				2135	raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);
				2136
				2137	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
				2138	migrate_hrtimer_list(&old_base->clock_base[i],
				2139	&new_base->clock_base[i]);
				2140	}
				2141
				2142	/*
				2143	* The migration might have changed the first expiring softirq
				2144	* timer on this CPU. Update it.
				2145	*/
				2146	__hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
				2147	/* Tell the other CPU to retrigger the next event */
				2148	smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
				2149
				2150	raw_spin_unlock(&new_base->lock);
				2151	old_base->online = 0;
				2152	raw_spin_unlock(&old_base->lock);
				2153
				2154	return 0;
				2155	}
				2156
				2157	#endif /* CONFIG_HOTPLUG_CPU */
				2158
				2159	void __init hrtimers_init(void)
				2160	{
				2161	hrtimers_prepare_cpu(smp_processor_id());
				2162	hrtimers_cpu_starting(smp_processor_id());
				2163	open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
				2164	}
				2165
				2166	/**
				2167	* schedule_hrtimeout_range_clock - sleep until timeout
				2168	* @expires: timeout value (ktime_t)
				2169	* @delta: slack in expires timeout (ktime_t)
				2170	* @mode: timer mode
				2171	* @clock_id: timer clock to be used
				2172	*/
				2173	int __sched
				2174	schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
				2175	const enum hrtimer_mode mode, clockid_t clock_id)
				2176	{
				2177	struct hrtimer_sleeper t;
				2178
				2179	/*
				2180	* Optimize when a zero timeout value is given. It does not
				2181	* matter whether this is an absolute or a relative time.
				2182	*/
				2183	if (expires && *expires == 0) {
				2184	__set_current_state(TASK_RUNNING);
				2185	return 0;
				2186	}
				2187
				2188	/*
				2189	* A NULL parameter means "infinite"
				2190	*/
				2191	if (!expires) {
				2192	schedule();
				2193	return -EINTR;
				2194	}
				2195
				2196	hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
				2197	hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
				2198	hrtimer_sleeper_start_expires(&t, mode);
				2199
				2200	if (likely(t.task))
				2201	schedule();
				2202
				2203	hrtimer_cancel(&t.timer);
				2204	destroy_hrtimer_on_stack(&t.timer);
				2205
				2206	__set_current_state(TASK_RUNNING);
				2207
				2208	return !t.task ? 0 : -EINTR;
				2209	}
				2210
				2211	/**
				2212	* schedule_hrtimeout_range - sleep until timeout
				2213	* @expires: timeout value (ktime_t)
				2214	* @delta: slack in expires timeout (ktime_t)
				2215	* @mode: timer mode
				2216	*
				2217	* Make the current task sleep until the given expiry time has
				2218	* elapsed. The routine will return immediately unless
				2219	* the current task state has been set (see set_current_state()).
				2220	*
				2221	* The @delta argument gives the kernel the freedom to schedule the
				2222	* actual wakeup to a time that is both power and performance friendly.
				2223	* The kernel give the normal best effort behavior for "@expires+@delta",
				2224	* but may decide to fire the timer earlier, but no earlier than @expires.
				2225	*
				2226	* You can set the task state as follows -
				2227	*
				2228	* %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
				2229	* pass before the routine returns unless the current task is explicitly
				2230	* woken up, (e.g. by wake_up_process()).
				2231	*
				2232	* %TASK_INTERRUPTIBLE - the routine may return early if a signal is
				2233	* delivered to the current task or the current task is explicitly woken
				2234	* up.
				2235	*
				2236	* The current task state is guaranteed to be TASK_RUNNING when this
				2237	* routine returns.
				2238	*
				2239	* Returns 0 when the timer has expired. If the task was woken before the
				2240	* timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
				2241	* by an explicit wakeup, it returns -EINTR.
				2242	*/
				2243	int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
				2244	const enum hrtimer_mode mode)
				2245	{
				2246	return schedule_hrtimeout_range_clock(expires, delta, mode,
				2247	CLOCK_MONOTONIC);
				2248	}
				2249	EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
				2250
				2251	/**
				2252	* schedule_hrtimeout - sleep until timeout
				2253	* @expires: timeout value (ktime_t)
				2254	* @mode: timer mode
				2255	*
				2256	* Make the current task sleep until the given expiry time has
				2257	* elapsed. The routine will return immediately unless
				2258	* the current task state has been set (see set_current_state()).
				2259	*
				2260	* You can set the task state as follows -
				2261	*
				2262	* %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
				2263	* pass before the routine returns unless the current task is explicitly
				2264	* woken up, (e.g. by wake_up_process()).
				2265	*
				2266	* %TASK_INTERRUPTIBLE - the routine may return early if a signal is
				2267	* delivered to the current task or the current task is explicitly woken
				2268	* up.
				2269	*
				2270	* The current task state is guaranteed to be TASK_RUNNING when this
				2271	* routine returns.
				2272	*
				2273	* Returns 0 when the timer has expired. If the task was woken before the
				2274	* timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
				2275	* by an explicit wakeup, it returns -EINTR.
				2276	*/
				2277	int __sched schedule_hrtimeout(ktime_t *expires,
				2278	const enum hrtimer_mode mode)
				2279	{
				2280	return schedule_hrtimeout_range(expires, 0, mode);
				2281	}
				2282	EXPORT_SYMBOL_GPL(schedule_hrtimeout);