Blame - marvell/linux/arch/x86/kernel/tsc_sync.c - T108

blob: 369c7357bdef23dff1a5b07397c0e5d0318e64c1 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* check TSC synchronization.
				4	*
				5	* Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
				6	*
				7	* We check whether all boot CPUs have their TSC's synchronized,
				8	* print a warning if not and turn off the TSC clock-source.
				9	*
				10	* The warp-check is point-to-point between two CPUs, the CPU
				11	* initiating the bootup is the 'source CPU', the freshly booting
				12	* CPU is the 'target CPU'.
				13	*
				14	* Only two CPUs may participate - they can enter in any order.
				15	* ( The serial nature of the boot logic and the CPU hotplug lock
				16	* protects against more than 2 CPUs entering this code. )
				17	*/
				18	#include <linux/topology.h>
				19	#include <linux/spinlock.h>
				20	#include <linux/kernel.h>
				21	#include <linux/smp.h>
				22	#include <linux/nmi.h>
				23	#include <asm/tsc.h>
				24
				25	struct tsc_adjust {
				26	s64 bootval;
				27	s64 adjusted;
				28	unsigned long nextcheck;
				29	bool warned;
				30	};
				31
				32	static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
				33	static struct timer_list tsc_sync_check_timer;
				34
				35	/*
				36	* TSC's on different sockets may be reset asynchronously.
				37	* This may cause the TSC ADJUST value on socket 0 to be NOT 0.
				38	*/
				39	bool __read_mostly tsc_async_resets;
				40
				41	void mark_tsc_async_resets(char *reason)
				42	{
				43	if (tsc_async_resets)
				44	return;
				45	tsc_async_resets = true;
				46	pr_info("tsc: Marking TSC async resets true due to %s\n", reason);
				47	}
				48
				49	void tsc_verify_tsc_adjust(bool resume)
				50	{
				51	struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust);
				52	s64 curval;
				53
				54	if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
				55	return;
				56
				57	/* Skip unnecessary error messages if TSC already unstable */
				58	if (check_tsc_unstable())
				59	return;
				60
				61	/* Rate limit the MSR check */
				62	if (!resume && time_before(jiffies, adj->nextcheck))
				63	return;
				64
				65	adj->nextcheck = jiffies + HZ;
				66
				67	rdmsrl(MSR_IA32_TSC_ADJUST, curval);
				68	if (adj->adjusted == curval)
				69	return;
				70
				71	/* Restore the original value */
				72	wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted);
				73
				74	if (!adj->warned \|\| resume) {
				75	pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n",
				76	smp_processor_id(), adj->adjusted, curval);
				77	adj->warned = true;
				78	}
				79	}
				80
				81	/*
				82	* Normally the tsc_sync will be checked every time system enters idle
				83	* state, but there is still caveat that a system won't enter idle,
				84	* either because it's too busy or configured purposely to not enter
				85	* idle.
				86	*
				87	* So setup a periodic timer (every 10 minutes) to make sure the check
				88	* is always on.
				89	*/
				90
				91	#define SYNC_CHECK_INTERVAL (HZ * 600)
				92
				93	static void tsc_sync_check_timer_fn(struct timer_list *unused)
				94	{
				95	int next_cpu;
				96
				97	tsc_verify_tsc_adjust(false);
				98
				99	/* Run the check for all onlined CPUs in turn */
				100	next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
				101	if (next_cpu >= nr_cpu_ids)
				102	next_cpu = cpumask_first(cpu_online_mask);
				103
				104	tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL;
				105	add_timer_on(&tsc_sync_check_timer, next_cpu);
				106	}
				107
				108	static int __init start_sync_check_timer(void)
				109	{
				110	if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) \|\| tsc_clocksource_reliable)
				111	return 0;
				112
				113	timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0);
				114	tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL;
				115	add_timer(&tsc_sync_check_timer);
				116
				117	return 0;
				118	}
				119	late_initcall(start_sync_check_timer);
				120
				121	static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
				122	unsigned int cpu, bool bootcpu)
				123	{
				124	/*
				125	* First online CPU in a package stores the boot value in the
				126	* adjustment value. This value might change later via the sync
				127	* mechanism. If that fails we still can yell about boot values not
				128	* being consistent.
				129	*
				130	* On the boot cpu we just force set the ADJUST value to 0 if it's
				131	* non zero. We don't do that on non boot cpus because physical
				132	* hotplug should have set the ADJUST register to a value > 0 so
				133	* the TSC is in sync with the already running cpus.
				134	*
				135	* Also don't force the ADJUST value to zero if that is a valid value
				136	* for socket 0 as determined by the system arch. This is required
				137	* when multiple sockets are reset asynchronously with each other
				138	* and socket 0 may not have an TSC ADJUST value of 0.
				139	*/
				140	if (bootcpu && bootval != 0) {
				141	if (likely(!tsc_async_resets)) {
				142	pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n",
				143	cpu, bootval);
				144	wrmsrl(MSR_IA32_TSC_ADJUST, 0);
				145	bootval = 0;
				146	} else {
				147	pr_info("TSC ADJUST: CPU%u: %lld NOT forced to 0\n",
				148	cpu, bootval);
				149	}
				150	}
				151	cur->adjusted = bootval;
				152	}
				153
				154	#ifndef CONFIG_SMP
				155	bool __init tsc_store_and_check_tsc_adjust(bool bootcpu)
				156	{
				157	struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
				158	s64 bootval;
				159
				160	if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
				161	return false;
				162
				163	/* Skip unnecessary error messages if TSC already unstable */
				164	if (check_tsc_unstable())
				165	return false;
				166
				167	rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
				168	cur->bootval = bootval;
				169	cur->nextcheck = jiffies + HZ;
				170	tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu);
				171	return false;
				172	}
				173
				174	#else /* !CONFIG_SMP */
				175
				176	/*
				177	* Store and check the TSC ADJUST MSR if available
				178	*/
				179	bool tsc_store_and_check_tsc_adjust(bool bootcpu)
				180	{
				181	struct tsc_adjust ref, cur = this_cpu_ptr(&tsc_adjust);
				182	unsigned int refcpu, cpu = smp_processor_id();
				183	struct cpumask *mask;
				184	s64 bootval;
				185
				186	if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
				187	return false;
				188
				189	rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
				190	cur->bootval = bootval;
				191	cur->nextcheck = jiffies + HZ;
				192	cur->warned = false;
				193
				194	/*
				195	* The default adjust value cannot be assumed to be zero on any socket.
				196	*/
				197	cur->adjusted = bootval;
				198
				199	/*
				200	* Check whether this CPU is the first in a package to come up. In
				201	* this case do not check the boot value against another package
				202	* because the new package might have been physically hotplugged,
				203	* where TSC_ADJUST is expected to be different. When called on the
				204	* boot CPU topology_core_cpumask() might not be available yet.
				205	*/
				206	mask = topology_core_cpumask(cpu);
				207	refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids;
				208
				209	if (refcpu >= nr_cpu_ids) {
				210	tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(),
				211	bootcpu);
				212	return false;
				213	}
				214
				215	ref = per_cpu_ptr(&tsc_adjust, refcpu);
				216	/*
				217	* Compare the boot value and complain if it differs in the
				218	* package.
				219	*/
				220	if (bootval != ref->bootval)
				221	printk_once(FW_BUG "TSC ADJUST differs within socket(s), fixing all errors\n");
				222
				223	/*
				224	* The TSC_ADJUST values in a package must be the same. If the boot
				225	* value on this newly upcoming CPU differs from the adjustment
				226	* value of the already online CPU in this package, set it to that
				227	* adjusted value.
				228	*/
				229	if (bootval != ref->adjusted) {
				230	cur->adjusted = ref->adjusted;
				231	wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted);
				232	}
				233	/*
				234	* We have the TSCs forced to be in sync on this package. Skip sync
				235	* test:
				236	*/
				237	return true;
				238	}
				239
				240	/*
				241	* Entry/exit counters that make sure that both CPUs
				242	* run the measurement code at once:
				243	*/
				244	static atomic_t start_count;
				245	static atomic_t stop_count;
				246	static atomic_t skip_test;
				247	static atomic_t test_runs;
				248
				249	/*
				250	* We use a raw spinlock in this exceptional case, because
				251	* we want to have the fastest, inlined, non-debug version
				252	* of a critical section, to be able to prove TSC time-warps:
				253	*/
				254	static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
				255
				256	static cycles_t last_tsc;
				257	static cycles_t max_warp;
				258	static int nr_warps;
				259	static int random_warps;
				260
				261	/*
				262	* TSC-warp measurement loop running on both CPUs. This is not called
				263	* if there is no TSC.
				264	*/
				265	static cycles_t check_tsc_warp(unsigned int timeout)
				266	{
				267	cycles_t start, now, prev, end, cur_max_warp = 0;
				268	int i, cur_warps = 0;
				269
				270	start = rdtsc_ordered();
				271	/*
				272	* The measurement runs for 'timeout' msecs:
				273	*/
				274	end = start + (cycles_t) tsc_khz * timeout;
				275	now = start;
				276
				277	for (i = 0; ; i++) {
				278	/*
				279	* We take the global lock, measure TSC, save the
				280	* previous TSC that was measured (possibly on
				281	* another CPU) and update the previous TSC timestamp.
				282	*/
				283	arch_spin_lock(&sync_lock);
				284	prev = last_tsc;
				285	now = rdtsc_ordered();
				286	last_tsc = now;
				287	arch_spin_unlock(&sync_lock);
				288
				289	/*
				290	* Be nice every now and then (and also check whether
				291	* measurement is done [we also insert a 10 million
				292	* loops safety exit, so we dont lock up in case the
				293	* TSC readout is totally broken]):
				294	*/
				295	if (unlikely(!(i & 7))) {
				296	if (now > end \|\| i > 10000000)
				297	break;
				298	cpu_relax();
				299	touch_nmi_watchdog();
				300	}
				301	/*
				302	* Outside the critical section we can now see whether
				303	* we saw a time-warp of the TSC going backwards:
				304	*/
				305	if (unlikely(prev > now)) {
				306	arch_spin_lock(&sync_lock);
				307	max_warp = max(max_warp, prev - now);
				308	cur_max_warp = max_warp;
				309	/*
				310	* Check whether this bounces back and forth. Only
				311	* one CPU should observe time going backwards.
				312	*/
				313	if (cur_warps != nr_warps)
				314	random_warps++;
				315	nr_warps++;
				316	cur_warps = nr_warps;
				317	arch_spin_unlock(&sync_lock);
				318	}
				319	}
				320	WARN(!(now-start),
				321	"Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
				322	now-start, end-start);
				323	return cur_max_warp;
				324	}
				325
				326	/*
				327	* If the target CPU coming online doesn't have any of its core-siblings
				328	* online, a timeout of 20msec will be used for the TSC-warp measurement
				329	* loop. Otherwise a smaller timeout of 2msec will be used, as we have some
				330	* information about this socket already (and this information grows as we
				331	* have more and more logical-siblings in that socket).
				332	*
				333	* Ideally we should be able to skip the TSC sync check on the other
				334	* core-siblings, if the first logical CPU in a socket passed the sync test.
				335	* But as the TSC is per-logical CPU and can potentially be modified wrongly
				336	* by the bios, TSC sync test for smaller duration should be able
				337	* to catch such errors. Also this will catch the condition where all the
				338	* cores in the socket doesn't get reset at the same time.
				339	*/
				340	static inline unsigned int loop_timeout(int cpu)
				341	{
				342	return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20;
				343	}
				344
				345	/*
				346	* Source CPU calls into this - it waits for the freshly booted
				347	* target CPU to arrive and then starts the measurement:
				348	*/
				349	void check_tsc_sync_source(int cpu)
				350	{
				351	int cpus = 2;
				352
				353	/*
				354	* No need to check if we already know that the TSC is not
				355	* synchronized or if we have no TSC.
				356	*/
				357	if (unsynchronized_tsc())
				358	return;
				359
				360	/*
				361	* Set the maximum number of test runs to
				362	* 1 if the CPU does not provide the TSC_ADJUST MSR
				363	* 3 if the MSR is available, so the target can try to adjust
				364	*/
				365	if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
				366	atomic_set(&test_runs, 1);
				367	else
				368	atomic_set(&test_runs, 3);
				369	retry:
				370	/*
				371	* Wait for the target to start or to skip the test:
				372	*/
				373	while (atomic_read(&start_count) != cpus - 1) {
				374	if (atomic_read(&skip_test) > 0) {
				375	atomic_set(&skip_test, 0);
				376	return;
				377	}
				378	cpu_relax();
				379	}
				380
				381	/*
				382	* Trigger the target to continue into the measurement too:
				383	*/
				384	atomic_inc(&start_count);
				385
				386	check_tsc_warp(loop_timeout(cpu));
				387
				388	while (atomic_read(&stop_count) != cpus-1)
				389	cpu_relax();
				390
				391	/*
				392	* If the test was successful set the number of runs to zero and
				393	* stop. If not, decrement the number of runs an check if we can
				394	* retry. In case of random warps no retry is attempted.
				395	*/
				396	if (!nr_warps) {
				397	atomic_set(&test_runs, 0);
				398
				399	pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
				400	smp_processor_id(), cpu);
				401
				402	} else if (atomic_dec_and_test(&test_runs) \|\| random_warps) {
				403	/* Force it to 0 if random warps brought us here */
				404	atomic_set(&test_runs, 0);
				405
				406	pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
				407	smp_processor_id(), cpu);
				408	pr_warning("Measured %Ld cycles TSC warp between CPUs, "
				409	"turning off TSC clock.\n", max_warp);
				410	if (random_warps)
				411	pr_warning("TSC warped randomly between CPUs\n");
				412	mark_tsc_unstable("check_tsc_sync_source failed");
				413	}
				414
				415	/*
				416	* Reset it - just in case we boot another CPU later:
				417	*/
				418	atomic_set(&start_count, 0);
				419	random_warps = 0;
				420	nr_warps = 0;
				421	max_warp = 0;
				422	last_tsc = 0;
				423
				424	/*
				425	* Let the target continue with the bootup:
				426	*/
				427	atomic_inc(&stop_count);
				428
				429	/*
				430	* Retry, if there is a chance to do so.
				431	*/
				432	if (atomic_read(&test_runs) > 0)
				433	goto retry;
				434	}
				435
				436	/*
				437	* Freshly booted CPUs call into this:
				438	*/
				439	void check_tsc_sync_target(void)
				440	{
				441	struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
				442	unsigned int cpu = smp_processor_id();
				443	cycles_t cur_max_warp, gbl_max_warp;
				444	int cpus = 2;
				445
				446	/* Also aborts if there is no TSC. */
				447	if (unsynchronized_tsc())
				448	return;
				449
				450	/*
				451	* Store, verify and sanitize the TSC adjust register. If
				452	* successful skip the test.
				453	*
				454	* The test is also skipped when the TSC is marked reliable. This
				455	* is true for SoCs which have no fallback clocksource. On these
				456	* SoCs the TSC is frequency synchronized, but still the TSC ADJUST
				457	* register might have been wreckaged by the BIOS..
				458	*/
				459	if (tsc_store_and_check_tsc_adjust(false) \|\| tsc_clocksource_reliable) {
				460	atomic_inc(&skip_test);
				461	return;
				462	}
				463
				464	retry:
				465	/*
				466	* Register this CPU's participation and wait for the
				467	* source CPU to start the measurement:
				468	*/
				469	atomic_inc(&start_count);
				470	while (atomic_read(&start_count) != cpus)
				471	cpu_relax();
				472
				473	cur_max_warp = check_tsc_warp(loop_timeout(cpu));
				474
				475	/*
				476	* Store the maximum observed warp value for a potential retry:
				477	*/
				478	gbl_max_warp = max_warp;
				479
				480	/*
				481	* Ok, we are done:
				482	*/
				483	atomic_inc(&stop_count);
				484
				485	/*
				486	* Wait for the source CPU to print stuff:
				487	*/
				488	while (atomic_read(&stop_count) != cpus)
				489	cpu_relax();
				490
				491	/*
				492	* Reset it for the next sync test:
				493	*/
				494	atomic_set(&stop_count, 0);
				495
				496	/*
				497	* Check the number of remaining test runs. If not zero, the test
				498	* failed and a retry with adjusted TSC is possible. If zero the
				499	* test was either successful or failed terminally.
				500	*/
				501	if (!atomic_read(&test_runs))
				502	return;
				503
				504	/*
				505	* If the warp value of this CPU is 0, then the other CPU
				506	* observed time going backwards so this TSC was ahead and
				507	* needs to move backwards.
				508	*/
				509	if (!cur_max_warp)
				510	cur_max_warp = -gbl_max_warp;
				511
				512	/*
				513	* Add the result to the previous adjustment value.
				514	*
				515	* The adjustement value is slightly off by the overhead of the
				516	* sync mechanism (observed values are ~200 TSC cycles), but this
				517	* really depends on CPU, node distance and frequency. So
				518	* compensating for this is hard to get right. Experiments show
				519	* that the warp is not longer detectable when the observed warp
				520	* value is used. In the worst case the adjustment needs to go
				521	* through a 3rd run for fine tuning.
				522	*/
				523	cur->adjusted += cur_max_warp;
				524
				525	pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
				526	cpu, cur_max_warp, cur->adjusted);
				527
				528	wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted);
				529	goto retry;
				530
				531	}
				532
				533	#endif /* CONFIG_SMP */