Blame - marvell/linux/arch/sparc/kernel/smp_64.c - T108

blob: 02deb12b413d0a978e39c852e625acee369246ef [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	/* smp.c: Sparc64 SMP support.
				3	*
				4	* Copyright (C) 1997, 2007, 2008 David S. Miller (davem@davemloft.net)
				5	*/
				6
				7	#include <linux/export.h>
				8	#include <linux/kernel.h>
				9	#include <linux/sched/mm.h>
				10	#include <linux/sched/hotplug.h>
				11	#include <linux/mm.h>
				12	#include <linux/pagemap.h>
				13	#include <linux/threads.h>
				14	#include <linux/smp.h>
				15	#include <linux/interrupt.h>
				16	#include <linux/kernel_stat.h>
				17	#include <linux/delay.h>
				18	#include <linux/init.h>
				19	#include <linux/spinlock.h>
				20	#include <linux/fs.h>
				21	#include <linux/seq_file.h>
				22	#include <linux/cache.h>
				23	#include <linux/jiffies.h>
				24	#include <linux/profile.h>
				25	#include <linux/memblock.h>
				26	#include <linux/vmalloc.h>
				27	#include <linux/ftrace.h>
				28	#include <linux/cpu.h>
				29	#include <linux/slab.h>
				30	#include <linux/kgdb.h>
				31
				32	#include <asm/head.h>
				33	#include <asm/ptrace.h>
				34	#include <linux/atomic.h>
				35	#include <asm/tlbflush.h>
				36	#include <asm/mmu_context.h>
				37	#include <asm/cpudata.h>
				38	#include <asm/hvtramp.h>
				39	#include <asm/io.h>
				40	#include <asm/timer.h>
				41	#include <asm/setup.h>
				42
				43	#include <asm/irq.h>
				44	#include <asm/irq_regs.h>
				45	#include <asm/page.h>
				46	#include <asm/pgtable.h>
				47	#include <asm/oplib.h>
				48	#include <linux/uaccess.h>
				49	#include <asm/starfire.h>
				50	#include <asm/tlb.h>
				51	#include <asm/sections.h>
				52	#include <asm/prom.h>
				53	#include <asm/mdesc.h>
				54	#include <asm/ldc.h>
				55	#include <asm/hypervisor.h>
				56	#include <asm/pcr.h>
				57
				58	#include "cpumap.h"
				59	#include "kernel.h"
				60
				61	DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;
				62	cpumask_t cpu_core_map[NR_CPUS] __read_mostly =
				63	{ [0 ... NR_CPUS-1] = CPU_MASK_NONE };
				64
				65	cpumask_t cpu_core_sib_map[NR_CPUS] __read_mostly = {
				66	[0 ... NR_CPUS-1] = CPU_MASK_NONE };
				67
				68	cpumask_t cpu_core_sib_cache_map[NR_CPUS] __read_mostly = {
				69	[0 ... NR_CPUS - 1] = CPU_MASK_NONE };
				70
				71	EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
				72	EXPORT_SYMBOL(cpu_core_map);
				73	EXPORT_SYMBOL(cpu_core_sib_map);
				74	EXPORT_SYMBOL(cpu_core_sib_cache_map);
				75
				76	static cpumask_t smp_commenced_mask;
				77
				78	static DEFINE_PER_CPU(bool, poke);
				79	static bool cpu_poke;
				80
				81	void smp_info(struct seq_file *m)
				82	{
				83	int i;
				84
				85	seq_printf(m, "State:\n");
				86	for_each_online_cpu(i)
				87	seq_printf(m, "CPU%d:\t\tonline\n", i);
				88	}
				89
				90	void smp_bogo(struct seq_file *m)
				91	{
				92	int i;
				93
				94	for_each_online_cpu(i)
				95	seq_printf(m,
				96	"Cpu%dClkTck\t: %016lx\n",
				97	i, cpu_data(i).clock_tick);
				98	}
				99
				100	extern void setup_sparc64_timer(void);
				101
				102	static volatile unsigned long callin_flag = 0;
				103
				104	void smp_callin(void)
				105	{
				106	int cpuid = hard_smp_processor_id();
				107
				108	__local_per_cpu_offset = __per_cpu_offset(cpuid);
				109
				110	if (tlb_type == hypervisor)
				111	sun4v_ktsb_register();
				112
				113	__flush_tlb_all();
				114
				115	setup_sparc64_timer();
				116
				117	if (cheetah_pcache_forced_on)
				118	cheetah_enable_pcache();
				119
				120	callin_flag = 1;
				121	__asm__ __volatile__("membar #Sync\n\t"
				122	"flush %%g6" : : : "memory");
				123
				124	/* Clear this or we will die instantly when we
				125	* schedule back to this idler...
				126	*/
				127	current_thread_info()->new_child = 0;
				128
				129	/* Attach to the address space of init_task. */
				130	mmgrab(&init_mm);
				131	current->active_mm = &init_mm;
				132
				133	/* inform the notifiers about the new cpu */
				134	notify_cpu_starting(cpuid);
				135
				136	while (!cpumask_test_cpu(cpuid, &smp_commenced_mask))
				137	rmb();
				138
				139	set_cpu_online(cpuid, true);
				140
				141	/* idle thread is expected to have preempt disabled */
				142	preempt_disable();
				143
				144	local_irq_enable();
				145
				146	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
				147	}
				148
				149	void cpu_panic(void)
				150	{
				151	printk("CPU[%d]: Returns from cpu_idle!\n", smp_processor_id());
				152	panic("SMP bolixed\n");
				153	}
				154
				155	/* This tick register synchronization scheme is taken entirely from
				156	* the ia64 port, see arch/ia64/kernel/smpboot.c for details and credit.
				157	*
				158	* The only change I've made is to rework it so that the master
				159	* initiates the synchonization instead of the slave. -DaveM
				160	*/
				161
				162	#define MASTER 0
				163	#define SLAVE (SMP_CACHE_BYTES/sizeof(unsigned long))
				164
				165	#define NUM_ROUNDS 64 /* magic value */
				166	#define NUM_ITERS 5 /* likewise */
				167
				168	static DEFINE_RAW_SPINLOCK(itc_sync_lock);
				169	static unsigned long go[SLAVE + 1];
				170
				171	#define DEBUG_TICK_SYNC 0
				172
				173	static inline long get_delta (long rt, long master)
				174	{
				175	unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
				176	unsigned long tcenter, t0, t1, tm;
				177	unsigned long i;
				178
				179	for (i = 0; i < NUM_ITERS; i++) {
				180	t0 = tick_ops->get_tick();
				181	go[MASTER] = 1;
				182	membar_safe("#StoreLoad");
				183	while (!(tm = go[SLAVE]))
				184	rmb();
				185	go[SLAVE] = 0;
				186	wmb();
				187	t1 = tick_ops->get_tick();
				188
				189	if (t1 - t0 < best_t1 - best_t0)
				190	best_t0 = t0, best_t1 = t1, best_tm = tm;
				191	}
				192
				193	*rt = best_t1 - best_t0;
				194	*master = best_tm - best_t0;
				195
				196	/* average best_t0 and best_t1 without overflow: */
				197	tcenter = (best_t0/2 + best_t1/2);
				198	if (best_t0 % 2 + best_t1 % 2 == 2)
				199	tcenter++;
				200	return tcenter - best_tm;
				201	}
				202
				203	void smp_synchronize_tick_client(void)
				204	{
				205	long i, delta, adj, adjust_latency = 0, done = 0;
				206	unsigned long flags, rt, master_time_stamp;
				207	#if DEBUG_TICK_SYNC
				208	struct {
				209	long rt; /* roundtrip time */
				210	long master; /* master's timestamp */
				211	long diff; /* difference between midpoint and master's timestamp */
				212	long lat; /* estimate of itc adjustment latency */
				213	} t[NUM_ROUNDS];
				214	#endif
				215
				216	go[MASTER] = 1;
				217
				218	while (go[MASTER])
				219	rmb();
				220
				221	local_irq_save(flags);
				222	{
				223	for (i = 0; i < NUM_ROUNDS; i++) {
				224	delta = get_delta(&rt, &master_time_stamp);
				225	if (delta == 0)
				226	done = 1; /* let's lock on to this... */
				227
				228	if (!done) {
				229	if (i > 0) {
				230	adjust_latency += -delta;
				231	adj = -delta + adjust_latency/4;
				232	} else
				233	adj = -delta;
				234
				235	tick_ops->add_tick(adj);
				236	}
				237	#if DEBUG_TICK_SYNC
				238	t[i].rt = rt;
				239	t[i].master = master_time_stamp;
				240	t[i].diff = delta;
				241	t[i].lat = adjust_latency/4;
				242	#endif
				243	}
				244	}
				245	local_irq_restore(flags);
				246
				247	#if DEBUG_TICK_SYNC
				248	for (i = 0; i < NUM_ROUNDS; i++)
				249	printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
				250	t[i].rt, t[i].master, t[i].diff, t[i].lat);
				251	#endif
				252
				253	printk(KERN_INFO "CPU %d: synchronized TICK with master CPU "
				254	"(last diff %ld cycles, maxerr %lu cycles)\n",
				255	smp_processor_id(), delta, rt);
				256	}
				257
				258	static void smp_start_sync_tick_client(int cpu);
				259
				260	static void smp_synchronize_one_tick(int cpu)
				261	{
				262	unsigned long flags, i;
				263
				264	go[MASTER] = 0;
				265
				266	smp_start_sync_tick_client(cpu);
				267
				268	/* wait for client to be ready */
				269	while (!go[MASTER])
				270	rmb();
				271
				272	/* now let the client proceed into his loop */
				273	go[MASTER] = 0;
				274	membar_safe("#StoreLoad");
				275
				276	raw_spin_lock_irqsave(&itc_sync_lock, flags);
				277	{
				278	for (i = 0; i < NUM_ROUNDS*NUM_ITERS; i++) {
				279	while (!go[MASTER])
				280	rmb();
				281	go[MASTER] = 0;
				282	wmb();
				283	go[SLAVE] = tick_ops->get_tick();
				284	membar_safe("#StoreLoad");
				285	}
				286	}
				287	raw_spin_unlock_irqrestore(&itc_sync_lock, flags);
				288	}
				289
				290	#if defined(CONFIG_SUN_LDOMS) && defined(CONFIG_HOTPLUG_CPU)
				291	static void ldom_startcpu_cpuid(unsigned int cpu, unsigned long thread_reg,
				292	void **descrp)
				293	{
				294	extern unsigned long sparc64_ttable_tl0;
				295	extern unsigned long kern_locked_tte_data;
				296	struct hvtramp_descr *hdesc;
				297	unsigned long trampoline_ra;
				298	struct trap_per_cpu *tb;
				299	u64 tte_vaddr, tte_data;
				300	unsigned long hv_err;
				301	int i;
				302
				303	hdesc = kzalloc(sizeof(*hdesc) +
				304	(sizeof(struct hvtramp_mapping) *
				305	num_kernel_image_mappings - 1),
				306	GFP_KERNEL);
				307	if (!hdesc) {
				308	printk(KERN_ERR "ldom_startcpu_cpuid: Cannot allocate "
				309	"hvtramp_descr.\n");
				310	return;
				311	}
				312	*descrp = hdesc;
				313
				314	hdesc->cpu = cpu;
				315	hdesc->num_mappings = num_kernel_image_mappings;
				316
				317	tb = &trap_block[cpu];
				318
				319	hdesc->fault_info_va = (unsigned long) &tb->fault_info;
				320	hdesc->fault_info_pa = kimage_addr_to_ra(&tb->fault_info);
				321
				322	hdesc->thread_reg = thread_reg;
				323
				324	tte_vaddr = (unsigned long) KERNBASE;
				325	tte_data = kern_locked_tte_data;
				326
				327	for (i = 0; i < hdesc->num_mappings; i++) {
				328	hdesc->maps[i].vaddr = tte_vaddr;
				329	hdesc->maps[i].tte = tte_data;
				330	tte_vaddr += 0x400000;
				331	tte_data += 0x400000;
				332	}
				333
				334	trampoline_ra = kimage_addr_to_ra(hv_cpu_startup);
				335
				336	hv_err = sun4v_cpu_start(cpu, trampoline_ra,
				337	kimage_addr_to_ra(&sparc64_ttable_tl0),
				338	__pa(hdesc));
				339	if (hv_err)
				340	printk(KERN_ERR "ldom_startcpu_cpuid: sun4v_cpu_start() "
				341	"gives error %lu\n", hv_err);
				342	}
				343	#endif
				344
				345	extern unsigned long sparc64_cpu_startup;
				346
				347	/* The OBP cpu startup callback truncates the 3rd arg cookie to
				348	* 32-bits (I think) so to be safe we have it read the pointer
				349	* contained here so we work on >4GB machines. -DaveM
				350	*/
				351	static struct thread_info *cpu_new_thread = NULL;
				352
				353	static int smp_boot_one_cpu(unsigned int cpu, struct task_struct *idle)
				354	{
				355	unsigned long entry =
				356	(unsigned long)(&sparc64_cpu_startup);
				357	unsigned long cookie =
				358	(unsigned long)(&cpu_new_thread);
				359	void *descr = NULL;
				360	int timeout, ret;
				361
				362	callin_flag = 0;
				363	cpu_new_thread = task_thread_info(idle);
				364
				365	if (tlb_type == hypervisor) {
				366	#if defined(CONFIG_SUN_LDOMS) && defined(CONFIG_HOTPLUG_CPU)
				367	if (ldom_domaining_enabled)
				368	ldom_startcpu_cpuid(cpu,
				369	(unsigned long) cpu_new_thread,
				370	&descr);
				371	else
				372	#endif
				373	prom_startcpu_cpuid(cpu, entry, cookie);
				374	} else {
				375	struct device_node *dp = of_find_node_by_cpuid(cpu);
				376
				377	prom_startcpu(dp->phandle, entry, cookie);
				378	}
				379
				380	for (timeout = 0; timeout < 50000; timeout++) {
				381	if (callin_flag)
				382	break;
				383	udelay(100);
				384	}
				385
				386	if (callin_flag) {
				387	ret = 0;
				388	} else {
				389	printk("Processor %d is stuck.\n", cpu);
				390	ret = -ENODEV;
				391	}
				392	cpu_new_thread = NULL;
				393
				394	kfree(descr);
				395
				396	return ret;
				397	}
				398
				399	static void spitfire_xcall_helper(u64 data0, u64 data1, u64 data2, u64 pstate, unsigned long cpu)
				400	{
				401	u64 result, target;
				402	int stuck, tmp;
				403
				404	if (this_is_starfire) {
				405	/* map to real upaid */
				406	cpu = (((cpu & 0x3c) << 1) \|
				407	((cpu & 0x40) >> 4) \|
				408	(cpu & 0x3));
				409	}
				410
				411	target = (cpu << 14) \| 0x70;
				412	again:
				413	/* Ok, this is the real Spitfire Errata #54.
				414	* One must read back from a UDB internal register
				415	* after writes to the UDB interrupt dispatch, but
				416	* before the membar Sync for that write.
				417	* So we use the high UDB control register (ASI 0x7f,
				418	* ADDR 0x20) for the dummy read. -DaveM
				419	*/
				420	tmp = 0x40;
				421	__asm__ __volatile__(
				422	"wrpr %1, %2, %%pstate\n\t"
				423	"stxa %4, [%0] %3\n\t"
				424	"stxa %5, [%0+%8] %3\n\t"
				425	"add %0, %8, %0\n\t"
				426	"stxa %6, [%0+%8] %3\n\t"
				427	"membar #Sync\n\t"
				428	"stxa %%g0, [%7] %3\n\t"
				429	"membar #Sync\n\t"
				430	"mov 0x20, %%g1\n\t"
				431	"ldxa [%%g1] 0x7f, %%g0\n\t"
				432	"membar #Sync"
				433	: "=r" (tmp)
				434	: "r" (pstate), "i" (PSTATE_IE), "i" (ASI_INTR_W),
				435	"r" (data0), "r" (data1), "r" (data2), "r" (target),
				436	"r" (0x10), "0" (tmp)
				437	: "g1");
				438
				439	/* NOTE: PSTATE_IE is still clear. */
				440	stuck = 100000;
				441	do {
				442	__asm__ __volatile__("ldxa [%%g0] %1, %0"
				443	: "=r" (result)
				444	: "i" (ASI_INTR_DISPATCH_STAT));
				445	if (result == 0) {
				446	__asm__ __volatile__("wrpr %0, 0x0, %%pstate"
				447	: : "r" (pstate));
				448	return;
				449	}
				450	stuck -= 1;
				451	if (stuck == 0)
				452	break;
				453	} while (result & 0x1);
				454	__asm__ __volatile__("wrpr %0, 0x0, %%pstate"
				455	: : "r" (pstate));
				456	if (stuck == 0) {
				457	printk("CPU[%d]: mondo stuckage result[%016llx]\n",
				458	smp_processor_id(), result);
				459	} else {
				460	udelay(2);
				461	goto again;
				462	}
				463	}
				464
				465	static void spitfire_xcall_deliver(struct trap_per_cpu *tb, int cnt)
				466	{
				467	u64 *mondo, data0, data1, data2;
				468	u16 *cpu_list;
				469	u64 pstate;
				470	int i;
				471
				472	__asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
				473	cpu_list = __va(tb->cpu_list_pa);
				474	mondo = __va(tb->cpu_mondo_block_pa);
				475	data0 = mondo[0];
				476	data1 = mondo[1];
				477	data2 = mondo[2];
				478	for (i = 0; i < cnt; i++)
				479	spitfire_xcall_helper(data0, data1, data2, pstate, cpu_list[i]);
				480	}
				481
				482	/* Cheetah now allows to send the whole 64-bytes of data in the interrupt
				483	* packet, but we have no use for that. However we do take advantage of
				484	* the new pipelining feature (ie. dispatch to multiple cpus simultaneously).
				485	*/
				486	static void cheetah_xcall_deliver(struct trap_per_cpu *tb, int cnt)
				487	{
				488	int nack_busy_id, is_jbus, need_more;
				489	u64 *mondo, pstate, ver, busy_mask;
				490	u16 *cpu_list;
				491
				492	cpu_list = __va(tb->cpu_list_pa);
				493	mondo = __va(tb->cpu_mondo_block_pa);
				494
				495	/* Unfortunately, someone at Sun had the brilliant idea to make the
				496	* busy/nack fields hard-coded by ITID number for this Ultra-III
				497	* derivative processor.
				498	*/
				499	__asm__ ("rdpr %%ver, %0" : "=r" (ver));
				500	is_jbus = ((ver >> 32) == __JALAPENO_ID \|\|
				501	(ver >> 32) == __SERRANO_ID);
				502
				503	__asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
				504
				505	retry:
				506	need_more = 0;
				507	__asm__ __volatile__("wrpr %0, %1, %%pstate\n\t"
				508	: : "r" (pstate), "i" (PSTATE_IE));
				509
				510	/* Setup the dispatch data registers. */
				511	__asm__ __volatile__("stxa %0, [%3] %6\n\t"
				512	"stxa %1, [%4] %6\n\t"
				513	"stxa %2, [%5] %6\n\t"
				514	"membar #Sync\n\t"
				515	: /* no outputs */
				516	: "r" (mondo[0]), "r" (mondo[1]), "r" (mondo[2]),
				517	"r" (0x40), "r" (0x50), "r" (0x60),
				518	"i" (ASI_INTR_W));
				519
				520	nack_busy_id = 0;
				521	busy_mask = 0;
				522	{
				523	int i;
				524
				525	for (i = 0; i < cnt; i++) {
				526	u64 target, nr;
				527
				528	nr = cpu_list[i];
				529	if (nr == 0xffff)
				530	continue;
				531
				532	target = (nr << 14) \| 0x70;
				533	if (is_jbus) {
				534	busy_mask \|= (0x1UL << (nr * 2));
				535	} else {
				536	target \|= (nack_busy_id << 24);
				537	busy_mask \|= (0x1UL <<
				538	(nack_busy_id * 2));
				539	}
				540	__asm__ __volatile__(
				541	"stxa %%g0, [%0] %1\n\t"
				542	"membar #Sync\n\t"
				543	: /* no outputs */
				544	: "r" (target), "i" (ASI_INTR_W));
				545	nack_busy_id++;
				546	if (nack_busy_id == 32) {
				547	need_more = 1;
				548	break;
				549	}
				550	}
				551	}
				552
				553	/* Now, poll for completion. */
				554	{
				555	u64 dispatch_stat, nack_mask;
				556	long stuck;
				557
				558	stuck = 100000 * nack_busy_id;
				559	nack_mask = busy_mask << 1;
				560	do {
				561	__asm__ __volatile__("ldxa [%%g0] %1, %0"
				562	: "=r" (dispatch_stat)
				563	: "i" (ASI_INTR_DISPATCH_STAT));
				564	if (!(dispatch_stat & (busy_mask \| nack_mask))) {
				565	__asm__ __volatile__("wrpr %0, 0x0, %%pstate"
				566	: : "r" (pstate));
				567	if (unlikely(need_more)) {
				568	int i, this_cnt = 0;
				569	for (i = 0; i < cnt; i++) {
				570	if (cpu_list[i] == 0xffff)
				571	continue;
				572	cpu_list[i] = 0xffff;
				573	this_cnt++;
				574	if (this_cnt == 32)
				575	break;
				576	}
				577	goto retry;
				578	}
				579	return;
				580	}
				581	if (!--stuck)
				582	break;
				583	} while (dispatch_stat & busy_mask);
				584
				585	__asm__ __volatile__("wrpr %0, 0x0, %%pstate"
				586	: : "r" (pstate));
				587
				588	if (dispatch_stat & busy_mask) {
				589	/* Busy bits will not clear, continue instead
				590	* of freezing up on this cpu.
				591	*/
				592	printk("CPU[%d]: mondo stuckage result[%016llx]\n",
				593	smp_processor_id(), dispatch_stat);
				594	} else {
				595	int i, this_busy_nack = 0;
				596
				597	/* Delay some random time with interrupts enabled
				598	* to prevent deadlock.
				599	*/
				600	udelay(2 * nack_busy_id);
				601
				602	/* Clear out the mask bits for cpus which did not
				603	* NACK us.
				604	*/
				605	for (i = 0; i < cnt; i++) {
				606	u64 check_mask, nr;
				607
				608	nr = cpu_list[i];
				609	if (nr == 0xffff)
				610	continue;
				611
				612	if (is_jbus)
				613	check_mask = (0x2UL << (2*nr));
				614	else
				615	check_mask = (0x2UL <<
				616	this_busy_nack);
				617	if ((dispatch_stat & check_mask) == 0)
				618	cpu_list[i] = 0xffff;
				619	this_busy_nack += 2;
				620	if (this_busy_nack == 64)
				621	break;
				622	}
				623
				624	goto retry;
				625	}
				626	}
				627	}
				628
				629	#define CPU_MONDO_COUNTER(cpuid) (cpu_mondo_counter[cpuid])
				630	#define MONDO_USEC_WAIT_MIN 2
				631	#define MONDO_USEC_WAIT_MAX 100
				632	#define MONDO_RETRY_LIMIT 500000
				633
				634	/* Multi-cpu list version.
				635	*
				636	* Deliver xcalls to 'cnt' number of cpus in 'cpu_list'.
				637	* Sometimes not all cpus receive the mondo, requiring us to re-send
				638	* the mondo until all cpus have received, or cpus are truly stuck
				639	* unable to receive mondo, and we timeout.
				640	* Occasionally a target cpu strand is borrowed briefly by hypervisor to
				641	* perform guest service, such as PCIe error handling. Consider the
				642	* service time, 1 second overall wait is reasonable for 1 cpu.
				643	* Here two in-between mondo check wait time are defined: 2 usec for
				644	* single cpu quick turn around and up to 100usec for large cpu count.
				645	* Deliver mondo to large number of cpus could take longer, we adjusts
				646	* the retry count as long as target cpus are making forward progress.
				647	*/
				648	static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
				649	{
				650	int this_cpu, tot_cpus, prev_sent, i, rem;
				651	int usec_wait, retries, tot_retries;
				652	u16 first_cpu = 0xffff;
				653	unsigned long xc_rcvd = 0;
				654	unsigned long status;
				655	int ecpuerror_id = 0;
				656	int enocpu_id = 0;
				657	u16 *cpu_list;
				658	u16 cpu;
				659
				660	this_cpu = smp_processor_id();
				661	cpu_list = __va(tb->cpu_list_pa);
				662	usec_wait = cnt * MONDO_USEC_WAIT_MIN;
				663	if (usec_wait > MONDO_USEC_WAIT_MAX)
				664	usec_wait = MONDO_USEC_WAIT_MAX;
				665	retries = tot_retries = 0;
				666	tot_cpus = cnt;
				667	prev_sent = 0;
				668
				669	do {
				670	int n_sent, mondo_delivered, target_cpu_busy;
				671
				672	status = sun4v_cpu_mondo_send(cnt,
				673	tb->cpu_list_pa,
				674	tb->cpu_mondo_block_pa);
				675
				676	/* HV_EOK means all cpus received the xcall, we're done. */
				677	if (likely(status == HV_EOK))
				678	goto xcall_done;
				679
				680	/* If not these non-fatal errors, panic */
				681	if (unlikely((status != HV_EWOULDBLOCK) &&
				682	(status != HV_ECPUERROR) &&
				683	(status != HV_ENOCPU)))
				684	goto fatal_errors;
				685
				686	/* First, see if we made any forward progress.
				687	*
				688	* Go through the cpu_list, count the target cpus that have
				689	* received our mondo (n_sent), and those that did not (rem).
				690	* Re-pack cpu_list with the cpus remain to be retried in the
				691	* front - this simplifies tracking the truly stalled cpus.
				692	*
				693	* The hypervisor indicates successful sends by setting
				694	* cpu list entries to the value 0xffff.
				695	*
				696	* EWOULDBLOCK means some target cpus did not receive the
				697	* mondo and retry usually helps.
				698	*
				699	* ECPUERROR means at least one target cpu is in error state,
				700	* it's usually safe to skip the faulty cpu and retry.
				701	*
				702	* ENOCPU means one of the target cpu doesn't belong to the
				703	* domain, perhaps offlined which is unexpected, but not
				704	* fatal and it's okay to skip the offlined cpu.
				705	*/
				706	rem = 0;
				707	n_sent = 0;
				708	for (i = 0; i < cnt; i++) {
				709	cpu = cpu_list[i];
				710	if (likely(cpu == 0xffff)) {
				711	n_sent++;
				712	} else if ((status == HV_ECPUERROR) &&
				713	(sun4v_cpu_state(cpu) == HV_CPU_STATE_ERROR)) {
				714	ecpuerror_id = cpu + 1;
				715	} else if (status == HV_ENOCPU && !cpu_online(cpu)) {
				716	enocpu_id = cpu + 1;
				717	} else {
				718	cpu_list[rem++] = cpu;
				719	}
				720	}
				721
				722	/* No cpu remained, we're done. */
				723	if (rem == 0)
				724	break;
				725
				726	/* Otherwise, update the cpu count for retry. */
				727	cnt = rem;
				728
				729	/* Record the overall number of mondos received by the
				730	* first of the remaining cpus.
				731	*/
				732	if (first_cpu != cpu_list[0]) {
				733	first_cpu = cpu_list[0];
				734	xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
				735	}
				736
				737	/* Was any mondo delivered successfully? */
				738	mondo_delivered = (n_sent > prev_sent);
				739	prev_sent = n_sent;
				740
				741	/* or, was any target cpu busy processing other mondos? */
				742	target_cpu_busy = (xc_rcvd < CPU_MONDO_COUNTER(first_cpu));
				743	xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
				744
				745	/* Retry count is for no progress. If we're making progress,
				746	* reset the retry count.
				747	*/
				748	if (likely(mondo_delivered \|\| target_cpu_busy)) {
				749	tot_retries += retries;
				750	retries = 0;
				751	} else if (unlikely(retries > MONDO_RETRY_LIMIT)) {
				752	goto fatal_mondo_timeout;
				753	}
				754
				755	/* Delay a little bit to let other cpus catch up on
				756	* their cpu mondo queue work.
				757	*/
				758	if (!mondo_delivered)
				759	udelay(usec_wait);
				760
				761	retries++;
				762	} while (1);
				763
				764	xcall_done:
				765	if (unlikely(ecpuerror_id > 0)) {
				766	pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) was in error state\n",
				767	this_cpu, ecpuerror_id - 1);
				768	} else if (unlikely(enocpu_id > 0)) {
				769	pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) does not belong to the domain\n",
				770	this_cpu, enocpu_id - 1);
				771	}
				772	return;
				773
				774	fatal_errors:
				775	/* fatal errors include bad alignment, etc */
				776	pr_crit("CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) mondo_block_pa(%lx)\n",
				777	this_cpu, tot_cpus, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
				778	panic("Unexpected SUN4V mondo error %lu\n", status);
				779
				780	fatal_mondo_timeout:
				781	/* some cpus being non-responsive to the cpu mondo */
				782	pr_crit("CPU[%d]: SUN4V mondo timeout, cpu(%d) made no forward progress after %d retries. Total target cpus(%d).\n",
				783	this_cpu, first_cpu, (tot_retries + retries), tot_cpus);
				784	panic("SUN4V mondo timeout panic\n");
				785	}
				786
				787	static void (xcall_deliver_impl)(struct trap_per_cpu , int);
				788
				789	static void xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask)
				790	{
				791	struct trap_per_cpu *tb;
				792	int this_cpu, i, cnt;
				793	unsigned long flags;
				794	u16 *cpu_list;
				795	u64 *mondo;
				796
				797	/* We have to do this whole thing with interrupts fully disabled.
				798	* Otherwise if we send an xcall from interrupt context it will
				799	* corrupt both our mondo block and cpu list state.
				800	*
				801	* One consequence of this is that we cannot use timeout mechanisms
				802	* that depend upon interrupts being delivered locally. So, for
				803	* example, we cannot sample jiffies and expect it to advance.
				804	*
				805	* Fortunately, udelay() uses %stick/%tick so we can use that.
				806	*/
				807	local_irq_save(flags);
				808
				809	this_cpu = smp_processor_id();
				810	tb = &trap_block[this_cpu];
				811
				812	mondo = __va(tb->cpu_mondo_block_pa);
				813	mondo[0] = data0;
				814	mondo[1] = data1;
				815	mondo[2] = data2;
				816	wmb();
				817
				818	cpu_list = __va(tb->cpu_list_pa);
				819
				820	/* Setup the initial cpu list. */
				821	cnt = 0;
				822	for_each_cpu(i, mask) {
				823	if (i == this_cpu \|\| !cpu_online(i))
				824	continue;
				825	cpu_list[cnt++] = i;
				826	}
				827
				828	if (cnt)
				829	xcall_deliver_impl(tb, cnt);
				830
				831	local_irq_restore(flags);
				832	}
				833
				834	/* Send cross call to all processors mentioned in MASK_P
				835	* except self. Really, there are only two cases currently,
				836	* "cpu_online_mask" and "mm_cpumask(mm)".
				837	*/
				838	static void smp_cross_call_masked(unsigned long func, u32 ctx, u64 data1, u64 data2, const cpumask_t mask)
				839	{
				840	u64 data0 = (((u64)ctx)<<32 \| (((u64)func) & 0xffffffff));
				841
				842	xcall_deliver(data0, data1, data2, mask);
				843	}
				844
				845	/* Send cross call to all processors except self. */
				846	static void smp_cross_call(unsigned long *func, u32 ctx, u64 data1, u64 data2)
				847	{
				848	smp_cross_call_masked(func, ctx, data1, data2, cpu_online_mask);
				849	}
				850
				851	extern unsigned long xcall_sync_tick;
				852
				853	static void smp_start_sync_tick_client(int cpu)
				854	{
				855	xcall_deliver((u64) &xcall_sync_tick, 0, 0,
				856	cpumask_of(cpu));
				857	}
				858
				859	extern unsigned long xcall_call_function;
				860
				861	void arch_send_call_function_ipi_mask(const struct cpumask *mask)
				862	{
				863	xcall_deliver((u64) &xcall_call_function, 0, 0, mask);
				864	}
				865
				866	extern unsigned long xcall_call_function_single;
				867
				868	void arch_send_call_function_single_ipi(int cpu)
				869	{
				870	xcall_deliver((u64) &xcall_call_function_single, 0, 0,
				871	cpumask_of(cpu));
				872	}
				873
				874	void __irq_entry smp_call_function_client(int irq, struct pt_regs *regs)
				875	{
				876	clear_softint(1 << irq);
				877	irq_enter();
				878	generic_smp_call_function_interrupt();
				879	irq_exit();
				880	}
				881
				882	void __irq_entry smp_call_function_single_client(int irq, struct pt_regs *regs)
				883	{
				884	clear_softint(1 << irq);
				885	irq_enter();
				886	generic_smp_call_function_single_interrupt();
				887	irq_exit();
				888	}
				889
				890	static void tsb_sync(void *info)
				891	{
				892	struct trap_per_cpu *tp = &trap_block[raw_smp_processor_id()];
				893	struct mm_struct *mm = info;
				894
				895	/* It is not valid to test "current->active_mm == mm" here.
				896	*
				897	* The value of "current" is not changed atomically with
				898	* switch_mm(). But that's OK, we just need to check the
				899	* current cpu's trap block PGD physical address.
				900	*/
				901	if (tp->pgd_paddr == __pa(mm->pgd))
				902	tsb_context_switch(mm);
				903	}
				904
				905	void smp_tsb_sync(struct mm_struct *mm)
				906	{
				907	smp_call_function_many(mm_cpumask(mm), tsb_sync, mm, 1);
				908	}
				909
				910	extern unsigned long xcall_flush_tlb_mm;
				911	extern unsigned long xcall_flush_tlb_page;
				912	extern unsigned long xcall_flush_tlb_kernel_range;
				913	extern unsigned long xcall_fetch_glob_regs;
				914	extern unsigned long xcall_fetch_glob_pmu;
				915	extern unsigned long xcall_fetch_glob_pmu_n4;
				916	extern unsigned long xcall_receive_signal;
				917	extern unsigned long xcall_new_mmu_context_version;
				918	#ifdef CONFIG_KGDB
				919	extern unsigned long xcall_kgdb_capture;
				920	#endif
				921
				922	#ifdef DCACHE_ALIASING_POSSIBLE
				923	extern unsigned long xcall_flush_dcache_page_cheetah;
				924	#endif
				925	extern unsigned long xcall_flush_dcache_page_spitfire;
				926
				927	static inline void __local_flush_dcache_page(struct page *page)
				928	{
				929	#ifdef DCACHE_ALIASING_POSSIBLE
				930	__flush_dcache_page(page_address(page),
				931	((tlb_type == spitfire) &&
				932	page_mapping_file(page) != NULL));
				933	#else
				934	if (page_mapping_file(page) != NULL &&
				935	tlb_type == spitfire)
				936	__flush_icache_page(__pa(page_address(page)));
				937	#endif
				938	}
				939
				940	void smp_flush_dcache_page_impl(struct page *page, int cpu)
				941	{
				942	int this_cpu;
				943
				944	if (tlb_type == hypervisor)
				945	return;
				946
				947	#ifdef CONFIG_DEBUG_DCFLUSH
				948	atomic_inc(&dcpage_flushes);
				949	#endif
				950
				951	this_cpu = get_cpu();
				952
				953	if (cpu == this_cpu) {
				954	__local_flush_dcache_page(page);
				955	} else if (cpu_online(cpu)) {
				956	void *pg_addr = page_address(page);
				957	u64 data0 = 0;
				958
				959	if (tlb_type == spitfire) {
				960	data0 = ((u64)&xcall_flush_dcache_page_spitfire);
				961	if (page_mapping_file(page) != NULL)
				962	data0 \|= ((u64)1 << 32);
				963	} else if (tlb_type == cheetah \|\| tlb_type == cheetah_plus) {
				964	#ifdef DCACHE_ALIASING_POSSIBLE
				965	data0 = ((u64)&xcall_flush_dcache_page_cheetah);
				966	#endif
				967	}
				968	if (data0) {
				969	xcall_deliver(data0, __pa(pg_addr),
				970	(u64) pg_addr, cpumask_of(cpu));
				971	#ifdef CONFIG_DEBUG_DCFLUSH
				972	atomic_inc(&dcpage_flushes_xcall);
				973	#endif
				974	}
				975	}
				976
				977	put_cpu();
				978	}
				979
				980	void flush_dcache_page_all(struct mm_struct mm, struct page page)
				981	{
				982	void *pg_addr;
				983	u64 data0;
				984
				985	if (tlb_type == hypervisor)
				986	return;
				987
				988	preempt_disable();
				989
				990	#ifdef CONFIG_DEBUG_DCFLUSH
				991	atomic_inc(&dcpage_flushes);
				992	#endif
				993	data0 = 0;
				994	pg_addr = page_address(page);
				995	if (tlb_type == spitfire) {
				996	data0 = ((u64)&xcall_flush_dcache_page_spitfire);
				997	if (page_mapping_file(page) != NULL)
				998	data0 \|= ((u64)1 << 32);
				999	} else if (tlb_type == cheetah \|\| tlb_type == cheetah_plus) {
				1000	#ifdef DCACHE_ALIASING_POSSIBLE
				1001	data0 = ((u64)&xcall_flush_dcache_page_cheetah);
				1002	#endif
				1003	}
				1004	if (data0) {
				1005	xcall_deliver(data0, __pa(pg_addr),
				1006	(u64) pg_addr, cpu_online_mask);
				1007	#ifdef CONFIG_DEBUG_DCFLUSH
				1008	atomic_inc(&dcpage_flushes_xcall);
				1009	#endif
				1010	}
				1011	__local_flush_dcache_page(page);
				1012
				1013	preempt_enable();
				1014	}
				1015
				1016	#ifdef CONFIG_KGDB
				1017	void kgdb_roundup_cpus(void)
				1018	{
				1019	smp_cross_call(&xcall_kgdb_capture, 0, 0, 0);
				1020	}
				1021	#endif
				1022
				1023	void smp_fetch_global_regs(void)
				1024	{
				1025	smp_cross_call(&xcall_fetch_glob_regs, 0, 0, 0);
				1026	}
				1027
				1028	void smp_fetch_global_pmu(void)
				1029	{
				1030	if (tlb_type == hypervisor &&
				1031	sun4v_chip_type >= SUN4V_CHIP_NIAGARA4)
				1032	smp_cross_call(&xcall_fetch_glob_pmu_n4, 0, 0, 0);
				1033	else
				1034	smp_cross_call(&xcall_fetch_glob_pmu, 0, 0, 0);
				1035	}
				1036
				1037	/* We know that the window frames of the user have been flushed
				1038	* to the stack before we get here because all callers of us
				1039	* are flush_tlb_() routines, and these run after flush_cache_()
				1040	* which performs the flushw.
				1041	*
				1042	* mm->cpu_vm_mask is a bit mask of which cpus an address
				1043	* space has (potentially) executed on, this is the heuristic
				1044	* we use to limit cross calls.
				1045	*/
				1046
				1047	/* This currently is only used by the hugetlb arch pre-fault
				1048	* hook on UltraSPARC-III+ and later when changing the pagesize
				1049	* bits of the context register for an address space.
				1050	*/
				1051	void smp_flush_tlb_mm(struct mm_struct *mm)
				1052	{
				1053	u32 ctx = CTX_HWBITS(mm->context);
				1054
				1055	get_cpu();
				1056
				1057	smp_cross_call_masked(&xcall_flush_tlb_mm,
				1058	ctx, 0, 0,
				1059	mm_cpumask(mm));
				1060
				1061	__flush_tlb_mm(ctx, SECONDARY_CONTEXT);
				1062
				1063	put_cpu();
				1064	}
				1065
				1066	struct tlb_pending_info {
				1067	unsigned long ctx;
				1068	unsigned long nr;
				1069	unsigned long *vaddrs;
				1070	};
				1071
				1072	static void tlb_pending_func(void *info)
				1073	{
				1074	struct tlb_pending_info *t = info;
				1075
				1076	__flush_tlb_pending(t->ctx, t->nr, t->vaddrs);
				1077	}
				1078
				1079	void smp_flush_tlb_pending(struct mm_struct mm, unsigned long nr, unsigned long vaddrs)
				1080	{
				1081	u32 ctx = CTX_HWBITS(mm->context);
				1082	struct tlb_pending_info info;
				1083
				1084	get_cpu();
				1085
				1086	info.ctx = ctx;
				1087	info.nr = nr;
				1088	info.vaddrs = vaddrs;
				1089
				1090	smp_call_function_many(mm_cpumask(mm), tlb_pending_func,
				1091	&info, 1);
				1092
				1093	__flush_tlb_pending(ctx, nr, vaddrs);
				1094
				1095	put_cpu();
				1096	}
				1097
				1098	void smp_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr)
				1099	{
				1100	unsigned long context = CTX_HWBITS(mm->context);
				1101
				1102	get_cpu();
				1103
				1104	smp_cross_call_masked(&xcall_flush_tlb_page,
				1105	context, vaddr, 0,
				1106	mm_cpumask(mm));
				1107
				1108	__flush_tlb_page(context, vaddr);
				1109
				1110	put_cpu();
				1111	}
				1112
				1113	void smp_flush_tlb_kernel_range(unsigned long start, unsigned long end)
				1114	{
				1115	start &= PAGE_MASK;
				1116	end = PAGE_ALIGN(end);
				1117	if (start != end) {
				1118	smp_cross_call(&xcall_flush_tlb_kernel_range,
				1119	0, start, end);
				1120
				1121	__flush_tlb_kernel_range(start, end);
				1122	}
				1123	}
				1124
				1125	/* CPU capture. */
				1126	/* #define CAPTURE_DEBUG */
				1127	extern unsigned long xcall_capture;
				1128
				1129	static atomic_t smp_capture_depth = ATOMIC_INIT(0);
				1130	static atomic_t smp_capture_registry = ATOMIC_INIT(0);
				1131	static unsigned long penguins_are_doing_time;
				1132
				1133	void smp_capture(void)
				1134	{
				1135	int result = atomic_add_return(1, &smp_capture_depth);
				1136
				1137	if (result == 1) {
				1138	int ncpus = num_online_cpus();
				1139
				1140	#ifdef CAPTURE_DEBUG
				1141	printk("CPU[%d]: Sending penguins to jail...",
				1142	smp_processor_id());
				1143	#endif
				1144	penguins_are_doing_time = 1;
				1145	atomic_inc(&smp_capture_registry);
				1146	smp_cross_call(&xcall_capture, 0, 0, 0);
				1147	while (atomic_read(&smp_capture_registry) != ncpus)
				1148	rmb();
				1149	#ifdef CAPTURE_DEBUG
				1150	printk("done\n");
				1151	#endif
				1152	}
				1153	}
				1154
				1155	void smp_release(void)
				1156	{
				1157	if (atomic_dec_and_test(&smp_capture_depth)) {
				1158	#ifdef CAPTURE_DEBUG
				1159	printk("CPU[%d]: Giving pardon to "
				1160	"imprisoned penguins\n",
				1161	smp_processor_id());
				1162	#endif
				1163	penguins_are_doing_time = 0;
				1164	membar_safe("#StoreLoad");
				1165	atomic_dec(&smp_capture_registry);
				1166	}
				1167	}
				1168
				1169	/* Imprisoned penguins run with %pil == PIL_NORMAL_MAX, but PSTATE_IE
				1170	* set, so they can service tlb flush xcalls...
				1171	*/
				1172	extern void prom_world(int);
				1173
				1174	void __irq_entry smp_penguin_jailcell(int irq, struct pt_regs *regs)
				1175	{
				1176	clear_softint(1 << irq);
				1177
				1178	preempt_disable();
				1179
				1180	__asm__ __volatile__("flushw");
				1181	prom_world(1);
				1182	atomic_inc(&smp_capture_registry);
				1183	membar_safe("#StoreLoad");
				1184	while (penguins_are_doing_time)
				1185	rmb();
				1186	atomic_dec(&smp_capture_registry);
				1187	prom_world(0);
				1188
				1189	preempt_enable();
				1190	}
				1191
				1192	/* /proc/profile writes can call this, don't __init it please. */
				1193	int setup_profiling_timer(unsigned int multiplier)
				1194	{
				1195	return -EINVAL;
				1196	}
				1197
				1198	void __init smp_prepare_cpus(unsigned int max_cpus)
				1199	{
				1200	}
				1201
				1202	void smp_prepare_boot_cpu(void)
				1203	{
				1204	}
				1205
				1206	void __init smp_setup_processor_id(void)
				1207	{
				1208	if (tlb_type == spitfire)
				1209	xcall_deliver_impl = spitfire_xcall_deliver;
				1210	else if (tlb_type == cheetah \|\| tlb_type == cheetah_plus)
				1211	xcall_deliver_impl = cheetah_xcall_deliver;
				1212	else
				1213	xcall_deliver_impl = hypervisor_xcall_deliver;
				1214	}
				1215
				1216	void smp_fill_in_sib_core_maps(void)
				1217	{
				1218	unsigned int i;
				1219
				1220	for_each_present_cpu(i) {
				1221	unsigned int j;
				1222
				1223	cpumask_clear(&cpu_core_map[i]);
				1224	if (cpu_data(i).core_id == 0) {
				1225	cpumask_set_cpu(i, &cpu_core_map[i]);
				1226	continue;
				1227	}
				1228
				1229	for_each_present_cpu(j) {
				1230	if (cpu_data(i).core_id ==
				1231	cpu_data(j).core_id)
				1232	cpumask_set_cpu(j, &cpu_core_map[i]);
				1233	}
				1234	}
				1235
				1236	for_each_present_cpu(i) {
				1237	unsigned int j;
				1238
				1239	for_each_present_cpu(j) {
				1240	if (cpu_data(i).max_cache_id ==
				1241	cpu_data(j).max_cache_id)
				1242	cpumask_set_cpu(j, &cpu_core_sib_cache_map[i]);
				1243
				1244	if (cpu_data(i).sock_id == cpu_data(j).sock_id)
				1245	cpumask_set_cpu(j, &cpu_core_sib_map[i]);
				1246	}
				1247	}
				1248
				1249	for_each_present_cpu(i) {
				1250	unsigned int j;
				1251
				1252	cpumask_clear(&per_cpu(cpu_sibling_map, i));
				1253	if (cpu_data(i).proc_id == -1) {
				1254	cpumask_set_cpu(i, &per_cpu(cpu_sibling_map, i));
				1255	continue;
				1256	}
				1257
				1258	for_each_present_cpu(j) {
				1259	if (cpu_data(i).proc_id ==
				1260	cpu_data(j).proc_id)
				1261	cpumask_set_cpu(j, &per_cpu(cpu_sibling_map, i));
				1262	}
				1263	}
				1264	}
				1265
				1266	int __cpu_up(unsigned int cpu, struct task_struct *tidle)
				1267	{
				1268	int ret = smp_boot_one_cpu(cpu, tidle);
				1269
				1270	if (!ret) {
				1271	cpumask_set_cpu(cpu, &smp_commenced_mask);
				1272	while (!cpu_online(cpu))
				1273	mb();
				1274	if (!cpu_online(cpu)) {
				1275	ret = -ENODEV;
				1276	} else {
				1277	/* On SUN4V, writes to %tick and %stick are
				1278	* not allowed.
				1279	*/
				1280	if (tlb_type != hypervisor)
				1281	smp_synchronize_one_tick(cpu);
				1282	}
				1283	}
				1284	return ret;
				1285	}
				1286
				1287	#ifdef CONFIG_HOTPLUG_CPU
				1288	void cpu_play_dead(void)
				1289	{
				1290	int cpu = smp_processor_id();
				1291	unsigned long pstate;
				1292
				1293	idle_task_exit();
				1294
				1295	if (tlb_type == hypervisor) {
				1296	struct trap_per_cpu *tb = &trap_block[cpu];
				1297
				1298	sun4v_cpu_qconf(HV_CPU_QUEUE_CPU_MONDO,
				1299	tb->cpu_mondo_pa, 0);
				1300	sun4v_cpu_qconf(HV_CPU_QUEUE_DEVICE_MONDO,
				1301	tb->dev_mondo_pa, 0);
				1302	sun4v_cpu_qconf(HV_CPU_QUEUE_RES_ERROR,
				1303	tb->resum_mondo_pa, 0);
				1304	sun4v_cpu_qconf(HV_CPU_QUEUE_NONRES_ERROR,
				1305	tb->nonresum_mondo_pa, 0);
				1306	}
				1307
				1308	cpumask_clear_cpu(cpu, &smp_commenced_mask);
				1309	membar_safe("#Sync");
				1310
				1311	local_irq_disable();
				1312
				1313	__asm__ __volatile__(
				1314	"rdpr %%pstate, %0\n\t"
				1315	"wrpr %0, %1, %%pstate"
				1316	: "=r" (pstate)
				1317	: "i" (PSTATE_IE));
				1318
				1319	while (1)
				1320	barrier();
				1321	}
				1322
				1323	int __cpu_disable(void)
				1324	{
				1325	int cpu = smp_processor_id();
				1326	cpuinfo_sparc *c;
				1327	int i;
				1328
				1329	for_each_cpu(i, &cpu_core_map[cpu])
				1330	cpumask_clear_cpu(cpu, &cpu_core_map[i]);
				1331	cpumask_clear(&cpu_core_map[cpu]);
				1332
				1333	for_each_cpu(i, &per_cpu(cpu_sibling_map, cpu))
				1334	cpumask_clear_cpu(cpu, &per_cpu(cpu_sibling_map, i));
				1335	cpumask_clear(&per_cpu(cpu_sibling_map, cpu));
				1336
				1337	c = &cpu_data(cpu);
				1338
				1339	c->core_id = 0;
				1340	c->proc_id = -1;
				1341
				1342	smp_wmb();
				1343
				1344	/* Make sure no interrupts point to this cpu. */
				1345	fixup_irqs();
				1346
				1347	local_irq_enable();
				1348	mdelay(1);
				1349	local_irq_disable();
				1350
				1351	set_cpu_online(cpu, false);
				1352
				1353	cpu_map_rebuild();
				1354
				1355	return 0;
				1356	}
				1357
				1358	void __cpu_die(unsigned int cpu)
				1359	{
				1360	int i;
				1361
				1362	for (i = 0; i < 100; i++) {
				1363	smp_rmb();
				1364	if (!cpumask_test_cpu(cpu, &smp_commenced_mask))
				1365	break;
				1366	msleep(100);
				1367	}
				1368	if (cpumask_test_cpu(cpu, &smp_commenced_mask)) {
				1369	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
				1370	} else {
				1371	#if defined(CONFIG_SUN_LDOMS)
				1372	unsigned long hv_err;
				1373	int limit = 100;
				1374
				1375	do {
				1376	hv_err = sun4v_cpu_stop(cpu);
				1377	if (hv_err == HV_EOK) {
				1378	set_cpu_present(cpu, false);
				1379	break;
				1380	}
				1381	} while (--limit > 0);
				1382	if (limit <= 0) {
				1383	printk(KERN_ERR "sun4v_cpu_stop() fails err=%lu\n",
				1384	hv_err);
				1385	}
				1386	#endif
				1387	}
				1388	}
				1389	#endif
				1390
				1391	void __init smp_cpus_done(unsigned int max_cpus)
				1392	{
				1393	}
				1394
				1395	static void send_cpu_ipi(int cpu)
				1396	{
				1397	xcall_deliver((u64) &xcall_receive_signal,
				1398	0, 0, cpumask_of(cpu));
				1399	}
				1400
				1401	void scheduler_poke(void)
				1402	{
				1403	if (!cpu_poke)
				1404	return;
				1405
				1406	if (!__this_cpu_read(poke))
				1407	return;
				1408
				1409	__this_cpu_write(poke, false);
				1410	set_softint(1 << PIL_SMP_RECEIVE_SIGNAL);
				1411	}
				1412
				1413	static unsigned long send_cpu_poke(int cpu)
				1414	{
				1415	unsigned long hv_err;
				1416
				1417	per_cpu(poke, cpu) = true;
				1418	hv_err = sun4v_cpu_poke(cpu);
				1419	if (hv_err != HV_EOK) {
				1420	per_cpu(poke, cpu) = false;
				1421	pr_err_ratelimited("%s: sun4v_cpu_poke() fails err=%lu\n",
				1422	__func__, hv_err);
				1423	}
				1424
				1425	return hv_err;
				1426	}
				1427
				1428	void smp_send_reschedule(int cpu)
				1429	{
				1430	if (cpu == smp_processor_id()) {
				1431	WARN_ON_ONCE(preemptible());
				1432	set_softint(1 << PIL_SMP_RECEIVE_SIGNAL);
				1433	return;
				1434	}
				1435
				1436	/* Use cpu poke to resume idle cpu if supported. */
				1437	if (cpu_poke && idle_cpu(cpu)) {
				1438	unsigned long ret;
				1439
				1440	ret = send_cpu_poke(cpu);
				1441	if (ret == HV_EOK)
				1442	return;
				1443	}
				1444
				1445	/* Use IPI in following cases:
				1446	* - cpu poke not supported
				1447	* - cpu not idle
				1448	* - send_cpu_poke() returns with error
				1449	*/
				1450	send_cpu_ipi(cpu);
				1451	}
				1452
				1453	void smp_init_cpu_poke(void)
				1454	{
				1455	unsigned long major;
				1456	unsigned long minor;
				1457	int ret;
				1458
				1459	if (tlb_type != hypervisor)
				1460	return;
				1461
				1462	ret = sun4v_hvapi_get(HV_GRP_CORE, &major, &minor);
				1463	if (ret) {
				1464	pr_debug("HV_GRP_CORE is not registered\n");
				1465	return;
				1466	}
				1467
				1468	if (major == 1 && minor >= 6) {
				1469	/* CPU POKE is registered. */
				1470	cpu_poke = true;
				1471	return;
				1472	}
				1473
				1474	pr_debug("CPU_POKE not supported\n");
				1475	}
				1476
				1477	void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs)
				1478	{
				1479	clear_softint(1 << irq);
				1480	scheduler_ipi();
				1481	}
				1482
				1483	static void stop_this_cpu(void *dummy)
				1484	{
				1485	set_cpu_online(smp_processor_id(), false);
				1486	prom_stopself();
				1487	}
				1488
				1489	void smp_send_stop(void)
				1490	{
				1491	int cpu;
				1492
				1493	if (tlb_type == hypervisor) {
				1494	int this_cpu = smp_processor_id();
				1495	#ifdef CONFIG_SERIAL_SUNHV
				1496	sunhv_migrate_hvcons_irq(this_cpu);
				1497	#endif
				1498	for_each_online_cpu(cpu) {
				1499	if (cpu == this_cpu)
				1500	continue;
				1501
				1502	set_cpu_online(cpu, false);
				1503	#ifdef CONFIG_SUN_LDOMS
				1504	if (ldom_domaining_enabled) {
				1505	unsigned long hv_err;
				1506	hv_err = sun4v_cpu_stop(cpu);
				1507	if (hv_err)
				1508	printk(KERN_ERR "sun4v_cpu_stop() "
				1509	"failed err=%lu\n", hv_err);
				1510	} else
				1511	#endif
				1512	prom_stopcpu_cpuid(cpu);
				1513	}
				1514	} else
				1515	smp_call_function(stop_this_cpu, NULL, 0);
				1516	}
				1517
				1518	/**
				1519	* pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
				1520	* @cpu: cpu to allocate for
				1521	* @size: size allocation in bytes
				1522	* @align: alignment
				1523	*
				1524	* Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
				1525	* does the right thing for NUMA regardless of the current
				1526	* configuration.
				1527	*
				1528	* RETURNS:
				1529	* Pointer to the allocated area on success, NULL on failure.
				1530	*/
				1531	static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size,
				1532	size_t align)
				1533	{
				1534	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
				1535	#ifdef CONFIG_NEED_MULTIPLE_NODES
				1536	int node = cpu_to_node(cpu);
				1537	void *ptr;
				1538
				1539	if (!node_online(node) \|\| !NODE_DATA(node)) {
				1540	ptr = memblock_alloc_from(size, align, goal);
				1541	pr_info("cpu %d has no node %d or node-local memory\n",
				1542	cpu, node);
				1543	pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
				1544	cpu, size, __pa(ptr));
				1545	} else {
				1546	ptr = memblock_alloc_try_nid(size, align, goal,
				1547	MEMBLOCK_ALLOC_ACCESSIBLE, node);
				1548	pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
				1549	"%016lx\n", cpu, size, node, __pa(ptr));
				1550	}
				1551	return ptr;
				1552	#else
				1553	return memblock_alloc_from(size, align, goal);
				1554	#endif
				1555	}
				1556
				1557	static void __init pcpu_free_bootmem(void *ptr, size_t size)
				1558	{
				1559	memblock_free(__pa(ptr), size);
				1560	}
				1561
				1562	static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
				1563	{
				1564	if (cpu_to_node(from) == cpu_to_node(to))
				1565	return LOCAL_DISTANCE;
				1566	else
				1567	return REMOTE_DISTANCE;
				1568	}
				1569
				1570	static void __init pcpu_populate_pte(unsigned long addr)
				1571	{
				1572	pgd_t *pgd = pgd_offset_k(addr);
				1573	pud_t *pud;
				1574	pmd_t *pmd;
				1575
				1576	if (pgd_none(*pgd)) {
				1577	pud_t *new;
				1578
				1579	new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
				1580	if (!new)
				1581	goto err_alloc;
				1582	pgd_populate(&init_mm, pgd, new);
				1583	}
				1584
				1585	pud = pud_offset(pgd, addr);
				1586	if (pud_none(*pud)) {
				1587	pmd_t *new;
				1588
				1589	new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
				1590	if (!new)
				1591	goto err_alloc;
				1592	pud_populate(&init_mm, pud, new);
				1593	}
				1594
				1595	pmd = pmd_offset(pud, addr);
				1596	if (!pmd_present(*pmd)) {
				1597	pte_t *new;
				1598
				1599	new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
				1600	if (!new)
				1601	goto err_alloc;
				1602	pmd_populate_kernel(&init_mm, pmd, new);
				1603	}
				1604
				1605	return;
				1606
				1607	err_alloc:
				1608	panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n",
				1609	__func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
				1610	}
				1611
				1612	void __init setup_per_cpu_areas(void)
				1613	{
				1614	unsigned long delta;
				1615	unsigned int cpu;
				1616	int rc = -EINVAL;
				1617
				1618	if (pcpu_chosen_fc != PCPU_FC_PAGE) {
				1619	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
				1620	PERCPU_DYNAMIC_RESERVE, 4 << 20,
				1621	pcpu_cpu_distance,
				1622	pcpu_alloc_bootmem,
				1623	pcpu_free_bootmem);
				1624	if (rc)
				1625	pr_warning("PERCPU: %s allocator failed (%d), "
				1626	"falling back to page size\n",
				1627	pcpu_fc_names[pcpu_chosen_fc], rc);
				1628	}
				1629	if (rc < 0)
				1630	rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE,
				1631	pcpu_alloc_bootmem,
				1632	pcpu_free_bootmem,
				1633	pcpu_populate_pte);
				1634	if (rc < 0)
				1635	panic("cannot initialize percpu area (err=%d)", rc);
				1636
				1637	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
				1638	for_each_possible_cpu(cpu)
				1639	__per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
				1640
				1641	/* Setup %g5 for the boot cpu. */
				1642	__local_per_cpu_offset = __per_cpu_offset(smp_processor_id());
				1643
				1644	of_fill_in_cpu_data();
				1645	if (tlb_type == hypervisor)
				1646	mdesc_fill_in_cpu_data(cpu_all_mask);
				1647	}