Blame - src/kernel/linux/v4.14/arch/powerpc/mm/numa.c - T103

blob: 417ea6db7b1d20df3cdc50d80db4ef09b9f7fb8e [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* pSeries NUMA support
				3	*
				4	* Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public License
				8	* as published by the Free Software Foundation; either version
				9	* 2 of the License, or (at your option) any later version.
				10	*/
				11	#define pr_fmt(fmt) "numa: " fmt
				12
				13	#include <linux/threads.h>
				14	#include <linux/bootmem.h>
				15	#include <linux/init.h>
				16	#include <linux/mm.h>
				17	#include <linux/mmzone.h>
				18	#include <linux/export.h>
				19	#include <linux/nodemask.h>
				20	#include <linux/cpu.h>
				21	#include <linux/notifier.h>
				22	#include <linux/memblock.h>
				23	#include <linux/of.h>
				24	#include <linux/pfn.h>
				25	#include <linux/cpuset.h>
				26	#include <linux/node.h>
				27	#include <linux/stop_machine.h>
				28	#include <linux/proc_fs.h>
				29	#include <linux/seq_file.h>
				30	#include <linux/uaccess.h>
				31	#include <linux/slab.h>
				32	#include <asm/cputhreads.h>
				33	#include <asm/sparsemem.h>
				34	#include <asm/prom.h>
				35	#include <asm/smp.h>
				36	#include <asm/cputhreads.h>
				37	#include <asm/topology.h>
				38	#include <asm/firmware.h>
				39	#include <asm/paca.h>
				40	#include <asm/hvcall.h>
				41	#include <asm/setup.h>
				42	#include <asm/vdso.h>
				43
				44	static int numa_enabled = 1;
				45
				46	static char *cmdline __initdata;
				47
				48	static int numa_debug;
				49	#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
				50
				51	int numa_cpu_lookup_table[NR_CPUS];
				52	cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
				53	struct pglist_data *node_data[MAX_NUMNODES];
				54
				55	EXPORT_SYMBOL(numa_cpu_lookup_table);
				56	EXPORT_SYMBOL(node_to_cpumask_map);
				57	EXPORT_SYMBOL(node_data);
				58
				59	static int min_common_depth;
				60	static int n_mem_addr_cells, n_mem_size_cells;
				61	static int form1_affinity;
				62
				63	#define MAX_DISTANCE_REF_POINTS 4
				64	static int distance_ref_points_depth;
				65	static const __be32 *distance_ref_points;
				66	static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
				67
				68	/*
				69	* Allocate node_to_cpumask_map based on number of available nodes
				70	* Requires node_possible_map to be valid.
				71	*
				72	* Note: cpumask_of_node() is not valid until after this is done.
				73	*/
				74	static void __init setup_node_to_cpumask_map(void)
				75	{
				76	unsigned int node;
				77
				78	/* setup nr_node_ids if not done yet */
				79	if (nr_node_ids == MAX_NUMNODES)
				80	setup_nr_node_ids();
				81
				82	/* allocate the map */
				83	for_each_node(node)
				84	alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
				85
				86	/* cpumask_of_node() will now work */
				87	dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
				88	}
				89
				90	static int __init fake_numa_create_new_node(unsigned long end_pfn,
				91	unsigned int *nid)
				92	{
				93	unsigned long long mem;
				94	char *p = cmdline;
				95	static unsigned int fake_nid;
				96	static unsigned long long curr_boundary;
				97
				98	/*
				99	* Modify node id, iff we started creating NUMA nodes
				100	* We want to continue from where we left of the last time
				101	*/
				102	if (fake_nid)
				103	*nid = fake_nid;
				104	/*
				105	* In case there are no more arguments to parse, the
				106	* node_id should be the same as the last fake node id
				107	* (we've handled this above).
				108	*/
				109	if (!p)
				110	return 0;
				111
				112	mem = memparse(p, &p);
				113	if (!mem)
				114	return 0;
				115
				116	if (mem < curr_boundary)
				117	return 0;
				118
				119	curr_boundary = mem;
				120
				121	if ((end_pfn << PAGE_SHIFT) > mem) {
				122	/*
				123	* Skip commas and spaces
				124	*/
				125	while (p == ',' \|\| p == ' ' \|\| *p == '\t')
				126	p++;
				127
				128	cmdline = p;
				129	fake_nid++;
				130	*nid = fake_nid;
				131	dbg("created new fake_node with id %d\n", fake_nid);
				132	return 1;
				133	}
				134	return 0;
				135	}
				136
				137	static void reset_numa_cpu_lookup_table(void)
				138	{
				139	unsigned int cpu;
				140
				141	for_each_possible_cpu(cpu)
				142	numa_cpu_lookup_table[cpu] = -1;
				143	}
				144
				145	static void map_cpu_to_node(int cpu, int node)
				146	{
				147	update_numa_cpu_lookup_table(cpu, node);
				148
				149	dbg("adding cpu %d to node %d\n", cpu, node);
				150
				151	if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node])))
				152	cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
				153	}
				154
				155	#if defined(CONFIG_HOTPLUG_CPU) \|\| defined(CONFIG_PPC_SPLPAR)
				156	static void unmap_cpu_from_node(unsigned long cpu)
				157	{
				158	int node = numa_cpu_lookup_table[cpu];
				159
				160	dbg("removing cpu %lu from node %d\n", cpu, node);
				161
				162	if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
				163	cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
				164	} else {
				165	printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
				166	cpu, node);
				167	}
				168	}
				169	#endif /* CONFIG_HOTPLUG_CPU \|\| CONFIG_PPC_SPLPAR */
				170
				171	/* must hold reference to node during call */
				172	static const __be32 of_get_associativity(struct device_node dev)
				173	{
				174	return of_get_property(dev, "ibm,associativity", NULL);
				175	}
				176
				177	/*
				178	* Returns the property linux,drconf-usable-memory if
				179	* it exists (the property exists only in kexec/kdump kernels,
				180	* added by kexec-tools)
				181	*/
				182	static const __be32 of_get_usable_memory(struct device_node memory)
				183	{
				184	const __be32 *prop;
				185	u32 len;
				186	prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
				187	if (!prop \|\| len < sizeof(unsigned int))
				188	return NULL;
				189	return prop;
				190	}
				191
				192	int __node_distance(int a, int b)
				193	{
				194	int i;
				195	int distance = LOCAL_DISTANCE;
				196
				197	if (!form1_affinity)
				198	return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
				199
				200	for (i = 0; i < distance_ref_points_depth; i++) {
				201	if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
				202	break;
				203
				204	/* Double the distance for each NUMA level */
				205	distance *= 2;
				206	}
				207
				208	return distance;
				209	}
				210	EXPORT_SYMBOL(__node_distance);
				211
				212	static void initialize_distance_lookup_table(int nid,
				213	const __be32 *associativity)
				214	{
				215	int i;
				216
				217	if (!form1_affinity)
				218	return;
				219
				220	for (i = 0; i < distance_ref_points_depth; i++) {
				221	const __be32 *entry;
				222
				223	entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
				224	distance_lookup_table[nid][i] = of_read_number(entry, 1);
				225	}
				226	}
				227
				228	/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
				229	* info is found.
				230	*/
				231	static int associativity_to_nid(const __be32 *associativity)
				232	{
				233	int nid = -1;
				234
				235	if (min_common_depth == -1)
				236	goto out;
				237
				238	if (of_read_number(associativity, 1) >= min_common_depth)
				239	nid = of_read_number(&associativity[min_common_depth], 1);
				240
				241	/* POWER4 LPAR uses 0xffff as invalid node */
				242	if (nid == 0xffff \|\| nid >= MAX_NUMNODES)
				243	nid = -1;
				244
				245	if (nid > 0 &&
				246	of_read_number(associativity, 1) >= distance_ref_points_depth) {
				247	/*
				248	* Skip the length field and send start of associativity array
				249	*/
				250	initialize_distance_lookup_table(nid, associativity + 1);
				251	}
				252
				253	out:
				254	return nid;
				255	}
				256
				257	/* Returns the nid associated with the given device tree node,
				258	* or -1 if not found.
				259	*/
				260	static int of_node_to_nid_single(struct device_node *device)
				261	{
				262	int nid = -1;
				263	const __be32 *tmp;
				264
				265	tmp = of_get_associativity(device);
				266	if (tmp)
				267	nid = associativity_to_nid(tmp);
				268	return nid;
				269	}
				270
				271	/* Walk the device tree upwards, looking for an associativity id */
				272	int of_node_to_nid(struct device_node *device)
				273	{
				274	int nid = -1;
				275
				276	of_node_get(device);
				277	while (device) {
				278	nid = of_node_to_nid_single(device);
				279	if (nid != -1)
				280	break;
				281
				282	device = of_get_next_parent(device);
				283	}
				284	of_node_put(device);
				285
				286	return nid;
				287	}
				288	EXPORT_SYMBOL(of_node_to_nid);
				289
				290	static int __init find_min_common_depth(void)
				291	{
				292	int depth;
				293	struct device_node *root;
				294
				295	if (firmware_has_feature(FW_FEATURE_OPAL))
				296	root = of_find_node_by_path("/ibm,opal");
				297	else
				298	root = of_find_node_by_path("/rtas");
				299	if (!root)
				300	root = of_find_node_by_path("/");
				301
				302	/*
				303	* This property is a set of 32-bit integers, each representing
				304	* an index into the ibm,associativity nodes.
				305	*
				306	* With form 0 affinity the first integer is for an SMP configuration
				307	* (should be all 0's) and the second is for a normal NUMA
				308	* configuration. We have only one level of NUMA.
				309	*
				310	* With form 1 affinity the first integer is the most significant
				311	* NUMA boundary and the following are progressively less significant
				312	* boundaries. There can be more than one level of NUMA.
				313	*/
				314	distance_ref_points = of_get_property(root,
				315	"ibm,associativity-reference-points",
				316	&distance_ref_points_depth);
				317
				318	if (!distance_ref_points) {
				319	dbg("NUMA: ibm,associativity-reference-points not found.\n");
				320	goto err;
				321	}
				322
				323	distance_ref_points_depth /= sizeof(int);
				324
				325	if (firmware_has_feature(FW_FEATURE_OPAL) \|\|
				326	firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
				327	dbg("Using form 1 affinity\n");
				328	form1_affinity = 1;
				329	}
				330
				331	if (form1_affinity) {
				332	depth = of_read_number(distance_ref_points, 1);
				333	} else {
				334	if (distance_ref_points_depth < 2) {
				335	printk(KERN_WARNING "NUMA: "
				336	"short ibm,associativity-reference-points\n");
				337	goto err;
				338	}
				339
				340	depth = of_read_number(&distance_ref_points[1], 1);
				341	}
				342
				343	/*
				344	* Warn and cap if the hardware supports more than
				345	* MAX_DISTANCE_REF_POINTS domains.
				346	*/
				347	if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
				348	printk(KERN_WARNING "NUMA: distance array capped at "
				349	"%d entries\n", MAX_DISTANCE_REF_POINTS);
				350	distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
				351	}
				352
				353	of_node_put(root);
				354	return depth;
				355
				356	err:
				357	of_node_put(root);
				358	return -1;
				359	}
				360
				361	static void __init get_n_mem_cells(int n_addr_cells, int n_size_cells)
				362	{
				363	struct device_node *memory = NULL;
				364
				365	memory = of_find_node_by_type(memory, "memory");
				366	if (!memory)
				367	panic("numa.c: No memory nodes found!");
				368
				369	*n_addr_cells = of_n_addr_cells(memory);
				370	*n_size_cells = of_n_size_cells(memory);
				371	of_node_put(memory);
				372	}
				373
				374	static unsigned long read_n_cells(int n, const __be32 **buf)
				375	{
				376	unsigned long result = 0;
				377
				378	while (n--) {
				379	result = (result << 32) \| of_read_number(*buf, 1);
				380	(*buf)++;
				381	}
				382	return result;
				383	}
				384
				385	/*
				386	* Read the next memblock list entry from the ibm,dynamic-memory property
				387	* and return the information in the provided of_drconf_cell structure.
				388	*/
				389	static void read_drconf_cell(struct of_drconf_cell drmem, const __be32 *cellp)
				390	{
				391	const __be32 *cp;
				392
				393	drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);
				394
				395	cp = *cellp;
				396	drmem->drc_index = of_read_number(cp, 1);
				397	drmem->reserved = of_read_number(&cp[1], 1);
				398	drmem->aa_index = of_read_number(&cp[2], 1);
				399	drmem->flags = of_read_number(&cp[3], 1);
				400
				401	*cellp = cp + 4;
				402	}
				403
				404	/*
				405	* Retrieve and validate the ibm,dynamic-memory property of the device tree.
				406	*
				407	* The layout of the ibm,dynamic-memory property is a number N of memblock
				408	* list entries followed by N memblock list entries. Each memblock list entry
				409	* contains information as laid out in the of_drconf_cell struct above.
				410	*/
				411	static int of_get_drconf_memory(struct device_node memory, const __be32 *dm)
				412	{
				413	const __be32 *prop;
				414	u32 len, entries;
				415
				416	prop = of_get_property(memory, "ibm,dynamic-memory", &len);
				417	if (!prop \|\| len < sizeof(unsigned int))
				418	return 0;
				419
				420	entries = of_read_number(prop++, 1);
				421
				422	/* Now that we know the number of entries, revalidate the size
				423	* of the property read in to ensure we have everything
				424	*/
				425	if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int))
				426	return 0;
				427
				428	*dm = prop;
				429	return entries;
				430	}
				431
				432	/*
				433	* Retrieve and validate the ibm,lmb-size property for drconf memory
				434	* from the device tree.
				435	*/
				436	static u64 of_get_lmb_size(struct device_node *memory)
				437	{
				438	const __be32 *prop;
				439	u32 len;
				440
				441	prop = of_get_property(memory, "ibm,lmb-size", &len);
				442	if (!prop \|\| len < sizeof(unsigned int))
				443	return 0;
				444
				445	return read_n_cells(n_mem_size_cells, &prop);
				446	}
				447
				448	struct assoc_arrays {
				449	u32 n_arrays;
				450	u32 array_sz;
				451	const __be32 *arrays;
				452	};
				453
				454	/*
				455	* Retrieve and validate the list of associativity arrays for drconf
				456	* memory from the ibm,associativity-lookup-arrays property of the
				457	* device tree..
				458	*
				459	* The layout of the ibm,associativity-lookup-arrays property is a number N
				460	* indicating the number of associativity arrays, followed by a number M
				461	* indicating the size of each associativity array, followed by a list
				462	* of N associativity arrays.
				463	*/
				464	static int of_get_assoc_arrays(struct device_node *memory,
				465	struct assoc_arrays *aa)
				466	{
				467	const __be32 *prop;
				468	u32 len;
				469
				470	prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
				471	if (!prop \|\| len < 2 * sizeof(unsigned int))
				472	return -1;
				473
				474	aa->n_arrays = of_read_number(prop++, 1);
				475	aa->array_sz = of_read_number(prop++, 1);
				476
				477	/* Now that we know the number of arrays and size of each array,
				478	* revalidate the size of the property read in.
				479	*/
				480	if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
				481	return -1;
				482
				483	aa->arrays = prop;
				484	return 0;
				485	}
				486
				487	/*
				488	* This is like of_node_to_nid_single() for memory represented in the
				489	* ibm,dynamic-reconfiguration-memory node.
				490	*/
				491	static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
				492	struct assoc_arrays *aa)
				493	{
				494	int default_nid = 0;
				495	int nid = default_nid;
				496	int index;
				497
				498	if (min_common_depth > 0 && min_common_depth <= aa->array_sz &&
				499	!(drmem->flags & DRCONF_MEM_AI_INVALID) &&
				500	drmem->aa_index < aa->n_arrays) {
				501	index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
				502	nid = of_read_number(&aa->arrays[index], 1);
				503
				504	if (nid == 0xffff \|\| nid >= MAX_NUMNODES)
				505	nid = default_nid;
				506
				507	if (nid > 0) {
				508	index = drmem->aa_index * aa->array_sz;
				509	initialize_distance_lookup_table(nid,
				510	&aa->arrays[index]);
				511	}
				512	}
				513
				514	return nid;
				515	}
				516
				517	/*
				518	* Figure out to which domain a cpu belongs and stick it there.
				519	* Return the id of the domain used.
				520	*/
				521	static int numa_setup_cpu(unsigned long lcpu)
				522	{
				523	int nid = -1;
				524	struct device_node *cpu;
				525
				526	/*
				527	* If a valid cpu-to-node mapping is already available, use it
				528	* directly instead of querying the firmware, since it represents
				529	* the most recent mapping notified to us by the platform (eg: VPHN).
				530	*/
				531	if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) {
				532	map_cpu_to_node(lcpu, nid);
				533	return nid;
				534	}
				535
				536	cpu = of_get_cpu_node(lcpu, NULL);
				537
				538	if (!cpu) {
				539	WARN_ON(1);
				540	if (cpu_present(lcpu))
				541	goto out_present;
				542	else
				543	goto out;
				544	}
				545
				546	nid = of_node_to_nid_single(cpu);
				547
				548	out_present:
				549	if (nid < 0 \|\| !node_possible(nid))
				550	nid = first_online_node;
				551
				552	map_cpu_to_node(lcpu, nid);
				553	of_node_put(cpu);
				554	out:
				555	return nid;
				556	}
				557
				558	static void verify_cpu_node_mapping(int cpu, int node)
				559	{
				560	int base, sibling, i;
				561
				562	/* Verify that all the threads in the core belong to the same node */
				563	base = cpu_first_thread_sibling(cpu);
				564
				565	for (i = 0; i < threads_per_core; i++) {
				566	sibling = base + i;
				567
				568	if (sibling == cpu \|\| cpu_is_offline(sibling))
				569	continue;
				570
				571	if (cpu_to_node(sibling) != node) {
				572	WARN(1, "CPU thread siblings %d and %d don't belong"
				573	" to the same node!\n", cpu, sibling);
				574	break;
				575	}
				576	}
				577	}
				578
				579	/* Must run before sched domains notifier. */
				580	static int ppc_numa_cpu_prepare(unsigned int cpu)
				581	{
				582	int nid;
				583
				584	nid = numa_setup_cpu(cpu);
				585	verify_cpu_node_mapping(cpu, nid);
				586	return 0;
				587	}
				588
				589	static int ppc_numa_cpu_dead(unsigned int cpu)
				590	{
				591	#ifdef CONFIG_HOTPLUG_CPU
				592	unmap_cpu_from_node(cpu);
				593	#endif
				594	return 0;
				595	}
				596
				597	/*
				598	* Check and possibly modify a memory region to enforce the memory limit.
				599	*
				600	* Returns the size the region should have to enforce the memory limit.
				601	* This will either be the original value of size, a truncated value,
				602	* or zero. If the returned value of size is 0 the region should be
				603	* discarded as it lies wholly above the memory limit.
				604	*/
				605	static unsigned long __init numa_enforce_memory_limit(unsigned long start,
				606	unsigned long size)
				607	{
				608	/*
				609	* We use memblock_end_of_DRAM() in here instead of memory_limit because
				610	* we've already adjusted it for the limit and it takes care of
				611	* having memory holes below the limit. Also, in the case of
				612	* iommu_is_off, memory_limit is not set but is implicitly enforced.
				613	*/
				614
				615	if (start + size <= memblock_end_of_DRAM())
				616	return size;
				617
				618	if (start >= memblock_end_of_DRAM())
				619	return 0;
				620
				621	return memblock_end_of_DRAM() - start;
				622	}
				623
				624	/*
				625	* Reads the counter for a given entry in
				626	* linux,drconf-usable-memory property
				627	*/
				628	static inline int __init read_usm_ranges(const __be32 **usm)
				629	{
				630	/*
				631	* For each lmb in ibm,dynamic-memory a corresponding
				632	* entry in linux,drconf-usable-memory property contains
				633	* a counter followed by that many (base, size) duple.
				634	* read the counter from linux,drconf-usable-memory
				635	*/
				636	return read_n_cells(n_mem_size_cells, usm);
				637	}
				638
				639	/*
				640	* Extract NUMA information from the ibm,dynamic-reconfiguration-memory
				641	* node. This assumes n_mem_{addr,size}_cells have been set.
				642	*/
				643	static void __init parse_drconf_memory(struct device_node *memory)
				644	{
				645	const __be32 uninitialized_var(dm), usm;
				646	unsigned int n, rc, ranges, is_kexec_kdump = 0;
				647	unsigned long lmb_size, base, size, sz;
				648	int nid;
				649	struct assoc_arrays aa = { .arrays = NULL };
				650
				651	n = of_get_drconf_memory(memory, &dm);
				652	if (!n)
				653	return;
				654
				655	lmb_size = of_get_lmb_size(memory);
				656	if (!lmb_size)
				657	return;
				658
				659	rc = of_get_assoc_arrays(memory, &aa);
				660	if (rc)
				661	return;
				662
				663	/* check if this is a kexec/kdump kernel */
				664	usm = of_get_usable_memory(memory);
				665	if (usm != NULL)
				666	is_kexec_kdump = 1;
				667
				668	for (; n != 0; --n) {
				669	struct of_drconf_cell drmem;
				670
				671	read_drconf_cell(&drmem, &dm);
				672
				673	/* skip this block if the reserved bit is set in flags (0x80)
				674	or if the block is not assigned to this partition (0x8) */
				675	if ((drmem.flags & DRCONF_MEM_RESERVED)
				676	\|\| !(drmem.flags & DRCONF_MEM_ASSIGNED))
				677	continue;
				678
				679	base = drmem.base_addr;
				680	size = lmb_size;
				681	ranges = 1;
				682
				683	if (is_kexec_kdump) {
				684	ranges = read_usm_ranges(&usm);
				685	if (!ranges) /* there are no (base, size) duple */
				686	continue;
				687	}
				688	do {
				689	if (is_kexec_kdump) {
				690	base = read_n_cells(n_mem_addr_cells, &usm);
				691	size = read_n_cells(n_mem_size_cells, &usm);
				692	}
				693	nid = of_drconf_to_nid_single(&drmem, &aa);
				694	fake_numa_create_new_node(
				695	((base + size) >> PAGE_SHIFT),
				696	&nid);
				697	node_set_online(nid);
				698	sz = numa_enforce_memory_limit(base, size);
				699	if (sz)
				700	memblock_set_node(base, sz,
				701	&memblock.memory, nid);
				702	} while (--ranges);
				703	}
				704	}
				705
				706	static int __init parse_numa_properties(void)
				707	{
				708	struct device_node *memory;
				709	int default_nid = 0;
				710	unsigned long i;
				711
				712	if (numa_enabled == 0) {
				713	printk(KERN_WARNING "NUMA disabled by user\n");
				714	return -1;
				715	}
				716
				717	min_common_depth = find_min_common_depth();
				718
				719	if (min_common_depth < 0)
				720	return min_common_depth;
				721
				722	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
				723
				724	/*
				725	* Even though we connect cpus to numa domains later in SMP
				726	* init, we need to know the node ids now. This is because
				727	* each node to be onlined must have NODE_DATA etc backing it.
				728	*/
				729	for_each_present_cpu(i) {
				730	struct device_node *cpu;
				731	int nid;
				732
				733	cpu = of_get_cpu_node(i, NULL);
				734	BUG_ON(!cpu);
				735	nid = of_node_to_nid_single(cpu);
				736	of_node_put(cpu);
				737
				738	/*
				739	* Don't fall back to default_nid yet -- we will plug
				740	* cpus into nodes once the memory scan has discovered
				741	* the topology.
				742	*/
				743	if (nid < 0)
				744	continue;
				745	node_set_online(nid);
				746	}
				747
				748	get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
				749
				750	for_each_node_by_type(memory, "memory") {
				751	unsigned long start;
				752	unsigned long size;
				753	int nid;
				754	int ranges;
				755	const __be32 *memcell_buf;
				756	unsigned int len;
				757
				758	memcell_buf = of_get_property(memory,
				759	"linux,usable-memory", &len);
				760	if (!memcell_buf \|\| len <= 0)
				761	memcell_buf = of_get_property(memory, "reg", &len);
				762	if (!memcell_buf \|\| len <= 0)
				763	continue;
				764
				765	/* ranges in cell */
				766	ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
				767	new_range:
				768	/* these are order-sensitive, and modify the buffer pointer */
				769	start = read_n_cells(n_mem_addr_cells, &memcell_buf);
				770	size = read_n_cells(n_mem_size_cells, &memcell_buf);
				771
				772	/*
				773	* Assumption: either all memory nodes or none will
				774	* have associativity properties. If none, then
				775	* everything goes to default_nid.
				776	*/
				777	nid = of_node_to_nid_single(memory);
				778	if (nid < 0)
				779	nid = default_nid;
				780
				781	fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
				782	node_set_online(nid);
				783
				784	size = numa_enforce_memory_limit(start, size);
				785	if (size)
				786	memblock_set_node(start, size, &memblock.memory, nid);
				787
				788	if (--ranges)
				789	goto new_range;
				790	}
				791
				792	/*
				793	* Now do the same thing for each MEMBLOCK listed in the
				794	* ibm,dynamic-memory property in the
				795	* ibm,dynamic-reconfiguration-memory node.
				796	*/
				797	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
				798	if (memory)
				799	parse_drconf_memory(memory);
				800
				801	return 0;
				802	}
				803
				804	static void __init setup_nonnuma(void)
				805	{
				806	unsigned long top_of_ram = memblock_end_of_DRAM();
				807	unsigned long total_ram = memblock_phys_mem_size();
				808	unsigned long start_pfn, end_pfn;
				809	unsigned int nid = 0;
				810	struct memblock_region *reg;
				811
				812	printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
				813	top_of_ram, total_ram);
				814	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
				815	(top_of_ram - total_ram) >> 20);
				816
				817	for_each_memblock(memory, reg) {
				818	start_pfn = memblock_region_memory_base_pfn(reg);
				819	end_pfn = memblock_region_memory_end_pfn(reg);
				820
				821	fake_numa_create_new_node(end_pfn, &nid);
				822	memblock_set_node(PFN_PHYS(start_pfn),
				823	PFN_PHYS(end_pfn - start_pfn),
				824	&memblock.memory, nid);
				825	node_set_online(nid);
				826	}
				827	}
				828
				829	void __init dump_numa_cpu_topology(void)
				830	{
				831	unsigned int node;
				832	unsigned int cpu, count;
				833
				834	if (min_common_depth == -1 \|\| !numa_enabled)
				835	return;
				836
				837	for_each_online_node(node) {
				838	pr_info("Node %d CPUs:", node);
				839
				840	count = 0;
				841	/*
				842	* If we used a CPU iterator here we would miss printing
				843	* the holes in the cpumap.
				844	*/
				845	for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
				846	if (cpumask_test_cpu(cpu,
				847	node_to_cpumask_map[node])) {
				848	if (count == 0)
				849	pr_cont(" %u", cpu);
				850	++count;
				851	} else {
				852	if (count > 1)
				853	pr_cont("-%u", cpu - 1);
				854	count = 0;
				855	}
				856	}
				857
				858	if (count > 1)
				859	pr_cont("-%u", nr_cpu_ids - 1);
				860	pr_cont("\n");
				861	}
				862	}
				863
				864	/* Initialize NODE_DATA for a node on the local memory */
				865	static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
				866	{
				867	u64 spanned_pages = end_pfn - start_pfn;
				868	const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
				869	u64 nd_pa;
				870	void *nd;
				871	int tnid;
				872
				873	nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
				874	nd = __va(nd_pa);
				875
				876	/* report and initialize */
				877	pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n",
				878	nd_pa, nd_pa + nd_size - 1);
				879	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
				880	if (tnid != nid)
				881	pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid);
				882
				883	node_data[nid] = nd;
				884	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
				885	NODE_DATA(nid)->node_id = nid;
				886	NODE_DATA(nid)->node_start_pfn = start_pfn;
				887	NODE_DATA(nid)->node_spanned_pages = spanned_pages;
				888	}
				889
				890	static void __init find_possible_nodes(void)
				891	{
				892	struct device_node *rtas;
				893	u32 numnodes, i;
				894
				895	if (min_common_depth <= 0)
				896	return;
				897
				898	rtas = of_find_node_by_path("/rtas");
				899	if (!rtas)
				900	return;
				901
				902	if (of_property_read_u32_index(rtas,
				903	"ibm,max-associativity-domains",
				904	min_common_depth, &numnodes))
				905	goto out;
				906
				907	for (i = 0; i < numnodes; i++) {
				908	if (!node_possible(i))
				909	node_set(i, node_possible_map);
				910	}
				911
				912	out:
				913	of_node_put(rtas);
				914	}
				915
				916	void __init initmem_init(void)
				917	{
				918	int nid, cpu;
				919
				920	max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
				921	max_pfn = max_low_pfn;
				922
				923	if (parse_numa_properties())
				924	setup_nonnuma();
				925
				926	memblock_dump_all();
				927
				928	/*
				929	* Modify the set of possible NUMA nodes to reflect information
				930	* available about the set of online nodes, and the set of nodes
				931	* that we expect to make use of for this platform's affinity
				932	* calculations.
				933	*/
				934	nodes_and(node_possible_map, node_possible_map, node_online_map);
				935
				936	find_possible_nodes();
				937
				938	for_each_online_node(nid) {
				939	unsigned long start_pfn, end_pfn;
				940
				941	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
				942	setup_node_data(nid, start_pfn, end_pfn);
				943	sparse_memory_present_with_active_regions(nid);
				944	}
				945
				946	sparse_init();
				947
				948	setup_node_to_cpumask_map();
				949
				950	reset_numa_cpu_lookup_table();
				951
				952	/*
				953	* We need the numa_cpu_lookup_table to be accurate for all CPUs,
				954	* even before we online them, so that we can use cpu_to_{node,mem}
				955	* early in boot, cf. smp_prepare_cpus().
				956	* _nocalls() + manual invocation is used because cpuhp is not yet
				957	* initialized for the boot CPU.
				958	*/
				959	cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare",
				960	ppc_numa_cpu_prepare, ppc_numa_cpu_dead);
				961	for_each_present_cpu(cpu)
				962	numa_setup_cpu(cpu);
				963	}
				964
				965	static int __init early_numa(char *p)
				966	{
				967	if (!p)
				968	return 0;
				969
				970	if (strstr(p, "off"))
				971	numa_enabled = 0;
				972
				973	if (strstr(p, "debug"))
				974	numa_debug = 1;
				975
				976	p = strstr(p, "fake=");
				977	if (p)
				978	cmdline = p + strlen("fake=");
				979
				980	return 0;
				981	}
				982	early_param("numa", early_numa);
				983
				984	static bool topology_updates_enabled = true;
				985
				986	static int __init early_topology_updates(char *p)
				987	{
				988	if (!p)
				989	return 0;
				990
				991	if (!strcmp(p, "off")) {
				992	pr_info("Disabling topology updates\n");
				993	topology_updates_enabled = false;
				994	}
				995
				996	return 0;
				997	}
				998	early_param("topology_updates", early_topology_updates);
				999
				1000	#ifdef CONFIG_MEMORY_HOTPLUG
				1001	/*
				1002	* Find the node associated with a hot added memory section for
				1003	* memory represented in the device tree by the property
				1004	* ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
				1005	*/
				1006	static int hot_add_drconf_scn_to_nid(struct device_node *memory,
				1007	unsigned long scn_addr)
				1008	{
				1009	const __be32 *dm;
				1010	unsigned int drconf_cell_cnt, rc;
				1011	unsigned long lmb_size;
				1012	struct assoc_arrays aa;
				1013	int nid = -1;
				1014
				1015	drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
				1016	if (!drconf_cell_cnt)
				1017	return -1;
				1018
				1019	lmb_size = of_get_lmb_size(memory);
				1020	if (!lmb_size)
				1021	return -1;
				1022
				1023	rc = of_get_assoc_arrays(memory, &aa);
				1024	if (rc)
				1025	return -1;
				1026
				1027	for (; drconf_cell_cnt != 0; --drconf_cell_cnt) {
				1028	struct of_drconf_cell drmem;
				1029
				1030	read_drconf_cell(&drmem, &dm);
				1031
				1032	/* skip this block if it is reserved or not assigned to
				1033	* this partition */
				1034	if ((drmem.flags & DRCONF_MEM_RESERVED)
				1035	\|\| !(drmem.flags & DRCONF_MEM_ASSIGNED))
				1036	continue;
				1037
				1038	if ((scn_addr < drmem.base_addr)
				1039	\|\| (scn_addr >= (drmem.base_addr + lmb_size)))
				1040	continue;
				1041
				1042	nid = of_drconf_to_nid_single(&drmem, &aa);
				1043	break;
				1044	}
				1045
				1046	return nid;
				1047	}
				1048
				1049	/*
				1050	* Find the node associated with a hot added memory section for memory
				1051	* represented in the device tree as a node (i.e. memory@XXXX) for
				1052	* each memblock.
				1053	*/
				1054	static int hot_add_node_scn_to_nid(unsigned long scn_addr)
				1055	{
				1056	struct device_node *memory;
				1057	int nid = -1;
				1058
				1059	for_each_node_by_type(memory, "memory") {
				1060	unsigned long start, size;
				1061	int ranges;
				1062	const __be32 *memcell_buf;
				1063	unsigned int len;
				1064
				1065	memcell_buf = of_get_property(memory, "reg", &len);
				1066	if (!memcell_buf \|\| len <= 0)
				1067	continue;
				1068
				1069	/* ranges in cell */
				1070	ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
				1071
				1072	while (ranges--) {
				1073	start = read_n_cells(n_mem_addr_cells, &memcell_buf);
				1074	size = read_n_cells(n_mem_size_cells, &memcell_buf);
				1075
				1076	if ((scn_addr < start) \|\| (scn_addr >= (start + size)))
				1077	continue;
				1078
				1079	nid = of_node_to_nid_single(memory);
				1080	break;
				1081	}
				1082
				1083	if (nid >= 0)
				1084	break;
				1085	}
				1086
				1087	of_node_put(memory);
				1088
				1089	return nid;
				1090	}
				1091
				1092	/*
				1093	* Find the node associated with a hot added memory section. Section
				1094	* corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that
				1095	* sections are fully contained within a single MEMBLOCK.
				1096	*/
				1097	int hot_add_scn_to_nid(unsigned long scn_addr)
				1098	{
				1099	struct device_node *memory = NULL;
				1100	int nid;
				1101
				1102	if (!numa_enabled \|\| (min_common_depth < 0))
				1103	return first_online_node;
				1104
				1105	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
				1106	if (memory) {
				1107	nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
				1108	of_node_put(memory);
				1109	} else {
				1110	nid = hot_add_node_scn_to_nid(scn_addr);
				1111	}
				1112
				1113	if (nid < 0 \|\| !node_possible(nid))
				1114	nid = first_online_node;
				1115
				1116	return nid;
				1117	}
				1118
				1119	static u64 hot_add_drconf_memory_max(void)
				1120	{
				1121	struct device_node *memory = NULL;
				1122	struct device_node *dn = NULL;
				1123	unsigned int drconf_cell_cnt = 0;
				1124	u64 lmb_size = 0;
				1125	const __be32 *dm = NULL;
				1126	const __be64 *lrdr = NULL;
				1127	struct of_drconf_cell drmem;
				1128
				1129	dn = of_find_node_by_path("/rtas");
				1130	if (dn) {
				1131	lrdr = of_get_property(dn, "ibm,lrdr-capacity", NULL);
				1132	of_node_put(dn);
				1133	if (lrdr)
				1134	return be64_to_cpup(lrdr);
				1135	}
				1136
				1137	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
				1138	if (memory) {
				1139	drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
				1140	lmb_size = of_get_lmb_size(memory);
				1141
				1142	/* Advance to the last cell, each cell has 6 32 bit integers */
				1143	dm += (drconf_cell_cnt - 1) * 6;
				1144	read_drconf_cell(&drmem, &dm);
				1145	of_node_put(memory);
				1146	return drmem.base_addr + lmb_size;
				1147	}
				1148	return 0;
				1149	}
				1150
				1151	/*
				1152	* memory_hotplug_max - return max address of memory that may be added
				1153	*
				1154	* This is currently only used on systems that support drconfig memory
				1155	* hotplug.
				1156	*/
				1157	u64 memory_hotplug_max(void)
				1158	{
				1159	return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
				1160	}
				1161	#endif /* CONFIG_MEMORY_HOTPLUG */
				1162
				1163	/* Virtual Processor Home Node (VPHN) support */
				1164	#ifdef CONFIG_PPC_SPLPAR
				1165
				1166	#include "vphn.h"
				1167
				1168	struct topology_update_data {
				1169	struct topology_update_data *next;
				1170	unsigned int cpu;
				1171	int old_nid;
				1172	int new_nid;
				1173	};
				1174
				1175	static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
				1176	static cpumask_t cpu_associativity_changes_mask;
				1177	static int vphn_enabled;
				1178	static int prrn_enabled;
				1179	static void reset_topology_timer(void);
				1180
				1181	/*
				1182	* Store the current values of the associativity change counters in the
				1183	* hypervisor.
				1184	*/
				1185	static void setup_cpu_associativity_change_counters(void)
				1186	{
				1187	int cpu;
				1188
				1189	/* The VPHN feature supports a maximum of 8 reference points */
				1190	BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);
				1191
				1192	for_each_possible_cpu(cpu) {
				1193	int i;
				1194	u8 *counts = vphn_cpu_change_counts[cpu];
				1195	volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
				1196
				1197	for (i = 0; i < distance_ref_points_depth; i++)
				1198	counts[i] = hypervisor_counts[i];
				1199	}
				1200	}
				1201
				1202	/*
				1203	* The hypervisor maintains a set of 8 associativity change counters in
				1204	* the VPA of each cpu that correspond to the associativity levels in the
				1205	* ibm,associativity-reference-points property. When an associativity
				1206	* level changes, the corresponding counter is incremented.
				1207	*
				1208	* Set a bit in cpu_associativity_changes_mask for each cpu whose home
				1209	* node associativity levels have changed.
				1210	*
				1211	* Returns the number of cpus with unhandled associativity changes.
				1212	*/
				1213	static int update_cpu_associativity_changes_mask(void)
				1214	{
				1215	int cpu;
				1216	cpumask_t *changes = &cpu_associativity_changes_mask;
				1217
				1218	for_each_possible_cpu(cpu) {
				1219	int i, changed = 0;
				1220	u8 *counts = vphn_cpu_change_counts[cpu];
				1221	volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
				1222
				1223	for (i = 0; i < distance_ref_points_depth; i++) {
				1224	if (hypervisor_counts[i] != counts[i]) {
				1225	counts[i] = hypervisor_counts[i];
				1226	changed = 1;
				1227	}
				1228	}
				1229	if (changed) {
				1230	cpumask_or(changes, changes, cpu_sibling_mask(cpu));
				1231	cpu = cpu_last_thread_sibling(cpu);
				1232	}
				1233	}
				1234
				1235	return cpumask_weight(changes);
				1236	}
				1237
				1238	/*
				1239	* Retrieve the new associativity information for a virtual processor's
				1240	* home node.
				1241	*/
				1242	static long hcall_vphn(unsigned long cpu, __be32 *associativity)
				1243	{
				1244	long rc;
				1245	long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
				1246	u64 flags = 1;
				1247	int hwcpu = get_hard_smp_processor_id(cpu);
				1248
				1249	rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
				1250	vphn_unpack_associativity(retbuf, associativity);
				1251
				1252	return rc;
				1253	}
				1254
				1255	static long vphn_get_associativity(unsigned long cpu,
				1256	__be32 *associativity)
				1257	{
				1258	long rc;
				1259
				1260	rc = hcall_vphn(cpu, associativity);
				1261
				1262	switch (rc) {
				1263	case H_FUNCTION:
				1264	printk_once(KERN_INFO
				1265	"VPHN is not supported. Disabling polling...\n");
				1266	stop_topology_update();
				1267	break;
				1268	case H_HARDWARE:
				1269	printk(KERN_ERR
				1270	"hcall_vphn() experienced a hardware fault "
				1271	"preventing VPHN. Disabling polling...\n");
				1272	stop_topology_update();
				1273	}
				1274
				1275	return rc;
				1276	}
				1277
				1278	static inline int find_and_online_cpu_nid(int cpu)
				1279	{
				1280	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
				1281	int new_nid;
				1282
				1283	/* Use associativity from first thread for all siblings */
				1284	vphn_get_associativity(cpu, associativity);
				1285	new_nid = associativity_to_nid(associativity);
				1286	if (new_nid < 0 \|\| !node_possible(new_nid))
				1287	new_nid = first_online_node;
				1288
				1289	if (NODE_DATA(new_nid) == NULL) {
				1290	#ifdef CONFIG_MEMORY_HOTPLUG
				1291	/*
				1292	* Need to ensure that NODE_DATA is initialized for a node from
				1293	* available memory (see memblock_alloc_try_nid). If unable to
				1294	* init the node, then default to nearest node that has memory
				1295	* installed.
				1296	*/
				1297	if (try_online_node(new_nid))
				1298	new_nid = first_online_node;
				1299	#else
				1300	/*
				1301	* Default to using the nearest node that has memory installed.
				1302	* Otherwise, it would be necessary to patch the kernel MM code
				1303	* to deal with more memoryless-node error conditions.
				1304	*/
				1305	new_nid = first_online_node;
				1306	#endif
				1307	}
				1308
				1309	return new_nid;
				1310	}
				1311
				1312	/*
				1313	* Update the CPU maps and sysfs entries for a single CPU when its NUMA
				1314	* characteristics change. This function doesn't perform any locking and is
				1315	* only safe to call from stop_machine().
				1316	*/
				1317	static int update_cpu_topology(void *data)
				1318	{
				1319	struct topology_update_data *update;
				1320	unsigned long cpu;
				1321
				1322	if (!data)
				1323	return -EINVAL;
				1324
				1325	cpu = smp_processor_id();
				1326
				1327	for (update = data; update; update = update->next) {
				1328	int new_nid = update->new_nid;
				1329	if (cpu != update->cpu)
				1330	continue;
				1331
				1332	unmap_cpu_from_node(cpu);
				1333	map_cpu_to_node(cpu, new_nid);
				1334	set_cpu_numa_node(cpu, new_nid);
				1335	set_cpu_numa_mem(cpu, local_memory_node(new_nid));
				1336	vdso_getcpu_init();
				1337	}
				1338
				1339	return 0;
				1340	}
				1341
				1342	static int update_lookup_table(void *data)
				1343	{
				1344	struct topology_update_data *update;
				1345
				1346	if (!data)
				1347	return -EINVAL;
				1348
				1349	/*
				1350	* Upon topology update, the numa-cpu lookup table needs to be updated
				1351	* for all threads in the core, including offline CPUs, to ensure that
				1352	* future hotplug operations respect the cpu-to-node associativity
				1353	* properly.
				1354	*/
				1355	for (update = data; update; update = update->next) {
				1356	int nid, base, j;
				1357
				1358	nid = update->new_nid;
				1359	base = cpu_first_thread_sibling(update->cpu);
				1360
				1361	for (j = 0; j < threads_per_core; j++) {
				1362	update_numa_cpu_lookup_table(base + j, nid);
				1363	}
				1364	}
				1365
				1366	return 0;
				1367	}
				1368
				1369	/*
				1370	* Update the node maps and sysfs entries for each cpu whose home node
				1371	* has changed. Returns 1 when the topology has changed, and 0 otherwise.
				1372	*
				1373	* cpus_locked says whether we already hold cpu_hotplug_lock.
				1374	*/
				1375	int numa_update_cpu_topology(bool cpus_locked)
				1376	{
				1377	unsigned int cpu, sibling, changed = 0;
				1378	struct topology_update_data updates, ud;
				1379	cpumask_t updated_cpus;
				1380	struct device *dev;
				1381	int weight, new_nid, i = 0;
				1382
				1383	if (!prrn_enabled && !vphn_enabled)
				1384	return 0;
				1385
				1386	weight = cpumask_weight(&cpu_associativity_changes_mask);
				1387	if (!weight)
				1388	return 0;
				1389
				1390	updates = kzalloc(weight * (sizeof(*updates)), GFP_KERNEL);
				1391	if (!updates)
				1392	return 0;
				1393
				1394	cpumask_clear(&updated_cpus);
				1395
				1396	for_each_cpu(cpu, &cpu_associativity_changes_mask) {
				1397	/*
				1398	* If siblings aren't flagged for changes, updates list
				1399	* will be too short. Skip on this update and set for next
				1400	* update.
				1401	*/
				1402	if (!cpumask_subset(cpu_sibling_mask(cpu),
				1403	&cpu_associativity_changes_mask)) {
				1404	pr_info("Sibling bits not set for associativity "
				1405	"change, cpu%d\n", cpu);
				1406	cpumask_or(&cpu_associativity_changes_mask,
				1407	&cpu_associativity_changes_mask,
				1408	cpu_sibling_mask(cpu));
				1409	cpu = cpu_last_thread_sibling(cpu);
				1410	continue;
				1411	}
				1412
				1413	new_nid = find_and_online_cpu_nid(cpu);
				1414
				1415	if (new_nid == numa_cpu_lookup_table[cpu]) {
				1416	cpumask_andnot(&cpu_associativity_changes_mask,
				1417	&cpu_associativity_changes_mask,
				1418	cpu_sibling_mask(cpu));
				1419	cpu = cpu_last_thread_sibling(cpu);
				1420	continue;
				1421	}
				1422
				1423	for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
				1424	ud = &updates[i++];
				1425	ud->cpu = sibling;
				1426	ud->new_nid = new_nid;
				1427	ud->old_nid = numa_cpu_lookup_table[sibling];
				1428	cpumask_set_cpu(sibling, &updated_cpus);
				1429	if (i < weight)
				1430	ud->next = &updates[i];
				1431	}
				1432	cpu = cpu_last_thread_sibling(cpu);
				1433	}
				1434
				1435	pr_debug("Topology update for the following CPUs:\n");
				1436	if (cpumask_weight(&updated_cpus)) {
				1437	for (ud = &updates[0]; ud; ud = ud->next) {
				1438	pr_debug("cpu %d moving from node %d "
				1439	"to %d\n", ud->cpu,
				1440	ud->old_nid, ud->new_nid);
				1441	}
				1442	}
				1443
				1444	/*
				1445	* In cases where we have nothing to update (because the updates list
				1446	* is too short or because the new topology is same as the old one),
				1447	* skip invoking update_cpu_topology() via stop-machine(). This is
				1448	* necessary (and not just a fast-path optimization) since stop-machine
				1449	* can end up electing a random CPU to run update_cpu_topology(), and
				1450	* thus trick us into setting up incorrect cpu-node mappings (since
				1451	* 'updates' is kzalloc()'ed).
				1452	*
				1453	* And for the similar reason, we will skip all the following updating.
				1454	*/
				1455	if (!cpumask_weight(&updated_cpus))
				1456	goto out;
				1457
				1458	if (cpus_locked)
				1459	stop_machine_cpuslocked(update_cpu_topology, &updates[0],
				1460	&updated_cpus);
				1461	else
				1462	stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
				1463
				1464	/*
				1465	* Update the numa-cpu lookup table with the new mappings, even for
				1466	* offline CPUs. It is best to perform this update from the stop-
				1467	* machine context.
				1468	*/
				1469	if (cpus_locked)
				1470	stop_machine_cpuslocked(update_lookup_table, &updates[0],
				1471	cpumask_of(raw_smp_processor_id()));
				1472	else
				1473	stop_machine(update_lookup_table, &updates[0],
				1474	cpumask_of(raw_smp_processor_id()));
				1475
				1476	for (ud = &updates[0]; ud; ud = ud->next) {
				1477	unregister_cpu_under_node(ud->cpu, ud->old_nid);
				1478	register_cpu_under_node(ud->cpu, ud->new_nid);
				1479
				1480	dev = get_cpu_device(ud->cpu);
				1481	if (dev)
				1482	kobject_uevent(&dev->kobj, KOBJ_CHANGE);
				1483	cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask);
				1484	changed = 1;
				1485	}
				1486
				1487	out:
				1488	kfree(updates);
				1489	return changed;
				1490	}
				1491
				1492	int arch_update_cpu_topology(void)
				1493	{
				1494	return numa_update_cpu_topology(true);
				1495	}
				1496
				1497	static void topology_work_fn(struct work_struct *work)
				1498	{
				1499	rebuild_sched_domains();
				1500	}
				1501	static DECLARE_WORK(topology_work, topology_work_fn);
				1502
				1503	static void topology_schedule_update(void)
				1504	{
				1505	schedule_work(&topology_work);
				1506	}
				1507
				1508	static void topology_timer_fn(unsigned long ignored)
				1509	{
				1510	if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
				1511	topology_schedule_update();
				1512	else if (vphn_enabled) {
				1513	if (update_cpu_associativity_changes_mask() > 0)
				1514	topology_schedule_update();
				1515	reset_topology_timer();
				1516	}
				1517	}
				1518	static struct timer_list topology_timer =
				1519	TIMER_INITIALIZER(topology_timer_fn, 0, 0);
				1520
				1521	static void reset_topology_timer(void)
				1522	{
				1523	topology_timer.data = 0;
				1524	topology_timer.expires = jiffies + 60 * HZ;
				1525	mod_timer(&topology_timer, topology_timer.expires);
				1526	}
				1527
				1528	#ifdef CONFIG_SMP
				1529
				1530	static int dt_update_callback(struct notifier_block *nb,
				1531	unsigned long action, void *data)
				1532	{
				1533	struct of_reconfig_data *update = data;
				1534	int rc = NOTIFY_DONE;
				1535
				1536	switch (action) {
				1537	case OF_RECONFIG_UPDATE_PROPERTY:
				1538	if (!of_prop_cmp(update->dn->type, "cpu") &&
				1539	!of_prop_cmp(update->prop->name, "ibm,associativity")) {
				1540	u32 core_id;
				1541	of_property_read_u32(update->dn, "reg", &core_id);
				1542	rc = dlpar_cpu_readd(core_id);
				1543	rc = NOTIFY_OK;
				1544	}
				1545	break;
				1546	}
				1547
				1548	return rc;
				1549	}
				1550
				1551	static struct notifier_block dt_update_nb = {
				1552	.notifier_call = dt_update_callback,
				1553	};
				1554
				1555	#endif
				1556
				1557	/*
				1558	* Start polling for associativity changes.
				1559	*/
				1560	int start_topology_update(void)
				1561	{
				1562	int rc = 0;
				1563
				1564	if (!topology_updates_enabled)
				1565	return 0;
				1566
				1567	if (firmware_has_feature(FW_FEATURE_PRRN)) {
				1568	if (!prrn_enabled) {
				1569	prrn_enabled = 1;
				1570	vphn_enabled = 0;
				1571	#ifdef CONFIG_SMP
				1572	rc = of_reconfig_notifier_register(&dt_update_nb);
				1573	#endif
				1574	}
				1575	} else if (firmware_has_feature(FW_FEATURE_VPHN) &&
				1576	lppaca_shared_proc(get_lppaca())) {
				1577	if (!vphn_enabled) {
				1578	prrn_enabled = 0;
				1579	vphn_enabled = 1;
				1580	setup_cpu_associativity_change_counters();
				1581	init_timer_deferrable(&topology_timer);
				1582	reset_topology_timer();
				1583	}
				1584	}
				1585
				1586	return rc;
				1587	}
				1588
				1589	/*
				1590	* Disable polling for VPHN associativity changes.
				1591	*/
				1592	int stop_topology_update(void)
				1593	{
				1594	int rc = 0;
				1595
				1596	if (!topology_updates_enabled)
				1597	return 0;
				1598
				1599	if (prrn_enabled) {
				1600	prrn_enabled = 0;
				1601	#ifdef CONFIG_SMP
				1602	rc = of_reconfig_notifier_unregister(&dt_update_nb);
				1603	#endif
				1604	} else if (vphn_enabled) {
				1605	vphn_enabled = 0;
				1606	rc = del_timer_sync(&topology_timer);
				1607	}
				1608
				1609	return rc;
				1610	}
				1611
				1612	int prrn_is_enabled(void)
				1613	{
				1614	return prrn_enabled;
				1615	}
				1616
				1617	static int topology_read(struct seq_file file, void v)
				1618	{
				1619	if (vphn_enabled \|\| prrn_enabled)
				1620	seq_puts(file, "on\n");
				1621	else
				1622	seq_puts(file, "off\n");
				1623
				1624	return 0;
				1625	}
				1626
				1627	static int topology_open(struct inode inode, struct file file)
				1628	{
				1629	return single_open(file, topology_read, NULL);
				1630	}
				1631
				1632	static ssize_t topology_write(struct file file, const char __user buf,
				1633	size_t count, loff_t *off)
				1634	{
				1635	char kbuf[4]; /* "on" or "off" plus null. */
				1636	int read_len;
				1637
				1638	read_len = count < 3 ? count : 3;
				1639	if (copy_from_user(kbuf, buf, read_len))
				1640	return -EINVAL;
				1641
				1642	kbuf[read_len] = '\0';
				1643
				1644	if (!strncmp(kbuf, "on", 2)) {
				1645	topology_updates_enabled = true;
				1646	start_topology_update();
				1647	} else if (!strncmp(kbuf, "off", 3)) {
				1648	stop_topology_update();
				1649	topology_updates_enabled = false;
				1650	} else
				1651	return -EINVAL;
				1652
				1653	return count;
				1654	}
				1655
				1656	static const struct file_operations topology_ops = {
				1657	.read = seq_read,
				1658	.write = topology_write,
				1659	.open = topology_open,
				1660	.release = single_release
				1661	};
				1662
				1663	static int topology_update_init(void)
				1664	{
				1665	start_topology_update();
				1666
				1667	if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops))
				1668	return -ENOMEM;
				1669
				1670	return 0;
				1671	}
				1672	device_initcall(topology_update_init);
				1673	#endif /* CONFIG_PPC_SPLPAR */