1/*
2 * pSeries NUMA support
3 *
4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#define pr_fmt(fmt) "numa: " fmt
12
13#include <linux/threads.h>
14#include <linux/bootmem.h>
15#include <linux/init.h>
16#include <linux/mm.h>
17#include <linux/mmzone.h>
18#include <linux/export.h>
19#include <linux/nodemask.h>
20#include <linux/cpu.h>
21#include <linux/notifier.h>
22#include <linux/memblock.h>
23#include <linux/of.h>
24#include <linux/pfn.h>
25#include <linux/cpuset.h>
26#include <linux/node.h>
27#include <linux/stop_machine.h>
28#include <linux/proc_fs.h>
29#include <linux/seq_file.h>
30#include <linux/uaccess.h>
31#include <linux/slab.h>
32#include <asm/cputhreads.h>
33#include <asm/sparsemem.h>
34#include <asm/prom.h>
35#include <asm/smp.h>
36#include <asm/cputhreads.h>
37#include <asm/topology.h>
38#include <asm/firmware.h>
39#include <asm/paca.h>
40#include <asm/hvcall.h>
41#include <asm/setup.h>
42#include <asm/vdso.h>
43
44static int numa_enabled = 1;
45
46static char *cmdline __initdata;
47
48static int numa_debug;
49#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
50
51int numa_cpu_lookup_table[NR_CPUS];
52cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
53struct pglist_data *node_data[MAX_NUMNODES];
54
55EXPORT_SYMBOL(numa_cpu_lookup_table);
56EXPORT_SYMBOL(node_to_cpumask_map);
57EXPORT_SYMBOL(node_data);
58
59static int min_common_depth;
60static int n_mem_addr_cells, n_mem_size_cells;
61static int form1_affinity;
62
63#define MAX_DISTANCE_REF_POINTS 4
64static int distance_ref_points_depth;
65static const __be32 *distance_ref_points;
66static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
67
68/*
69 * Allocate node_to_cpumask_map based on number of available nodes
70 * Requires node_possible_map to be valid.
71 *
72 * Note: cpumask_of_node() is not valid until after this is done.
73 */
74static void __init setup_node_to_cpumask_map(void)
75{
76	unsigned int node;
77
78	/* setup nr_node_ids if not done yet */
79	if (nr_node_ids == MAX_NUMNODES)
80		setup_nr_node_ids();
81
82	/* allocate the map */
83	for_each_node(node)
84		alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
85
86	/* cpumask_of_node() will now work */
87	dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
88}
89
90static int __init fake_numa_create_new_node(unsigned long end_pfn,
91						unsigned int *nid)
92{
93	unsigned long long mem;
94	char *p = cmdline;
95	static unsigned int fake_nid;
96	static unsigned long long curr_boundary;
97
98	/*
99	 * Modify node id, iff we started creating NUMA nodes
100	 * We want to continue from where we left of the last time
101	 */
102	if (fake_nid)
103		*nid = fake_nid;
104	/*
105	 * In case there are no more arguments to parse, the
106	 * node_id should be the same as the last fake node id
107	 * (we've handled this above).
108	 */
109	if (!p)
110		return 0;
111
112	mem = memparse(p, &p);
113	if (!mem)
114		return 0;
115
116	if (mem < curr_boundary)
117		return 0;
118
119	curr_boundary = mem;
120
121	if ((end_pfn << PAGE_SHIFT) > mem) {
122		/*
123		 * Skip commas and spaces
124		 */
125		while (*p == ',' || *p == ' ' || *p == '\t')
126			p++;
127
128		cmdline = p;
129		fake_nid++;
130		*nid = fake_nid;
131		dbg("created new fake_node with id %d\n", fake_nid);
132		return 1;
133	}
134	return 0;
135}
136
137static void reset_numa_cpu_lookup_table(void)
138{
139	unsigned int cpu;
140
141	for_each_possible_cpu(cpu)
142		numa_cpu_lookup_table[cpu] = -1;
143}
144
145static void update_numa_cpu_lookup_table(unsigned int cpu, int node)
146{
147	numa_cpu_lookup_table[cpu] = node;
148}
149
150static void map_cpu_to_node(int cpu, int node)
151{
152	update_numa_cpu_lookup_table(cpu, node);
153
154	dbg("adding cpu %d to node %d\n", cpu, node);
155
156	if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node])))
157		cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
158}
159
160#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
161static void unmap_cpu_from_node(unsigned long cpu)
162{
163	int node = numa_cpu_lookup_table[cpu];
164
165	dbg("removing cpu %lu from node %d\n", cpu, node);
166
167	if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
168		cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
169	} else {
170		printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
171		       cpu, node);
172	}
173}
174#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
175
176/* must hold reference to node during call */
177static const __be32 *of_get_associativity(struct device_node *dev)
178{
179	return of_get_property(dev, "ibm,associativity", NULL);
180}
181
182/*
183 * Returns the property linux,drconf-usable-memory if
184 * it exists (the property exists only in kexec/kdump kernels,
185 * added by kexec-tools)
186 */
187static const __be32 *of_get_usable_memory(struct device_node *memory)
188{
189	const __be32 *prop;
190	u32 len;
191	prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
192	if (!prop || len < sizeof(unsigned int))
193		return NULL;
194	return prop;
195}
196
197int __node_distance(int a, int b)
198{
199	int i;
200	int distance = LOCAL_DISTANCE;
201
202	if (!form1_affinity)
203		return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
204
205	for (i = 0; i < distance_ref_points_depth; i++) {
206		if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
207			break;
208
209		/* Double the distance for each NUMA level */
210		distance *= 2;
211	}
212
213	return distance;
214}
215EXPORT_SYMBOL(__node_distance);
216
217static void initialize_distance_lookup_table(int nid,
218		const __be32 *associativity)
219{
220	int i;
221
222	if (!form1_affinity)
223		return;
224
225	for (i = 0; i < distance_ref_points_depth; i++) {
226		const __be32 *entry;
227
228		entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
229		distance_lookup_table[nid][i] = of_read_number(entry, 1);
230	}
231}
232
233/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
234 * info is found.
235 */
236static int associativity_to_nid(const __be32 *associativity)
237{
238	int nid = -1;
239
240	if (min_common_depth == -1)
241		goto out;
242
243	if (of_read_number(associativity, 1) >= min_common_depth)
244		nid = of_read_number(&associativity[min_common_depth], 1);
245
246	/* POWER4 LPAR uses 0xffff as invalid node */
247	if (nid == 0xffff || nid >= MAX_NUMNODES)
248		nid = -1;
249
250	if (nid > 0 &&
251		of_read_number(associativity, 1) >= distance_ref_points_depth) {
252		/*
253		 * Skip the length field and send start of associativity array
254		 */
255		initialize_distance_lookup_table(nid, associativity + 1);
256	}
257
258out:
259	return nid;
260}
261
262/* Returns the nid associated with the given device tree node,
263 * or -1 if not found.
264 */
265static int of_node_to_nid_single(struct device_node *device)
266{
267	int nid = -1;
268	const __be32 *tmp;
269
270	tmp = of_get_associativity(device);
271	if (tmp)
272		nid = associativity_to_nid(tmp);
273	return nid;
274}
275
276/* Walk the device tree upwards, looking for an associativity id */
277int of_node_to_nid(struct device_node *device)
278{
279	int nid = -1;
280
281	of_node_get(device);
282	while (device) {
283		nid = of_node_to_nid_single(device);
284		if (nid != -1)
285			break;
286
287		device = of_get_next_parent(device);
288	}
289	of_node_put(device);
290
291	return nid;
292}
293EXPORT_SYMBOL_GPL(of_node_to_nid);
294
295static int __init find_min_common_depth(void)
296{
297	int depth;
298	struct device_node *root;
299
300	if (firmware_has_feature(FW_FEATURE_OPAL))
301		root = of_find_node_by_path("/ibm,opal");
302	else
303		root = of_find_node_by_path("/rtas");
304	if (!root)
305		root = of_find_node_by_path("/");
306
307	/*
308	 * This property is a set of 32-bit integers, each representing
309	 * an index into the ibm,associativity nodes.
310	 *
311	 * With form 0 affinity the first integer is for an SMP configuration
312	 * (should be all 0's) and the second is for a normal NUMA
313	 * configuration. We have only one level of NUMA.
314	 *
315	 * With form 1 affinity the first integer is the most significant
316	 * NUMA boundary and the following are progressively less significant
317	 * boundaries. There can be more than one level of NUMA.
318	 */
319	distance_ref_points = of_get_property(root,
320					"ibm,associativity-reference-points",
321					&distance_ref_points_depth);
322
323	if (!distance_ref_points) {
324		dbg("NUMA: ibm,associativity-reference-points not found.\n");
325		goto err;
326	}
327
328	distance_ref_points_depth /= sizeof(int);
329
330	if (firmware_has_feature(FW_FEATURE_OPAL) ||
331	    firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
332		dbg("Using form 1 affinity\n");
333		form1_affinity = 1;
334	}
335
336	if (form1_affinity) {
337		depth = of_read_number(distance_ref_points, 1);
338	} else {
339		if (distance_ref_points_depth < 2) {
340			printk(KERN_WARNING "NUMA: "
341				"short ibm,associativity-reference-points\n");
342			goto err;
343		}
344
345		depth = of_read_number(&distance_ref_points[1], 1);
346	}
347
348	/*
349	 * Warn and cap if the hardware supports more than
350	 * MAX_DISTANCE_REF_POINTS domains.
351	 */
352	if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
353		printk(KERN_WARNING "NUMA: distance array capped at "
354			"%d entries\n", MAX_DISTANCE_REF_POINTS);
355		distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
356	}
357
358	of_node_put(root);
359	return depth;
360
361err:
362	of_node_put(root);
363	return -1;
364}
365
366static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
367{
368	struct device_node *memory = NULL;
369
370	memory = of_find_node_by_type(memory, "memory");
371	if (!memory)
372		panic("numa.c: No memory nodes found!");
373
374	*n_addr_cells = of_n_addr_cells(memory);
375	*n_size_cells = of_n_size_cells(memory);
376	of_node_put(memory);
377}
378
379static unsigned long read_n_cells(int n, const __be32 **buf)
380{
381	unsigned long result = 0;
382
383	while (n--) {
384		result = (result << 32) | of_read_number(*buf, 1);
385		(*buf)++;
386	}
387	return result;
388}
389
390/*
391 * Read the next memblock list entry from the ibm,dynamic-memory property
392 * and return the information in the provided of_drconf_cell structure.
393 */
394static void read_drconf_cell(struct of_drconf_cell *drmem, const __be32 **cellp)
395{
396	const __be32 *cp;
397
398	drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);
399
400	cp = *cellp;
401	drmem->drc_index = of_read_number(cp, 1);
402	drmem->reserved = of_read_number(&cp[1], 1);
403	drmem->aa_index = of_read_number(&cp[2], 1);
404	drmem->flags = of_read_number(&cp[3], 1);
405
406	*cellp = cp + 4;
407}
408
409/*
410 * Retrieve and validate the ibm,dynamic-memory property of the device tree.
411 *
412 * The layout of the ibm,dynamic-memory property is a number N of memblock
413 * list entries followed by N memblock list entries.  Each memblock list entry
414 * contains information as laid out in the of_drconf_cell struct above.
415 */
416static int of_get_drconf_memory(struct device_node *memory, const __be32 **dm)
417{
418	const __be32 *prop;
419	u32 len, entries;
420
421	prop = of_get_property(memory, "ibm,dynamic-memory", &len);
422	if (!prop || len < sizeof(unsigned int))
423		return 0;
424
425	entries = of_read_number(prop++, 1);
426
427	/* Now that we know the number of entries, revalidate the size
428	 * of the property read in to ensure we have everything
429	 */
430	if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int))
431		return 0;
432
433	*dm = prop;
434	return entries;
435}
436
437/*
438 * Retrieve and validate the ibm,lmb-size property for drconf memory
439 * from the device tree.
440 */
441static u64 of_get_lmb_size(struct device_node *memory)
442{
443	const __be32 *prop;
444	u32 len;
445
446	prop = of_get_property(memory, "ibm,lmb-size", &len);
447	if (!prop || len < sizeof(unsigned int))
448		return 0;
449
450	return read_n_cells(n_mem_size_cells, &prop);
451}
452
453struct assoc_arrays {
454	u32	n_arrays;
455	u32	array_sz;
456	const __be32 *arrays;
457};
458
459/*
460 * Retrieve and validate the list of associativity arrays for drconf
461 * memory from the ibm,associativity-lookup-arrays property of the
462 * device tree..
463 *
464 * The layout of the ibm,associativity-lookup-arrays property is a number N
465 * indicating the number of associativity arrays, followed by a number M
466 * indicating the size of each associativity array, followed by a list
467 * of N associativity arrays.
468 */
469static int of_get_assoc_arrays(struct device_node *memory,
470			       struct assoc_arrays *aa)
471{
472	const __be32 *prop;
473	u32 len;
474
475	prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
476	if (!prop || len < 2 * sizeof(unsigned int))
477		return -1;
478
479	aa->n_arrays = of_read_number(prop++, 1);
480	aa->array_sz = of_read_number(prop++, 1);
481
482	/* Now that we know the number of arrays and size of each array,
483	 * revalidate the size of the property read in.
484	 */
485	if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
486		return -1;
487
488	aa->arrays = prop;
489	return 0;
490}
491
492/*
493 * This is like of_node_to_nid_single() for memory represented in the
494 * ibm,dynamic-reconfiguration-memory node.
495 */
496static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
497				   struct assoc_arrays *aa)
498{
499	int default_nid = 0;
500	int nid = default_nid;
501	int index;
502
503	if (min_common_depth > 0 && min_common_depth <= aa->array_sz &&
504	    !(drmem->flags & DRCONF_MEM_AI_INVALID) &&
505	    drmem->aa_index < aa->n_arrays) {
506		index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
507		nid = of_read_number(&aa->arrays[index], 1);
508
509		if (nid == 0xffff || nid >= MAX_NUMNODES)
510			nid = default_nid;
511
512		if (nid > 0) {
513			index = drmem->aa_index * aa->array_sz;
514			initialize_distance_lookup_table(nid,
515							&aa->arrays[index]);
516		}
517	}
518
519	return nid;
520}
521
522/*
523 * Figure out to which domain a cpu belongs and stick it there.
524 * Return the id of the domain used.
525 */
526static int numa_setup_cpu(unsigned long lcpu)
527{
528	int nid = -1;
529	struct device_node *cpu;
530
531	/*
532	 * If a valid cpu-to-node mapping is already available, use it
533	 * directly instead of querying the firmware, since it represents
534	 * the most recent mapping notified to us by the platform (eg: VPHN).
535	 */
536	if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) {
537		map_cpu_to_node(lcpu, nid);
538		return nid;
539	}
540
541	cpu = of_get_cpu_node(lcpu, NULL);
542
543	if (!cpu) {
544		WARN_ON(1);
545		if (cpu_present(lcpu))
546			goto out_present;
547		else
548			goto out;
549	}
550
551	nid = of_node_to_nid_single(cpu);
552
553out_present:
554	if (nid < 0 || !node_online(nid))
555		nid = first_online_node;
556
557	map_cpu_to_node(lcpu, nid);
558	of_node_put(cpu);
559out:
560	return nid;
561}
562
563static void verify_cpu_node_mapping(int cpu, int node)
564{
565	int base, sibling, i;
566
567	/* Verify that all the threads in the core belong to the same node */
568	base = cpu_first_thread_sibling(cpu);
569
570	for (i = 0; i < threads_per_core; i++) {
571		sibling = base + i;
572
573		if (sibling == cpu || cpu_is_offline(sibling))
574			continue;
575
576		if (cpu_to_node(sibling) != node) {
577			WARN(1, "CPU thread siblings %d and %d don't belong"
578				" to the same node!\n", cpu, sibling);
579			break;
580		}
581	}
582}
583
584static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action,
585			     void *hcpu)
586{
587	unsigned long lcpu = (unsigned long)hcpu;
588	int ret = NOTIFY_DONE, nid;
589
590	switch (action) {
591	case CPU_UP_PREPARE:
592	case CPU_UP_PREPARE_FROZEN:
593		nid = numa_setup_cpu(lcpu);
594		verify_cpu_node_mapping((int)lcpu, nid);
595		ret = NOTIFY_OK;
596		break;
597#ifdef CONFIG_HOTPLUG_CPU
598	case CPU_DEAD:
599	case CPU_DEAD_FROZEN:
600	case CPU_UP_CANCELED:
601	case CPU_UP_CANCELED_FROZEN:
602		unmap_cpu_from_node(lcpu);
603		ret = NOTIFY_OK;
604		break;
605#endif
606	}
607	return ret;
608}
609
610/*
611 * Check and possibly modify a memory region to enforce the memory limit.
612 *
613 * Returns the size the region should have to enforce the memory limit.
614 * This will either be the original value of size, a truncated value,
615 * or zero. If the returned value of size is 0 the region should be
616 * discarded as it lies wholly above the memory limit.
617 */
618static unsigned long __init numa_enforce_memory_limit(unsigned long start,
619						      unsigned long size)
620{
621	/*
622	 * We use memblock_end_of_DRAM() in here instead of memory_limit because
623	 * we've already adjusted it for the limit and it takes care of
624	 * having memory holes below the limit.  Also, in the case of
625	 * iommu_is_off, memory_limit is not set but is implicitly enforced.
626	 */
627
628	if (start + size <= memblock_end_of_DRAM())
629		return size;
630
631	if (start >= memblock_end_of_DRAM())
632		return 0;
633
634	return memblock_end_of_DRAM() - start;
635}
636
637/*
638 * Reads the counter for a given entry in
639 * linux,drconf-usable-memory property
640 */
641static inline int __init read_usm_ranges(const __be32 **usm)
642{
643	/*
644	 * For each lmb in ibm,dynamic-memory a corresponding
645	 * entry in linux,drconf-usable-memory property contains
646	 * a counter followed by that many (base, size) duple.
647	 * read the counter from linux,drconf-usable-memory
648	 */
649	return read_n_cells(n_mem_size_cells, usm);
650}
651
652/*
653 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
654 * node.  This assumes n_mem_{addr,size}_cells have been set.
655 */
656static void __init parse_drconf_memory(struct device_node *memory)
657{
658	const __be32 *uninitialized_var(dm), *usm;
659	unsigned int n, rc, ranges, is_kexec_kdump = 0;
660	unsigned long lmb_size, base, size, sz;
661	int nid;
662	struct assoc_arrays aa = { .arrays = NULL };
663
664	n = of_get_drconf_memory(memory, &dm);
665	if (!n)
666		return;
667
668	lmb_size = of_get_lmb_size(memory);
669	if (!lmb_size)
670		return;
671
672	rc = of_get_assoc_arrays(memory, &aa);
673	if (rc)
674		return;
675
676	/* check if this is a kexec/kdump kernel */
677	usm = of_get_usable_memory(memory);
678	if (usm != NULL)
679		is_kexec_kdump = 1;
680
681	for (; n != 0; --n) {
682		struct of_drconf_cell drmem;
683
684		read_drconf_cell(&drmem, &dm);
685
686		/* skip this block if the reserved bit is set in flags (0x80)
687		   or if the block is not assigned to this partition (0x8) */
688		if ((drmem.flags & DRCONF_MEM_RESERVED)
689		    || !(drmem.flags & DRCONF_MEM_ASSIGNED))
690			continue;
691
692		base = drmem.base_addr;
693		size = lmb_size;
694		ranges = 1;
695
696		if (is_kexec_kdump) {
697			ranges = read_usm_ranges(&usm);
698			if (!ranges) /* there are no (base, size) duple */
699				continue;
700		}
701		do {
702			if (is_kexec_kdump) {
703				base = read_n_cells(n_mem_addr_cells, &usm);
704				size = read_n_cells(n_mem_size_cells, &usm);
705			}
706			nid = of_drconf_to_nid_single(&drmem, &aa);
707			fake_numa_create_new_node(
708				((base + size) >> PAGE_SHIFT),
709					   &nid);
710			node_set_online(nid);
711			sz = numa_enforce_memory_limit(base, size);
712			if (sz)
713				memblock_set_node(base, sz,
714						  &memblock.memory, nid);
715		} while (--ranges);
716	}
717}
718
719static int __init parse_numa_properties(void)
720{
721	struct device_node *memory;
722	int default_nid = 0;
723	unsigned long i;
724
725	if (numa_enabled == 0) {
726		printk(KERN_WARNING "NUMA disabled by user\n");
727		return -1;
728	}
729
730	min_common_depth = find_min_common_depth();
731
732	if (min_common_depth < 0)
733		return min_common_depth;
734
735	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
736
737	/*
738	 * Even though we connect cpus to numa domains later in SMP
739	 * init, we need to know the node ids now. This is because
740	 * each node to be onlined must have NODE_DATA etc backing it.
741	 */
742	for_each_present_cpu(i) {
743		struct device_node *cpu;
744		int nid;
745
746		cpu = of_get_cpu_node(i, NULL);
747		BUG_ON(!cpu);
748		nid = of_node_to_nid_single(cpu);
749		of_node_put(cpu);
750
751		/*
752		 * Don't fall back to default_nid yet -- we will plug
753		 * cpus into nodes once the memory scan has discovered
754		 * the topology.
755		 */
756		if (nid < 0)
757			continue;
758		node_set_online(nid);
759	}
760
761	get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
762
763	for_each_node_by_type(memory, "memory") {
764		unsigned long start;
765		unsigned long size;
766		int nid;
767		int ranges;
768		const __be32 *memcell_buf;
769		unsigned int len;
770
771		memcell_buf = of_get_property(memory,
772			"linux,usable-memory", &len);
773		if (!memcell_buf || len <= 0)
774			memcell_buf = of_get_property(memory, "reg", &len);
775		if (!memcell_buf || len <= 0)
776			continue;
777
778		/* ranges in cell */
779		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
780new_range:
781		/* these are order-sensitive, and modify the buffer pointer */
782		start = read_n_cells(n_mem_addr_cells, &memcell_buf);
783		size = read_n_cells(n_mem_size_cells, &memcell_buf);
784
785		/*
786		 * Assumption: either all memory nodes or none will
787		 * have associativity properties.  If none, then
788		 * everything goes to default_nid.
789		 */
790		nid = of_node_to_nid_single(memory);
791		if (nid < 0)
792			nid = default_nid;
793
794		fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
795		node_set_online(nid);
796
797		if (!(size = numa_enforce_memory_limit(start, size))) {
798			if (--ranges)
799				goto new_range;
800			else
801				continue;
802		}
803
804		memblock_set_node(start, size, &memblock.memory, nid);
805
806		if (--ranges)
807			goto new_range;
808	}
809
810	/*
811	 * Now do the same thing for each MEMBLOCK listed in the
812	 * ibm,dynamic-memory property in the
813	 * ibm,dynamic-reconfiguration-memory node.
814	 */
815	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
816	if (memory)
817		parse_drconf_memory(memory);
818
819	return 0;
820}
821
822static void __init setup_nonnuma(void)
823{
824	unsigned long top_of_ram = memblock_end_of_DRAM();
825	unsigned long total_ram = memblock_phys_mem_size();
826	unsigned long start_pfn, end_pfn;
827	unsigned int nid = 0;
828	struct memblock_region *reg;
829
830	printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
831	       top_of_ram, total_ram);
832	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
833	       (top_of_ram - total_ram) >> 20);
834
835	for_each_memblock(memory, reg) {
836		start_pfn = memblock_region_memory_base_pfn(reg);
837		end_pfn = memblock_region_memory_end_pfn(reg);
838
839		fake_numa_create_new_node(end_pfn, &nid);
840		memblock_set_node(PFN_PHYS(start_pfn),
841				  PFN_PHYS(end_pfn - start_pfn),
842				  &memblock.memory, nid);
843		node_set_online(nid);
844	}
845}
846
847void __init dump_numa_cpu_topology(void)
848{
849	unsigned int node;
850	unsigned int cpu, count;
851
852	if (min_common_depth == -1 || !numa_enabled)
853		return;
854
855	for_each_online_node(node) {
856		printk(KERN_DEBUG "Node %d CPUs:", node);
857
858		count = 0;
859		/*
860		 * If we used a CPU iterator here we would miss printing
861		 * the holes in the cpumap.
862		 */
863		for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
864			if (cpumask_test_cpu(cpu,
865					node_to_cpumask_map[node])) {
866				if (count == 0)
867					printk(" %u", cpu);
868				++count;
869			} else {
870				if (count > 1)
871					printk("-%u", cpu - 1);
872				count = 0;
873			}
874		}
875
876		if (count > 1)
877			printk("-%u", nr_cpu_ids - 1);
878		printk("\n");
879	}
880}
881
882static void __init dump_numa_memory_topology(void)
883{
884	unsigned int node;
885	unsigned int count;
886
887	if (min_common_depth == -1 || !numa_enabled)
888		return;
889
890	for_each_online_node(node) {
891		unsigned long i;
892
893		printk(KERN_DEBUG "Node %d Memory:", node);
894
895		count = 0;
896
897		for (i = 0; i < memblock_end_of_DRAM();
898		     i += (1 << SECTION_SIZE_BITS)) {
899			if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
900				if (count == 0)
901					printk(" 0x%lx", i);
902				++count;
903			} else {
904				if (count > 0)
905					printk("-0x%lx", i);
906				count = 0;
907			}
908		}
909
910		if (count > 0)
911			printk("-0x%lx", i);
912		printk("\n");
913	}
914}
915
916static struct notifier_block ppc64_numa_nb = {
917	.notifier_call = cpu_numa_callback,
918	.priority = 1 /* Must run before sched domains notifier. */
919};
920
921/* Initialize NODE_DATA for a node on the local memory */
922static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
923{
924	u64 spanned_pages = end_pfn - start_pfn;
925	const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
926	u64 nd_pa;
927	void *nd;
928	int tnid;
929
930	if (spanned_pages)
931		pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
932			nid, start_pfn << PAGE_SHIFT,
933			(end_pfn << PAGE_SHIFT) - 1);
934	else
935		pr_info("Initmem setup node %d\n", nid);
936
937	nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
938	nd = __va(nd_pa);
939
940	/* report and initialize */
941	pr_info("  NODE_DATA [mem %#010Lx-%#010Lx]\n",
942		nd_pa, nd_pa + nd_size - 1);
943	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
944	if (tnid != nid)
945		pr_info("    NODE_DATA(%d) on node %d\n", nid, tnid);
946
947	node_data[nid] = nd;
948	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
949	NODE_DATA(nid)->node_id = nid;
950	NODE_DATA(nid)->node_start_pfn = start_pfn;
951	NODE_DATA(nid)->node_spanned_pages = spanned_pages;
952}
953
954void __init initmem_init(void)
955{
956	int nid, cpu;
957
958	max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
959	max_pfn = max_low_pfn;
960
961	if (parse_numa_properties())
962		setup_nonnuma();
963	else
964		dump_numa_memory_topology();
965
966	memblock_dump_all();
967
968	/*
969	 * Reduce the possible NUMA nodes to the online NUMA nodes,
970	 * since we do not support node hotplug. This ensures that  we
971	 * lower the maximum NUMA node ID to what is actually present.
972	 */
973	nodes_and(node_possible_map, node_possible_map, node_online_map);
974
975	for_each_online_node(nid) {
976		unsigned long start_pfn, end_pfn;
977
978		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
979		setup_node_data(nid, start_pfn, end_pfn);
980		sparse_memory_present_with_active_regions(nid);
981	}
982
983	sparse_init();
984
985	setup_node_to_cpumask_map();
986
987	reset_numa_cpu_lookup_table();
988	register_cpu_notifier(&ppc64_numa_nb);
989	/*
990	 * We need the numa_cpu_lookup_table to be accurate for all CPUs,
991	 * even before we online them, so that we can use cpu_to_{node,mem}
992	 * early in boot, cf. smp_prepare_cpus().
993	 */
994	for_each_present_cpu(cpu) {
995		numa_setup_cpu((unsigned long)cpu);
996	}
997}
998
999static int __init early_numa(char *p)
1000{
1001	if (!p)
1002		return 0;
1003
1004	if (strstr(p, "off"))
1005		numa_enabled = 0;
1006
1007	if (strstr(p, "debug"))
1008		numa_debug = 1;
1009
1010	p = strstr(p, "fake=");
1011	if (p)
1012		cmdline = p + strlen("fake=");
1013
1014	return 0;
1015}
1016early_param("numa", early_numa);
1017
1018static bool topology_updates_enabled = true;
1019
1020static int __init early_topology_updates(char *p)
1021{
1022	if (!p)
1023		return 0;
1024
1025	if (!strcmp(p, "off")) {
1026		pr_info("Disabling topology updates\n");
1027		topology_updates_enabled = false;
1028	}
1029
1030	return 0;
1031}
1032early_param("topology_updates", early_topology_updates);
1033
1034#ifdef CONFIG_MEMORY_HOTPLUG
1035/*
1036 * Find the node associated with a hot added memory section for
1037 * memory represented in the device tree by the property
1038 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
1039 */
1040static int hot_add_drconf_scn_to_nid(struct device_node *memory,
1041				     unsigned long scn_addr)
1042{
1043	const __be32 *dm;
1044	unsigned int drconf_cell_cnt, rc;
1045	unsigned long lmb_size;
1046	struct assoc_arrays aa;
1047	int nid = -1;
1048
1049	drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
1050	if (!drconf_cell_cnt)
1051		return -1;
1052
1053	lmb_size = of_get_lmb_size(memory);
1054	if (!lmb_size)
1055		return -1;
1056
1057	rc = of_get_assoc_arrays(memory, &aa);
1058	if (rc)
1059		return -1;
1060
1061	for (; drconf_cell_cnt != 0; --drconf_cell_cnt) {
1062		struct of_drconf_cell drmem;
1063
1064		read_drconf_cell(&drmem, &dm);
1065
1066		/* skip this block if it is reserved or not assigned to
1067		 * this partition */
1068		if ((drmem.flags & DRCONF_MEM_RESERVED)
1069		    || !(drmem.flags & DRCONF_MEM_ASSIGNED))
1070			continue;
1071
1072		if ((scn_addr < drmem.base_addr)
1073		    || (scn_addr >= (drmem.base_addr + lmb_size)))
1074			continue;
1075
1076		nid = of_drconf_to_nid_single(&drmem, &aa);
1077		break;
1078	}
1079
1080	return nid;
1081}
1082
1083/*
1084 * Find the node associated with a hot added memory section for memory
1085 * represented in the device tree as a node (i.e. memory@XXXX) for
1086 * each memblock.
1087 */
1088static int hot_add_node_scn_to_nid(unsigned long scn_addr)
1089{
1090	struct device_node *memory;
1091	int nid = -1;
1092
1093	for_each_node_by_type(memory, "memory") {
1094		unsigned long start, size;
1095		int ranges;
1096		const __be32 *memcell_buf;
1097		unsigned int len;
1098
1099		memcell_buf = of_get_property(memory, "reg", &len);
1100		if (!memcell_buf || len <= 0)
1101			continue;
1102
1103		/* ranges in cell */
1104		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
1105
1106		while (ranges--) {
1107			start = read_n_cells(n_mem_addr_cells, &memcell_buf);
1108			size = read_n_cells(n_mem_size_cells, &memcell_buf);
1109
1110			if ((scn_addr < start) || (scn_addr >= (start + size)))
1111				continue;
1112
1113			nid = of_node_to_nid_single(memory);
1114			break;
1115		}
1116
1117		if (nid >= 0)
1118			break;
1119	}
1120
1121	of_node_put(memory);
1122
1123	return nid;
1124}
1125
1126/*
1127 * Find the node associated with a hot added memory section.  Section
1128 * corresponds to a SPARSEMEM section, not an MEMBLOCK.  It is assumed that
1129 * sections are fully contained within a single MEMBLOCK.
1130 */
1131int hot_add_scn_to_nid(unsigned long scn_addr)
1132{
1133	struct device_node *memory = NULL;
1134	int nid, found = 0;
1135
1136	if (!numa_enabled || (min_common_depth < 0))
1137		return first_online_node;
1138
1139	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1140	if (memory) {
1141		nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
1142		of_node_put(memory);
1143	} else {
1144		nid = hot_add_node_scn_to_nid(scn_addr);
1145	}
1146
1147	if (nid < 0 || !node_online(nid))
1148		nid = first_online_node;
1149
1150	if (NODE_DATA(nid)->node_spanned_pages)
1151		return nid;
1152
1153	for_each_online_node(nid) {
1154		if (NODE_DATA(nid)->node_spanned_pages) {
1155			found = 1;
1156			break;
1157		}
1158	}
1159
1160	BUG_ON(!found);
1161	return nid;
1162}
1163
1164static u64 hot_add_drconf_memory_max(void)
1165{
1166        struct device_node *memory = NULL;
1167        unsigned int drconf_cell_cnt = 0;
1168        u64 lmb_size = 0;
1169	const __be32 *dm = NULL;
1170
1171        memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1172        if (memory) {
1173                drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
1174                lmb_size = of_get_lmb_size(memory);
1175                of_node_put(memory);
1176        }
1177        return lmb_size * drconf_cell_cnt;
1178}
1179
1180/*
1181 * memory_hotplug_max - return max address of memory that may be added
1182 *
1183 * This is currently only used on systems that support drconfig memory
1184 * hotplug.
1185 */
1186u64 memory_hotplug_max(void)
1187{
1188        return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
1189}
1190#endif /* CONFIG_MEMORY_HOTPLUG */
1191
1192/* Virtual Processor Home Node (VPHN) support */
1193#ifdef CONFIG_PPC_SPLPAR
1194
1195#include "vphn.h"
1196
1197struct topology_update_data {
1198	struct topology_update_data *next;
1199	unsigned int cpu;
1200	int old_nid;
1201	int new_nid;
1202};
1203
1204static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
1205static cpumask_t cpu_associativity_changes_mask;
1206static int vphn_enabled;
1207static int prrn_enabled;
1208static void reset_topology_timer(void);
1209
1210/*
1211 * Store the current values of the associativity change counters in the
1212 * hypervisor.
1213 */
1214static void setup_cpu_associativity_change_counters(void)
1215{
1216	int cpu;
1217
1218	/* The VPHN feature supports a maximum of 8 reference points */
1219	BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);
1220
1221	for_each_possible_cpu(cpu) {
1222		int i;
1223		u8 *counts = vphn_cpu_change_counts[cpu];
1224		volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
1225
1226		for (i = 0; i < distance_ref_points_depth; i++)
1227			counts[i] = hypervisor_counts[i];
1228	}
1229}
1230
1231/*
1232 * The hypervisor maintains a set of 8 associativity change counters in
1233 * the VPA of each cpu that correspond to the associativity levels in the
1234 * ibm,associativity-reference-points property. When an associativity
1235 * level changes, the corresponding counter is incremented.
1236 *
1237 * Set a bit in cpu_associativity_changes_mask for each cpu whose home
1238 * node associativity levels have changed.
1239 *
1240 * Returns the number of cpus with unhandled associativity changes.
1241 */
1242static int update_cpu_associativity_changes_mask(void)
1243{
1244	int cpu;
1245	cpumask_t *changes = &cpu_associativity_changes_mask;
1246
1247	for_each_possible_cpu(cpu) {
1248		int i, changed = 0;
1249		u8 *counts = vphn_cpu_change_counts[cpu];
1250		volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
1251
1252		for (i = 0; i < distance_ref_points_depth; i++) {
1253			if (hypervisor_counts[i] != counts[i]) {
1254				counts[i] = hypervisor_counts[i];
1255				changed = 1;
1256			}
1257		}
1258		if (changed) {
1259			cpumask_or(changes, changes, cpu_sibling_mask(cpu));
1260			cpu = cpu_last_thread_sibling(cpu);
1261		}
1262	}
1263
1264	return cpumask_weight(changes);
1265}
1266
1267/*
1268 * Retrieve the new associativity information for a virtual processor's
1269 * home node.
1270 */
1271static long hcall_vphn(unsigned long cpu, __be32 *associativity)
1272{
1273	long rc;
1274	long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
1275	u64 flags = 1;
1276	int hwcpu = get_hard_smp_processor_id(cpu);
1277
1278	rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
1279	vphn_unpack_associativity(retbuf, associativity);
1280
1281	return rc;
1282}
1283
1284static long vphn_get_associativity(unsigned long cpu,
1285					__be32 *associativity)
1286{
1287	long rc;
1288
1289	rc = hcall_vphn(cpu, associativity);
1290
1291	switch (rc) {
1292	case H_FUNCTION:
1293		printk(KERN_INFO
1294			"VPHN is not supported. Disabling polling...\n");
1295		stop_topology_update();
1296		break;
1297	case H_HARDWARE:
1298		printk(KERN_ERR
1299			"hcall_vphn() experienced a hardware fault "
1300			"preventing VPHN. Disabling polling...\n");
1301		stop_topology_update();
1302	}
1303
1304	return rc;
1305}
1306
1307/*
1308 * Update the CPU maps and sysfs entries for a single CPU when its NUMA
1309 * characteristics change. This function doesn't perform any locking and is
1310 * only safe to call from stop_machine().
1311 */
1312static int update_cpu_topology(void *data)
1313{
1314	struct topology_update_data *update;
1315	unsigned long cpu;
1316
1317	if (!data)
1318		return -EINVAL;
1319
1320	cpu = smp_processor_id();
1321
1322	for (update = data; update; update = update->next) {
1323		int new_nid = update->new_nid;
1324		if (cpu != update->cpu)
1325			continue;
1326
1327		unmap_cpu_from_node(cpu);
1328		map_cpu_to_node(cpu, new_nid);
1329		set_cpu_numa_node(cpu, new_nid);
1330		set_cpu_numa_mem(cpu, local_memory_node(new_nid));
1331		vdso_getcpu_init();
1332	}
1333
1334	return 0;
1335}
1336
1337static int update_lookup_table(void *data)
1338{
1339	struct topology_update_data *update;
1340
1341	if (!data)
1342		return -EINVAL;
1343
1344	/*
1345	 * Upon topology update, the numa-cpu lookup table needs to be updated
1346	 * for all threads in the core, including offline CPUs, to ensure that
1347	 * future hotplug operations respect the cpu-to-node associativity
1348	 * properly.
1349	 */
1350	for (update = data; update; update = update->next) {
1351		int nid, base, j;
1352
1353		nid = update->new_nid;
1354		base = cpu_first_thread_sibling(update->cpu);
1355
1356		for (j = 0; j < threads_per_core; j++) {
1357			update_numa_cpu_lookup_table(base + j, nid);
1358		}
1359	}
1360
1361	return 0;
1362}
1363
1364/*
1365 * Update the node maps and sysfs entries for each cpu whose home node
1366 * has changed. Returns 1 when the topology has changed, and 0 otherwise.
1367 */
1368int arch_update_cpu_topology(void)
1369{
1370	unsigned int cpu, sibling, changed = 0;
1371	struct topology_update_data *updates, *ud;
1372	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
1373	cpumask_t updated_cpus;
1374	struct device *dev;
1375	int weight, new_nid, i = 0;
1376
1377	if (!prrn_enabled && !vphn_enabled)
1378		return 0;
1379
1380	weight = cpumask_weight(&cpu_associativity_changes_mask);
1381	if (!weight)
1382		return 0;
1383
1384	updates = kzalloc(weight * (sizeof(*updates)), GFP_KERNEL);
1385	if (!updates)
1386		return 0;
1387
1388	cpumask_clear(&updated_cpus);
1389
1390	for_each_cpu(cpu, &cpu_associativity_changes_mask) {
1391		/*
1392		 * If siblings aren't flagged for changes, updates list
1393		 * will be too short. Skip on this update and set for next
1394		 * update.
1395		 */
1396		if (!cpumask_subset(cpu_sibling_mask(cpu),
1397					&cpu_associativity_changes_mask)) {
1398			pr_info("Sibling bits not set for associativity "
1399					"change, cpu%d\n", cpu);
1400			cpumask_or(&cpu_associativity_changes_mask,
1401					&cpu_associativity_changes_mask,
1402					cpu_sibling_mask(cpu));
1403			cpu = cpu_last_thread_sibling(cpu);
1404			continue;
1405		}
1406
1407		/* Use associativity from first thread for all siblings */
1408		vphn_get_associativity(cpu, associativity);
1409		new_nid = associativity_to_nid(associativity);
1410		if (new_nid < 0 || !node_online(new_nid))
1411			new_nid = first_online_node;
1412
1413		if (new_nid == numa_cpu_lookup_table[cpu]) {
1414			cpumask_andnot(&cpu_associativity_changes_mask,
1415					&cpu_associativity_changes_mask,
1416					cpu_sibling_mask(cpu));
1417			cpu = cpu_last_thread_sibling(cpu);
1418			continue;
1419		}
1420
1421		for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
1422			ud = &updates[i++];
1423			ud->cpu = sibling;
1424			ud->new_nid = new_nid;
1425			ud->old_nid = numa_cpu_lookup_table[sibling];
1426			cpumask_set_cpu(sibling, &updated_cpus);
1427			if (i < weight)
1428				ud->next = &updates[i];
1429		}
1430		cpu = cpu_last_thread_sibling(cpu);
1431	}
1432
1433	pr_debug("Topology update for the following CPUs:\n");
1434	if (cpumask_weight(&updated_cpus)) {
1435		for (ud = &updates[0]; ud; ud = ud->next) {
1436			pr_debug("cpu %d moving from node %d "
1437					  "to %d\n", ud->cpu,
1438					  ud->old_nid, ud->new_nid);
1439		}
1440	}
1441
1442	/*
1443	 * In cases where we have nothing to update (because the updates list
1444	 * is too short or because the new topology is same as the old one),
1445	 * skip invoking update_cpu_topology() via stop-machine(). This is
1446	 * necessary (and not just a fast-path optimization) since stop-machine
1447	 * can end up electing a random CPU to run update_cpu_topology(), and
1448	 * thus trick us into setting up incorrect cpu-node mappings (since
1449	 * 'updates' is kzalloc()'ed).
1450	 *
1451	 * And for the similar reason, we will skip all the following updating.
1452	 */
1453	if (!cpumask_weight(&updated_cpus))
1454		goto out;
1455
1456	stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
1457
1458	/*
1459	 * Update the numa-cpu lookup table with the new mappings, even for
1460	 * offline CPUs. It is best to perform this update from the stop-
1461	 * machine context.
1462	 */
1463	stop_machine(update_lookup_table, &updates[0],
1464					cpumask_of(raw_smp_processor_id()));
1465
1466	for (ud = &updates[0]; ud; ud = ud->next) {
1467		unregister_cpu_under_node(ud->cpu, ud->old_nid);
1468		register_cpu_under_node(ud->cpu, ud->new_nid);
1469
1470		dev = get_cpu_device(ud->cpu);
1471		if (dev)
1472			kobject_uevent(&dev->kobj, KOBJ_CHANGE);
1473		cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask);
1474		changed = 1;
1475	}
1476
1477out:
1478	kfree(updates);
1479	return changed;
1480}
1481
1482static void topology_work_fn(struct work_struct *work)
1483{
1484	rebuild_sched_domains();
1485}
1486static DECLARE_WORK(topology_work, topology_work_fn);
1487
1488static void topology_schedule_update(void)
1489{
1490	schedule_work(&topology_work);
1491}
1492
1493static void topology_timer_fn(unsigned long ignored)
1494{
1495	if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
1496		topology_schedule_update();
1497	else if (vphn_enabled) {
1498		if (update_cpu_associativity_changes_mask() > 0)
1499			topology_schedule_update();
1500		reset_topology_timer();
1501	}
1502}
1503static struct timer_list topology_timer =
1504	TIMER_INITIALIZER(topology_timer_fn, 0, 0);
1505
1506static void reset_topology_timer(void)
1507{
1508	topology_timer.data = 0;
1509	topology_timer.expires = jiffies + 60 * HZ;
1510	mod_timer(&topology_timer, topology_timer.expires);
1511}
1512
1513#ifdef CONFIG_SMP
1514
1515static void stage_topology_update(int core_id)
1516{
1517	cpumask_or(&cpu_associativity_changes_mask,
1518		&cpu_associativity_changes_mask, cpu_sibling_mask(core_id));
1519	reset_topology_timer();
1520}
1521
1522static int dt_update_callback(struct notifier_block *nb,
1523				unsigned long action, void *data)
1524{
1525	struct of_reconfig_data *update = data;
1526	int rc = NOTIFY_DONE;
1527
1528	switch (action) {
1529	case OF_RECONFIG_UPDATE_PROPERTY:
1530		if (!of_prop_cmp(update->dn->type, "cpu") &&
1531		    !of_prop_cmp(update->prop->name, "ibm,associativity")) {
1532			u32 core_id;
1533			of_property_read_u32(update->dn, "reg", &core_id);
1534			stage_topology_update(core_id);
1535			rc = NOTIFY_OK;
1536		}
1537		break;
1538	}
1539
1540	return rc;
1541}
1542
1543static struct notifier_block dt_update_nb = {
1544	.notifier_call = dt_update_callback,
1545};
1546
1547#endif
1548
1549/*
1550 * Start polling for associativity changes.
1551 */
1552int start_topology_update(void)
1553{
1554	int rc = 0;
1555
1556	if (firmware_has_feature(FW_FEATURE_PRRN)) {
1557		if (!prrn_enabled) {
1558			prrn_enabled = 1;
1559			vphn_enabled = 0;
1560#ifdef CONFIG_SMP
1561			rc = of_reconfig_notifier_register(&dt_update_nb);
1562#endif
1563		}
1564	} else if (firmware_has_feature(FW_FEATURE_VPHN) &&
1565		   lppaca_shared_proc(get_lppaca())) {
1566		if (!vphn_enabled) {
1567			prrn_enabled = 0;
1568			vphn_enabled = 1;
1569			setup_cpu_associativity_change_counters();
1570			init_timer_deferrable(&topology_timer);
1571			reset_topology_timer();
1572		}
1573	}
1574
1575	return rc;
1576}
1577
1578/*
1579 * Disable polling for VPHN associativity changes.
1580 */
1581int stop_topology_update(void)
1582{
1583	int rc = 0;
1584
1585	if (prrn_enabled) {
1586		prrn_enabled = 0;
1587#ifdef CONFIG_SMP
1588		rc = of_reconfig_notifier_unregister(&dt_update_nb);
1589#endif
1590	} else if (vphn_enabled) {
1591		vphn_enabled = 0;
1592		rc = del_timer_sync(&topology_timer);
1593	}
1594
1595	return rc;
1596}
1597
1598int prrn_is_enabled(void)
1599{
1600	return prrn_enabled;
1601}
1602
1603static int topology_read(struct seq_file *file, void *v)
1604{
1605	if (vphn_enabled || prrn_enabled)
1606		seq_puts(file, "on\n");
1607	else
1608		seq_puts(file, "off\n");
1609
1610	return 0;
1611}
1612
1613static int topology_open(struct inode *inode, struct file *file)
1614{
1615	return single_open(file, topology_read, NULL);
1616}
1617
1618static ssize_t topology_write(struct file *file, const char __user *buf,
1619			      size_t count, loff_t *off)
1620{
1621	char kbuf[4]; /* "on" or "off" plus null. */
1622	int read_len;
1623
1624	read_len = count < 3 ? count : 3;
1625	if (copy_from_user(kbuf, buf, read_len))
1626		return -EINVAL;
1627
1628	kbuf[read_len] = '\0';
1629
1630	if (!strncmp(kbuf, "on", 2))
1631		start_topology_update();
1632	else if (!strncmp(kbuf, "off", 3))
1633		stop_topology_update();
1634	else
1635		return -EINVAL;
1636
1637	return count;
1638}
1639
1640static const struct file_operations topology_ops = {
1641	.read = seq_read,
1642	.write = topology_write,
1643	.open = topology_open,
1644	.release = single_release
1645};
1646
1647static int topology_update_init(void)
1648{
1649	/* Do not poll for changes if disabled at boot */
1650	if (topology_updates_enabled)
1651		start_topology_update();
1652
1653	if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops))
1654		return -ENOMEM;
1655
1656	return 0;
1657}
1658device_initcall(topology_update_init);
1659#endif /* CONFIG_PPC_SPLPAR */
1660