numa.c 36.9 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
/*
 * pSeries NUMA support
 *
 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 */
#include <linux/threads.h>
#include <linux/bootmem.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/module.h>
#include <linux/nodemask.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
Yinghai Lu's avatar
Yinghai Lu committed
20
#include <linux/memblock.h>
21
#include <linux/of.h>
22
#include <linux/pfn.h>
23
24
#include <linux/cpuset.h>
#include <linux/node.h>
25
#include <asm/sparsemem.h>
26
#include <asm/prom.h>
27
#include <asm/system.h>
28
#include <asm/smp.h>
29
30
#include <asm/firmware.h>
#include <asm/paca.h>
31
#include <asm/hvcall.h>
Linus Torvalds's avatar
Linus Torvalds committed
32
33
34

static int numa_enabled = 1;

35
36
static char *cmdline __initdata;

Linus Torvalds's avatar
Linus Torvalds committed
37
38
39
static int numa_debug;
#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }

40
int numa_cpu_lookup_table[NR_CPUS];
41
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
Linus Torvalds's avatar
Linus Torvalds committed
42
struct pglist_data *node_data[MAX_NUMNODES];
43
44

EXPORT_SYMBOL(numa_cpu_lookup_table);
45
EXPORT_SYMBOL(node_to_cpumask_map);
46
47
EXPORT_SYMBOL(node_data);

Linus Torvalds's avatar
Linus Torvalds committed
48
static int min_common_depth;
49
static int n_mem_addr_cells, n_mem_size_cells;
50
51
52
53
54
55
static int form1_affinity;

#define MAX_DISTANCE_REF_POINTS 4
static int distance_ref_points_depth;
static const unsigned int *distance_ref_points;
static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
Linus Torvalds's avatar
Linus Torvalds committed
56

57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
/*
 * Allocate node_to_cpumask_map based on number of available nodes
 * Requires node_possible_map to be valid.
 *
 * Note: node_to_cpumask() is not valid until after this is done.
 */
static void __init setup_node_to_cpumask_map(void)
{
	unsigned int node, num = 0;

	/* setup nr_node_ids if not done yet */
	if (nr_node_ids == MAX_NUMNODES) {
		for_each_node_mask(node, node_possible_map)
			num = node;
		nr_node_ids = num + 1;
	}

	/* allocate the map */
	for (node = 0; node < nr_node_ids; node++)
		alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);

	/* cpumask_of_node() will now work */
	dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
}

82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
						unsigned int *nid)
{
	unsigned long long mem;
	char *p = cmdline;
	static unsigned int fake_nid;
	static unsigned long long curr_boundary;

	/*
	 * Modify node id, iff we started creating NUMA nodes
	 * We want to continue from where we left of the last time
	 */
	if (fake_nid)
		*nid = fake_nid;
	/*
	 * In case there are no more arguments to parse, the
	 * node_id should be the same as the last fake node id
	 * (we've handled this above).
	 */
	if (!p)
		return 0;

	mem = memparse(p, &p);
	if (!mem)
		return 0;

	if (mem < curr_boundary)
		return 0;

	curr_boundary = mem;

	if ((end_pfn << PAGE_SHIFT) > mem) {
		/*
		 * Skip commas and spaces
		 */
		while (*p == ',' || *p == ' ' || *p == '\t')
			p++;

		cmdline = p;
		fake_nid++;
		*nid = fake_nid;
		dbg("created new fake_node with id %d\n", fake_nid);
		return 1;
	}
	return 0;
}

129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/*
 * get_active_region_work_fn - A helper function for get_node_active_region
 *	Returns datax set to the start_pfn and end_pfn if they contain
 *	the initial value of datax->start_pfn between them
 * @start_pfn: start page(inclusive) of region to check
 * @end_pfn: end page(exclusive) of region to check
 * @datax: comes in with ->start_pfn set to value to search for and
 *	goes out with active range if it contains it
 * Returns 1 if search value is in range else 0
 */
static int __init get_active_region_work_fn(unsigned long start_pfn,
					unsigned long end_pfn, void *datax)
{
	struct node_active_region *data;
	data = (struct node_active_region *)datax;

	if (start_pfn <= data->start_pfn && end_pfn > data->start_pfn) {
		data->start_pfn = start_pfn;
		data->end_pfn = end_pfn;
		return 1;
	}
	return 0;

}

/*
 * get_node_active_region - Return active region containing start_pfn
156
 * Active range returned is empty if none found.
157
158
159
160
161
162
163
164
165
166
 * @start_pfn: The page to return the region for.
 * @node_ar: Returned set to the active region containing start_pfn
 */
static void __init get_node_active_region(unsigned long start_pfn,
		       struct node_active_region *node_ar)
{
	int nid = early_pfn_to_nid(start_pfn);

	node_ar->nid = nid;
	node_ar->start_pfn = start_pfn;
167
	node_ar->end_pfn = start_pfn;
168
169
170
	work_with_active_regions(nid, get_active_region_work_fn, node_ar);
}

171
static void map_cpu_to_node(int cpu, int node)
Linus Torvalds's avatar
Linus Torvalds committed
172
173
{
	numa_cpu_lookup_table[cpu] = node;
174

175
176
	dbg("adding cpu %d to node %d\n", cpu, node);

177
178
	if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node])))
		cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
Linus Torvalds's avatar
Linus Torvalds committed
179
180
}

181
#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
Linus Torvalds's avatar
Linus Torvalds committed
182
183
184
185
186
187
static void unmap_cpu_from_node(unsigned long cpu)
{
	int node = numa_cpu_lookup_table[cpu];

	dbg("removing cpu %lu from node %d\n", cpu, node);

188
189
	if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
		cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
Linus Torvalds's avatar
Linus Torvalds committed
190
191
192
193
194
	} else {
		printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
		       cpu, node);
	}
}
195
#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
Linus Torvalds's avatar
Linus Torvalds committed
196
197

/* must hold reference to node during call */
198
static const int *of_get_associativity(struct device_node *dev)
Linus Torvalds's avatar
Linus Torvalds committed
199
{
200
	return of_get_property(dev, "ibm,associativity", NULL);
Linus Torvalds's avatar
Linus Torvalds committed
201
202
}

203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
/*
 * Returns the property linux,drconf-usable-memory if
 * it exists (the property exists only in kexec/kdump kernels,
 * added by kexec-tools)
 */
static const u32 *of_get_usable_memory(struct device_node *memory)
{
	const u32 *prop;
	u32 len;
	prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
	if (!prop || len < sizeof(unsigned int))
		return 0;
	return prop;
}

218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
int __node_distance(int a, int b)
{
	int i;
	int distance = LOCAL_DISTANCE;

	if (!form1_affinity)
		return distance;

	for (i = 0; i < distance_ref_points_depth; i++) {
		if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
			break;

		/* Double the distance for each NUMA level */
		distance *= 2;
	}

	return distance;
}

static void initialize_distance_lookup_table(int nid,
		const unsigned int *associativity)
{
	int i;

	if (!form1_affinity)
		return;

	for (i = 0; i < distance_ref_points_depth; i++) {
		distance_lookup_table[nid][i] =
			associativity[distance_ref_points[i]];
	}
}

251
252
253
/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
 * info is found.
 */
254
static int associativity_to_nid(const unsigned int *associativity)
Linus Torvalds's avatar
Linus Torvalds committed
255
{
256
	int nid = -1;
Linus Torvalds's avatar
Linus Torvalds committed
257
258

	if (min_common_depth == -1)
259
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
260

261
262
	if (associativity[0] >= min_common_depth)
		nid = associativity[min_common_depth];
263
264

	/* POWER4 LPAR uses 0xffff as invalid node */
265
266
	if (nid == 0xffff || nid >= MAX_NUMNODES)
		nid = -1;
267

268
269
	if (nid > 0 && associativity[0] >= distance_ref_points_depth)
		initialize_distance_lookup_table(nid, associativity);
270

271
out:
272
	return nid;
Linus Torvalds's avatar
Linus Torvalds committed
273
274
}

275
276
277
278
279
280
281
282
283
284
285
286
287
288
/* Returns the nid associated with the given device tree node,
 * or -1 if not found.
 */
static int of_node_to_nid_single(struct device_node *device)
{
	int nid = -1;
	const unsigned int *tmp;

	tmp = of_get_associativity(device);
	if (tmp)
		nid = associativity_to_nid(tmp);
	return nid;
}

289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
/* Walk the device tree upwards, looking for an associativity id */
int of_node_to_nid(struct device_node *device)
{
	struct device_node *tmp;
	int nid = -1;

	of_node_get(device);
	while (device) {
		nid = of_node_to_nid_single(device);
		if (nid != -1)
			break;

	        tmp = device;
		device = of_get_parent(tmp);
		of_node_put(tmp);
	}
	of_node_put(device);

	return nid;
}
EXPORT_SYMBOL_GPL(of_node_to_nid);

Linus Torvalds's avatar
Linus Torvalds committed
311
312
static int __init find_min_common_depth(void)
{
313
	int depth;
Linus Torvalds's avatar
Linus Torvalds committed
314
	struct device_node *rtas_root;
315
316
	struct device_node *chosen;
	const char *vec5;
Linus Torvalds's avatar
Linus Torvalds committed
317
318
319
320
321
322
323

	rtas_root = of_find_node_by_path("/rtas");

	if (!rtas_root)
		return -1;

	/*
324
325
326
327
328
329
330
331
332
333
	 * This property is a set of 32-bit integers, each representing
	 * an index into the ibm,associativity nodes.
	 *
	 * With form 0 affinity the first integer is for an SMP configuration
	 * (should be all 0's) and the second is for a normal NUMA
	 * configuration. We have only one level of NUMA.
	 *
	 * With form 1 affinity the first integer is the most significant
	 * NUMA boundary and the following are progressively less significant
	 * boundaries. There can be more than one level of NUMA.
Linus Torvalds's avatar
Linus Torvalds committed
334
	 */
335
336
337
338
339
340
341
342
343
344
	distance_ref_points = of_get_property(rtas_root,
					"ibm,associativity-reference-points",
					&distance_ref_points_depth);

	if (!distance_ref_points) {
		dbg("NUMA: ibm,associativity-reference-points not found.\n");
		goto err;
	}

	distance_ref_points_depth /= sizeof(int);
Linus Torvalds's avatar
Linus Torvalds committed
345

346
347
348
349
350
351
352
#define VEC5_AFFINITY_BYTE	5
#define VEC5_AFFINITY		0x80
	chosen = of_find_node_by_path("/chosen");
	if (chosen) {
		vec5 = of_get_property(chosen, "ibm,architecture-vec-5", NULL);
		if (vec5 && (vec5[VEC5_AFFINITY_BYTE] & VEC5_AFFINITY)) {
			dbg("Using form 1 affinity\n");
353
			form1_affinity = 1;
354
		}
355
356
	}

357
358
	if (form1_affinity) {
		depth = distance_ref_points[0];
Linus Torvalds's avatar
Linus Torvalds committed
359
	} else {
360
361
362
363
364
365
366
		if (distance_ref_points_depth < 2) {
			printk(KERN_WARNING "NUMA: "
				"short ibm,associativity-reference-points\n");
			goto err;
		}

		depth = distance_ref_points[1];
Linus Torvalds's avatar
Linus Torvalds committed
367
368
	}

369
370
371
372
373
374
375
376
377
378
379
	/*
	 * Warn and cap if the hardware supports more than
	 * MAX_DISTANCE_REF_POINTS domains.
	 */
	if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
		printk(KERN_WARNING "NUMA: distance array capped at "
			"%d entries\n", MAX_DISTANCE_REF_POINTS);
		distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
	}

	of_node_put(rtas_root);
Linus Torvalds's avatar
Linus Torvalds committed
380
	return depth;
381
382
383
384

err:
	of_node_put(rtas_root);
	return -1;
Linus Torvalds's avatar
Linus Torvalds committed
385
386
}

387
static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
Linus Torvalds's avatar
Linus Torvalds committed
388
389
390
391
{
	struct device_node *memory = NULL;

	memory = of_find_node_by_type(memory, "memory");
392
	if (!memory)
393
		panic("numa.c: No memory nodes found!");
394

395
	*n_addr_cells = of_n_addr_cells(memory);
396
	*n_size_cells = of_n_size_cells(memory);
397
	of_node_put(memory);
Linus Torvalds's avatar
Linus Torvalds committed
398
399
}

400
static unsigned long __devinit read_n_cells(int n, const unsigned int **buf)
Linus Torvalds's avatar
Linus Torvalds committed
401
402
403
404
405
406
407
408
409
410
{
	unsigned long result = 0;

	while (n--) {
		result = (result << 32) | **buf;
		(*buf)++;
	}
	return result;
}

411
412
413
414
415
416
417
418
419
420
421
422
423
struct of_drconf_cell {
	u64	base_addr;
	u32	drc_index;
	u32	reserved;
	u32	aa_index;
	u32	flags;
};

#define DRCONF_MEM_ASSIGNED	0x00000008
#define DRCONF_MEM_AI_INVALID	0x00000040
#define DRCONF_MEM_RESERVED	0x00000080

/*
Yinghai Lu's avatar
Yinghai Lu committed
424
 * Read the next memblock list entry from the ibm,dynamic-memory property
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
 * and return the information in the provided of_drconf_cell structure.
 */
static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp)
{
	const u32 *cp;

	drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);

	cp = *cellp;
	drmem->drc_index = cp[0];
	drmem->reserved = cp[1];
	drmem->aa_index = cp[2];
	drmem->flags = cp[3];

	*cellp = cp + 4;
}

/*
 * Retreive and validate the ibm,dynamic-memory property of the device tree.
 *
Yinghai Lu's avatar
Yinghai Lu committed
445
446
 * The layout of the ibm,dynamic-memory property is a number N of memblock
 * list entries followed by N memblock list entries.  Each memblock list entry
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
 * contains information as layed out in the of_drconf_cell struct above.
 */
static int of_get_drconf_memory(struct device_node *memory, const u32 **dm)
{
	const u32 *prop;
	u32 len, entries;

	prop = of_get_property(memory, "ibm,dynamic-memory", &len);
	if (!prop || len < sizeof(unsigned int))
		return 0;

	entries = *prop++;

	/* Now that we know the number of entries, revalidate the size
	 * of the property read in to ensure we have everything
	 */
	if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int))
		return 0;

	*dm = prop;
	return entries;
}

/*
471
 * Retreive and validate the ibm,lmb-size property for drconf memory
472
473
 * from the device tree.
 */
474
static u64 of_get_lmb_size(struct device_node *memory)
475
476
477
478
{
	const u32 *prop;
	u32 len;

479
	prop = of_get_property(memory, "ibm,lmb-size", &len);
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
	if (!prop || len < sizeof(unsigned int))
		return 0;

	return read_n_cells(n_mem_size_cells, &prop);
}

struct assoc_arrays {
	u32	n_arrays;
	u32	array_sz;
	const u32 *arrays;
};

/*
 * Retreive and validate the list of associativity arrays for drconf
 * memory from the ibm,associativity-lookup-arrays property of the
 * device tree..
 *
 * The layout of the ibm,associativity-lookup-arrays property is a number N
 * indicating the number of associativity arrays, followed by a number M
 * indicating the size of each associativity array, followed by a list
 * of N associativity arrays.
 */
static int of_get_assoc_arrays(struct device_node *memory,
			       struct assoc_arrays *aa)
{
	const u32 *prop;
	u32 len;

	prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
	if (!prop || len < 2 * sizeof(unsigned int))
		return -1;

	aa->n_arrays = *prop++;
	aa->array_sz = *prop++;

	/* Now that we know the number of arrrays and size of each array,
	 * revalidate the size of the property read in.
	 */
	if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
		return -1;

	aa->arrays = prop;
	return 0;
}

/*
 * This is like of_node_to_nid_single() for memory represented in the
 * ibm,dynamic-reconfiguration-memory node.
 */
static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
				   struct assoc_arrays *aa)
{
	int default_nid = 0;
	int nid = default_nid;
	int index;

	if (min_common_depth > 0 && min_common_depth <= aa->array_sz &&
	    !(drmem->flags & DRCONF_MEM_AI_INVALID) &&
	    drmem->aa_index < aa->n_arrays) {
		index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
		nid = aa->arrays[index];

		if (nid == 0xffff || nid >= MAX_NUMNODES)
			nid = default_nid;
	}

	return nid;
}

Linus Torvalds's avatar
Linus Torvalds committed
549
550
551
552
/*
 * Figure out to which domain a cpu belongs and stick it there.
 * Return the id of the domain used.
 */
553
static int __cpuinit numa_setup_cpu(unsigned long lcpu)
Linus Torvalds's avatar
Linus Torvalds committed
554
{
555
	int nid = 0;
556
	struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
557
558
559
560
561
562

	if (!cpu) {
		WARN_ON(1);
		goto out;
	}

563
	nid = of_node_to_nid_single(cpu);
Linus Torvalds's avatar
Linus Torvalds committed
564

565
	if (nid < 0 || !node_online(nid))
566
		nid = first_online_node;
Linus Torvalds's avatar
Linus Torvalds committed
567
out:
568
	map_cpu_to_node(lcpu, nid);
Linus Torvalds's avatar
Linus Torvalds committed
569
570
571

	of_node_put(cpu);

572
	return nid;
Linus Torvalds's avatar
Linus Torvalds committed
573
574
}

575
static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
Linus Torvalds's avatar
Linus Torvalds committed
576
577
578
579
580
581
582
583
			     unsigned long action,
			     void *hcpu)
{
	unsigned long lcpu = (unsigned long)hcpu;
	int ret = NOTIFY_DONE;

	switch (action) {
	case CPU_UP_PREPARE:
584
	case CPU_UP_PREPARE_FROZEN:
585
		numa_setup_cpu(lcpu);
Linus Torvalds's avatar
Linus Torvalds committed
586
587
588
589
		ret = NOTIFY_OK;
		break;
#ifdef CONFIG_HOTPLUG_CPU
	case CPU_DEAD:
590
	case CPU_DEAD_FROZEN:
Linus Torvalds's avatar
Linus Torvalds committed
591
	case CPU_UP_CANCELED:
592
	case CPU_UP_CANCELED_FROZEN:
Linus Torvalds's avatar
Linus Torvalds committed
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
		unmap_cpu_from_node(lcpu);
		break;
		ret = NOTIFY_OK;
#endif
	}
	return ret;
}

/*
 * Check and possibly modify a memory region to enforce the memory limit.
 *
 * Returns the size the region should have to enforce the memory limit.
 * This will either be the original value of size, a truncated value,
 * or zero. If the returned value of size is 0 the region should be
 * discarded as it lies wholy above the memory limit.
 */
609
610
static unsigned long __init numa_enforce_memory_limit(unsigned long start,
						      unsigned long size)
Linus Torvalds's avatar
Linus Torvalds committed
611
612
{
	/*
Yinghai Lu's avatar
Yinghai Lu committed
613
	 * We use memblock_end_of_DRAM() in here instead of memory_limit because
Linus Torvalds's avatar
Linus Torvalds committed
614
	 * we've already adjusted it for the limit and it takes care of
615
616
	 * having memory holes below the limit.  Also, in the case of
	 * iommu_is_off, memory_limit is not set but is implicitly enforced.
Linus Torvalds's avatar
Linus Torvalds committed
617
618
	 */

Yinghai Lu's avatar
Yinghai Lu committed
619
	if (start + size <= memblock_end_of_DRAM())
Linus Torvalds's avatar
Linus Torvalds committed
620
621
		return size;

Yinghai Lu's avatar
Yinghai Lu committed
622
	if (start >= memblock_end_of_DRAM())
Linus Torvalds's avatar
Linus Torvalds committed
623
624
		return 0;

Yinghai Lu's avatar
Yinghai Lu committed
625
	return memblock_end_of_DRAM() - start;
Linus Torvalds's avatar
Linus Torvalds committed
626
627
}

628
629
630
631
632
633
634
/*
 * Reads the counter for a given entry in
 * linux,drconf-usable-memory property
 */
static inline int __init read_usm_ranges(const u32 **usm)
{
	/*
635
	 * For each lmb in ibm,dynamic-memory a corresponding
636
637
638
639
640
641
642
	 * entry in linux,drconf-usable-memory property contains
	 * a counter followed by that many (base, size) duple.
	 * read the counter from linux,drconf-usable-memory
	 */
	return read_n_cells(n_mem_size_cells, usm);
}

643
644
645
646
647
648
/*
 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
 * node.  This assumes n_mem_{addr,size}_cells have been set.
 */
static void __init parse_drconf_memory(struct device_node *memory)
{
649
650
	const u32 *dm, *usm;
	unsigned int n, rc, ranges, is_kexec_kdump = 0;
651
	unsigned long lmb_size, base, size, sz;
652
653
654
655
656
	int nid;
	struct assoc_arrays aa;

	n = of_get_drconf_memory(memory, &dm);
	if (!n)
657
658
		return;

659
660
	lmb_size = of_get_lmb_size(memory);
	if (!lmb_size)
661
662
663
664
		return;

	rc = of_get_assoc_arrays(memory, &aa);
	if (rc)
665
666
		return;

667
668
669
670
671
	/* check if this is a kexec/kdump kernel */
	usm = of_get_usable_memory(memory);
	if (usm != NULL)
		is_kexec_kdump = 1;

672
	for (; n != 0; --n) {
673
674
675
676
677
678
679
680
		struct of_drconf_cell drmem;

		read_drconf_cell(&drmem, &dm);

		/* skip this block if the reserved bit is set in flags (0x80)
		   or if the block is not assigned to this partition (0x8) */
		if ((drmem.flags & DRCONF_MEM_RESERVED)
		    || !(drmem.flags & DRCONF_MEM_ASSIGNED))
681
			continue;
682

683
		base = drmem.base_addr;
684
		size = lmb_size;
685
		ranges = 1;
686

687
688
689
690
691
692
693
694
695
696
697
698
699
		if (is_kexec_kdump) {
			ranges = read_usm_ranges(&usm);
			if (!ranges) /* there are no (base, size) duple */
				continue;
		}
		do {
			if (is_kexec_kdump) {
				base = read_n_cells(n_mem_addr_cells, &usm);
				size = read_n_cells(n_mem_size_cells, &usm);
			}
			nid = of_drconf_to_nid_single(&drmem, &aa);
			fake_numa_create_new_node(
				((base + size) >> PAGE_SHIFT),
700
					   &nid);
701
702
703
704
705
706
707
			node_set_online(nid);
			sz = numa_enforce_memory_limit(base, size);
			if (sz)
				add_active_range(nid, base >> PAGE_SHIFT,
						 (base >> PAGE_SHIFT)
						 + (sz >> PAGE_SHIFT));
		} while (--ranges);
708
709
710
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
711
712
713
714
static int __init parse_numa_properties(void)
{
	struct device_node *cpu = NULL;
	struct device_node *memory = NULL;
715
	int default_nid = 0;
Linus Torvalds's avatar
Linus Torvalds committed
716
717
718
719
720
721
722
723
724
725
726
727
	unsigned long i;

	if (numa_enabled == 0) {
		printk(KERN_WARNING "NUMA disabled by user\n");
		return -1;
	}

	min_common_depth = find_min_common_depth();

	if (min_common_depth < 0)
		return min_common_depth;

728
729
	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);

Linus Torvalds's avatar
Linus Torvalds committed
730
	/*
731
732
733
	 * Even though we connect cpus to numa domains later in SMP
	 * init, we need to know the node ids now. This is because
	 * each node to be onlined must have NODE_DATA etc backing it.
Linus Torvalds's avatar
Linus Torvalds committed
734
	 */
735
	for_each_present_cpu(i) {
736
		int nid;
Linus Torvalds's avatar
Linus Torvalds committed
737

738
		cpu = of_get_cpu_node(i, NULL);
739
		BUG_ON(!cpu);
740
		nid = of_node_to_nid_single(cpu);
741
		of_node_put(cpu);
Linus Torvalds's avatar
Linus Torvalds committed
742

743
744
745
746
747
748
749
750
		/*
		 * Don't fall back to default_nid yet -- we will plug
		 * cpus into nodes once the memory scan has discovered
		 * the topology.
		 */
		if (nid < 0)
			continue;
		node_set_online(nid);
Linus Torvalds's avatar
Linus Torvalds committed
751
752
	}

753
	get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
Linus Torvalds's avatar
Linus Torvalds committed
754
755
756
757
	memory = NULL;
	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
		unsigned long start;
		unsigned long size;
758
		int nid;
Linus Torvalds's avatar
Linus Torvalds committed
759
		int ranges;
760
		const unsigned int *memcell_buf;
Linus Torvalds's avatar
Linus Torvalds committed
761
762
		unsigned int len;

763
		memcell_buf = of_get_property(memory,
764
765
			"linux,usable-memory", &len);
		if (!memcell_buf || len <= 0)
766
			memcell_buf = of_get_property(memory, "reg", &len);
Linus Torvalds's avatar
Linus Torvalds committed
767
768
769
		if (!memcell_buf || len <= 0)
			continue;

770
771
		/* ranges in cell */
		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
Linus Torvalds's avatar
Linus Torvalds committed
772
773
new_range:
		/* these are order-sensitive, and modify the buffer pointer */
774
775
		start = read_n_cells(n_mem_addr_cells, &memcell_buf);
		size = read_n_cells(n_mem_size_cells, &memcell_buf);
Linus Torvalds's avatar
Linus Torvalds committed
776

777
778
779
780
781
		/*
		 * Assumption: either all memory nodes or none will
		 * have associativity properties.  If none, then
		 * everything goes to default_nid.
		 */
782
		nid = of_node_to_nid_single(memory);
783
784
		if (nid < 0)
			nid = default_nid;
785
786

		fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
787
		node_set_online(nid);
Linus Torvalds's avatar
Linus Torvalds committed
788

789
		if (!(size = numa_enforce_memory_limit(start, size))) {
Linus Torvalds's avatar
Linus Torvalds committed
790
791
792
793
794
795
			if (--ranges)
				goto new_range;
			else
				continue;
		}

796
797
		add_active_range(nid, start >> PAGE_SHIFT,
				(start >> PAGE_SHIFT) + (size >> PAGE_SHIFT));
Linus Torvalds's avatar
Linus Torvalds committed
798
799
800
801
802

		if (--ranges)
			goto new_range;
	}

803
	/*
Yinghai Lu's avatar
Yinghai Lu committed
804
	 * Now do the same thing for each MEMBLOCK listed in the ibm,dynamic-memory
805
806
807
808
809
810
	 * property in the ibm,dynamic-reconfiguration-memory node.
	 */
	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
	if (memory)
		parse_drconf_memory(memory);

Linus Torvalds's avatar
Linus Torvalds committed
811
812
813
814
815
	return 0;
}

static void __init setup_nonnuma(void)
{
Yinghai Lu's avatar
Yinghai Lu committed
816
817
	unsigned long top_of_ram = memblock_end_of_DRAM();
	unsigned long total_ram = memblock_phys_mem_size();
818
	unsigned long start_pfn, end_pfn;
819
820
	unsigned int nid = 0;
	struct memblock_region *reg;
Linus Torvalds's avatar
Linus Torvalds committed
821

822
	printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
Linus Torvalds's avatar
Linus Torvalds committed
823
	       top_of_ram, total_ram);
824
	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
Linus Torvalds's avatar
Linus Torvalds committed
825
826
	       (top_of_ram - total_ram) >> 20);

827
	for_each_memblock(memory, reg) {
828
829
		start_pfn = memblock_region_memory_base_pfn(reg);
		end_pfn = memblock_region_memory_end_pfn(reg);
830
831
832
833

		fake_numa_create_new_node(end_pfn, &nid);
		add_active_range(nid, start_pfn, end_pfn);
		node_set_online(nid);
834
	}
Linus Torvalds's avatar
Linus Torvalds committed
835
836
}

837
838
839
840
841
842
843
844
845
void __init dump_numa_cpu_topology(void)
{
	unsigned int node;
	unsigned int cpu, count;

	if (min_common_depth == -1 || !numa_enabled)
		return;

	for_each_online_node(node) {
846
		printk(KERN_DEBUG "Node %d CPUs:", node);
847
848
849
850
851
852

		count = 0;
		/*
		 * If we used a CPU iterator here we would miss printing
		 * the holes in the cpumap.
		 */
853
854
855
		for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
			if (cpumask_test_cpu(cpu,
					node_to_cpumask_map[node])) {
856
857
858
859
860
861
862
863
864
865
866
				if (count == 0)
					printk(" %u", cpu);
				++count;
			} else {
				if (count > 1)
					printk("-%u", cpu - 1);
				count = 0;
			}
		}

		if (count > 1)
867
			printk("-%u", nr_cpu_ids - 1);
868
869
870
871
872
		printk("\n");
	}
}

static void __init dump_numa_memory_topology(void)
Linus Torvalds's avatar
Linus Torvalds committed
873
874
875
876
877
878
879
880
881
882
{
	unsigned int node;
	unsigned int count;

	if (min_common_depth == -1 || !numa_enabled)
		return;

	for_each_online_node(node) {
		unsigned long i;

883
		printk(KERN_DEBUG "Node %d Memory:", node);
Linus Torvalds's avatar
Linus Torvalds committed
884
885
886

		count = 0;

Yinghai Lu's avatar
Yinghai Lu committed
887
		for (i = 0; i < memblock_end_of_DRAM();
888
889
		     i += (1 << SECTION_SIZE_BITS)) {
			if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
Linus Torvalds's avatar
Linus Torvalds committed
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
				if (count == 0)
					printk(" 0x%lx", i);
				++count;
			} else {
				if (count > 0)
					printk("-0x%lx", i);
				count = 0;
			}
		}

		if (count > 0)
			printk("-0x%lx", i);
		printk("\n");
	}
}

/*
Yinghai Lu's avatar
Yinghai Lu committed
907
 * Allocate some memory, satisfying the memblock or bootmem allocator where
Linus Torvalds's avatar
Linus Torvalds committed
908
909
910
 * required. nid is the preferred node and end is the physical address of
 * the highest address in the node.
 *
911
 * Returns the virtual address of the memory.
Linus Torvalds's avatar
Linus Torvalds committed
912
 */
913
static void __init *careful_zallocation(int nid, unsigned long size,
914
915
				       unsigned long align,
				       unsigned long end_pfn)
Linus Torvalds's avatar
Linus Torvalds committed
916
{
917
	void *ret;
918
	int new_nid;
919
920
	unsigned long ret_paddr;

Yinghai Lu's avatar
Yinghai Lu committed
921
	ret_paddr = __memblock_alloc_base(size, align, end_pfn << PAGE_SHIFT);
Linus Torvalds's avatar
Linus Torvalds committed
922
923

	/* retry over all memory */
924
	if (!ret_paddr)
Yinghai Lu's avatar
Yinghai Lu committed
925
		ret_paddr = __memblock_alloc_base(size, align, memblock_end_of_DRAM());
Linus Torvalds's avatar
Linus Torvalds committed
926

927
	if (!ret_paddr)
928
		panic("numa.c: cannot allocate %lu bytes for node %d",
Linus Torvalds's avatar
Linus Torvalds committed
929
930
		      size, nid);

931
932
	ret = __va(ret_paddr);

Linus Torvalds's avatar
Linus Torvalds committed
933
	/*
934
	 * We initialize the nodes in numeric order: 0, 1, 2...
Yinghai Lu's avatar
Yinghai Lu committed
935
	 * and hand over control from the MEMBLOCK allocator to the
936
937
	 * bootmem allocator.  If this function is called for
	 * node 5, then we know that all nodes <5 are using the
Yinghai Lu's avatar
Yinghai Lu committed
938
	 * bootmem allocator instead of the MEMBLOCK allocator.
939
940
941
	 *
	 * So, check the nid from which this allocation came
	 * and double check to see if we need to use bootmem
Yinghai Lu's avatar
Yinghai Lu committed
942
	 * instead of the MEMBLOCK.  We don't free the MEMBLOCK memory
943
	 * since it would be useless.
Linus Torvalds's avatar
Linus Torvalds committed
944
	 */
945
	new_nid = early_pfn_to_nid(ret_paddr >> PAGE_SHIFT);
946
	if (new_nid < nid) {
947
		ret = __alloc_bootmem_node(NODE_DATA(new_nid),
Linus Torvalds's avatar
Linus Torvalds committed
948
949
				size, align, 0);

950
		dbg("alloc_bootmem %p %lx\n", ret, size);
Linus Torvalds's avatar
Linus Torvalds committed
951
952
	}

953
	memset(ret, 0, size);
954
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
955
956
}

957
958
959
960
961
static struct notifier_block __cpuinitdata ppc64_numa_nb = {
	.notifier_call = cpu_numa_callback,
	.priority = 1 /* Must run before sched domains notifier. */
};

962
963
964
static void mark_reserved_regions_for_nid(int nid)
{
	struct pglist_data *node = NODE_DATA(nid);
965
	struct memblock_region *reg;
966

967
968
969
	for_each_memblock(reserved, reg) {
		unsigned long physbase = reg->base;
		unsigned long size = reg->size;
970
		unsigned long start_pfn = physbase >> PAGE_SHIFT;
971
		unsigned long end_pfn = PFN_UP(physbase + size);
972
973
974
975
976
		struct node_active_region node_ar;
		unsigned long node_end_pfn = node->node_start_pfn +
					     node->node_spanned_pages;

		/*
Yinghai Lu's avatar
Yinghai Lu committed
977
		 * Check to make sure that this memblock.reserved area is
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
		 * within the bounds of the node that we care about.
		 * Checking the nid of the start and end points is not
		 * sufficient because the reserved area could span the
		 * entire node.
		 */
		if (end_pfn <= node->node_start_pfn ||
		    start_pfn >= node_end_pfn)
			continue;

		get_node_active_region(start_pfn, &node_ar);
		while (start_pfn < end_pfn &&
			node_ar.start_pfn < node_ar.end_pfn) {
			unsigned long reserve_size = size;
			/*
			 * if reserved region extends past active region
			 * then trim size to active region
			 */
			if (end_pfn > node_ar.end_pfn)
				reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
997
					- physbase;
998
999
1000
			/*
			 * Only worry about *this* node, others may not
			 * yet have valid NODE_DATA().
For faster browsing, not all history is shown. View entire blame