4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
11 #include <linux/threads.h>
12 #include <linux/bootmem.h>
13 #include <linux/init.h>
15 #include <linux/mmzone.h>
16 #include <linux/module.h>
17 #include <linux/nodemask.h>
18 #include <linux/cpu.h>
19 #include <linux/notifier.h>
20 #include <linux/lmb.h>
22 #include <asm/sparsemem.h>
24 #include <asm/system.h>
27 static int numa_enabled
= 1;
29 static char *cmdline __initdata
;
31 static int numa_debug
;
32 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
34 int numa_cpu_lookup_table
[NR_CPUS
];
35 cpumask_t numa_cpumask_lookup_table
[MAX_NUMNODES
];
36 struct pglist_data
*node_data
[MAX_NUMNODES
];
38 EXPORT_SYMBOL(numa_cpu_lookup_table
);
39 EXPORT_SYMBOL(numa_cpumask_lookup_table
);
40 EXPORT_SYMBOL(node_data
);
42 static bootmem_data_t __initdata plat_node_bdata
[MAX_NUMNODES
];
43 static int min_common_depth
;
44 static int n_mem_addr_cells
, n_mem_size_cells
;
46 static int __cpuinit
fake_numa_create_new_node(unsigned long end_pfn
,
49 unsigned long long mem
;
51 static unsigned int fake_nid
;
52 static unsigned long long curr_boundary
;
55 * Modify node id, iff we started creating NUMA nodes
56 * We want to continue from where we left of the last time
61 * In case there are no more arguments to parse, the
62 * node_id should be the same as the last fake node id
63 * (we've handled this above).
68 mem
= memparse(p
, &p
);
72 if (mem
< curr_boundary
)
77 if ((end_pfn
<< PAGE_SHIFT
) > mem
) {
79 * Skip commas and spaces
81 while (*p
== ',' || *p
== ' ' || *p
== '\t')
87 dbg("created new fake_node with id %d\n", fake_nid
);
93 static void __cpuinit
map_cpu_to_node(int cpu
, int node
)
95 numa_cpu_lookup_table
[cpu
] = node
;
97 dbg("adding cpu %d to node %d\n", cpu
, node
);
99 if (!(cpu_isset(cpu
, numa_cpumask_lookup_table
[node
])))
100 cpu_set(cpu
, numa_cpumask_lookup_table
[node
]);
103 #ifdef CONFIG_HOTPLUG_CPU
104 static void unmap_cpu_from_node(unsigned long cpu
)
106 int node
= numa_cpu_lookup_table
[cpu
];
108 dbg("removing cpu %lu from node %d\n", cpu
, node
);
110 if (cpu_isset(cpu
, numa_cpumask_lookup_table
[node
])) {
111 cpu_clear(cpu
, numa_cpumask_lookup_table
[node
]);
113 printk(KERN_ERR
"WARNING: cpu %lu not found in node %d\n",
117 #endif /* CONFIG_HOTPLUG_CPU */
119 static struct device_node
* __cpuinit
find_cpu_node(unsigned int cpu
)
121 unsigned int hw_cpuid
= get_hard_smp_processor_id(cpu
);
122 struct device_node
*cpu_node
= NULL
;
123 const unsigned int *interrupt_server
, *reg
;
126 while ((cpu_node
= of_find_node_by_type(cpu_node
, "cpu")) != NULL
) {
127 /* Try interrupt server first */
128 interrupt_server
= of_get_property(cpu_node
,
129 "ibm,ppc-interrupt-server#s", &len
);
131 len
= len
/ sizeof(u32
);
133 if (interrupt_server
&& (len
> 0)) {
135 if (interrupt_server
[len
] == hw_cpuid
)
139 reg
= of_get_property(cpu_node
, "reg", &len
);
140 if (reg
&& (len
> 0) && (reg
[0] == hw_cpuid
))
148 /* must hold reference to node during call */
149 static const int *of_get_associativity(struct device_node
*dev
)
151 return of_get_property(dev
, "ibm,associativity", NULL
);
154 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
157 static int of_node_to_nid_single(struct device_node
*device
)
160 const unsigned int *tmp
;
162 if (min_common_depth
== -1)
165 tmp
= of_get_associativity(device
);
169 if (tmp
[0] >= min_common_depth
)
170 nid
= tmp
[min_common_depth
];
172 /* POWER4 LPAR uses 0xffff as invalid node */
173 if (nid
== 0xffff || nid
>= MAX_NUMNODES
)
179 /* Walk the device tree upwards, looking for an associativity id */
180 int of_node_to_nid(struct device_node
*device
)
182 struct device_node
*tmp
;
187 nid
= of_node_to_nid_single(device
);
192 device
= of_get_parent(tmp
);
199 EXPORT_SYMBOL_GPL(of_node_to_nid
);
202 * In theory, the "ibm,associativity" property may contain multiple
203 * associativity lists because a resource may be multiply connected
204 * into the machine. This resource then has different associativity
205 * characteristics relative to its multiple connections. We ignore
206 * this for now. We also assume that all cpu and memory sets have
207 * their distances represented at a common level. This won't be
208 * true for hierarchical NUMA.
210 * In any case the ibm,associativity-reference-points should give
211 * the correct depth for a normal NUMA system.
213 * - Dave Hansen <haveblue@us.ibm.com>
215 static int __init
find_min_common_depth(void)
218 const unsigned int *ref_points
;
219 struct device_node
*rtas_root
;
222 rtas_root
= of_find_node_by_path("/rtas");
228 * this property is 2 32-bit integers, each representing a level of
229 * depth in the associativity nodes. The first is for an SMP
230 * configuration (should be all 0's) and the second is for a normal
231 * NUMA configuration.
233 ref_points
= of_get_property(rtas_root
,
234 "ibm,associativity-reference-points", &len
);
236 if ((len
>= 1) && ref_points
) {
237 depth
= ref_points
[1];
239 dbg("NUMA: ibm,associativity-reference-points not found.\n");
242 of_node_put(rtas_root
);
247 static void __init
get_n_mem_cells(int *n_addr_cells
, int *n_size_cells
)
249 struct device_node
*memory
= NULL
;
251 memory
= of_find_node_by_type(memory
, "memory");
253 panic("numa.c: No memory nodes found!");
255 *n_addr_cells
= of_n_addr_cells(memory
);
256 *n_size_cells
= of_n_size_cells(memory
);
260 static unsigned long __devinit
read_n_cells(int n
, const unsigned int **buf
)
262 unsigned long result
= 0;
265 result
= (result
<< 32) | **buf
;
272 * Figure out to which domain a cpu belongs and stick it there.
273 * Return the id of the domain used.
275 static int __cpuinit
numa_setup_cpu(unsigned long lcpu
)
278 struct device_node
*cpu
= find_cpu_node(lcpu
);
285 nid
= of_node_to_nid_single(cpu
);
287 if (nid
< 0 || !node_online(nid
))
288 nid
= any_online_node(NODE_MASK_ALL
);
290 map_cpu_to_node(lcpu
, nid
);
297 static int __cpuinit
cpu_numa_callback(struct notifier_block
*nfb
,
298 unsigned long action
,
301 unsigned long lcpu
= (unsigned long)hcpu
;
302 int ret
= NOTIFY_DONE
;
306 case CPU_UP_PREPARE_FROZEN
:
307 numa_setup_cpu(lcpu
);
310 #ifdef CONFIG_HOTPLUG_CPU
312 case CPU_DEAD_FROZEN
:
313 case CPU_UP_CANCELED
:
314 case CPU_UP_CANCELED_FROZEN
:
315 unmap_cpu_from_node(lcpu
);
324 * Check and possibly modify a memory region to enforce the memory limit.
326 * Returns the size the region should have to enforce the memory limit.
327 * This will either be the original value of size, a truncated value,
328 * or zero. If the returned value of size is 0 the region should be
329 * discarded as it lies wholy above the memory limit.
331 static unsigned long __init
numa_enforce_memory_limit(unsigned long start
,
335 * We use lmb_end_of_DRAM() in here instead of memory_limit because
336 * we've already adjusted it for the limit and it takes care of
337 * having memory holes below the limit.
343 if (start
+ size
<= lmb_end_of_DRAM())
346 if (start
>= lmb_end_of_DRAM())
349 return lmb_end_of_DRAM() - start
;
353 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
354 * node. This assumes n_mem_{addr,size}_cells have been set.
356 static void __init
parse_drconf_memory(struct device_node
*memory
)
358 const unsigned int *lm
, *dm
, *aa
;
359 unsigned int ls
, ld
, la
;
360 unsigned int n
, aam
, aalen
;
361 unsigned long lmb_size
, size
, start
;
362 int nid
, default_nid
= 0;
363 unsigned int ai
, flags
;
365 lm
= of_get_property(memory
, "ibm,lmb-size", &ls
);
366 dm
= of_get_property(memory
, "ibm,dynamic-memory", &ld
);
367 aa
= of_get_property(memory
, "ibm,associativity-lookup-arrays", &la
);
368 if (!lm
|| !dm
|| !aa
||
369 ls
< sizeof(unsigned int) || ld
< sizeof(unsigned int) ||
370 la
< 2 * sizeof(unsigned int))
373 lmb_size
= read_n_cells(n_mem_size_cells
, &lm
);
374 n
= *dm
++; /* number of LMBs */
375 aam
= *aa
++; /* number of associativity lists */
376 aalen
= *aa
++; /* length of each associativity list */
377 if (ld
< (n
* (n_mem_addr_cells
+ 4) + 1) * sizeof(unsigned int) ||
378 la
< (aam
* aalen
+ 2) * sizeof(unsigned int))
381 for (; n
!= 0; --n
) {
382 start
= read_n_cells(n_mem_addr_cells
, &dm
);
386 /* 0x80 == reserved, 0x8 = assigned to us */
387 if ((flags
& 0x80) || !(flags
& 0x8))
390 /* flags & 0x40 means associativity index is invalid */
391 if (min_common_depth
> 0 && min_common_depth
<= aalen
&&
392 (flags
& 0x40) == 0 && ai
< aam
) {
393 /* this is like of_node_to_nid_single */
394 nid
= aa
[ai
* aalen
+ min_common_depth
- 1];
395 if (nid
== 0xffff || nid
>= MAX_NUMNODES
)
399 fake_numa_create_new_node(((start
+ lmb_size
) >> PAGE_SHIFT
),
401 node_set_online(nid
);
403 size
= numa_enforce_memory_limit(start
, lmb_size
);
407 add_active_range(nid
, start
>> PAGE_SHIFT
,
408 (start
>> PAGE_SHIFT
) + (size
>> PAGE_SHIFT
));
412 static int __init
parse_numa_properties(void)
414 struct device_node
*cpu
= NULL
;
415 struct device_node
*memory
= NULL
;
419 if (numa_enabled
== 0) {
420 printk(KERN_WARNING
"NUMA disabled by user\n");
424 min_common_depth
= find_min_common_depth();
426 if (min_common_depth
< 0)
427 return min_common_depth
;
429 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth
);
432 * Even though we connect cpus to numa domains later in SMP
433 * init, we need to know the node ids now. This is because
434 * each node to be onlined must have NODE_DATA etc backing it.
436 for_each_present_cpu(i
) {
439 cpu
= find_cpu_node(i
);
441 nid
= of_node_to_nid_single(cpu
);
445 * Don't fall back to default_nid yet -- we will plug
446 * cpus into nodes once the memory scan has discovered
451 node_set_online(nid
);
454 get_n_mem_cells(&n_mem_addr_cells
, &n_mem_size_cells
);
456 while ((memory
= of_find_node_by_type(memory
, "memory")) != NULL
) {
461 const unsigned int *memcell_buf
;
464 memcell_buf
= of_get_property(memory
,
465 "linux,usable-memory", &len
);
466 if (!memcell_buf
|| len
<= 0)
467 memcell_buf
= of_get_property(memory
, "reg", &len
);
468 if (!memcell_buf
|| len
<= 0)
472 ranges
= (len
>> 2) / (n_mem_addr_cells
+ n_mem_size_cells
);
474 /* these are order-sensitive, and modify the buffer pointer */
475 start
= read_n_cells(n_mem_addr_cells
, &memcell_buf
);
476 size
= read_n_cells(n_mem_size_cells
, &memcell_buf
);
479 * Assumption: either all memory nodes or none will
480 * have associativity properties. If none, then
481 * everything goes to default_nid.
483 nid
= of_node_to_nid_single(memory
);
487 fake_numa_create_new_node(((start
+ size
) >> PAGE_SHIFT
), &nid
);
488 node_set_online(nid
);
490 if (!(size
= numa_enforce_memory_limit(start
, size
))) {
497 add_active_range(nid
, start
>> PAGE_SHIFT
,
498 (start
>> PAGE_SHIFT
) + (size
>> PAGE_SHIFT
));
505 * Now do the same thing for each LMB listed in the ibm,dynamic-memory
506 * property in the ibm,dynamic-reconfiguration-memory node.
508 memory
= of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
510 parse_drconf_memory(memory
);
515 static void __init
setup_nonnuma(void)
517 unsigned long top_of_ram
= lmb_end_of_DRAM();
518 unsigned long total_ram
= lmb_phys_mem_size();
519 unsigned long start_pfn
, end_pfn
;
520 unsigned int i
, nid
= 0;
522 printk(KERN_DEBUG
"Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
523 top_of_ram
, total_ram
);
524 printk(KERN_DEBUG
"Memory hole size: %ldMB\n",
525 (top_of_ram
- total_ram
) >> 20);
527 for (i
= 0; i
< lmb
.memory
.cnt
; ++i
) {
528 start_pfn
= lmb
.memory
.region
[i
].base
>> PAGE_SHIFT
;
529 end_pfn
= start_pfn
+ lmb_size_pages(&lmb
.memory
, i
);
531 fake_numa_create_new_node(end_pfn
, &nid
);
532 add_active_range(nid
, start_pfn
, end_pfn
);
533 node_set_online(nid
);
537 void __init
dump_numa_cpu_topology(void)
540 unsigned int cpu
, count
;
542 if (min_common_depth
== -1 || !numa_enabled
)
545 for_each_online_node(node
) {
546 printk(KERN_DEBUG
"Node %d CPUs:", node
);
550 * If we used a CPU iterator here we would miss printing
551 * the holes in the cpumap.
553 for (cpu
= 0; cpu
< NR_CPUS
; cpu
++) {
554 if (cpu_isset(cpu
, numa_cpumask_lookup_table
[node
])) {
560 printk("-%u", cpu
- 1);
566 printk("-%u", NR_CPUS
- 1);
571 static void __init
dump_numa_memory_topology(void)
576 if (min_common_depth
== -1 || !numa_enabled
)
579 for_each_online_node(node
) {
582 printk(KERN_DEBUG
"Node %d Memory:", node
);
586 for (i
= 0; i
< lmb_end_of_DRAM();
587 i
+= (1 << SECTION_SIZE_BITS
)) {
588 if (early_pfn_to_nid(i
>> PAGE_SHIFT
) == node
) {
606 * Allocate some memory, satisfying the lmb or bootmem allocator where
607 * required. nid is the preferred node and end is the physical address of
608 * the highest address in the node.
610 * Returns the physical address of the memory.
612 static void __init
*careful_allocation(int nid
, unsigned long size
,
614 unsigned long end_pfn
)
617 unsigned long ret
= __lmb_alloc_base(size
, align
, end_pfn
<< PAGE_SHIFT
);
619 /* retry over all memory */
621 ret
= __lmb_alloc_base(size
, align
, lmb_end_of_DRAM());
624 panic("numa.c: cannot allocate %lu bytes on node %d",
628 * If the memory came from a previously allocated node, we must
629 * retry with the bootmem allocator.
631 new_nid
= early_pfn_to_nid(ret
>> PAGE_SHIFT
);
633 ret
= (unsigned long)__alloc_bootmem_node(NODE_DATA(new_nid
),
637 panic("numa.c: cannot allocate %lu bytes on node %d",
642 dbg("alloc_bootmem %lx %lx\n", ret
, size
);
648 static struct notifier_block __cpuinitdata ppc64_numa_nb
= {
649 .notifier_call
= cpu_numa_callback
,
650 .priority
= 1 /* Must run before sched domains notifier. */
653 void __init
do_init_bootmem(void)
659 max_low_pfn
= lmb_end_of_DRAM() >> PAGE_SHIFT
;
660 max_pfn
= max_low_pfn
;
662 if (parse_numa_properties())
665 dump_numa_memory_topology();
667 register_cpu_notifier(&ppc64_numa_nb
);
668 cpu_numa_callback(&ppc64_numa_nb
, CPU_UP_PREPARE
,
669 (void *)(unsigned long)boot_cpuid
);
671 for_each_online_node(nid
) {
672 unsigned long start_pfn
, end_pfn
;
673 unsigned long bootmem_paddr
;
674 unsigned long bootmap_pages
;
676 get_pfn_range_for_nid(nid
, &start_pfn
, &end_pfn
);
678 /* Allocate the node structure node local if possible */
679 NODE_DATA(nid
) = careful_allocation(nid
,
680 sizeof(struct pglist_data
),
681 SMP_CACHE_BYTES
, end_pfn
);
682 NODE_DATA(nid
) = __va(NODE_DATA(nid
));
683 memset(NODE_DATA(nid
), 0, sizeof(struct pglist_data
));
685 dbg("node %d\n", nid
);
686 dbg("NODE_DATA() = %p\n", NODE_DATA(nid
));
688 NODE_DATA(nid
)->bdata
= &plat_node_bdata
[nid
];
689 NODE_DATA(nid
)->node_start_pfn
= start_pfn
;
690 NODE_DATA(nid
)->node_spanned_pages
= end_pfn
- start_pfn
;
692 if (NODE_DATA(nid
)->node_spanned_pages
== 0)
695 dbg("start_paddr = %lx\n", start_pfn
<< PAGE_SHIFT
);
696 dbg("end_paddr = %lx\n", end_pfn
<< PAGE_SHIFT
);
698 bootmap_pages
= bootmem_bootmap_pages(end_pfn
- start_pfn
);
699 bootmem_paddr
= (unsigned long)careful_allocation(nid
,
700 bootmap_pages
<< PAGE_SHIFT
,
702 memset(__va(bootmem_paddr
), 0, bootmap_pages
<< PAGE_SHIFT
);
704 dbg("bootmap_paddr = %lx\n", bootmem_paddr
);
706 init_bootmem_node(NODE_DATA(nid
), bootmem_paddr
>> PAGE_SHIFT
,
709 free_bootmem_with_active_regions(nid
, end_pfn
);
711 /* Mark reserved regions on this node */
712 for (i
= 0; i
< lmb
.reserved
.cnt
; i
++) {
713 unsigned long physbase
= lmb
.reserved
.region
[i
].base
;
714 unsigned long size
= lmb
.reserved
.region
[i
].size
;
715 unsigned long start_paddr
= start_pfn
<< PAGE_SHIFT
;
716 unsigned long end_paddr
= end_pfn
<< PAGE_SHIFT
;
718 if (early_pfn_to_nid(physbase
>> PAGE_SHIFT
) != nid
&&
719 early_pfn_to_nid((physbase
+size
-1) >> PAGE_SHIFT
) != nid
)
722 if (physbase
< end_paddr
&&
723 (physbase
+size
) > start_paddr
) {
725 if (physbase
< start_paddr
) {
726 size
-= start_paddr
- physbase
;
727 physbase
= start_paddr
;
730 if (size
> end_paddr
- physbase
)
731 size
= end_paddr
- physbase
;
733 dbg("reserve_bootmem %lx %lx\n", physbase
,
735 reserve_bootmem_node(NODE_DATA(nid
), physbase
,
736 size
, BOOTMEM_DEFAULT
);
740 sparse_memory_present_with_active_regions(nid
);
744 void __init
paging_init(void)
746 unsigned long max_zone_pfns
[MAX_NR_ZONES
];
747 memset(max_zone_pfns
, 0, sizeof(max_zone_pfns
));
748 max_zone_pfns
[ZONE_DMA
] = lmb_end_of_DRAM() >> PAGE_SHIFT
;
749 free_area_init_nodes(max_zone_pfns
);
752 static int __init
early_numa(char *p
)
757 if (strstr(p
, "off"))
760 if (strstr(p
, "debug"))
763 p
= strstr(p
, "fake=");
765 cmdline
= p
+ strlen("fake=");
769 early_param("numa", early_numa
);
771 #ifdef CONFIG_MEMORY_HOTPLUG
773 * Find the node associated with a hot added memory section. Section
774 * corresponds to a SPARSEMEM section, not an LMB. It is assumed that
775 * sections are fully contained within a single LMB.
777 int hot_add_scn_to_nid(unsigned long scn_addr
)
779 struct device_node
*memory
= NULL
;
781 int default_nid
= any_online_node(NODE_MASK_ALL
);
784 if (!numa_enabled
|| (min_common_depth
< 0))
787 while ((memory
= of_find_node_by_type(memory
, "memory")) != NULL
) {
788 unsigned long start
, size
;
790 const unsigned int *memcell_buf
;
793 memcell_buf
= of_get_property(memory
, "reg", &len
);
794 if (!memcell_buf
|| len
<= 0)
798 ranges
= (len
>> 2) / (n_mem_addr_cells
+ n_mem_size_cells
);
800 start
= read_n_cells(n_mem_addr_cells
, &memcell_buf
);
801 size
= read_n_cells(n_mem_size_cells
, &memcell_buf
);
802 nid
= of_node_to_nid_single(memory
);
804 /* Domains not present at boot default to 0 */
805 if (nid
< 0 || !node_online(nid
))
808 if ((scn_addr
>= start
) && (scn_addr
< (start
+ size
))) {
813 if (--ranges
) /* process all ranges in cell */
816 BUG(); /* section address should be found above */
819 /* Temporary code to ensure that returned node is not empty */
822 while (NODE_DATA(nid
)->node_spanned_pages
== 0) {
823 node_clear(nid
, nodes
);
824 nid
= any_online_node(nodes
);
828 #endif /* CONFIG_MEMORY_HOTPLUG */