2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 #include <linux/kernel.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
16 #include <asm/proto.h>
25 struct pglist_data
*node_data
[MAX_NUMNODES
];
26 bootmem_data_t plat_node_bdata
[MAX_NUMNODES
];
29 u8 memnodemap
[NODEMAPSIZE
];
31 unsigned char cpu_to_node
[NR_CPUS
] = { [0 ... NR_CPUS
-1] = NUMA_NO_NODE
};
32 cpumask_t node_to_cpumask
[MAX_NUMNODES
];
34 int numa_off __initdata
;
36 int __init
compute_hash_shift(struct node
*nodes
, int numnodes
)
42 /* When in doubt use brute force. */
44 memset(memnodemap
,0xff,sizeof(*memnodemap
) * NODEMAPSIZE
);
45 for (i
= 0; i
< numnodes
; i
++) {
46 if (nodes
[i
].start
== nodes
[i
].end
)
48 for (addr
= nodes
[i
].start
;
50 addr
+= (1UL << shift
)) {
51 if (memnodemap
[addr
>> shift
] != 0xff &&
52 memnodemap
[addr
>> shift
] != i
) {
54 "node %d shift %d addr %Lx conflict %d\n",
55 i
, shift
, addr
, memnodemap
[addr
>>shift
]);
58 memnodemap
[addr
>> shift
] = i
;
65 memset(memnodemap
,0,sizeof(*memnodemap
) * NODEMAPSIZE
);
69 #ifdef CONFIG_SPARSEMEM
70 int early_pfn_to_nid(unsigned long pfn
)
72 return phys_to_nid(pfn
<< PAGE_SHIFT
);
76 /* Initialize bootmem allocator for a node */
77 void __init
setup_node_bootmem(int nodeid
, unsigned long start
, unsigned long end
)
79 unsigned long start_pfn
, end_pfn
, bootmap_pages
, bootmap_size
, bootmap_start
;
80 unsigned long nodedata_phys
;
81 const int pgdat_size
= round_up(sizeof(pg_data_t
), PAGE_SIZE
);
83 start
= round_up(start
, ZONE_ALIGN
);
85 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid
, start
, end
);
87 start_pfn
= start
>> PAGE_SHIFT
;
88 end_pfn
= end
>> PAGE_SHIFT
;
90 memory_present(nodeid
, start_pfn
, end_pfn
);
91 nodedata_phys
= find_e820_area(start
, end
, pgdat_size
);
92 if (nodedata_phys
== -1L)
93 panic("Cannot find memory pgdat in node %d\n", nodeid
);
95 Dprintk("nodedata_phys %lx\n", nodedata_phys
);
97 node_data
[nodeid
] = phys_to_virt(nodedata_phys
);
98 memset(NODE_DATA(nodeid
), 0, sizeof(pg_data_t
));
99 NODE_DATA(nodeid
)->bdata
= &plat_node_bdata
[nodeid
];
100 NODE_DATA(nodeid
)->node_start_pfn
= start_pfn
;
101 NODE_DATA(nodeid
)->node_spanned_pages
= end_pfn
- start_pfn
;
103 /* Find a place for the bootmem map */
104 bootmap_pages
= bootmem_bootmap_pages(end_pfn
- start_pfn
);
105 bootmap_start
= round_up(nodedata_phys
+ pgdat_size
, PAGE_SIZE
);
106 bootmap_start
= find_e820_area(bootmap_start
, end
, bootmap_pages
<<PAGE_SHIFT
);
107 if (bootmap_start
== -1L)
108 panic("Not enough continuous space for bootmap on node %d", nodeid
);
109 Dprintk("bootmap start %lu pages %lu\n", bootmap_start
, bootmap_pages
);
111 bootmap_size
= init_bootmem_node(NODE_DATA(nodeid
),
112 bootmap_start
>> PAGE_SHIFT
,
115 e820_bootmem_free(NODE_DATA(nodeid
), start
, end
);
117 reserve_bootmem_node(NODE_DATA(nodeid
), nodedata_phys
, pgdat_size
);
118 reserve_bootmem_node(NODE_DATA(nodeid
), bootmap_start
, bootmap_pages
<<PAGE_SHIFT
);
119 node_set_online(nodeid
);
122 /* Initialize final allocator for a zone */
123 void __init
setup_node_zones(int nodeid
)
125 unsigned long start_pfn
, end_pfn
;
126 unsigned long zones
[MAX_NR_ZONES
];
127 unsigned long dma_end_pfn
;
129 memset(zones
, 0, sizeof(unsigned long) * MAX_NR_ZONES
);
131 start_pfn
= node_start_pfn(nodeid
);
132 end_pfn
= node_end_pfn(nodeid
);
134 Dprintk(KERN_INFO
"setting up node %d %lx-%lx\n", nodeid
, start_pfn
, end_pfn
);
136 /* All nodes > 0 have a zero length zone DMA */
137 dma_end_pfn
= __pa(MAX_DMA_ADDRESS
) >> PAGE_SHIFT
;
138 if (start_pfn
< dma_end_pfn
) {
139 zones
[ZONE_DMA
] = dma_end_pfn
- start_pfn
;
140 zones
[ZONE_NORMAL
] = end_pfn
- dma_end_pfn
;
142 zones
[ZONE_NORMAL
] = end_pfn
- start_pfn
;
145 free_area_init_node(nodeid
, NODE_DATA(nodeid
), zones
,
149 void __init
numa_init_array(void)
152 /* There are unfortunately some poorly designed mainboards around
153 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
154 mapping. To avoid this fill in the mapping for all possible
155 CPUs, as the number of CPUs is not known yet.
156 We round robin the existing nodes. */
158 for (i
= 0; i
< NR_CPUS
; i
++) {
159 if (cpu_to_node
[i
] != NUMA_NO_NODE
)
161 rr
= next_node(rr
, node_online_map
);
162 if (rr
== MAX_NUMNODES
)
163 rr
= first_node(node_online_map
);
168 set_bit(0, &node_to_cpumask
[cpu_to_node(0)]);
171 #ifdef CONFIG_NUMA_EMU
172 int numa_fake __initdata
= 0;
175 static int numa_emulation(unsigned long start_pfn
, unsigned long end_pfn
)
178 struct node nodes
[MAX_NUMNODES
];
179 unsigned long sz
= ((end_pfn
- start_pfn
)<<PAGE_SHIFT
) / numa_fake
;
181 /* Kludge needed for the hash function */
182 if (hweight64(sz
) > 1) {
184 while ((x
<< 1) < sz
)
187 printk("Numa emulation unbalanced. Complain to maintainer\n");
191 memset(&nodes
,0,sizeof(nodes
));
192 for (i
= 0; i
< numa_fake
; i
++) {
193 nodes
[i
].start
= (start_pfn
<<PAGE_SHIFT
) + i
*sz
;
194 if (i
== numa_fake
-1)
195 sz
= (end_pfn
<<PAGE_SHIFT
) - nodes
[i
].start
;
196 nodes
[i
].end
= nodes
[i
].start
+ sz
;
197 if (i
!= numa_fake
-1)
199 printk(KERN_INFO
"Faking node %d at %016Lx-%016Lx (%LuMB)\n",
201 nodes
[i
].start
, nodes
[i
].end
,
202 (nodes
[i
].end
- nodes
[i
].start
) >> 20);
205 memnode_shift
= compute_hash_shift(nodes
, numa_fake
);
206 if (memnode_shift
< 0) {
208 printk(KERN_ERR
"No NUMA hash function found. Emulation disabled.\n");
211 for_each_online_node(i
)
212 setup_node_bootmem(i
, nodes
[i
].start
, nodes
[i
].end
);
218 void __init
numa_initmem_init(unsigned long start_pfn
, unsigned long end_pfn
)
222 #ifdef CONFIG_NUMA_EMU
223 if (numa_fake
&& !numa_emulation(start_pfn
, end_pfn
))
227 #ifdef CONFIG_ACPI_NUMA
228 if (!numa_off
&& !acpi_scan_nodes(start_pfn
<< PAGE_SHIFT
,
229 end_pfn
<< PAGE_SHIFT
))
233 #ifdef CONFIG_K8_NUMA
234 if (!numa_off
&& !k8_scan_nodes(start_pfn
<<PAGE_SHIFT
, end_pfn
<<PAGE_SHIFT
))
237 printk(KERN_INFO
"%s\n",
238 numa_off
? "NUMA turned off" : "No NUMA configuration found");
240 printk(KERN_INFO
"Faking a node at %016lx-%016lx\n",
241 start_pfn
<< PAGE_SHIFT
,
242 end_pfn
<< PAGE_SHIFT
);
243 /* setup dummy node covering all memory */
246 nodes_clear(node_online_map
);
248 for (i
= 0; i
< NR_CPUS
; i
++)
250 node_to_cpumask
[0] = cpumask_of_cpu(0);
251 setup_node_bootmem(0, start_pfn
<< PAGE_SHIFT
, end_pfn
<< PAGE_SHIFT
);
254 __cpuinit
void numa_add_cpu(int cpu
)
256 /* BP is initialized elsewhere */
258 set_bit(cpu
, &node_to_cpumask
[cpu_to_node(cpu
)]);
261 unsigned long __init
numa_free_all_bootmem(void)
264 unsigned long pages
= 0;
265 for_each_online_node(i
) {
266 pages
+= free_all_bootmem_node(NODE_DATA(i
));
271 void __init
paging_init(void)
274 for_each_online_node(i
) {
280 __init
int numa_setup(char *opt
)
282 if (!strncmp(opt
,"off",3))
284 #ifdef CONFIG_NUMA_EMU
285 if(!strncmp(opt
, "fake=", 5)) {
286 numa_fake
= simple_strtoul(opt
+5,NULL
,0); ;
287 if (numa_fake
>= MAX_NUMNODES
)
288 numa_fake
= MAX_NUMNODES
;
291 #ifdef CONFIG_ACPI_NUMA
292 if (!strncmp(opt
,"noacpi",6))
298 EXPORT_SYMBOL(cpu_to_node
);
299 EXPORT_SYMBOL(node_to_cpumask
);
300 EXPORT_SYMBOL(memnode_shift
);
301 EXPORT_SYMBOL(memnodemap
);
302 EXPORT_SYMBOL(node_data
);