2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 #include <linux/kernel.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
16 #include <asm/proto.h>
25 struct pglist_data
*node_data
[MAX_NUMNODES
];
26 bootmem_data_t plat_node_bdata
[MAX_NUMNODES
];
29 u8 memnodemap
[NODEMAPSIZE
];
31 unsigned char cpu_to_node
[NR_CPUS
] = { [0 ... NR_CPUS
-1] = NUMA_NO_NODE
};
32 cpumask_t node_to_cpumask
[MAX_NUMNODES
];
34 int numa_off __initdata
;
36 int __init
compute_hash_shift(struct node
*nodes
, int numnodes
)
42 /* When in doubt use brute force. */
44 memset(memnodemap
,0xff,sizeof(*memnodemap
) * NODEMAPSIZE
);
45 for (i
= 0; i
< numnodes
; i
++) {
46 if (nodes
[i
].start
== nodes
[i
].end
)
48 for (addr
= nodes
[i
].start
;
50 addr
+= (1UL << shift
)) {
51 if (memnodemap
[addr
>> shift
] != 0xff &&
52 memnodemap
[addr
>> shift
] != i
) {
54 "node %d shift %d addr %Lx conflict %d\n",
55 i
, shift
, addr
, memnodemap
[addr
>>shift
]);
58 memnodemap
[addr
>> shift
] = i
;
65 memset(memnodemap
,0,sizeof(*memnodemap
) * NODEMAPSIZE
);
69 /* Initialize bootmem allocator for a node */
70 void __init
setup_node_bootmem(int nodeid
, unsigned long start
, unsigned long end
)
72 unsigned long start_pfn
, end_pfn
, bootmap_pages
, bootmap_size
, bootmap_start
;
73 unsigned long nodedata_phys
;
74 const int pgdat_size
= round_up(sizeof(pg_data_t
), PAGE_SIZE
);
76 start
= round_up(start
, ZONE_ALIGN
);
78 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid
, start
, end
);
80 start_pfn
= start
>> PAGE_SHIFT
;
81 end_pfn
= end
>> PAGE_SHIFT
;
83 nodedata_phys
= find_e820_area(start
, end
, pgdat_size
);
84 if (nodedata_phys
== -1L)
85 panic("Cannot find memory pgdat in node %d\n", nodeid
);
87 Dprintk("nodedata_phys %lx\n", nodedata_phys
);
89 node_data
[nodeid
] = phys_to_virt(nodedata_phys
);
90 memset(NODE_DATA(nodeid
), 0, sizeof(pg_data_t
));
91 NODE_DATA(nodeid
)->bdata
= &plat_node_bdata
[nodeid
];
92 NODE_DATA(nodeid
)->node_start_pfn
= start_pfn
;
93 NODE_DATA(nodeid
)->node_spanned_pages
= end_pfn
- start_pfn
;
95 /* Find a place for the bootmem map */
96 bootmap_pages
= bootmem_bootmap_pages(end_pfn
- start_pfn
);
97 bootmap_start
= round_up(nodedata_phys
+ pgdat_size
, PAGE_SIZE
);
98 bootmap_start
= find_e820_area(bootmap_start
, end
, bootmap_pages
<<PAGE_SHIFT
);
99 if (bootmap_start
== -1L)
100 panic("Not enough continuous space for bootmap on node %d", nodeid
);
101 Dprintk("bootmap start %lu pages %lu\n", bootmap_start
, bootmap_pages
);
103 bootmap_size
= init_bootmem_node(NODE_DATA(nodeid
),
104 bootmap_start
>> PAGE_SHIFT
,
107 e820_bootmem_free(NODE_DATA(nodeid
), start
, end
);
109 reserve_bootmem_node(NODE_DATA(nodeid
), nodedata_phys
, pgdat_size
);
110 reserve_bootmem_node(NODE_DATA(nodeid
), bootmap_start
, bootmap_pages
<<PAGE_SHIFT
);
111 node_set_online(nodeid
);
114 /* Initialize final allocator for a zone */
115 void __init
setup_node_zones(int nodeid
)
117 unsigned long start_pfn
, end_pfn
;
118 unsigned long zones
[MAX_NR_ZONES
];
119 unsigned long dma_end_pfn
;
121 memset(zones
, 0, sizeof(unsigned long) * MAX_NR_ZONES
);
123 start_pfn
= node_start_pfn(nodeid
);
124 end_pfn
= node_end_pfn(nodeid
);
126 Dprintk(KERN_INFO
"setting up node %d %lx-%lx\n", nodeid
, start_pfn
, end_pfn
);
128 /* All nodes > 0 have a zero length zone DMA */
129 dma_end_pfn
= __pa(MAX_DMA_ADDRESS
) >> PAGE_SHIFT
;
130 if (start_pfn
< dma_end_pfn
) {
131 zones
[ZONE_DMA
] = dma_end_pfn
- start_pfn
;
132 zones
[ZONE_NORMAL
] = end_pfn
- dma_end_pfn
;
134 zones
[ZONE_NORMAL
] = end_pfn
- start_pfn
;
137 free_area_init_node(nodeid
, NODE_DATA(nodeid
), zones
,
141 void __init
numa_init_array(void)
144 /* There are unfortunately some poorly designed mainboards around
145 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
146 mapping. To avoid this fill in the mapping for all possible
147 CPUs, as the number of CPUs is not known yet.
148 We round robin the existing nodes. */
150 for (i
= 0; i
< NR_CPUS
; i
++) {
151 if (cpu_to_node
[i
] != NUMA_NO_NODE
)
153 rr
= next_node(rr
, node_online_map
);
154 if (rr
== MAX_NUMNODES
)
155 rr
= first_node(node_online_map
);
160 set_bit(0, &node_to_cpumask
[cpu_to_node(0)]);
163 #ifdef CONFIG_NUMA_EMU
164 int numa_fake __initdata
= 0;
167 static int numa_emulation(unsigned long start_pfn
, unsigned long end_pfn
)
170 struct node nodes
[MAX_NUMNODES
];
171 unsigned long sz
= ((end_pfn
- start_pfn
)<<PAGE_SHIFT
) / numa_fake
;
173 /* Kludge needed for the hash function */
174 if (hweight64(sz
) > 1) {
176 while ((x
<< 1) < sz
)
179 printk("Numa emulation unbalanced. Complain to maintainer\n");
183 memset(&nodes
,0,sizeof(nodes
));
184 for (i
= 0; i
< numa_fake
; i
++) {
185 nodes
[i
].start
= (start_pfn
<<PAGE_SHIFT
) + i
*sz
;
186 if (i
== numa_fake
-1)
187 sz
= (end_pfn
<<PAGE_SHIFT
) - nodes
[i
].start
;
188 nodes
[i
].end
= nodes
[i
].start
+ sz
;
189 if (i
!= numa_fake
-1)
191 printk(KERN_INFO
"Faking node %d at %016Lx-%016Lx (%LuMB)\n",
193 nodes
[i
].start
, nodes
[i
].end
,
194 (nodes
[i
].end
- nodes
[i
].start
) >> 20);
197 memnode_shift
= compute_hash_shift(nodes
, numa_fake
);
198 if (memnode_shift
< 0) {
200 printk(KERN_ERR
"No NUMA hash function found. Emulation disabled.\n");
203 for_each_online_node(i
)
204 setup_node_bootmem(i
, nodes
[i
].start
, nodes
[i
].end
);
210 void __init
numa_initmem_init(unsigned long start_pfn
, unsigned long end_pfn
)
214 #ifdef CONFIG_NUMA_EMU
215 if (numa_fake
&& !numa_emulation(start_pfn
, end_pfn
))
219 #ifdef CONFIG_ACPI_NUMA
220 if (!numa_off
&& !acpi_scan_nodes(start_pfn
<< PAGE_SHIFT
,
221 end_pfn
<< PAGE_SHIFT
))
225 #ifdef CONFIG_K8_NUMA
226 if (!numa_off
&& !k8_scan_nodes(start_pfn
<<PAGE_SHIFT
, end_pfn
<<PAGE_SHIFT
))
229 printk(KERN_INFO
"%s\n",
230 numa_off
? "NUMA turned off" : "No NUMA configuration found");
232 printk(KERN_INFO
"Faking a node at %016lx-%016lx\n",
233 start_pfn
<< PAGE_SHIFT
,
234 end_pfn
<< PAGE_SHIFT
);
235 /* setup dummy node covering all memory */
238 nodes_clear(node_online_map
);
240 for (i
= 0; i
< NR_CPUS
; i
++)
242 node_to_cpumask
[0] = cpumask_of_cpu(0);
243 setup_node_bootmem(0, start_pfn
<< PAGE_SHIFT
, end_pfn
<< PAGE_SHIFT
);
246 __init
void numa_add_cpu(int cpu
)
248 /* BP is initialized elsewhere */
250 set_bit(cpu
, &node_to_cpumask
[cpu_to_node(cpu
)]);
253 unsigned long __init
numa_free_all_bootmem(void)
256 unsigned long pages
= 0;
257 for_each_online_node(i
) {
258 pages
+= free_all_bootmem_node(NODE_DATA(i
));
263 void __init
paging_init(void)
266 for_each_online_node(i
) {
272 __init
int numa_setup(char *opt
)
274 if (!strncmp(opt
,"off",3))
276 #ifdef CONFIG_NUMA_EMU
277 if(!strncmp(opt
, "fake=", 5)) {
278 numa_fake
= simple_strtoul(opt
+5,NULL
,0); ;
279 if (numa_fake
>= MAX_NUMNODES
)
280 numa_fake
= MAX_NUMNODES
;
283 #ifdef CONFIG_ACPI_NUMA
284 if (!strncmp(opt
,"noacpi",6))
290 EXPORT_SYMBOL(cpu_to_node
);
291 EXPORT_SYMBOL(node_to_cpumask
);
292 EXPORT_SYMBOL(memnode_shift
);
293 EXPORT_SYMBOL(memnodemap
);
294 EXPORT_SYMBOL(node_data
);