1 // SPDX-License-Identifier: GPL-2.0-or-later
3 #include <linux/array_size.h>
4 #include <linux/sort.h>
5 #include <linux/printk.h>
6 #include <linux/memblock.h>
7 #include <linux/numa.h>
8 #include <linux/numa_memblks.h>
10 static int numa_distance_cnt
;
11 static u8
*numa_distance
;
13 nodemask_t numa_nodes_parsed __initdata
;
15 static struct numa_meminfo numa_meminfo __initdata_or_meminfo
;
16 static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo
;
19 * Set nodes, which have memory in @mi, in *@nodemask.
21 static void __init
numa_nodemask_from_meminfo(nodemask_t
*nodemask
,
22 const struct numa_meminfo
*mi
)
26 for (i
= 0; i
< ARRAY_SIZE(mi
->blk
); i
++)
27 if (mi
->blk
[i
].start
!= mi
->blk
[i
].end
&&
28 mi
->blk
[i
].nid
!= NUMA_NO_NODE
)
29 node_set(mi
->blk
[i
].nid
, *nodemask
);
33 * numa_reset_distance - Reset NUMA distance table
35 * The current table is freed. The next numa_set_distance() call will
38 void __init
numa_reset_distance(void)
40 size_t size
= numa_distance_cnt
* numa_distance_cnt
* sizeof(numa_distance
[0]);
42 /* numa_distance could be 1LU marking allocation failure, test cnt */
43 if (numa_distance_cnt
)
44 memblock_free(numa_distance
, size
);
45 numa_distance_cnt
= 0;
46 numa_distance
= NULL
; /* enable table creation */
49 static int __init
numa_alloc_distance(void)
51 nodemask_t nodes_parsed
;
55 /* size the new table and allocate it */
56 nodes_parsed
= numa_nodes_parsed
;
57 numa_nodemask_from_meminfo(&nodes_parsed
, &numa_meminfo
);
59 for_each_node_mask(i
, nodes_parsed
)
62 size
= cnt
* cnt
* sizeof(numa_distance
[0]);
64 numa_distance
= memblock_alloc(size
, PAGE_SIZE
);
66 pr_warn("Warning: can't allocate distance table!\n");
67 /* don't retry until explicitly reset */
68 numa_distance
= (void *)1LU;
72 numa_distance_cnt
= cnt
;
74 /* fill with the default distances */
75 for (i
= 0; i
< cnt
; i
++)
76 for (j
= 0; j
< cnt
; j
++)
77 numa_distance
[i
* cnt
+ j
] = i
== j
?
78 LOCAL_DISTANCE
: REMOTE_DISTANCE
;
79 printk(KERN_DEBUG
"NUMA: Initialized distance table, cnt=%d\n", cnt
);
85 * numa_set_distance - Set NUMA distance from one NUMA to another
86 * @from: the 'from' node to set distance
87 * @to: the 'to' node to set distance
88 * @distance: NUMA distance
90 * Set the distance from node @from to @to to @distance. If distance table
91 * doesn't exist, one which is large enough to accommodate all the currently
92 * known nodes will be created.
94 * If such table cannot be allocated, a warning is printed and further
95 * calls are ignored until the distance table is reset with
96 * numa_reset_distance().
98 * If @from or @to is higher than the highest known node or lower than zero
99 * at the time of table creation or @distance doesn't make sense, the call
101 * This is to allow simplification of specific NUMA config implementations.
103 void __init
numa_set_distance(int from
, int to
, int distance
)
105 if (!numa_distance
&& numa_alloc_distance() < 0)
108 if (from
>= numa_distance_cnt
|| to
>= numa_distance_cnt
||
109 from
< 0 || to
< 0) {
110 pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
115 if ((u8
)distance
!= distance
||
116 (from
== to
&& distance
!= LOCAL_DISTANCE
)) {
117 pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
122 numa_distance
[from
* numa_distance_cnt
+ to
] = distance
;
125 int __node_distance(int from
, int to
)
127 if (from
>= numa_distance_cnt
|| to
>= numa_distance_cnt
)
128 return from
== to
? LOCAL_DISTANCE
: REMOTE_DISTANCE
;
129 return numa_distance
[from
* numa_distance_cnt
+ to
];
131 EXPORT_SYMBOL(__node_distance
);
133 static int __init
numa_add_memblk_to(int nid
, u64 start
, u64 end
,
134 struct numa_meminfo
*mi
)
136 /* ignore zero length blks */
140 /* whine about and ignore invalid blks */
141 if (start
> end
|| nid
< 0 || nid
>= MAX_NUMNODES
) {
142 pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
143 nid
, start
, end
- 1);
147 if (mi
->nr_blks
>= NR_NODE_MEMBLKS
) {
148 pr_err("too many memblk ranges\n");
152 mi
->blk
[mi
->nr_blks
].start
= start
;
153 mi
->blk
[mi
->nr_blks
].end
= end
;
154 mi
->blk
[mi
->nr_blks
].nid
= nid
;
160 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
161 * @idx: Index of memblk to remove
162 * @mi: numa_meminfo to remove memblk from
164 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
165 * decrementing @mi->nr_blks.
167 void __init
numa_remove_memblk_from(int idx
, struct numa_meminfo
*mi
)
170 memmove(&mi
->blk
[idx
], &mi
->blk
[idx
+ 1],
171 (mi
->nr_blks
- idx
) * sizeof(mi
->blk
[0]));
175 * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
176 * @dst: numa_meminfo to append block to
177 * @idx: Index of memblk to remove
178 * @src: numa_meminfo to remove memblk from
180 static void __init
numa_move_tail_memblk(struct numa_meminfo
*dst
, int idx
,
181 struct numa_meminfo
*src
)
183 dst
->blk
[dst
->nr_blks
++] = src
->blk
[idx
];
184 numa_remove_memblk_from(idx
, src
);
188 * numa_add_memblk - Add one numa_memblk to numa_meminfo
189 * @nid: NUMA node ID of the new memblk
190 * @start: Start address of the new memblk
191 * @end: End address of the new memblk
193 * Add a new memblk to the default numa_meminfo.
196 * 0 on success, -errno on failure.
198 int __init
numa_add_memblk(int nid
, u64 start
, u64 end
)
200 return numa_add_memblk_to(nid
, start
, end
, &numa_meminfo
);
204 * numa_cleanup_meminfo - Cleanup a numa_meminfo
205 * @mi: numa_meminfo to clean up
207 * Sanitize @mi by merging and removing unnecessary memblks. Also check for
208 * conflicts and clear unused memblks.
211 * 0 on success, -errno on failure.
213 int __init
numa_cleanup_meminfo(struct numa_meminfo
*mi
)
215 const u64 low
= memblock_start_of_DRAM();
216 const u64 high
= memblock_end_of_DRAM();
219 /* first, trim all entries */
220 for (i
= 0; i
< mi
->nr_blks
; i
++) {
221 struct numa_memblk
*bi
= &mi
->blk
[i
];
223 /* move / save reserved memory ranges */
224 if (!memblock_overlaps_region(&memblock
.memory
,
225 bi
->start
, bi
->end
- bi
->start
)) {
226 numa_move_tail_memblk(&numa_reserved_meminfo
, i
--, mi
);
230 /* make sure all non-reserved blocks are inside the limits */
231 bi
->start
= max(bi
->start
, low
);
233 /* preserve info for non-RAM areas above 'max_pfn': */
234 if (bi
->end
> high
) {
235 numa_add_memblk_to(bi
->nid
, high
, bi
->end
,
236 &numa_reserved_meminfo
);
240 /* and there's no empty block */
241 if (bi
->start
>= bi
->end
)
242 numa_remove_memblk_from(i
--, mi
);
245 /* merge neighboring / overlapping entries */
246 for (i
= 0; i
< mi
->nr_blks
; i
++) {
247 struct numa_memblk
*bi
= &mi
->blk
[i
];
249 for (j
= i
+ 1; j
< mi
->nr_blks
; j
++) {
250 struct numa_memblk
*bj
= &mi
->blk
[j
];
254 * See whether there are overlapping blocks. Whine
255 * about but allow overlaps of the same nid. They
256 * will be merged below.
258 if (bi
->end
> bj
->start
&& bi
->start
< bj
->end
) {
259 if (bi
->nid
!= bj
->nid
) {
260 pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
261 bi
->nid
, bi
->start
, bi
->end
- 1,
262 bj
->nid
, bj
->start
, bj
->end
- 1);
265 pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
266 bi
->nid
, bi
->start
, bi
->end
- 1,
267 bj
->start
, bj
->end
- 1);
271 * Join together blocks on the same node, holes
272 * between which don't overlap with memory on other
275 if (bi
->nid
!= bj
->nid
)
277 start
= min(bi
->start
, bj
->start
);
278 end
= max(bi
->end
, bj
->end
);
279 for (k
= 0; k
< mi
->nr_blks
; k
++) {
280 struct numa_memblk
*bk
= &mi
->blk
[k
];
282 if (bi
->nid
== bk
->nid
)
284 if (start
< bk
->end
&& end
> bk
->start
)
289 pr_info("NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
290 bi
->nid
, bi
->start
, bi
->end
- 1, bj
->start
,
291 bj
->end
- 1, start
, end
- 1);
294 numa_remove_memblk_from(j
--, mi
);
298 /* clear unused ones */
299 for (i
= mi
->nr_blks
; i
< ARRAY_SIZE(mi
->blk
); i
++) {
300 mi
->blk
[i
].start
= mi
->blk
[i
].end
= 0;
301 mi
->blk
[i
].nid
= NUMA_NO_NODE
;
308 * Mark all currently memblock-reserved physical memory (which covers the
309 * kernel's own memory ranges) as hot-unswappable.
311 static void __init
numa_clear_kernel_node_hotplug(void)
313 nodemask_t reserved_nodemask
= NODE_MASK_NONE
;
314 struct memblock_region
*mb_region
;
318 * We have to do some preprocessing of memblock regions, to
319 * make them suitable for reservation.
321 * At this time, all memory regions reserved by memblock are
322 * used by the kernel, but those regions are not split up
323 * along node boundaries yet, and don't necessarily have their
324 * node ID set yet either.
326 * So iterate over all parsed memory blocks and use those ranges to
327 * set the nid in memblock.reserved. This will split up the
328 * memblock regions along node boundaries and will set the node IDs
331 for (i
= 0; i
< numa_meminfo
.nr_blks
; i
++) {
332 struct numa_memblk
*mb
= numa_meminfo
.blk
+ i
;
335 ret
= memblock_set_node(mb
->start
, mb
->end
- mb
->start
,
336 &memblock
.reserved
, mb
->nid
);
341 * Now go over all reserved memblock regions, to construct a
342 * node mask of all kernel reserved memory areas.
344 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
345 * numa_meminfo might not include all memblock.reserved
346 * memory ranges, because quirks such as trim_snb_memory()
347 * reserve specific pages for Sandy Bridge graphics. ]
349 for_each_reserved_mem_region(mb_region
) {
350 int nid
= memblock_get_region_node(mb_region
);
352 if (numa_valid_node(nid
))
353 node_set(nid
, reserved_nodemask
);
357 * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
358 * belonging to the reserved node mask.
360 * Note that this will include memory regions that reside
361 * on nodes that contain kernel memory - entire nodes
362 * become hot-unpluggable:
364 for (i
= 0; i
< numa_meminfo
.nr_blks
; i
++) {
365 struct numa_memblk
*mb
= numa_meminfo
.blk
+ i
;
367 if (!node_isset(mb
->nid
, reserved_nodemask
))
370 memblock_clear_hotplug(mb
->start
, mb
->end
- mb
->start
);
374 static int __init
numa_register_meminfo(struct numa_meminfo
*mi
)
378 /* Account for nodes with cpus and no memory */
379 node_possible_map
= numa_nodes_parsed
;
380 numa_nodemask_from_meminfo(&node_possible_map
, mi
);
381 if (WARN_ON(nodes_empty(node_possible_map
)))
384 for (i
= 0; i
< mi
->nr_blks
; i
++) {
385 struct numa_memblk
*mb
= &mi
->blk
[i
];
387 memblock_set_node(mb
->start
, mb
->end
- mb
->start
,
388 &memblock
.memory
, mb
->nid
);
392 * At very early time, the kernel have to use some memory such as
393 * loading the kernel image. We cannot prevent this anyway. So any
394 * node the kernel resides in should be un-hotpluggable.
396 * And when we come here, alloc node data won't fail.
398 numa_clear_kernel_node_hotplug();
401 * If sections array is gonna be used for pfn -> nid mapping, check
402 * whether its granularity is fine enough.
404 if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS
)) {
405 unsigned long pfn_align
= node_map_pfn_alignment();
407 if (pfn_align
&& pfn_align
< PAGES_PER_SECTION
) {
408 unsigned long node_align_mb
= PFN_PHYS(pfn_align
) >> 20;
410 unsigned long sect_align_mb
= PFN_PHYS(PAGES_PER_SECTION
) >> 20;
412 pr_warn("Node alignment %luMB < min %luMB, rejecting NUMA config\n",
413 node_align_mb
, sect_align_mb
);
421 int __init
numa_memblks_init(int (*init_func
)(void),
422 bool memblock_force_top_down
)
424 phys_addr_t max_addr
= (phys_addr_t
)ULLONG_MAX
;
427 nodes_clear(numa_nodes_parsed
);
428 nodes_clear(node_possible_map
);
429 nodes_clear(node_online_map
);
430 memset(&numa_meminfo
, 0, sizeof(numa_meminfo
));
431 WARN_ON(memblock_set_node(0, max_addr
, &memblock
.memory
, NUMA_NO_NODE
));
432 WARN_ON(memblock_set_node(0, max_addr
, &memblock
.reserved
,
434 /* In case that parsing SRAT failed. */
435 WARN_ON(memblock_clear_hotplug(0, max_addr
));
436 numa_reset_distance();
443 * We reset memblock back to the top-down direction
444 * here because if we configured ACPI_NUMA, we have
445 * parsed SRAT in init_func(). It is ok to have the
446 * reset here even if we did't configure ACPI_NUMA
447 * or acpi numa init fails and fallbacks to dummy
450 if (memblock_force_top_down
)
451 memblock_set_bottom_up(false);
453 ret
= numa_cleanup_meminfo(&numa_meminfo
);
457 numa_emulation(&numa_meminfo
, numa_distance_cnt
);
459 return numa_register_meminfo(&numa_meminfo
);
462 static int __init
cmp_memblk(const void *a
, const void *b
)
464 const struct numa_memblk
*ma
= *(const struct numa_memblk
**)a
;
465 const struct numa_memblk
*mb
= *(const struct numa_memblk
**)b
;
467 return (ma
->start
> mb
->start
) - (ma
->start
< mb
->start
);
470 static struct numa_memblk
*numa_memblk_list
[NR_NODE_MEMBLKS
] __initdata
;
473 * numa_fill_memblks - Fill gaps in numa_meminfo memblks
474 * @start: address to begin fill
475 * @end: address to end fill
477 * Find and extend numa_meminfo memblks to cover the physical
478 * address range @start-@end
482 * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end
485 int __init
numa_fill_memblks(u64 start
, u64 end
)
487 struct numa_memblk
**blk
= &numa_memblk_list
[0];
488 struct numa_meminfo
*mi
= &numa_meminfo
;
493 * Create a list of pointers to numa_meminfo memblks that
494 * overlap start, end. The list is used to make in-place
495 * changes that fill out the numa_meminfo memblks.
497 for (int i
= 0; i
< mi
->nr_blks
; i
++) {
498 struct numa_memblk
*bi
= &mi
->blk
[i
];
500 if (memblock_addrs_overlap(start
, end
- start
, bi
->start
,
501 bi
->end
- bi
->start
)) {
502 blk
[count
] = &mi
->blk
[i
];
507 return NUMA_NO_MEMBLK
;
509 /* Sort the list of pointers in memblk->start order */
510 sort(&blk
[0], count
, sizeof(blk
[0]), cmp_memblk
, NULL
);
512 /* Make sure the first/last memblks include start/end */
513 blk
[0]->start
= min(blk
[0]->start
, start
);
514 blk
[count
- 1]->end
= max(blk
[count
- 1]->end
, end
);
517 * Fill any gaps by tracking the previous memblks
518 * end address and backfilling to it if needed.
520 prev_end
= blk
[0]->end
;
521 for (int i
= 1; i
< count
; i
++) {
522 struct numa_memblk
*curr
= blk
[i
];
524 if (prev_end
>= curr
->start
) {
525 if (prev_end
< curr
->end
)
526 prev_end
= curr
->end
;
528 curr
->start
= prev_end
;
529 prev_end
= curr
->end
;
535 #ifdef CONFIG_NUMA_KEEP_MEMINFO
536 static int meminfo_to_nid(struct numa_meminfo
*mi
, u64 start
)
540 for (i
= 0; i
< mi
->nr_blks
; i
++)
541 if (mi
->blk
[i
].start
<= start
&& mi
->blk
[i
].end
> start
)
542 return mi
->blk
[i
].nid
;
546 int phys_to_target_node(u64 start
)
548 int nid
= meminfo_to_nid(&numa_meminfo
, start
);
551 * Prefer online nodes, but if reserved memory might be
552 * hot-added continue the search with reserved ranges.
554 if (nid
!= NUMA_NO_NODE
)
557 return meminfo_to_nid(&numa_reserved_meminfo
, start
);
559 EXPORT_SYMBOL_GPL(phys_to_target_node
);
561 int memory_add_physaddr_to_nid(u64 start
)
563 int nid
= meminfo_to_nid(&numa_meminfo
, start
);
565 if (nid
== NUMA_NO_NODE
)
566 nid
= numa_meminfo
.blk
[0].nid
;
569 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid
);
571 #endif /* CONFIG_NUMA_KEEP_MEMINFO */