1 // SPDX-License-Identifier: GPL-2.0
3 * Memory subsystem support
5 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
6 * Dave Hansen <haveblue@us.ibm.com>
8 * This file provides the necessary infrastructure to represent
9 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
10 * All arch-independent code that assumes MEMORY_HOTPLUG requires
11 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
14 #include <linux/module.h>
15 #include <linux/init.h>
16 #include <linux/topology.h>
17 #include <linux/capability.h>
18 #include <linux/device.h>
19 #include <linux/memory.h>
20 #include <linux/memory_hotplug.h>
22 #include <linux/stat.h>
23 #include <linux/slab.h>
24 #include <linux/xarray.h>
26 #include <linux/atomic.h>
27 #include <linux/uaccess.h>
29 #define MEMORY_CLASS_NAME "memory"
31 static const char *const online_type_to_str
[] = {
32 [MMOP_OFFLINE
] = "offline",
33 [MMOP_ONLINE
] = "online",
34 [MMOP_ONLINE_KERNEL
] = "online_kernel",
35 [MMOP_ONLINE_MOVABLE
] = "online_movable",
38 int mhp_online_type_from_str(const char *str
)
42 for (i
= 0; i
< ARRAY_SIZE(online_type_to_str
); i
++) {
43 if (sysfs_streq(str
, online_type_to_str
[i
]))
49 #define to_memory_block(dev) container_of(dev, struct memory_block, dev)
51 static int sections_per_block
;
53 static inline unsigned long memory_block_id(unsigned long section_nr
)
55 return section_nr
/ sections_per_block
;
58 static inline unsigned long pfn_to_block_id(unsigned long pfn
)
60 return memory_block_id(pfn_to_section_nr(pfn
));
63 static inline unsigned long phys_to_block_id(unsigned long phys
)
65 return pfn_to_block_id(PFN_DOWN(phys
));
68 static int memory_subsys_online(struct device
*dev
);
69 static int memory_subsys_offline(struct device
*dev
);
71 static const struct bus_type memory_subsys
= {
72 .name
= MEMORY_CLASS_NAME
,
73 .dev_name
= MEMORY_CLASS_NAME
,
74 .online
= memory_subsys_online
,
75 .offline
= memory_subsys_offline
,
79 * Memory blocks are cached in a local radix tree to avoid
80 * a costly linear search for the corresponding device on
83 static DEFINE_XARRAY(memory_blocks
);
86 * Memory groups, indexed by memory group id (mgid).
88 static DEFINE_XARRAY_FLAGS(memory_groups
, XA_FLAGS_ALLOC
);
89 #define MEMORY_GROUP_MARK_DYNAMIC XA_MARK_1
91 static BLOCKING_NOTIFIER_HEAD(memory_chain
);
93 int register_memory_notifier(struct notifier_block
*nb
)
95 return blocking_notifier_chain_register(&memory_chain
, nb
);
97 EXPORT_SYMBOL(register_memory_notifier
);
99 void unregister_memory_notifier(struct notifier_block
*nb
)
101 blocking_notifier_chain_unregister(&memory_chain
, nb
);
103 EXPORT_SYMBOL(unregister_memory_notifier
);
105 static void memory_block_release(struct device
*dev
)
107 struct memory_block
*mem
= to_memory_block(dev
);
108 /* Verify that the altmap is freed */
109 WARN_ON(mem
->altmap
);
113 unsigned long __weak
memory_block_size_bytes(void)
115 return MIN_MEMORY_BLOCK_SIZE
;
117 EXPORT_SYMBOL_GPL(memory_block_size_bytes
);
119 /* Show the memory block ID, relative to the memory block size */
120 static ssize_t
phys_index_show(struct device
*dev
,
121 struct device_attribute
*attr
, char *buf
)
123 struct memory_block
*mem
= to_memory_block(dev
);
125 return sysfs_emit(buf
, "%08lx\n", memory_block_id(mem
->start_section_nr
));
129 * Legacy interface that we cannot remove. Always indicate "removable"
130 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic.
132 static ssize_t
removable_show(struct device
*dev
, struct device_attribute
*attr
,
135 return sysfs_emit(buf
, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE
));
139 * online, offline, going offline, etc.
141 static ssize_t
state_show(struct device
*dev
, struct device_attribute
*attr
,
144 struct memory_block
*mem
= to_memory_block(dev
);
148 * We can probably put these states in a nice little array
149 * so that they're not open-coded
151 switch (mem
->state
) {
158 case MEM_GOING_OFFLINE
:
159 output
= "going-offline";
163 return sysfs_emit(buf
, "ERROR-UNKNOWN-%ld\n", mem
->state
);
166 return sysfs_emit(buf
, "%s\n", output
);
169 int memory_notify(unsigned long val
, void *v
)
171 return blocking_notifier_call_chain(&memory_chain
, val
, v
);
174 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
175 static unsigned long memblk_nr_poison(struct memory_block
*mem
);
177 static inline unsigned long memblk_nr_poison(struct memory_block
*mem
)
184 * Must acquire mem_hotplug_lock in write mode.
186 static int memory_block_online(struct memory_block
*mem
)
188 unsigned long start_pfn
= section_nr_to_pfn(mem
->start_section_nr
);
189 unsigned long nr_pages
= PAGES_PER_SECTION
* sections_per_block
;
190 unsigned long nr_vmemmap_pages
= 0;
191 struct memory_notify arg
;
195 if (memblk_nr_poison(mem
))
198 zone
= zone_for_pfn_range(mem
->online_type
, mem
->nid
, mem
->group
,
199 start_pfn
, nr_pages
);
202 * Although vmemmap pages have a different lifecycle than the pages
203 * they describe (they remain until the memory is unplugged), doing
204 * their initialization and accounting at memory onlining/offlining
205 * stage helps to keep accounting easier to follow - e.g vmemmaps
206 * belong to the same zone as the memory they backed.
209 nr_vmemmap_pages
= mem
->altmap
->free
;
211 arg
.altmap_start_pfn
= start_pfn
;
212 arg
.altmap_nr_pages
= nr_vmemmap_pages
;
213 arg
.start_pfn
= start_pfn
+ nr_vmemmap_pages
;
214 arg
.nr_pages
= nr_pages
- nr_vmemmap_pages
;
216 ret
= memory_notify(MEM_PREPARE_ONLINE
, &arg
);
217 ret
= notifier_to_errno(ret
);
221 if (nr_vmemmap_pages
) {
222 ret
= mhp_init_memmap_on_memory(start_pfn
, nr_vmemmap_pages
,
223 zone
, mem
->altmap
->inaccessible
);
228 ret
= online_pages(start_pfn
+ nr_vmemmap_pages
,
229 nr_pages
- nr_vmemmap_pages
, zone
, mem
->group
);
231 if (nr_vmemmap_pages
)
232 mhp_deinit_memmap_on_memory(start_pfn
, nr_vmemmap_pages
);
237 * Account once onlining succeeded. If the zone was unpopulated, it is
238 * now already properly populated.
240 if (nr_vmemmap_pages
)
241 adjust_present_page_count(pfn_to_page(start_pfn
), mem
->group
,
248 memory_notify(MEM_FINISH_OFFLINE
, &arg
);
255 * Must acquire mem_hotplug_lock in write mode.
257 static int memory_block_offline(struct memory_block
*mem
)
259 unsigned long start_pfn
= section_nr_to_pfn(mem
->start_section_nr
);
260 unsigned long nr_pages
= PAGES_PER_SECTION
* sections_per_block
;
261 unsigned long nr_vmemmap_pages
= 0;
262 struct memory_notify arg
;
269 * Unaccount before offlining, such that unpopulated zone and kthreads
270 * can properly be torn down in offline_pages().
273 nr_vmemmap_pages
= mem
->altmap
->free
;
276 if (nr_vmemmap_pages
)
277 adjust_present_page_count(pfn_to_page(start_pfn
), mem
->group
,
280 ret
= offline_pages(start_pfn
+ nr_vmemmap_pages
,
281 nr_pages
- nr_vmemmap_pages
, mem
->zone
, mem
->group
);
283 /* offline_pages() failed. Account back. */
284 if (nr_vmemmap_pages
)
285 adjust_present_page_count(pfn_to_page(start_pfn
),
286 mem
->group
, nr_vmemmap_pages
);
290 if (nr_vmemmap_pages
)
291 mhp_deinit_memmap_on_memory(start_pfn
, nr_vmemmap_pages
);
294 arg
.altmap_start_pfn
= start_pfn
;
295 arg
.altmap_nr_pages
= nr_vmemmap_pages
;
296 arg
.start_pfn
= start_pfn
+ nr_vmemmap_pages
;
297 arg
.nr_pages
= nr_pages
- nr_vmemmap_pages
;
298 memory_notify(MEM_FINISH_OFFLINE
, &arg
);
305 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
306 * OK to have direct references to sparsemem variables in here.
309 memory_block_action(struct memory_block
*mem
, unsigned long action
)
315 ret
= memory_block_online(mem
);
318 ret
= memory_block_offline(mem
);
321 WARN(1, KERN_WARNING
"%s(%ld, %ld) unknown action: "
322 "%ld\n", __func__
, mem
->start_section_nr
, action
, action
);
329 static int memory_block_change_state(struct memory_block
*mem
,
330 unsigned long to_state
, unsigned long from_state_req
)
334 if (mem
->state
!= from_state_req
)
337 if (to_state
== MEM_OFFLINE
)
338 mem
->state
= MEM_GOING_OFFLINE
;
340 ret
= memory_block_action(mem
, to_state
);
341 mem
->state
= ret
? from_state_req
: to_state
;
346 /* The device lock serializes operations on memory_subsys_[online|offline] */
347 static int memory_subsys_online(struct device
*dev
)
349 struct memory_block
*mem
= to_memory_block(dev
);
352 if (mem
->state
== MEM_ONLINE
)
356 * When called via device_online() without configuring the online_type,
357 * we want to default to MMOP_ONLINE.
359 if (mem
->online_type
== MMOP_OFFLINE
)
360 mem
->online_type
= MMOP_ONLINE
;
362 ret
= memory_block_change_state(mem
, MEM_ONLINE
, MEM_OFFLINE
);
363 mem
->online_type
= MMOP_OFFLINE
;
368 static int memory_subsys_offline(struct device
*dev
)
370 struct memory_block
*mem
= to_memory_block(dev
);
372 if (mem
->state
== MEM_OFFLINE
)
375 return memory_block_change_state(mem
, MEM_OFFLINE
, MEM_ONLINE
);
378 static ssize_t
state_store(struct device
*dev
, struct device_attribute
*attr
,
379 const char *buf
, size_t count
)
381 const int online_type
= mhp_online_type_from_str(buf
);
382 struct memory_block
*mem
= to_memory_block(dev
);
388 ret
= lock_device_hotplug_sysfs();
392 switch (online_type
) {
393 case MMOP_ONLINE_KERNEL
:
394 case MMOP_ONLINE_MOVABLE
:
396 /* mem->online_type is protected by device_hotplug_lock */
397 mem
->online_type
= online_type
;
398 ret
= device_online(&mem
->dev
);
401 ret
= device_offline(&mem
->dev
);
404 ret
= -EINVAL
; /* should never happen */
407 unlock_device_hotplug();
418 * Legacy interface that we cannot remove: s390x exposes the storage increment
419 * covered by a memory block, allowing for identifying which memory blocks
420 * comprise a storage increment. Since a memory block spans complete
421 * storage increments nowadays, this interface is basically unused. Other
422 * archs never exposed != 0.
424 static ssize_t
phys_device_show(struct device
*dev
,
425 struct device_attribute
*attr
, char *buf
)
427 struct memory_block
*mem
= to_memory_block(dev
);
428 unsigned long start_pfn
= section_nr_to_pfn(mem
->start_section_nr
);
430 return sysfs_emit(buf
, "%d\n",
431 arch_get_memory_phys_device(start_pfn
));
434 #ifdef CONFIG_MEMORY_HOTREMOVE
435 static int print_allowed_zone(char *buf
, int len
, int nid
,
436 struct memory_group
*group
,
437 unsigned long start_pfn
, unsigned long nr_pages
,
438 int online_type
, struct zone
*default_zone
)
442 zone
= zone_for_pfn_range(online_type
, nid
, group
, start_pfn
, nr_pages
);
443 if (zone
== default_zone
)
446 return sysfs_emit_at(buf
, len
, " %s", zone
->name
);
449 static ssize_t
valid_zones_show(struct device
*dev
,
450 struct device_attribute
*attr
, char *buf
)
452 struct memory_block
*mem
= to_memory_block(dev
);
453 unsigned long start_pfn
= section_nr_to_pfn(mem
->start_section_nr
);
454 unsigned long nr_pages
= PAGES_PER_SECTION
* sections_per_block
;
455 struct memory_group
*group
= mem
->group
;
456 struct zone
*default_zone
;
461 * Check the existing zone. Make sure that we do that only on the
462 * online nodes otherwise the page_zone is not reliable
464 if (mem
->state
== MEM_ONLINE
) {
466 * If !mem->zone, the memory block spans multiple zones and
467 * cannot get offlined.
469 default_zone
= mem
->zone
;
471 return sysfs_emit(buf
, "%s\n", "none");
472 len
+= sysfs_emit_at(buf
, len
, "%s", default_zone
->name
);
476 default_zone
= zone_for_pfn_range(MMOP_ONLINE
, nid
, group
,
477 start_pfn
, nr_pages
);
479 len
+= sysfs_emit_at(buf
, len
, "%s", default_zone
->name
);
480 len
+= print_allowed_zone(buf
, len
, nid
, group
, start_pfn
, nr_pages
,
481 MMOP_ONLINE_KERNEL
, default_zone
);
482 len
+= print_allowed_zone(buf
, len
, nid
, group
, start_pfn
, nr_pages
,
483 MMOP_ONLINE_MOVABLE
, default_zone
);
485 len
+= sysfs_emit_at(buf
, len
, "\n");
488 static DEVICE_ATTR_RO(valid_zones
);
491 static DEVICE_ATTR_RO(phys_index
);
492 static DEVICE_ATTR_RW(state
);
493 static DEVICE_ATTR_RO(phys_device
);
494 static DEVICE_ATTR_RO(removable
);
497 * Show the memory block size (shared by all memory blocks).
499 static ssize_t
block_size_bytes_show(struct device
*dev
,
500 struct device_attribute
*attr
, char *buf
)
502 return sysfs_emit(buf
, "%lx\n", memory_block_size_bytes());
505 static DEVICE_ATTR_RO(block_size_bytes
);
508 * Memory auto online policy.
511 static ssize_t
auto_online_blocks_show(struct device
*dev
,
512 struct device_attribute
*attr
, char *buf
)
514 return sysfs_emit(buf
, "%s\n",
515 online_type_to_str
[mhp_default_online_type
]);
518 static ssize_t
auto_online_blocks_store(struct device
*dev
,
519 struct device_attribute
*attr
,
520 const char *buf
, size_t count
)
522 const int online_type
= mhp_online_type_from_str(buf
);
527 mhp_default_online_type
= online_type
;
531 static DEVICE_ATTR_RW(auto_online_blocks
);
533 #ifdef CONFIG_CRASH_HOTPLUG
534 #include <linux/kexec.h>
535 static ssize_t
crash_hotplug_show(struct device
*dev
,
536 struct device_attribute
*attr
, char *buf
)
538 return sysfs_emit(buf
, "%d\n", crash_check_hotplug_support());
540 static DEVICE_ATTR_RO(crash_hotplug
);
544 * Some architectures will have custom drivers to do this, and
545 * will not need to do it from userspace. The fake hot-add code
546 * as well as ppc64 will do all of their discovery in userspace
547 * and will require this interface.
549 #ifdef CONFIG_ARCH_MEMORY_PROBE
550 static ssize_t
probe_store(struct device
*dev
, struct device_attribute
*attr
,
551 const char *buf
, size_t count
)
555 unsigned long pages_per_block
= PAGES_PER_SECTION
* sections_per_block
;
557 ret
= kstrtoull(buf
, 0, &phys_addr
);
561 if (phys_addr
& ((pages_per_block
<< PAGE_SHIFT
) - 1))
564 ret
= lock_device_hotplug_sysfs();
568 nid
= memory_add_physaddr_to_nid(phys_addr
);
569 ret
= __add_memory(nid
, phys_addr
,
570 MIN_MEMORY_BLOCK_SIZE
* sections_per_block
,
578 unlock_device_hotplug();
582 static DEVICE_ATTR_WO(probe
);
585 #ifdef CONFIG_MEMORY_FAILURE
587 * Support for offlining pages of memory
590 /* Soft offline a page */
591 static ssize_t
soft_offline_page_store(struct device
*dev
,
592 struct device_attribute
*attr
,
593 const char *buf
, size_t count
)
597 if (!capable(CAP_SYS_ADMIN
))
599 if (kstrtoull(buf
, 0, &pfn
) < 0)
602 ret
= soft_offline_page(pfn
, 0);
603 return ret
== 0 ? count
: ret
;
606 /* Forcibly offline a page, including killing processes. */
607 static ssize_t
hard_offline_page_store(struct device
*dev
,
608 struct device_attribute
*attr
,
609 const char *buf
, size_t count
)
613 if (!capable(CAP_SYS_ADMIN
))
615 if (kstrtoull(buf
, 0, &pfn
) < 0)
618 ret
= memory_failure(pfn
, MF_SW_SIMULATED
);
619 if (ret
== -EOPNOTSUPP
)
621 return ret
? ret
: count
;
624 static DEVICE_ATTR_WO(soft_offline_page
);
625 static DEVICE_ATTR_WO(hard_offline_page
);
628 /* See phys_device_show(). */
629 int __weak
arch_get_memory_phys_device(unsigned long start_pfn
)
635 * A reference for the returned memory block device is acquired.
637 * Called under device_hotplug_lock.
639 static struct memory_block
*find_memory_block_by_id(unsigned long block_id
)
641 struct memory_block
*mem
;
643 mem
= xa_load(&memory_blocks
, block_id
);
645 get_device(&mem
->dev
);
650 * Called under device_hotplug_lock.
652 struct memory_block
*find_memory_block(unsigned long section_nr
)
654 unsigned long block_id
= memory_block_id(section_nr
);
656 return find_memory_block_by_id(block_id
);
659 static struct attribute
*memory_memblk_attrs
[] = {
660 &dev_attr_phys_index
.attr
,
661 &dev_attr_state
.attr
,
662 &dev_attr_phys_device
.attr
,
663 &dev_attr_removable
.attr
,
664 #ifdef CONFIG_MEMORY_HOTREMOVE
665 &dev_attr_valid_zones
.attr
,
670 static const struct attribute_group memory_memblk_attr_group
= {
671 .attrs
= memory_memblk_attrs
,
674 static const struct attribute_group
*memory_memblk_attr_groups
[] = {
675 &memory_memblk_attr_group
,
679 static int __add_memory_block(struct memory_block
*memory
)
683 memory
->dev
.bus
= &memory_subsys
;
684 memory
->dev
.id
= memory
->start_section_nr
/ sections_per_block
;
685 memory
->dev
.release
= memory_block_release
;
686 memory
->dev
.groups
= memory_memblk_attr_groups
;
687 memory
->dev
.offline
= memory
->state
== MEM_OFFLINE
;
689 ret
= device_register(&memory
->dev
);
691 put_device(&memory
->dev
);
694 ret
= xa_err(xa_store(&memory_blocks
, memory
->dev
.id
, memory
,
697 device_unregister(&memory
->dev
);
702 static struct zone
*early_node_zone_for_memory_block(struct memory_block
*mem
,
705 const unsigned long start_pfn
= section_nr_to_pfn(mem
->start_section_nr
);
706 const unsigned long nr_pages
= PAGES_PER_SECTION
* sections_per_block
;
707 struct zone
*zone
, *matching_zone
= NULL
;
708 pg_data_t
*pgdat
= NODE_DATA(nid
);
712 * This logic only works for early memory, when the applicable zones
713 * already span the memory block. We don't expect overlapping zones on
714 * a single node for early memory. So if we're told that some PFNs
715 * of a node fall into this memory block, we can assume that all node
716 * zones that intersect with the memory block are actually applicable.
717 * No need to look at the memmap.
719 for (i
= 0; i
< MAX_NR_ZONES
; i
++) {
720 zone
= pgdat
->node_zones
+ i
;
721 if (!populated_zone(zone
))
723 if (!zone_intersects(zone
, start_pfn
, nr_pages
))
725 if (!matching_zone
) {
726 matching_zone
= zone
;
729 /* Spans multiple zones ... */
730 matching_zone
= NULL
;
733 return matching_zone
;
738 * memory_block_add_nid() - Indicate that system RAM falling into this memory
739 * block device (partially) belongs to the given node.
740 * @mem: The memory block device.
742 * @context: The memory initialization context.
744 * Indicate that system RAM falling into this memory block (partially) belongs
745 * to the given node. If the context indicates ("early") that we are adding the
746 * node during node device subsystem initialization, this will also properly
747 * set/adjust mem->zone based on the zone ranges of the given node.
749 void memory_block_add_nid(struct memory_block
*mem
, int nid
,
750 enum meminit_context context
)
752 if (context
== MEMINIT_EARLY
&& mem
->nid
!= nid
) {
754 * For early memory we have to determine the zone when setting
755 * the node id and handle multiple nodes spanning a single
756 * memory block by indicate via zone == NULL that we're not
757 * dealing with a single zone. So if we're setting the node id
758 * the first time, determine if there is a single zone. If we're
759 * setting the node id a second time to a different node,
760 * invalidate the single detected zone.
762 if (mem
->nid
== NUMA_NO_NODE
)
763 mem
->zone
= early_node_zone_for_memory_block(mem
, nid
);
769 * If this memory block spans multiple nodes, we only indicate
770 * the last processed node. If we span multiple nodes (not applicable
771 * to hotplugged memory), zone == NULL will prohibit memory offlining
772 * and consequently unplug.
778 static int add_memory_block(unsigned long block_id
, unsigned long state
,
779 struct vmem_altmap
*altmap
,
780 struct memory_group
*group
)
782 struct memory_block
*mem
;
785 mem
= find_memory_block_by_id(block_id
);
787 put_device(&mem
->dev
);
790 mem
= kzalloc(sizeof(*mem
), GFP_KERNEL
);
794 mem
->start_section_nr
= block_id
* sections_per_block
;
796 mem
->nid
= NUMA_NO_NODE
;
797 mem
->altmap
= altmap
;
798 INIT_LIST_HEAD(&mem
->group_next
);
801 if (state
== MEM_ONLINE
)
803 * MEM_ONLINE at this point implies early memory. With NUMA,
804 * we'll determine the zone when setting the node id via
805 * memory_block_add_nid(). Memory hotplug updated the zone
806 * manually when memory onlining/offlining succeeds.
808 mem
->zone
= early_node_zone_for_memory_block(mem
, NUMA_NO_NODE
);
809 #endif /* CONFIG_NUMA */
811 ret
= __add_memory_block(mem
);
817 list_add(&mem
->group_next
, &group
->memory_blocks
);
823 static int __init
add_boot_memory_block(unsigned long base_section_nr
)
825 int section_count
= 0;
828 for (nr
= base_section_nr
; nr
< base_section_nr
+ sections_per_block
;
830 if (present_section_nr(nr
))
833 if (section_count
== 0)
835 return add_memory_block(memory_block_id(base_section_nr
),
836 MEM_ONLINE
, NULL
, NULL
);
839 static int add_hotplug_memory_block(unsigned long block_id
,
840 struct vmem_altmap
*altmap
,
841 struct memory_group
*group
)
843 return add_memory_block(block_id
, MEM_OFFLINE
, altmap
, group
);
846 static void remove_memory_block(struct memory_block
*memory
)
848 if (WARN_ON_ONCE(memory
->dev
.bus
!= &memory_subsys
))
851 WARN_ON(xa_erase(&memory_blocks
, memory
->dev
.id
) == NULL
);
854 list_del(&memory
->group_next
);
855 memory
->group
= NULL
;
858 /* drop the ref. we got via find_memory_block() */
859 put_device(&memory
->dev
);
860 device_unregister(&memory
->dev
);
864 * Create memory block devices for the given memory area. Start and size
865 * have to be aligned to memory block granularity. Memory block devices
866 * will be initialized as offline.
868 * Called under device_hotplug_lock.
870 int create_memory_block_devices(unsigned long start
, unsigned long size
,
871 struct vmem_altmap
*altmap
,
872 struct memory_group
*group
)
874 const unsigned long start_block_id
= pfn_to_block_id(PFN_DOWN(start
));
875 unsigned long end_block_id
= pfn_to_block_id(PFN_DOWN(start
+ size
));
876 struct memory_block
*mem
;
877 unsigned long block_id
;
880 if (WARN_ON_ONCE(!IS_ALIGNED(start
, memory_block_size_bytes()) ||
881 !IS_ALIGNED(size
, memory_block_size_bytes())))
884 for (block_id
= start_block_id
; block_id
!= end_block_id
; block_id
++) {
885 ret
= add_hotplug_memory_block(block_id
, altmap
, group
);
890 end_block_id
= block_id
;
891 for (block_id
= start_block_id
; block_id
!= end_block_id
;
893 mem
= find_memory_block_by_id(block_id
);
894 if (WARN_ON_ONCE(!mem
))
896 remove_memory_block(mem
);
903 * Remove memory block devices for the given memory area. Start and size
904 * have to be aligned to memory block granularity. Memory block devices
905 * have to be offline.
907 * Called under device_hotplug_lock.
909 void remove_memory_block_devices(unsigned long start
, unsigned long size
)
911 const unsigned long start_block_id
= pfn_to_block_id(PFN_DOWN(start
));
912 const unsigned long end_block_id
= pfn_to_block_id(PFN_DOWN(start
+ size
));
913 struct memory_block
*mem
;
914 unsigned long block_id
;
916 if (WARN_ON_ONCE(!IS_ALIGNED(start
, memory_block_size_bytes()) ||
917 !IS_ALIGNED(size
, memory_block_size_bytes())))
920 for (block_id
= start_block_id
; block_id
!= end_block_id
; block_id
++) {
921 mem
= find_memory_block_by_id(block_id
);
922 if (WARN_ON_ONCE(!mem
))
924 num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem
));
925 unregister_memory_block_under_nodes(mem
);
926 remove_memory_block(mem
);
930 static struct attribute
*memory_root_attrs
[] = {
931 #ifdef CONFIG_ARCH_MEMORY_PROBE
932 &dev_attr_probe
.attr
,
935 #ifdef CONFIG_MEMORY_FAILURE
936 &dev_attr_soft_offline_page
.attr
,
937 &dev_attr_hard_offline_page
.attr
,
940 &dev_attr_block_size_bytes
.attr
,
941 &dev_attr_auto_online_blocks
.attr
,
942 #ifdef CONFIG_CRASH_HOTPLUG
943 &dev_attr_crash_hotplug
.attr
,
948 static const struct attribute_group memory_root_attr_group
= {
949 .attrs
= memory_root_attrs
,
952 static const struct attribute_group
*memory_root_attr_groups
[] = {
953 &memory_root_attr_group
,
958 * Initialize the sysfs support for memory devices. At the time this function
959 * is called, we cannot have concurrent creation/deletion of memory block
960 * devices, the device_hotplug_lock is not needed.
962 void __init
memory_dev_init(void)
965 unsigned long block_sz
, nr
;
967 /* Validate the configured memory block size */
968 block_sz
= memory_block_size_bytes();
969 if (!is_power_of_2(block_sz
) || block_sz
< MIN_MEMORY_BLOCK_SIZE
)
970 panic("Memory block size not suitable: 0x%lx\n", block_sz
);
971 sections_per_block
= block_sz
/ MIN_MEMORY_BLOCK_SIZE
;
973 ret
= subsys_system_register(&memory_subsys
, memory_root_attr_groups
);
975 panic("%s() failed to register subsystem: %d\n", __func__
, ret
);
978 * Create entries for memory sections that were found
979 * during boot and have been initialized
981 for (nr
= 0; nr
<= __highest_present_section_nr
;
982 nr
+= sections_per_block
) {
983 ret
= add_boot_memory_block(nr
);
985 panic("%s() failed to add memory block: %d\n", __func__
,
991 * walk_memory_blocks - walk through all present memory blocks overlapped
992 * by the range [start, start + size)
994 * @start: start address of the memory range
995 * @size: size of the memory range
996 * @arg: argument passed to func
997 * @func: callback for each memory section walked
999 * This function walks through all present memory blocks overlapped by the
1000 * range [start, start + size), calling func on each memory block.
1002 * In case func() returns an error, walking is aborted and the error is
1005 * Called under device_hotplug_lock.
1007 int walk_memory_blocks(unsigned long start
, unsigned long size
,
1008 void *arg
, walk_memory_blocks_func_t func
)
1010 const unsigned long start_block_id
= phys_to_block_id(start
);
1011 const unsigned long end_block_id
= phys_to_block_id(start
+ size
- 1);
1012 struct memory_block
*mem
;
1013 unsigned long block_id
;
1019 for (block_id
= start_block_id
; block_id
<= end_block_id
; block_id
++) {
1020 mem
= find_memory_block_by_id(block_id
);
1024 ret
= func(mem
, arg
);
1025 put_device(&mem
->dev
);
1032 struct for_each_memory_block_cb_data
{
1033 walk_memory_blocks_func_t func
;
1037 static int for_each_memory_block_cb(struct device
*dev
, void *data
)
1039 struct memory_block
*mem
= to_memory_block(dev
);
1040 struct for_each_memory_block_cb_data
*cb_data
= data
;
1042 return cb_data
->func(mem
, cb_data
->arg
);
1046 * for_each_memory_block - walk through all present memory blocks
1048 * @arg: argument passed to func
1049 * @func: callback for each memory block walked
1051 * This function walks through all present memory blocks, calling func on
1052 * each memory block.
1054 * In case func() returns an error, walking is aborted and the error is
1057 int for_each_memory_block(void *arg
, walk_memory_blocks_func_t func
)
1059 struct for_each_memory_block_cb_data cb_data
= {
1064 return bus_for_each_dev(&memory_subsys
, NULL
, &cb_data
,
1065 for_each_memory_block_cb
);
1069 * This is an internal helper to unify allocation and initialization of
1070 * memory groups. Note that the passed memory group will be copied to a
1071 * dynamically allocated memory group. After this call, the passed
1072 * memory group should no longer be used.
1074 static int memory_group_register(struct memory_group group
)
1076 struct memory_group
*new_group
;
1080 if (!node_possible(group
.nid
))
1083 new_group
= kzalloc(sizeof(group
), GFP_KERNEL
);
1087 INIT_LIST_HEAD(&new_group
->memory_blocks
);
1089 ret
= xa_alloc(&memory_groups
, &mgid
, new_group
, xa_limit_31b
,
1094 } else if (group
.is_dynamic
) {
1095 xa_set_mark(&memory_groups
, mgid
, MEMORY_GROUP_MARK_DYNAMIC
);
1101 * memory_group_register_static() - Register a static memory group.
1102 * @nid: The node id.
1103 * @max_pages: The maximum number of pages we'll have in this static memory
1106 * Register a new static memory group and return the memory group id.
1107 * All memory in the group belongs to a single unit, such as a DIMM. All
1108 * memory belonging to a static memory group is added in one go to be removed
1109 * in one go -- it's static.
1111 * Returns an error if out of memory, if the node id is invalid, if no new
1112 * memory groups can be registered, or if max_pages is invalid (0). Otherwise,
1113 * returns the new memory group id.
1115 int memory_group_register_static(int nid
, unsigned long max_pages
)
1117 struct memory_group group
= {
1120 .max_pages
= max_pages
,
1126 return memory_group_register(group
);
1128 EXPORT_SYMBOL_GPL(memory_group_register_static
);
1131 * memory_group_register_dynamic() - Register a dynamic memory group.
1132 * @nid: The node id.
1133 * @unit_pages: Unit in pages in which is memory added/removed in this dynamic
1136 * Register a new dynamic memory group and return the memory group id.
1137 * Memory within a dynamic memory group is added/removed dynamically
1140 * Returns an error if out of memory, if the node id is invalid, if no new
1141 * memory groups can be registered, or if unit_pages is invalid (0, not a
1142 * power of two, smaller than a single memory block). Otherwise, returns the
1143 * new memory group id.
1145 int memory_group_register_dynamic(int nid
, unsigned long unit_pages
)
1147 struct memory_group group
= {
1151 .unit_pages
= unit_pages
,
1155 if (!unit_pages
|| !is_power_of_2(unit_pages
) ||
1156 unit_pages
< PHYS_PFN(memory_block_size_bytes()))
1158 return memory_group_register(group
);
1160 EXPORT_SYMBOL_GPL(memory_group_register_dynamic
);
1163 * memory_group_unregister() - Unregister a memory group.
1164 * @mgid: the memory group id
1166 * Unregister a memory group. If any memory block still belongs to this
1167 * memory group, unregistering will fail.
1169 * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some
1170 * memory blocks still belong to this memory group and returns 0 if
1171 * unregistering succeeded.
1173 int memory_group_unregister(int mgid
)
1175 struct memory_group
*group
;
1180 group
= xa_load(&memory_groups
, mgid
);
1183 if (!list_empty(&group
->memory_blocks
))
1185 xa_erase(&memory_groups
, mgid
);
1189 EXPORT_SYMBOL_GPL(memory_group_unregister
);
1192 * This is an internal helper only to be used in core memory hotplug code to
1193 * lookup a memory group. We don't care about locking, as we don't expect a
1194 * memory group to get unregistered while adding memory to it -- because
1195 * the group and the memory is managed by the same driver.
1197 struct memory_group
*memory_group_find_by_id(int mgid
)
1199 return xa_load(&memory_groups
, mgid
);
1203 * This is an internal helper only to be used in core memory hotplug code to
1204 * walk all dynamic memory groups excluding a given memory group, either
1205 * belonging to a specific node, or belonging to any node.
1207 int walk_dynamic_memory_groups(int nid
, walk_memory_groups_func_t func
,
1208 struct memory_group
*excluded
, void *arg
)
1210 struct memory_group
*group
;
1211 unsigned long index
;
1214 xa_for_each_marked(&memory_groups
, index
, group
,
1215 MEMORY_GROUP_MARK_DYNAMIC
) {
1216 if (group
== excluded
)
1219 if (nid
!= NUMA_NO_NODE
&& group
->nid
!= nid
)
1221 #endif /* CONFIG_NUMA */
1222 ret
= func(group
, arg
);
1229 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
1230 void memblk_nr_poison_inc(unsigned long pfn
)
1232 const unsigned long block_id
= pfn_to_block_id(pfn
);
1233 struct memory_block
*mem
= find_memory_block_by_id(block_id
);
1236 atomic_long_inc(&mem
->nr_hwpoison
);
1239 void memblk_nr_poison_sub(unsigned long pfn
, long i
)
1241 const unsigned long block_id
= pfn_to_block_id(pfn
);
1242 struct memory_block
*mem
= find_memory_block_by_id(block_id
);
1245 atomic_long_sub(i
, &mem
->nr_hwpoison
);
1248 static unsigned long memblk_nr_poison(struct memory_block
*mem
)
1250 return atomic_long_read(&mem
->nr_hwpoison
);