1 // SPDX-License-Identifier: GPL-2.0
3 * Memory subsystem support
5 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
6 * Dave Hansen <haveblue@us.ibm.com>
8 * This file provides the necessary infrastructure to represent
9 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
10 * All arch-independent code that assumes MEMORY_HOTPLUG requires
11 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
14 #include <linux/module.h>
15 #include <linux/init.h>
16 #include <linux/topology.h>
17 #include <linux/capability.h>
18 #include <linux/device.h>
19 #include <linux/memory.h>
20 #include <linux/memory_hotplug.h>
22 #include <linux/stat.h>
23 #include <linux/slab.h>
24 #include <linux/xarray.h>
26 #include <linux/atomic.h>
27 #include <linux/uaccess.h>
29 #define MEMORY_CLASS_NAME "memory"
31 static const char *const online_type_to_str
[] = {
32 [MMOP_OFFLINE
] = "offline",
33 [MMOP_ONLINE
] = "online",
34 [MMOP_ONLINE_KERNEL
] = "online_kernel",
35 [MMOP_ONLINE_MOVABLE
] = "online_movable",
38 int memhp_online_type_from_str(const char *str
)
42 for (i
= 0; i
< ARRAY_SIZE(online_type_to_str
); i
++) {
43 if (sysfs_streq(str
, online_type_to_str
[i
]))
49 #define to_memory_block(dev) container_of(dev, struct memory_block, dev)
51 static int sections_per_block
;
53 static inline unsigned long base_memory_block_id(unsigned long section_nr
)
55 return section_nr
/ sections_per_block
;
58 static inline unsigned long pfn_to_block_id(unsigned long pfn
)
60 return base_memory_block_id(pfn_to_section_nr(pfn
));
63 static inline unsigned long phys_to_block_id(unsigned long phys
)
65 return pfn_to_block_id(PFN_DOWN(phys
));
68 static int memory_subsys_online(struct device
*dev
);
69 static int memory_subsys_offline(struct device
*dev
);
71 static struct bus_type memory_subsys
= {
72 .name
= MEMORY_CLASS_NAME
,
73 .dev_name
= MEMORY_CLASS_NAME
,
74 .online
= memory_subsys_online
,
75 .offline
= memory_subsys_offline
,
79 * Memory blocks are cached in a local radix tree to avoid
80 * a costly linear search for the corresponding device on
83 static DEFINE_XARRAY(memory_blocks
);
85 static BLOCKING_NOTIFIER_HEAD(memory_chain
);
87 int register_memory_notifier(struct notifier_block
*nb
)
89 return blocking_notifier_chain_register(&memory_chain
, nb
);
91 EXPORT_SYMBOL(register_memory_notifier
);
93 void unregister_memory_notifier(struct notifier_block
*nb
)
95 blocking_notifier_chain_unregister(&memory_chain
, nb
);
97 EXPORT_SYMBOL(unregister_memory_notifier
);
99 static void memory_block_release(struct device
*dev
)
101 struct memory_block
*mem
= to_memory_block(dev
);
106 unsigned long __weak
memory_block_size_bytes(void)
108 return MIN_MEMORY_BLOCK_SIZE
;
110 EXPORT_SYMBOL_GPL(memory_block_size_bytes
);
113 * Show the first physical section index (number) of this memory block.
115 static ssize_t
phys_index_show(struct device
*dev
,
116 struct device_attribute
*attr
, char *buf
)
118 struct memory_block
*mem
= to_memory_block(dev
);
119 unsigned long phys_index
;
121 phys_index
= mem
->start_section_nr
/ sections_per_block
;
122 return sprintf(buf
, "%08lx\n", phys_index
);
126 * Legacy interface that we cannot remove. Always indicate "removable"
127 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic.
129 static ssize_t
removable_show(struct device
*dev
, struct device_attribute
*attr
,
132 return sprintf(buf
, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE
));
136 * online, offline, going offline, etc.
138 static ssize_t
state_show(struct device
*dev
, struct device_attribute
*attr
,
141 struct memory_block
*mem
= to_memory_block(dev
);
145 * We can probably put these states in a nice little array
146 * so that they're not open-coded
148 switch (mem
->state
) {
150 len
= sprintf(buf
, "online\n");
153 len
= sprintf(buf
, "offline\n");
155 case MEM_GOING_OFFLINE
:
156 len
= sprintf(buf
, "going-offline\n");
159 len
= sprintf(buf
, "ERROR-UNKNOWN-%ld\n",
168 int memory_notify(unsigned long val
, void *v
)
170 return blocking_notifier_call_chain(&memory_chain
, val
, v
);
174 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
175 * OK to have direct references to sparsemem variables in here.
178 memory_block_action(unsigned long start_section_nr
, unsigned long action
,
179 int online_type
, int nid
)
181 unsigned long start_pfn
;
182 unsigned long nr_pages
= PAGES_PER_SECTION
* sections_per_block
;
185 start_pfn
= section_nr_to_pfn(start_section_nr
);
189 ret
= online_pages(start_pfn
, nr_pages
, online_type
, nid
);
192 ret
= offline_pages(start_pfn
, nr_pages
);
195 WARN(1, KERN_WARNING
"%s(%ld, %ld) unknown action: "
196 "%ld\n", __func__
, start_section_nr
, action
, action
);
203 static int memory_block_change_state(struct memory_block
*mem
,
204 unsigned long to_state
, unsigned long from_state_req
)
208 if (mem
->state
!= from_state_req
)
211 if (to_state
== MEM_OFFLINE
)
212 mem
->state
= MEM_GOING_OFFLINE
;
214 ret
= memory_block_action(mem
->start_section_nr
, to_state
,
215 mem
->online_type
, mem
->nid
);
217 mem
->state
= ret
? from_state_req
: to_state
;
222 /* The device lock serializes operations on memory_subsys_[online|offline] */
223 static int memory_subsys_online(struct device
*dev
)
225 struct memory_block
*mem
= to_memory_block(dev
);
228 if (mem
->state
== MEM_ONLINE
)
232 * When called via device_online() without configuring the online_type,
233 * we want to default to MMOP_ONLINE.
235 if (mem
->online_type
== MMOP_OFFLINE
)
236 mem
->online_type
= MMOP_ONLINE
;
238 ret
= memory_block_change_state(mem
, MEM_ONLINE
, MEM_OFFLINE
);
239 mem
->online_type
= MMOP_OFFLINE
;
244 static int memory_subsys_offline(struct device
*dev
)
246 struct memory_block
*mem
= to_memory_block(dev
);
248 if (mem
->state
== MEM_OFFLINE
)
251 return memory_block_change_state(mem
, MEM_OFFLINE
, MEM_ONLINE
);
254 static ssize_t
state_store(struct device
*dev
, struct device_attribute
*attr
,
255 const char *buf
, size_t count
)
257 const int online_type
= memhp_online_type_from_str(buf
);
258 struct memory_block
*mem
= to_memory_block(dev
);
264 ret
= lock_device_hotplug_sysfs();
268 switch (online_type
) {
269 case MMOP_ONLINE_KERNEL
:
270 case MMOP_ONLINE_MOVABLE
:
272 /* mem->online_type is protected by device_hotplug_lock */
273 mem
->online_type
= online_type
;
274 ret
= device_online(&mem
->dev
);
277 ret
= device_offline(&mem
->dev
);
280 ret
= -EINVAL
; /* should never happen */
283 unlock_device_hotplug();
294 * phys_device is a bad name for this. What I really want
295 * is a way to differentiate between memory ranges that
296 * are part of physical devices that constitute
297 * a complete removable unit or fru.
298 * i.e. do these ranges belong to the same physical device,
299 * s.t. if I offline all of these sections I can then
300 * remove the physical device?
302 static ssize_t
phys_device_show(struct device
*dev
,
303 struct device_attribute
*attr
, char *buf
)
305 struct memory_block
*mem
= to_memory_block(dev
);
306 return sprintf(buf
, "%d\n", mem
->phys_device
);
309 #ifdef CONFIG_MEMORY_HOTREMOVE
310 static void print_allowed_zone(char *buf
, int nid
, unsigned long start_pfn
,
311 unsigned long nr_pages
, int online_type
,
312 struct zone
*default_zone
)
316 zone
= zone_for_pfn_range(online_type
, nid
, start_pfn
, nr_pages
);
317 if (zone
!= default_zone
) {
319 strcat(buf
, zone
->name
);
323 static ssize_t
valid_zones_show(struct device
*dev
,
324 struct device_attribute
*attr
, char *buf
)
326 struct memory_block
*mem
= to_memory_block(dev
);
327 unsigned long start_pfn
= section_nr_to_pfn(mem
->start_section_nr
);
328 unsigned long nr_pages
= PAGES_PER_SECTION
* sections_per_block
;
329 struct zone
*default_zone
;
333 * Check the existing zone. Make sure that we do that only on the
334 * online nodes otherwise the page_zone is not reliable
336 if (mem
->state
== MEM_ONLINE
) {
338 * The block contains more than one zone can not be offlined.
339 * This can happen e.g. for ZONE_DMA and ZONE_DMA32
341 default_zone
= test_pages_in_a_zone(start_pfn
,
342 start_pfn
+ nr_pages
);
344 return sprintf(buf
, "none\n");
345 strcat(buf
, default_zone
->name
);
350 default_zone
= zone_for_pfn_range(MMOP_ONLINE
, nid
, start_pfn
,
352 strcat(buf
, default_zone
->name
);
354 print_allowed_zone(buf
, nid
, start_pfn
, nr_pages
, MMOP_ONLINE_KERNEL
,
356 print_allowed_zone(buf
, nid
, start_pfn
, nr_pages
, MMOP_ONLINE_MOVABLE
,
363 static DEVICE_ATTR_RO(valid_zones
);
366 static DEVICE_ATTR_RO(phys_index
);
367 static DEVICE_ATTR_RW(state
);
368 static DEVICE_ATTR_RO(phys_device
);
369 static DEVICE_ATTR_RO(removable
);
372 * Show the memory block size (shared by all memory blocks).
374 static ssize_t
block_size_bytes_show(struct device
*dev
,
375 struct device_attribute
*attr
, char *buf
)
377 return sprintf(buf
, "%lx\n", memory_block_size_bytes());
380 static DEVICE_ATTR_RO(block_size_bytes
);
383 * Memory auto online policy.
386 static ssize_t
auto_online_blocks_show(struct device
*dev
,
387 struct device_attribute
*attr
, char *buf
)
389 return sprintf(buf
, "%s\n",
390 online_type_to_str
[memhp_default_online_type
]);
393 static ssize_t
auto_online_blocks_store(struct device
*dev
,
394 struct device_attribute
*attr
,
395 const char *buf
, size_t count
)
397 const int online_type
= memhp_online_type_from_str(buf
);
402 memhp_default_online_type
= online_type
;
406 static DEVICE_ATTR_RW(auto_online_blocks
);
409 * Some architectures will have custom drivers to do this, and
410 * will not need to do it from userspace. The fake hot-add code
411 * as well as ppc64 will do all of their discovery in userspace
412 * and will require this interface.
414 #ifdef CONFIG_ARCH_MEMORY_PROBE
415 static ssize_t
probe_store(struct device
*dev
, struct device_attribute
*attr
,
416 const char *buf
, size_t count
)
420 unsigned long pages_per_block
= PAGES_PER_SECTION
* sections_per_block
;
422 ret
= kstrtoull(buf
, 0, &phys_addr
);
426 if (phys_addr
& ((pages_per_block
<< PAGE_SHIFT
) - 1))
429 ret
= lock_device_hotplug_sysfs();
433 nid
= memory_add_physaddr_to_nid(phys_addr
);
434 ret
= __add_memory(nid
, phys_addr
,
435 MIN_MEMORY_BLOCK_SIZE
* sections_per_block
);
442 unlock_device_hotplug();
446 static DEVICE_ATTR_WO(probe
);
449 #ifdef CONFIG_MEMORY_FAILURE
451 * Support for offlining pages of memory
454 /* Soft offline a page */
455 static ssize_t
soft_offline_page_store(struct device
*dev
,
456 struct device_attribute
*attr
,
457 const char *buf
, size_t count
)
461 if (!capable(CAP_SYS_ADMIN
))
463 if (kstrtoull(buf
, 0, &pfn
) < 0)
466 ret
= soft_offline_page(pfn
, 0);
467 return ret
== 0 ? count
: ret
;
470 /* Forcibly offline a page, including killing processes. */
471 static ssize_t
hard_offline_page_store(struct device
*dev
,
472 struct device_attribute
*attr
,
473 const char *buf
, size_t count
)
477 if (!capable(CAP_SYS_ADMIN
))
479 if (kstrtoull(buf
, 0, &pfn
) < 0)
482 ret
= memory_failure(pfn
, 0);
483 return ret
? ret
: count
;
486 static DEVICE_ATTR_WO(soft_offline_page
);
487 static DEVICE_ATTR_WO(hard_offline_page
);
491 * Note that phys_device is optional. It is here to allow for
492 * differentiation between which *physical* devices each
493 * section belongs to...
495 int __weak
arch_get_memory_phys_device(unsigned long start_pfn
)
501 * A reference for the returned memory block device is acquired.
503 * Called under device_hotplug_lock.
505 static struct memory_block
*find_memory_block_by_id(unsigned long block_id
)
507 struct memory_block
*mem
;
509 mem
= xa_load(&memory_blocks
, block_id
);
511 get_device(&mem
->dev
);
516 * Called under device_hotplug_lock.
518 struct memory_block
*find_memory_block(struct mem_section
*section
)
520 unsigned long block_id
= base_memory_block_id(__section_nr(section
));
522 return find_memory_block_by_id(block_id
);
525 static struct attribute
*memory_memblk_attrs
[] = {
526 &dev_attr_phys_index
.attr
,
527 &dev_attr_state
.attr
,
528 &dev_attr_phys_device
.attr
,
529 &dev_attr_removable
.attr
,
530 #ifdef CONFIG_MEMORY_HOTREMOVE
531 &dev_attr_valid_zones
.attr
,
536 static struct attribute_group memory_memblk_attr_group
= {
537 .attrs
= memory_memblk_attrs
,
540 static const struct attribute_group
*memory_memblk_attr_groups
[] = {
541 &memory_memblk_attr_group
,
546 * register_memory - Setup a sysfs device for a memory block
549 int register_memory(struct memory_block
*memory
)
553 memory
->dev
.bus
= &memory_subsys
;
554 memory
->dev
.id
= memory
->start_section_nr
/ sections_per_block
;
555 memory
->dev
.release
= memory_block_release
;
556 memory
->dev
.groups
= memory_memblk_attr_groups
;
557 memory
->dev
.offline
= memory
->state
== MEM_OFFLINE
;
559 ret
= device_register(&memory
->dev
);
561 put_device(&memory
->dev
);
564 ret
= xa_err(xa_store(&memory_blocks
, memory
->dev
.id
, memory
,
567 put_device(&memory
->dev
);
568 device_unregister(&memory
->dev
);
573 static int init_memory_block(struct memory_block
**memory
,
574 unsigned long block_id
, unsigned long state
)
576 struct memory_block
*mem
;
577 unsigned long start_pfn
;
580 mem
= find_memory_block_by_id(block_id
);
582 put_device(&mem
->dev
);
585 mem
= kzalloc(sizeof(*mem
), GFP_KERNEL
);
589 mem
->start_section_nr
= block_id
* sections_per_block
;
591 start_pfn
= section_nr_to_pfn(mem
->start_section_nr
);
592 mem
->phys_device
= arch_get_memory_phys_device(start_pfn
);
593 mem
->nid
= NUMA_NO_NODE
;
595 ret
= register_memory(mem
);
601 static int add_memory_block(unsigned long base_section_nr
)
603 int section_count
= 0;
604 struct memory_block
*mem
;
607 for (nr
= base_section_nr
; nr
< base_section_nr
+ sections_per_block
;
609 if (present_section_nr(nr
))
612 if (section_count
== 0)
614 return init_memory_block(&mem
, base_memory_block_id(base_section_nr
),
618 static void unregister_memory(struct memory_block
*memory
)
620 if (WARN_ON_ONCE(memory
->dev
.bus
!= &memory_subsys
))
623 WARN_ON(xa_erase(&memory_blocks
, memory
->dev
.id
) == NULL
);
625 /* drop the ref. we got via find_memory_block() */
626 put_device(&memory
->dev
);
627 device_unregister(&memory
->dev
);
631 * Create memory block devices for the given memory area. Start and size
632 * have to be aligned to memory block granularity. Memory block devices
633 * will be initialized as offline.
635 * Called under device_hotplug_lock.
637 int create_memory_block_devices(unsigned long start
, unsigned long size
)
639 const unsigned long start_block_id
= pfn_to_block_id(PFN_DOWN(start
));
640 unsigned long end_block_id
= pfn_to_block_id(PFN_DOWN(start
+ size
));
641 struct memory_block
*mem
;
642 unsigned long block_id
;
645 if (WARN_ON_ONCE(!IS_ALIGNED(start
, memory_block_size_bytes()) ||
646 !IS_ALIGNED(size
, memory_block_size_bytes())))
649 for (block_id
= start_block_id
; block_id
!= end_block_id
; block_id
++) {
650 ret
= init_memory_block(&mem
, block_id
, MEM_OFFLINE
);
655 end_block_id
= block_id
;
656 for (block_id
= start_block_id
; block_id
!= end_block_id
;
658 mem
= find_memory_block_by_id(block_id
);
659 if (WARN_ON_ONCE(!mem
))
661 unregister_memory(mem
);
668 * Remove memory block devices for the given memory area. Start and size
669 * have to be aligned to memory block granularity. Memory block devices
670 * have to be offline.
672 * Called under device_hotplug_lock.
674 void remove_memory_block_devices(unsigned long start
, unsigned long size
)
676 const unsigned long start_block_id
= pfn_to_block_id(PFN_DOWN(start
));
677 const unsigned long end_block_id
= pfn_to_block_id(PFN_DOWN(start
+ size
));
678 struct memory_block
*mem
;
679 unsigned long block_id
;
681 if (WARN_ON_ONCE(!IS_ALIGNED(start
, memory_block_size_bytes()) ||
682 !IS_ALIGNED(size
, memory_block_size_bytes())))
685 for (block_id
= start_block_id
; block_id
!= end_block_id
; block_id
++) {
686 mem
= find_memory_block_by_id(block_id
);
687 if (WARN_ON_ONCE(!mem
))
689 unregister_memory_block_under_nodes(mem
);
690 unregister_memory(mem
);
694 /* return true if the memory block is offlined, otherwise, return false */
695 bool is_memblock_offlined(struct memory_block
*mem
)
697 return mem
->state
== MEM_OFFLINE
;
700 static struct attribute
*memory_root_attrs
[] = {
701 #ifdef CONFIG_ARCH_MEMORY_PROBE
702 &dev_attr_probe
.attr
,
705 #ifdef CONFIG_MEMORY_FAILURE
706 &dev_attr_soft_offline_page
.attr
,
707 &dev_attr_hard_offline_page
.attr
,
710 &dev_attr_block_size_bytes
.attr
,
711 &dev_attr_auto_online_blocks
.attr
,
715 static struct attribute_group memory_root_attr_group
= {
716 .attrs
= memory_root_attrs
,
719 static const struct attribute_group
*memory_root_attr_groups
[] = {
720 &memory_root_attr_group
,
725 * Initialize the sysfs support for memory devices. At the time this function
726 * is called, we cannot have concurrent creation/deletion of memory block
727 * devices, the device_hotplug_lock is not needed.
729 void __init
memory_dev_init(void)
732 unsigned long block_sz
, nr
;
734 /* Validate the configured memory block size */
735 block_sz
= memory_block_size_bytes();
736 if (!is_power_of_2(block_sz
) || block_sz
< MIN_MEMORY_BLOCK_SIZE
)
737 panic("Memory block size not suitable: 0x%lx\n", block_sz
);
738 sections_per_block
= block_sz
/ MIN_MEMORY_BLOCK_SIZE
;
740 ret
= subsys_system_register(&memory_subsys
, memory_root_attr_groups
);
742 panic("%s() failed to register subsystem: %d\n", __func__
, ret
);
745 * Create entries for memory sections that were found
746 * during boot and have been initialized
748 for (nr
= 0; nr
<= __highest_present_section_nr
;
749 nr
+= sections_per_block
) {
750 ret
= add_memory_block(nr
);
752 panic("%s() failed to add memory block: %d\n", __func__
,
758 * walk_memory_blocks - walk through all present memory blocks overlapped
759 * by the range [start, start + size)
761 * @start: start address of the memory range
762 * @size: size of the memory range
763 * @arg: argument passed to func
764 * @func: callback for each memory section walked
766 * This function walks through all present memory blocks overlapped by the
767 * range [start, start + size), calling func on each memory block.
769 * In case func() returns an error, walking is aborted and the error is
772 * Called under device_hotplug_lock.
774 int walk_memory_blocks(unsigned long start
, unsigned long size
,
775 void *arg
, walk_memory_blocks_func_t func
)
777 const unsigned long start_block_id
= phys_to_block_id(start
);
778 const unsigned long end_block_id
= phys_to_block_id(start
+ size
- 1);
779 struct memory_block
*mem
;
780 unsigned long block_id
;
786 for (block_id
= start_block_id
; block_id
<= end_block_id
; block_id
++) {
787 mem
= find_memory_block_by_id(block_id
);
791 ret
= func(mem
, arg
);
792 put_device(&mem
->dev
);
799 struct for_each_memory_block_cb_data
{
800 walk_memory_blocks_func_t func
;
804 static int for_each_memory_block_cb(struct device
*dev
, void *data
)
806 struct memory_block
*mem
= to_memory_block(dev
);
807 struct for_each_memory_block_cb_data
*cb_data
= data
;
809 return cb_data
->func(mem
, cb_data
->arg
);
813 * for_each_memory_block - walk through all present memory blocks
815 * @arg: argument passed to func
816 * @func: callback for each memory block walked
818 * This function walks through all present memory blocks, calling func on
821 * In case func() returns an error, walking is aborted and the error is
824 int for_each_memory_block(void *arg
, walk_memory_blocks_func_t func
)
826 struct for_each_memory_block_cb_data cb_data
= {
831 return bus_for_each_dev(&memory_subsys
, NULL
, &cb_data
,
832 for_each_memory_block_cb
);