1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Virtio-mem device driver.
5 * Copyright Red Hat, Inc. 2020
7 * Author(s): David Hildenbrand <david@redhat.com>
10 #include <linux/virtio.h>
11 #include <linux/virtio_mem.h>
12 #include <linux/workqueue.h>
13 #include <linux/slab.h>
14 #include <linux/module.h>
16 #include <linux/memory_hotplug.h>
17 #include <linux/memory.h>
18 #include <linux/hrtimer.h>
19 #include <linux/crash_dump.h>
20 #include <linux/mutex.h>
21 #include <linux/bitmap.h>
22 #include <linux/lockdep.h>
24 #include <acpi/acpi_numa.h>
26 static bool unplug_online
= true;
27 module_param(unplug_online
, bool, 0644);
28 MODULE_PARM_DESC(unplug_online
, "Try to unplug online memory");
30 enum virtio_mem_mb_state
{
31 /* Unplugged, not added to Linux. Can be reused later. */
32 VIRTIO_MEM_MB_STATE_UNUSED
= 0,
33 /* (Partially) plugged, not added to Linux. Error on add_memory(). */
34 VIRTIO_MEM_MB_STATE_PLUGGED
,
35 /* Fully plugged, fully added to Linux, offline. */
36 VIRTIO_MEM_MB_STATE_OFFLINE
,
37 /* Partially plugged, fully added to Linux, offline. */
38 VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL
,
39 /* Fully plugged, fully added to Linux, online (!ZONE_MOVABLE). */
40 VIRTIO_MEM_MB_STATE_ONLINE
,
41 /* Partially plugged, fully added to Linux, online (!ZONE_MOVABLE). */
42 VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL
,
44 * Fully plugged, fully added to Linux, online (ZONE_MOVABLE).
45 * We are not allowed to allocate (unplug) parts of this block that
46 * are not movable (similar to gigantic pages). We will never allow
47 * to online OFFLINE_PARTIAL to ZONE_MOVABLE (as they would contain
50 VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE
,
51 VIRTIO_MEM_MB_STATE_COUNT
55 struct virtio_device
*vdev
;
57 /* We might first have to unplug all memory when starting up. */
58 bool unplug_all_required
;
60 /* Workqueue that processes the plug/unplug requests. */
61 struct work_struct wq
;
62 atomic_t config_changed
;
64 /* Virtqueue for guest->host requests. */
67 /* Wait for a host response to a guest request. */
68 wait_queue_head_t host_resp
;
70 /* Space for one guest request and the host response. */
71 struct virtio_mem_req req
;
72 struct virtio_mem_resp resp
;
74 /* The current size of the device. */
75 uint64_t plugged_size
;
76 /* The requested size of the device. */
77 uint64_t requested_size
;
79 /* The device block size (for communicating with the device). */
80 uint64_t device_block_size
;
81 /* The translated node id. NUMA_NO_NODE in case not specified. */
83 /* Physical start address of the memory region. */
85 /* Maximum region size in bytes. */
88 /* The subblock size. */
89 uint64_t subblock_size
;
90 /* The number of subblocks per memory block. */
91 uint32_t nb_sb_per_mb
;
93 /* Id of the first memory block of this device. */
94 unsigned long first_mb_id
;
95 /* Id of the last memory block of this device. */
96 unsigned long last_mb_id
;
97 /* Id of the last usable memory block of this device. */
98 unsigned long last_usable_mb_id
;
99 /* Id of the next memory bock to prepare when needed. */
100 unsigned long next_mb_id
;
102 /* The parent resource for all memory added via this device. */
103 struct resource
*parent_resource
;
105 * Copy of "System RAM (virtio_mem)" to be used for
106 * add_memory_driver_managed().
108 const char *resource_name
;
110 /* Summary of all memory block states. */
111 unsigned long nb_mb_state
[VIRTIO_MEM_MB_STATE_COUNT
];
112 #define VIRTIO_MEM_NB_OFFLINE_THRESHOLD 10
115 * One byte state per memory block.
117 * Allocated via vmalloc(). When preparing new blocks, resized
118 * (alloc+copy+free) when needed (crossing pages with the next mb).
119 * (when crossing pages).
121 * With 128MB memory blocks, we have states for 512GB of memory in one
127 * $nb_sb_per_mb bit per memory block. Handled similar to mb_state.
129 * With 4MB subblocks, we manage 128GB of memory in one page.
131 unsigned long *sb_bitmap
;
134 * Mutex that protects the nb_mb_state, mb_state, and sb_bitmap.
136 * When this lock is held the pointers can't change, ONLINE and
137 * OFFLINE blocks can't change the state and no subblocks will get
140 struct mutex hotplug_mutex
;
143 /* An error occurred we cannot handle - stop processing requests. */
146 /* The driver is being removed. */
147 spinlock_t removal_lock
;
150 /* Timer for retrying to plug/unplug memory. */
151 struct hrtimer retry_timer
;
152 unsigned int retry_timer_ms
;
153 #define VIRTIO_MEM_RETRY_TIMER_MIN_MS 50000
154 #define VIRTIO_MEM_RETRY_TIMER_MAX_MS 300000
156 /* Memory notifier (online/offline events). */
157 struct notifier_block memory_notifier
;
159 /* Next device in the list of virtio-mem devices. */
160 struct list_head next
;
164 * We have to share a single online_page callback among all virtio-mem
165 * devices. We use RCU to iterate the list in the callback.
167 static DEFINE_MUTEX(virtio_mem_mutex
);
168 static LIST_HEAD(virtio_mem_devices
);
170 static void virtio_mem_online_page_cb(struct page
*page
, unsigned int order
);
173 * Register a virtio-mem device so it will be considered for the online_page
176 static int register_virtio_mem_device(struct virtio_mem
*vm
)
180 /* First device registers the callback. */
181 mutex_lock(&virtio_mem_mutex
);
182 if (list_empty(&virtio_mem_devices
))
183 rc
= set_online_page_callback(&virtio_mem_online_page_cb
);
185 list_add_rcu(&vm
->next
, &virtio_mem_devices
);
186 mutex_unlock(&virtio_mem_mutex
);
192 * Unregister a virtio-mem device so it will no longer be considered for the
193 * online_page callback.
195 static void unregister_virtio_mem_device(struct virtio_mem
*vm
)
197 /* Last device unregisters the callback. */
198 mutex_lock(&virtio_mem_mutex
);
199 list_del_rcu(&vm
->next
);
200 if (list_empty(&virtio_mem_devices
))
201 restore_online_page_callback(&virtio_mem_online_page_cb
);
202 mutex_unlock(&virtio_mem_mutex
);
208 * Calculate the memory block id of a given address.
210 static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr
)
212 return addr
/ memory_block_size_bytes();
216 * Calculate the physical start address of a given memory block id.
218 static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id
)
220 return mb_id
* memory_block_size_bytes();
224 * Calculate the subblock id of a given address.
226 static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem
*vm
,
229 const unsigned long mb_id
= virtio_mem_phys_to_mb_id(addr
);
230 const unsigned long mb_addr
= virtio_mem_mb_id_to_phys(mb_id
);
232 return (addr
- mb_addr
) / vm
->subblock_size
;
236 * Set the state of a memory block, taking care of the state counter.
238 static void virtio_mem_mb_set_state(struct virtio_mem
*vm
, unsigned long mb_id
,
239 enum virtio_mem_mb_state state
)
241 const unsigned long idx
= mb_id
- vm
->first_mb_id
;
242 enum virtio_mem_mb_state old_state
;
244 old_state
= vm
->mb_state
[idx
];
245 vm
->mb_state
[idx
] = state
;
247 BUG_ON(vm
->nb_mb_state
[old_state
] == 0);
248 vm
->nb_mb_state
[old_state
]--;
249 vm
->nb_mb_state
[state
]++;
253 * Get the state of a memory block.
255 static enum virtio_mem_mb_state
virtio_mem_mb_get_state(struct virtio_mem
*vm
,
258 const unsigned long idx
= mb_id
- vm
->first_mb_id
;
260 return vm
->mb_state
[idx
];
264 * Prepare the state array for the next memory block.
266 static int virtio_mem_mb_state_prepare_next_mb(struct virtio_mem
*vm
)
268 unsigned long old_bytes
= vm
->next_mb_id
- vm
->first_mb_id
+ 1;
269 unsigned long new_bytes
= vm
->next_mb_id
- vm
->first_mb_id
+ 2;
270 int old_pages
= PFN_UP(old_bytes
);
271 int new_pages
= PFN_UP(new_bytes
);
272 uint8_t *new_mb_state
;
274 if (vm
->mb_state
&& old_pages
== new_pages
)
277 new_mb_state
= vzalloc(new_pages
* PAGE_SIZE
);
281 mutex_lock(&vm
->hotplug_mutex
);
283 memcpy(new_mb_state
, vm
->mb_state
, old_pages
* PAGE_SIZE
);
285 vm
->mb_state
= new_mb_state
;
286 mutex_unlock(&vm
->hotplug_mutex
);
291 #define virtio_mem_for_each_mb_state(_vm, _mb_id, _state) \
292 for (_mb_id = _vm->first_mb_id; \
293 _mb_id < _vm->next_mb_id && _vm->nb_mb_state[_state]; \
295 if (virtio_mem_mb_get_state(_vm, _mb_id) == _state)
297 #define virtio_mem_for_each_mb_state_rev(_vm, _mb_id, _state) \
298 for (_mb_id = _vm->next_mb_id - 1; \
299 _mb_id >= _vm->first_mb_id && _vm->nb_mb_state[_state]; \
301 if (virtio_mem_mb_get_state(_vm, _mb_id) == _state)
304 * Mark all selected subblocks plugged.
306 * Will not modify the state of the memory block.
308 static void virtio_mem_mb_set_sb_plugged(struct virtio_mem
*vm
,
309 unsigned long mb_id
, int sb_id
,
312 const int bit
= (mb_id
- vm
->first_mb_id
) * vm
->nb_sb_per_mb
+ sb_id
;
314 __bitmap_set(vm
->sb_bitmap
, bit
, count
);
318 * Mark all selected subblocks unplugged.
320 * Will not modify the state of the memory block.
322 static void virtio_mem_mb_set_sb_unplugged(struct virtio_mem
*vm
,
323 unsigned long mb_id
, int sb_id
,
326 const int bit
= (mb_id
- vm
->first_mb_id
) * vm
->nb_sb_per_mb
+ sb_id
;
328 __bitmap_clear(vm
->sb_bitmap
, bit
, count
);
332 * Test if all selected subblocks are plugged.
334 static bool virtio_mem_mb_test_sb_plugged(struct virtio_mem
*vm
,
335 unsigned long mb_id
, int sb_id
,
338 const int bit
= (mb_id
- vm
->first_mb_id
) * vm
->nb_sb_per_mb
+ sb_id
;
341 return test_bit(bit
, vm
->sb_bitmap
);
343 /* TODO: Helper similar to bitmap_set() */
344 return find_next_zero_bit(vm
->sb_bitmap
, bit
+ count
, bit
) >=
349 * Test if all selected subblocks are unplugged.
351 static bool virtio_mem_mb_test_sb_unplugged(struct virtio_mem
*vm
,
352 unsigned long mb_id
, int sb_id
,
355 const int bit
= (mb_id
- vm
->first_mb_id
) * vm
->nb_sb_per_mb
+ sb_id
;
357 /* TODO: Helper similar to bitmap_set() */
358 return find_next_bit(vm
->sb_bitmap
, bit
+ count
, bit
) >= bit
+ count
;
362 * Find the first unplugged subblock. Returns vm->nb_sb_per_mb in case there is
365 static int virtio_mem_mb_first_unplugged_sb(struct virtio_mem
*vm
,
368 const int bit
= (mb_id
- vm
->first_mb_id
) * vm
->nb_sb_per_mb
;
370 return find_next_zero_bit(vm
->sb_bitmap
, bit
+ vm
->nb_sb_per_mb
, bit
) -
375 * Prepare the subblock bitmap for the next memory block.
377 static int virtio_mem_sb_bitmap_prepare_next_mb(struct virtio_mem
*vm
)
379 const unsigned long old_nb_mb
= vm
->next_mb_id
- vm
->first_mb_id
;
380 const unsigned long old_nb_bits
= old_nb_mb
* vm
->nb_sb_per_mb
;
381 const unsigned long new_nb_bits
= (old_nb_mb
+ 1) * vm
->nb_sb_per_mb
;
382 int old_pages
= PFN_UP(BITS_TO_LONGS(old_nb_bits
) * sizeof(long));
383 int new_pages
= PFN_UP(BITS_TO_LONGS(new_nb_bits
) * sizeof(long));
384 unsigned long *new_sb_bitmap
, *old_sb_bitmap
;
386 if (vm
->sb_bitmap
&& old_pages
== new_pages
)
389 new_sb_bitmap
= vzalloc(new_pages
* PAGE_SIZE
);
393 mutex_lock(&vm
->hotplug_mutex
);
395 memcpy(new_sb_bitmap
, vm
->sb_bitmap
, old_pages
* PAGE_SIZE
);
397 old_sb_bitmap
= vm
->sb_bitmap
;
398 vm
->sb_bitmap
= new_sb_bitmap
;
399 mutex_unlock(&vm
->hotplug_mutex
);
401 vfree(old_sb_bitmap
);
406 * Try to add a memory block to Linux. This will usually only fail
409 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
412 * Will not modify the state of the memory block.
414 static int virtio_mem_mb_add(struct virtio_mem
*vm
, unsigned long mb_id
)
416 const uint64_t addr
= virtio_mem_mb_id_to_phys(mb_id
);
419 if (nid
== NUMA_NO_NODE
)
420 nid
= memory_add_physaddr_to_nid(addr
);
423 * When force-unloading the driver and we still have memory added to
424 * Linux, the resource name has to stay.
426 if (!vm
->resource_name
) {
427 vm
->resource_name
= kstrdup_const("System RAM (virtio_mem)",
429 if (!vm
->resource_name
)
433 dev_dbg(&vm
->vdev
->dev
, "adding memory block: %lu\n", mb_id
);
434 return add_memory_driver_managed(nid
, addr
, memory_block_size_bytes(),
439 * Try to remove a memory block from Linux. Will only fail if the memory block
442 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
445 * Will not modify the state of the memory block.
447 static int virtio_mem_mb_remove(struct virtio_mem
*vm
, unsigned long mb_id
)
449 const uint64_t addr
= virtio_mem_mb_id_to_phys(mb_id
);
452 if (nid
== NUMA_NO_NODE
)
453 nid
= memory_add_physaddr_to_nid(addr
);
455 dev_dbg(&vm
->vdev
->dev
, "removing memory block: %lu\n", mb_id
);
456 return remove_memory(nid
, addr
, memory_block_size_bytes());
460 * Try to offline and remove a memory block from Linux.
462 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
465 * Will not modify the state of the memory block.
467 static int virtio_mem_mb_offline_and_remove(struct virtio_mem
*vm
,
470 const uint64_t addr
= virtio_mem_mb_id_to_phys(mb_id
);
473 if (nid
== NUMA_NO_NODE
)
474 nid
= memory_add_physaddr_to_nid(addr
);
476 dev_dbg(&vm
->vdev
->dev
, "offlining and removing memory block: %lu\n",
478 return offline_and_remove_memory(nid
, addr
, memory_block_size_bytes());
482 * Trigger the workqueue so the device can perform its magic.
484 static void virtio_mem_retry(struct virtio_mem
*vm
)
488 spin_lock_irqsave(&vm
->removal_lock
, flags
);
490 queue_work(system_freezable_wq
, &vm
->wq
);
491 spin_unlock_irqrestore(&vm
->removal_lock
, flags
);
494 static int virtio_mem_translate_node_id(struct virtio_mem
*vm
, uint16_t node_id
)
496 int node
= NUMA_NO_NODE
;
498 #if defined(CONFIG_ACPI_NUMA)
499 if (virtio_has_feature(vm
->vdev
, VIRTIO_MEM_F_ACPI_PXM
))
500 node
= pxm_to_node(node_id
);
506 * Test if a virtio-mem device overlaps with the given range. Can be called
507 * from (notifier) callbacks lockless.
509 static bool virtio_mem_overlaps_range(struct virtio_mem
*vm
,
510 unsigned long start
, unsigned long size
)
512 unsigned long dev_start
= virtio_mem_mb_id_to_phys(vm
->first_mb_id
);
513 unsigned long dev_end
= virtio_mem_mb_id_to_phys(vm
->last_mb_id
) +
514 memory_block_size_bytes();
516 return start
< dev_end
&& dev_start
< start
+ size
;
520 * Test if a virtio-mem device owns a memory block. Can be called from
521 * (notifier) callbacks lockless.
523 static bool virtio_mem_owned_mb(struct virtio_mem
*vm
, unsigned long mb_id
)
525 return mb_id
>= vm
->first_mb_id
&& mb_id
<= vm
->last_mb_id
;
528 static int virtio_mem_notify_going_online(struct virtio_mem
*vm
,
532 switch (virtio_mem_mb_get_state(vm
, mb_id
)) {
533 case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL
:
535 * We won't allow to online a partially plugged memory block
536 * to the MOVABLE zone - it would contain unmovable parts.
538 if (zone
== ZONE_MOVABLE
) {
539 dev_warn_ratelimited(&vm
->vdev
->dev
,
540 "memory block has holes, MOVABLE not supported\n");
544 case VIRTIO_MEM_MB_STATE_OFFLINE
:
549 dev_warn_ratelimited(&vm
->vdev
->dev
,
550 "memory block onlining denied\n");
554 static void virtio_mem_notify_offline(struct virtio_mem
*vm
,
557 switch (virtio_mem_mb_get_state(vm
, mb_id
)) {
558 case VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL
:
559 virtio_mem_mb_set_state(vm
, mb_id
,
560 VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL
);
562 case VIRTIO_MEM_MB_STATE_ONLINE
:
563 case VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE
:
564 virtio_mem_mb_set_state(vm
, mb_id
,
565 VIRTIO_MEM_MB_STATE_OFFLINE
);
573 * Trigger the workqueue, maybe we can now unplug memory. Also,
574 * when we offline and remove a memory block, this will re-trigger
575 * us immediately - which is often nice because the removal of
576 * the memory block (e.g., memmap) might have freed up memory
577 * on other memory blocks we manage.
579 virtio_mem_retry(vm
);
582 static void virtio_mem_notify_online(struct virtio_mem
*vm
, unsigned long mb_id
,
585 unsigned long nb_offline
;
587 switch (virtio_mem_mb_get_state(vm
, mb_id
)) {
588 case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL
:
589 BUG_ON(zone
== ZONE_MOVABLE
);
590 virtio_mem_mb_set_state(vm
, mb_id
,
591 VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL
);
593 case VIRTIO_MEM_MB_STATE_OFFLINE
:
594 if (zone
== ZONE_MOVABLE
)
595 virtio_mem_mb_set_state(vm
, mb_id
,
596 VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE
);
598 virtio_mem_mb_set_state(vm
, mb_id
,
599 VIRTIO_MEM_MB_STATE_ONLINE
);
605 nb_offline
= vm
->nb_mb_state
[VIRTIO_MEM_MB_STATE_OFFLINE
] +
606 vm
->nb_mb_state
[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL
];
608 /* see if we can add new blocks now that we onlined one block */
609 if (nb_offline
== VIRTIO_MEM_NB_OFFLINE_THRESHOLD
- 1)
610 virtio_mem_retry(vm
);
613 static void virtio_mem_notify_going_offline(struct virtio_mem
*vm
,
616 const unsigned long nr_pages
= PFN_DOWN(vm
->subblock_size
);
621 for (sb_id
= 0; sb_id
< vm
->nb_sb_per_mb
; sb_id
++) {
622 if (virtio_mem_mb_test_sb_plugged(vm
, mb_id
, sb_id
, 1))
625 * Drop our reference to the pages so the memory can get
626 * offlined and add the unplugged pages to the managed
627 * page counters (so offlining code can correctly subtract
630 pfn
= PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id
) +
631 sb_id
* vm
->subblock_size
);
632 adjust_managed_page_count(pfn_to_page(pfn
), nr_pages
);
633 for (i
= 0; i
< nr_pages
; i
++) {
634 page
= pfn_to_page(pfn
+ i
);
635 if (WARN_ON(!page_ref_dec_and_test(page
)))
636 dump_page(page
, "unplugged page referenced");
641 static void virtio_mem_notify_cancel_offline(struct virtio_mem
*vm
,
644 const unsigned long nr_pages
= PFN_DOWN(vm
->subblock_size
);
648 for (sb_id
= 0; sb_id
< vm
->nb_sb_per_mb
; sb_id
++) {
649 if (virtio_mem_mb_test_sb_plugged(vm
, mb_id
, sb_id
, 1))
652 * Get the reference we dropped when going offline and
653 * subtract the unplugged pages from the managed page
656 pfn
= PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id
) +
657 sb_id
* vm
->subblock_size
);
658 adjust_managed_page_count(pfn_to_page(pfn
), -nr_pages
);
659 for (i
= 0; i
< nr_pages
; i
++)
660 page_ref_inc(pfn_to_page(pfn
+ i
));
665 * This callback will either be called synchronously from add_memory() or
666 * asynchronously (e.g., triggered via user space). We have to be careful
667 * with locking when calling add_memory().
669 static int virtio_mem_memory_notifier_cb(struct notifier_block
*nb
,
670 unsigned long action
, void *arg
)
672 struct virtio_mem
*vm
= container_of(nb
, struct virtio_mem
,
674 struct memory_notify
*mhp
= arg
;
675 const unsigned long start
= PFN_PHYS(mhp
->start_pfn
);
676 const unsigned long size
= PFN_PHYS(mhp
->nr_pages
);
677 const unsigned long mb_id
= virtio_mem_phys_to_mb_id(start
);
681 if (!virtio_mem_overlaps_range(vm
, start
, size
))
685 * Memory is onlined/offlined in memory block granularity. We cannot
686 * cross virtio-mem device boundaries and memory block boundaries. Bail
687 * out if this ever changes.
689 if (WARN_ON_ONCE(size
!= memory_block_size_bytes() ||
690 !IS_ALIGNED(start
, memory_block_size_bytes())))
694 * Avoid circular locking lockdep warnings. We lock the mutex
695 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The
696 * blocking_notifier_call_chain() has it's own lock, which gets unlocked
697 * between both notifier calls and will bail out. False positive.
702 case MEM_GOING_OFFLINE
:
703 mutex_lock(&vm
->hotplug_mutex
);
705 rc
= notifier_from_errno(-EBUSY
);
706 mutex_unlock(&vm
->hotplug_mutex
);
709 vm
->hotplug_active
= true;
710 virtio_mem_notify_going_offline(vm
, mb_id
);
712 case MEM_GOING_ONLINE
:
713 mutex_lock(&vm
->hotplug_mutex
);
715 rc
= notifier_from_errno(-EBUSY
);
716 mutex_unlock(&vm
->hotplug_mutex
);
719 vm
->hotplug_active
= true;
720 zone
= page_zonenum(pfn_to_page(mhp
->start_pfn
));
721 rc
= virtio_mem_notify_going_online(vm
, mb_id
, zone
);
724 virtio_mem_notify_offline(vm
, mb_id
);
725 vm
->hotplug_active
= false;
726 mutex_unlock(&vm
->hotplug_mutex
);
729 zone
= page_zonenum(pfn_to_page(mhp
->start_pfn
));
730 virtio_mem_notify_online(vm
, mb_id
, zone
);
731 vm
->hotplug_active
= false;
732 mutex_unlock(&vm
->hotplug_mutex
);
734 case MEM_CANCEL_OFFLINE
:
735 if (!vm
->hotplug_active
)
737 virtio_mem_notify_cancel_offline(vm
, mb_id
);
738 vm
->hotplug_active
= false;
739 mutex_unlock(&vm
->hotplug_mutex
);
741 case MEM_CANCEL_ONLINE
:
742 if (!vm
->hotplug_active
)
744 vm
->hotplug_active
= false;
745 mutex_unlock(&vm
->hotplug_mutex
);
757 * Set a range of pages PG_offline. Remember pages that were never onlined
758 * (via generic_online_page()) using PageDirty().
760 static void virtio_mem_set_fake_offline(unsigned long pfn
,
761 unsigned int nr_pages
, bool onlined
)
763 for (; nr_pages
--; pfn
++) {
764 struct page
*page
= pfn_to_page(pfn
);
766 __SetPageOffline(page
);
769 /* FIXME: remove after cleanups */
770 ClearPageReserved(page
);
776 * Clear PG_offline from a range of pages. If the pages were never onlined,
777 * (via generic_online_page()), clear PageDirty().
779 static void virtio_mem_clear_fake_offline(unsigned long pfn
,
780 unsigned int nr_pages
, bool onlined
)
782 for (; nr_pages
--; pfn
++) {
783 struct page
*page
= pfn_to_page(pfn
);
785 __ClearPageOffline(page
);
787 ClearPageDirty(page
);
792 * Release a range of fake-offline pages to the buddy, effectively
793 * fake-onlining them.
795 static void virtio_mem_fake_online(unsigned long pfn
, unsigned int nr_pages
)
797 const int order
= MAX_ORDER
- 1;
801 * We are always called with subblock granularity, which is at least
802 * aligned to MAX_ORDER - 1.
804 for (i
= 0; i
< nr_pages
; i
+= 1 << order
) {
805 struct page
*page
= pfn_to_page(pfn
+ i
);
808 * If the page is PageDirty(), it was kept fake-offline when
809 * onlining the memory block. Otherwise, it was allocated
810 * using alloc_contig_range(). All pages in a subblock are
813 if (PageDirty(page
)) {
814 virtio_mem_clear_fake_offline(pfn
+ i
, 1 << order
,
816 generic_online_page(page
, order
);
818 virtio_mem_clear_fake_offline(pfn
+ i
, 1 << order
,
820 free_contig_range(pfn
+ i
, 1 << order
);
821 adjust_managed_page_count(page
, 1 << order
);
826 static void virtio_mem_online_page_cb(struct page
*page
, unsigned int order
)
828 const unsigned long addr
= page_to_phys(page
);
829 const unsigned long mb_id
= virtio_mem_phys_to_mb_id(addr
);
830 struct virtio_mem
*vm
;
834 * We exploit here that subblocks have at least MAX_ORDER - 1
835 * size/alignment and that this callback is is called with such a
836 * size/alignment. So we cannot cross subblocks and therefore
837 * also not memory blocks.
840 list_for_each_entry_rcu(vm
, &virtio_mem_devices
, next
) {
841 if (!virtio_mem_owned_mb(vm
, mb_id
))
844 sb_id
= virtio_mem_phys_to_sb_id(vm
, addr
);
846 * If plugged, online the pages, otherwise, set them fake
847 * offline (PageOffline).
849 if (virtio_mem_mb_test_sb_plugged(vm
, mb_id
, sb_id
, 1))
850 generic_online_page(page
, order
);
852 virtio_mem_set_fake_offline(PFN_DOWN(addr
), 1 << order
,
859 /* not virtio-mem memory, but e.g., a DIMM. online it */
860 generic_online_page(page
, order
);
863 static uint64_t virtio_mem_send_request(struct virtio_mem
*vm
,
864 const struct virtio_mem_req
*req
)
866 struct scatterlist
*sgs
[2], sg_req
, sg_resp
;
870 /* don't use the request residing on the stack (vaddr) */
873 /* out: buffer for request */
874 sg_init_one(&sg_req
, &vm
->req
, sizeof(vm
->req
));
877 /* in: buffer for response */
878 sg_init_one(&sg_resp
, &vm
->resp
, sizeof(vm
->resp
));
881 rc
= virtqueue_add_sgs(vm
->vq
, sgs
, 1, 1, vm
, GFP_KERNEL
);
885 virtqueue_kick(vm
->vq
);
887 /* wait for a response */
888 wait_event(vm
->host_resp
, virtqueue_get_buf(vm
->vq
, &len
));
890 return virtio16_to_cpu(vm
->vdev
, vm
->resp
.type
);
893 static int virtio_mem_send_plug_request(struct virtio_mem
*vm
, uint64_t addr
,
896 const uint64_t nb_vm_blocks
= size
/ vm
->device_block_size
;
897 const struct virtio_mem_req req
= {
898 .type
= cpu_to_virtio16(vm
->vdev
, VIRTIO_MEM_REQ_PLUG
),
899 .u
.plug
.addr
= cpu_to_virtio64(vm
->vdev
, addr
),
900 .u
.plug
.nb_blocks
= cpu_to_virtio16(vm
->vdev
, nb_vm_blocks
),
903 if (atomic_read(&vm
->config_changed
))
906 switch (virtio_mem_send_request(vm
, &req
)) {
907 case VIRTIO_MEM_RESP_ACK
:
908 vm
->plugged_size
+= size
;
910 case VIRTIO_MEM_RESP_NACK
:
912 case VIRTIO_MEM_RESP_BUSY
:
914 case VIRTIO_MEM_RESP_ERROR
:
921 static int virtio_mem_send_unplug_request(struct virtio_mem
*vm
, uint64_t addr
,
924 const uint64_t nb_vm_blocks
= size
/ vm
->device_block_size
;
925 const struct virtio_mem_req req
= {
926 .type
= cpu_to_virtio16(vm
->vdev
, VIRTIO_MEM_REQ_UNPLUG
),
927 .u
.unplug
.addr
= cpu_to_virtio64(vm
->vdev
, addr
),
928 .u
.unplug
.nb_blocks
= cpu_to_virtio16(vm
->vdev
, nb_vm_blocks
),
931 if (atomic_read(&vm
->config_changed
))
934 switch (virtio_mem_send_request(vm
, &req
)) {
935 case VIRTIO_MEM_RESP_ACK
:
936 vm
->plugged_size
-= size
;
938 case VIRTIO_MEM_RESP_BUSY
:
940 case VIRTIO_MEM_RESP_ERROR
:
947 static int virtio_mem_send_unplug_all_request(struct virtio_mem
*vm
)
949 const struct virtio_mem_req req
= {
950 .type
= cpu_to_virtio16(vm
->vdev
, VIRTIO_MEM_REQ_UNPLUG_ALL
),
953 switch (virtio_mem_send_request(vm
, &req
)) {
954 case VIRTIO_MEM_RESP_ACK
:
955 vm
->unplug_all_required
= false;
956 vm
->plugged_size
= 0;
957 /* usable region might have shrunk */
958 atomic_set(&vm
->config_changed
, 1);
960 case VIRTIO_MEM_RESP_BUSY
:
968 * Plug selected subblocks. Updates the plugged state, but not the state
969 * of the memory block.
971 static int virtio_mem_mb_plug_sb(struct virtio_mem
*vm
, unsigned long mb_id
,
972 int sb_id
, int count
)
974 const uint64_t addr
= virtio_mem_mb_id_to_phys(mb_id
) +
975 sb_id
* vm
->subblock_size
;
976 const uint64_t size
= count
* vm
->subblock_size
;
979 dev_dbg(&vm
->vdev
->dev
, "plugging memory block: %lu : %i - %i\n", mb_id
,
980 sb_id
, sb_id
+ count
- 1);
982 rc
= virtio_mem_send_plug_request(vm
, addr
, size
);
984 virtio_mem_mb_set_sb_plugged(vm
, mb_id
, sb_id
, count
);
989 * Unplug selected subblocks. Updates the plugged state, but not the state
990 * of the memory block.
992 static int virtio_mem_mb_unplug_sb(struct virtio_mem
*vm
, unsigned long mb_id
,
993 int sb_id
, int count
)
995 const uint64_t addr
= virtio_mem_mb_id_to_phys(mb_id
) +
996 sb_id
* vm
->subblock_size
;
997 const uint64_t size
= count
* vm
->subblock_size
;
1000 dev_dbg(&vm
->vdev
->dev
, "unplugging memory block: %lu : %i - %i\n",
1001 mb_id
, sb_id
, sb_id
+ count
- 1);
1003 rc
= virtio_mem_send_unplug_request(vm
, addr
, size
);
1005 virtio_mem_mb_set_sb_unplugged(vm
, mb_id
, sb_id
, count
);
1010 * Unplug the desired number of plugged subblocks of a offline or not-added
1011 * memory block. Will fail if any subblock cannot get unplugged (instead of
1014 * Will not modify the state of the memory block.
1016 * Note: can fail after some subblocks were unplugged.
1018 static int virtio_mem_mb_unplug_any_sb(struct virtio_mem
*vm
,
1019 unsigned long mb_id
, uint64_t *nb_sb
)
1024 sb_id
= vm
->nb_sb_per_mb
- 1;
1026 /* Find the next candidate subblock */
1027 while (sb_id
>= 0 &&
1028 virtio_mem_mb_test_sb_unplugged(vm
, mb_id
, sb_id
, 1))
1032 /* Try to unplug multiple subblocks at a time */
1034 while (count
< *nb_sb
&& sb_id
> 0 &&
1035 virtio_mem_mb_test_sb_plugged(vm
, mb_id
, sb_id
- 1, 1)) {
1040 rc
= virtio_mem_mb_unplug_sb(vm
, mb_id
, sb_id
, count
);
1051 * Unplug all plugged subblocks of an offline or not-added memory block.
1053 * Will not modify the state of the memory block.
1055 * Note: can fail after some subblocks were unplugged.
1057 static int virtio_mem_mb_unplug(struct virtio_mem
*vm
, unsigned long mb_id
)
1059 uint64_t nb_sb
= vm
->nb_sb_per_mb
;
1061 return virtio_mem_mb_unplug_any_sb(vm
, mb_id
, &nb_sb
);
1065 * Prepare tracking data for the next memory block.
1067 static int virtio_mem_prepare_next_mb(struct virtio_mem
*vm
,
1068 unsigned long *mb_id
)
1072 if (vm
->next_mb_id
> vm
->last_usable_mb_id
)
1075 /* Resize the state array if required. */
1076 rc
= virtio_mem_mb_state_prepare_next_mb(vm
);
1080 /* Resize the subblock bitmap if required. */
1081 rc
= virtio_mem_sb_bitmap_prepare_next_mb(vm
);
1085 vm
->nb_mb_state
[VIRTIO_MEM_MB_STATE_UNUSED
]++;
1086 *mb_id
= vm
->next_mb_id
++;
1091 * Don't add too many blocks that are not onlined yet to avoid running OOM.
1093 static bool virtio_mem_too_many_mb_offline(struct virtio_mem
*vm
)
1095 unsigned long nb_offline
;
1097 nb_offline
= vm
->nb_mb_state
[VIRTIO_MEM_MB_STATE_OFFLINE
] +
1098 vm
->nb_mb_state
[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL
];
1099 return nb_offline
>= VIRTIO_MEM_NB_OFFLINE_THRESHOLD
;
1103 * Try to plug the desired number of subblocks and add the memory block
1106 * Will modify the state of the memory block.
1108 static int virtio_mem_mb_plug_and_add(struct virtio_mem
*vm
,
1109 unsigned long mb_id
,
1112 const int count
= min_t(int, *nb_sb
, vm
->nb_sb_per_mb
);
1115 if (WARN_ON_ONCE(!count
))
1119 * Plug the requested number of subblocks before adding it to linux,
1120 * so that onlining will directly online all plugged subblocks.
1122 rc
= virtio_mem_mb_plug_sb(vm
, mb_id
, 0, count
);
1127 * Mark the block properly offline before adding it to Linux,
1128 * so the memory notifiers will find the block in the right state.
1130 if (count
== vm
->nb_sb_per_mb
)
1131 virtio_mem_mb_set_state(vm
, mb_id
,
1132 VIRTIO_MEM_MB_STATE_OFFLINE
);
1134 virtio_mem_mb_set_state(vm
, mb_id
,
1135 VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL
);
1137 /* Add the memory block to linux - if that fails, try to unplug. */
1138 rc
= virtio_mem_mb_add(vm
, mb_id
);
1140 enum virtio_mem_mb_state new_state
= VIRTIO_MEM_MB_STATE_UNUSED
;
1142 dev_err(&vm
->vdev
->dev
,
1143 "adding memory block %lu failed with %d\n", mb_id
, rc
);
1144 rc2
= virtio_mem_mb_unplug_sb(vm
, mb_id
, 0, count
);
1147 * TODO: Linux MM does not properly clean up yet in all cases
1148 * where adding of memory failed - especially on -ENOMEM.
1151 new_state
= VIRTIO_MEM_MB_STATE_PLUGGED
;
1152 virtio_mem_mb_set_state(vm
, mb_id
, new_state
);
1161 * Try to plug the desired number of subblocks of a memory block that
1162 * is already added to Linux.
1164 * Will modify the state of the memory block.
1166 * Note: Can fail after some subblocks were successfully plugged.
1168 static int virtio_mem_mb_plug_any_sb(struct virtio_mem
*vm
, unsigned long mb_id
,
1169 uint64_t *nb_sb
, bool online
)
1171 unsigned long pfn
, nr_pages
;
1175 if (WARN_ON_ONCE(!*nb_sb
))
1179 sb_id
= virtio_mem_mb_first_unplugged_sb(vm
, mb_id
);
1180 if (sb_id
>= vm
->nb_sb_per_mb
)
1183 while (count
< *nb_sb
&&
1184 sb_id
+ count
< vm
->nb_sb_per_mb
&&
1185 !virtio_mem_mb_test_sb_plugged(vm
, mb_id
, sb_id
+ count
,
1189 rc
= virtio_mem_mb_plug_sb(vm
, mb_id
, sb_id
, count
);
1196 /* fake-online the pages if the memory block is online */
1197 pfn
= PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id
) +
1198 sb_id
* vm
->subblock_size
);
1199 nr_pages
= PFN_DOWN(count
* vm
->subblock_size
);
1200 virtio_mem_fake_online(pfn
, nr_pages
);
1203 if (virtio_mem_mb_test_sb_plugged(vm
, mb_id
, 0, vm
->nb_sb_per_mb
)) {
1205 virtio_mem_mb_set_state(vm
, mb_id
,
1206 VIRTIO_MEM_MB_STATE_ONLINE
);
1208 virtio_mem_mb_set_state(vm
, mb_id
,
1209 VIRTIO_MEM_MB_STATE_OFFLINE
);
1216 * Try to plug the requested amount of memory.
1218 static int virtio_mem_plug_request(struct virtio_mem
*vm
, uint64_t diff
)
1220 uint64_t nb_sb
= diff
/ vm
->subblock_size
;
1221 unsigned long mb_id
;
1227 /* Don't race with onlining/offlining */
1228 mutex_lock(&vm
->hotplug_mutex
);
1230 /* Try to plug subblocks of partially plugged online blocks. */
1231 virtio_mem_for_each_mb_state(vm
, mb_id
,
1232 VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL
) {
1233 rc
= virtio_mem_mb_plug_any_sb(vm
, mb_id
, &nb_sb
, true);
1239 /* Try to plug subblocks of partially plugged offline blocks. */
1240 virtio_mem_for_each_mb_state(vm
, mb_id
,
1241 VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL
) {
1242 rc
= virtio_mem_mb_plug_any_sb(vm
, mb_id
, &nb_sb
, false);
1249 * We won't be working on online/offline memory blocks from this point,
1250 * so we can't race with memory onlining/offlining. Drop the mutex.
1252 mutex_unlock(&vm
->hotplug_mutex
);
1254 /* Try to plug and add unused blocks */
1255 virtio_mem_for_each_mb_state(vm
, mb_id
, VIRTIO_MEM_MB_STATE_UNUSED
) {
1256 if (virtio_mem_too_many_mb_offline(vm
))
1259 rc
= virtio_mem_mb_plug_and_add(vm
, mb_id
, &nb_sb
);
1265 /* Try to prepare, plug and add new blocks */
1267 if (virtio_mem_too_many_mb_offline(vm
))
1270 rc
= virtio_mem_prepare_next_mb(vm
, &mb_id
);
1273 rc
= virtio_mem_mb_plug_and_add(vm
, mb_id
, &nb_sb
);
1281 mutex_unlock(&vm
->hotplug_mutex
);
1286 * Unplug the desired number of plugged subblocks of an offline memory block.
1287 * Will fail if any subblock cannot get unplugged (instead of skipping it).
1289 * Will modify the state of the memory block. Might temporarily drop the
1292 * Note: Can fail after some subblocks were successfully unplugged.
1294 static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem
*vm
,
1295 unsigned long mb_id
,
1300 rc
= virtio_mem_mb_unplug_any_sb(vm
, mb_id
, nb_sb
);
1302 /* some subblocks might have been unplugged even on failure */
1303 if (!virtio_mem_mb_test_sb_plugged(vm
, mb_id
, 0, vm
->nb_sb_per_mb
))
1304 virtio_mem_mb_set_state(vm
, mb_id
,
1305 VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL
);
1309 if (virtio_mem_mb_test_sb_unplugged(vm
, mb_id
, 0, vm
->nb_sb_per_mb
)) {
1311 * Remove the block from Linux - this should never fail.
1312 * Hinder the block from getting onlined by marking it
1313 * unplugged. Temporarily drop the mutex, so
1314 * any pending GOING_ONLINE requests can be serviced/rejected.
1316 virtio_mem_mb_set_state(vm
, mb_id
,
1317 VIRTIO_MEM_MB_STATE_UNUSED
);
1319 mutex_unlock(&vm
->hotplug_mutex
);
1320 rc
= virtio_mem_mb_remove(vm
, mb_id
);
1322 mutex_lock(&vm
->hotplug_mutex
);
1328 * Unplug the given plugged subblocks of an online memory block.
1330 * Will modify the state of the memory block.
1332 static int virtio_mem_mb_unplug_sb_online(struct virtio_mem
*vm
,
1333 unsigned long mb_id
, int sb_id
,
1336 const unsigned long nr_pages
= PFN_DOWN(vm
->subblock_size
) * count
;
1337 unsigned long start_pfn
;
1340 start_pfn
= PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id
) +
1341 sb_id
* vm
->subblock_size
);
1342 rc
= alloc_contig_range(start_pfn
, start_pfn
+ nr_pages
,
1343 MIGRATE_MOVABLE
, GFP_KERNEL
);
1345 /* whoops, out of memory */
1350 /* Mark it as fake-offline before unplugging it */
1351 virtio_mem_set_fake_offline(start_pfn
, nr_pages
, true);
1352 adjust_managed_page_count(pfn_to_page(start_pfn
), -nr_pages
);
1354 /* Try to unplug the allocated memory */
1355 rc
= virtio_mem_mb_unplug_sb(vm
, mb_id
, sb_id
, count
);
1357 /* Return the memory to the buddy. */
1358 virtio_mem_fake_online(start_pfn
, nr_pages
);
1362 virtio_mem_mb_set_state(vm
, mb_id
,
1363 VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL
);
1368 * Unplug the desired number of plugged subblocks of an online memory block.
1369 * Will skip subblock that are busy.
1371 * Will modify the state of the memory block. Might temporarily drop the
1374 * Note: Can fail after some subblocks were successfully unplugged. Can
1375 * return 0 even if subblocks were busy and could not get unplugged.
1377 static int virtio_mem_mb_unplug_any_sb_online(struct virtio_mem
*vm
,
1378 unsigned long mb_id
,
1383 /* If possible, try to unplug the complete block in one shot. */
1384 if (*nb_sb
>= vm
->nb_sb_per_mb
&&
1385 virtio_mem_mb_test_sb_plugged(vm
, mb_id
, 0, vm
->nb_sb_per_mb
)) {
1386 rc
= virtio_mem_mb_unplug_sb_online(vm
, mb_id
, 0,
1389 *nb_sb
-= vm
->nb_sb_per_mb
;
1391 } else if (rc
!= -EBUSY
)
1395 /* Fallback to single subblocks. */
1396 for (sb_id
= vm
->nb_sb_per_mb
- 1; sb_id
>= 0 && *nb_sb
; sb_id
--) {
1397 /* Find the next candidate subblock */
1398 while (sb_id
>= 0 &&
1399 !virtio_mem_mb_test_sb_plugged(vm
, mb_id
, sb_id
, 1))
1404 rc
= virtio_mem_mb_unplug_sb_online(vm
, mb_id
, sb_id
, 1);
1414 * Once all subblocks of a memory block were unplugged, offline and
1415 * remove it. This will usually not fail, as no memory is in use
1416 * anymore - however some other notifiers might NACK the request.
1418 if (virtio_mem_mb_test_sb_unplugged(vm
, mb_id
, 0, vm
->nb_sb_per_mb
)) {
1419 mutex_unlock(&vm
->hotplug_mutex
);
1420 rc
= virtio_mem_mb_offline_and_remove(vm
, mb_id
);
1421 mutex_lock(&vm
->hotplug_mutex
);
1423 virtio_mem_mb_set_state(vm
, mb_id
,
1424 VIRTIO_MEM_MB_STATE_UNUSED
);
1431 * Try to unplug the requested amount of memory.
1433 static int virtio_mem_unplug_request(struct virtio_mem
*vm
, uint64_t diff
)
1435 uint64_t nb_sb
= diff
/ vm
->subblock_size
;
1436 unsigned long mb_id
;
1443 * We'll drop the mutex a couple of times when it is safe to do so.
1444 * This might result in some blocks switching the state (online/offline)
1445 * and we could miss them in this run - we will retry again later.
1447 mutex_lock(&vm
->hotplug_mutex
);
1449 /* Try to unplug subblocks of partially plugged offline blocks. */
1450 virtio_mem_for_each_mb_state_rev(vm
, mb_id
,
1451 VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL
) {
1452 rc
= virtio_mem_mb_unplug_any_sb_offline(vm
, mb_id
,
1459 /* Try to unplug subblocks of plugged offline blocks. */
1460 virtio_mem_for_each_mb_state_rev(vm
, mb_id
,
1461 VIRTIO_MEM_MB_STATE_OFFLINE
) {
1462 rc
= virtio_mem_mb_unplug_any_sb_offline(vm
, mb_id
,
1469 if (!unplug_online
) {
1470 mutex_unlock(&vm
->hotplug_mutex
);
1474 /* Try to unplug subblocks of partially plugged online blocks. */
1475 virtio_mem_for_each_mb_state_rev(vm
, mb_id
,
1476 VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL
) {
1477 rc
= virtio_mem_mb_unplug_any_sb_online(vm
, mb_id
,
1481 mutex_unlock(&vm
->hotplug_mutex
);
1483 mutex_lock(&vm
->hotplug_mutex
);
1486 /* Try to unplug subblocks of plugged online blocks. */
1487 virtio_mem_for_each_mb_state_rev(vm
, mb_id
,
1488 VIRTIO_MEM_MB_STATE_ONLINE
) {
1489 rc
= virtio_mem_mb_unplug_any_sb_online(vm
, mb_id
,
1493 mutex_unlock(&vm
->hotplug_mutex
);
1495 mutex_lock(&vm
->hotplug_mutex
);
1498 mutex_unlock(&vm
->hotplug_mutex
);
1499 return nb_sb
? -EBUSY
: 0;
1501 mutex_unlock(&vm
->hotplug_mutex
);
1506 * Try to unplug all blocks that couldn't be unplugged before, for example,
1507 * because the hypervisor was busy.
1509 static int virtio_mem_unplug_pending_mb(struct virtio_mem
*vm
)
1511 unsigned long mb_id
;
1514 virtio_mem_for_each_mb_state(vm
, mb_id
, VIRTIO_MEM_MB_STATE_PLUGGED
) {
1515 rc
= virtio_mem_mb_unplug(vm
, mb_id
);
1518 virtio_mem_mb_set_state(vm
, mb_id
, VIRTIO_MEM_MB_STATE_UNUSED
);
1525 * Update all parts of the config that could have changed.
1527 static void virtio_mem_refresh_config(struct virtio_mem
*vm
)
1529 const uint64_t phys_limit
= 1UL << MAX_PHYSMEM_BITS
;
1530 uint64_t new_plugged_size
, usable_region_size
, end_addr
;
1532 /* the plugged_size is just a reflection of what _we_ did previously */
1533 virtio_cread_le(vm
->vdev
, struct virtio_mem_config
, plugged_size
,
1535 if (WARN_ON_ONCE(new_plugged_size
!= vm
->plugged_size
))
1536 vm
->plugged_size
= new_plugged_size
;
1538 /* calculate the last usable memory block id */
1539 virtio_cread_le(vm
->vdev
, struct virtio_mem_config
,
1540 usable_region_size
, &usable_region_size
);
1541 end_addr
= vm
->addr
+ usable_region_size
;
1542 end_addr
= min(end_addr
, phys_limit
);
1543 vm
->last_usable_mb_id
= virtio_mem_phys_to_mb_id(end_addr
) - 1;
1545 /* see if there is a request to change the size */
1546 virtio_cread_le(vm
->vdev
, struct virtio_mem_config
, requested_size
,
1547 &vm
->requested_size
);
1549 dev_info(&vm
->vdev
->dev
, "plugged size: 0x%llx", vm
->plugged_size
);
1550 dev_info(&vm
->vdev
->dev
, "requested size: 0x%llx", vm
->requested_size
);
1554 * Workqueue function for handling plug/unplug requests and config updates.
1556 static void virtio_mem_run_wq(struct work_struct
*work
)
1558 struct virtio_mem
*vm
= container_of(work
, struct virtio_mem
, wq
);
1562 hrtimer_cancel(&vm
->retry_timer
);
1570 /* Make sure we start with a clean state if there are leftovers. */
1571 if (unlikely(vm
->unplug_all_required
))
1572 rc
= virtio_mem_send_unplug_all_request(vm
);
1574 if (atomic_read(&vm
->config_changed
)) {
1575 atomic_set(&vm
->config_changed
, 0);
1576 virtio_mem_refresh_config(vm
);
1579 /* Unplug any leftovers from previous runs */
1581 rc
= virtio_mem_unplug_pending_mb(vm
);
1583 if (!rc
&& vm
->requested_size
!= vm
->plugged_size
) {
1584 if (vm
->requested_size
> vm
->plugged_size
) {
1585 diff
= vm
->requested_size
- vm
->plugged_size
;
1586 rc
= virtio_mem_plug_request(vm
, diff
);
1588 diff
= vm
->plugged_size
- vm
->requested_size
;
1589 rc
= virtio_mem_unplug_request(vm
, diff
);
1595 vm
->retry_timer_ms
= VIRTIO_MEM_RETRY_TIMER_MIN_MS
;
1599 * We cannot add any more memory (alignment, physical limit)
1600 * or we have too many offline memory blocks.
1605 * The hypervisor cannot process our request right now
1606 * (e.g., out of memory, migrating);
1610 * We cannot free up any memory to unplug it (all plugged memory
1614 /* Out of memory, try again later. */
1615 hrtimer_start(&vm
->retry_timer
, ms_to_ktime(vm
->retry_timer_ms
),
1619 /* Retry immediately (e.g., the config changed). */
1622 /* Unknown error, mark as broken */
1623 dev_err(&vm
->vdev
->dev
,
1624 "unknown error, marking device broken: %d\n", rc
);
1629 static enum hrtimer_restart
virtio_mem_timer_expired(struct hrtimer
*timer
)
1631 struct virtio_mem
*vm
= container_of(timer
, struct virtio_mem
,
1634 virtio_mem_retry(vm
);
1635 vm
->retry_timer_ms
= min_t(unsigned int, vm
->retry_timer_ms
* 2,
1636 VIRTIO_MEM_RETRY_TIMER_MAX_MS
);
1637 return HRTIMER_NORESTART
;
1640 static void virtio_mem_handle_response(struct virtqueue
*vq
)
1642 struct virtio_mem
*vm
= vq
->vdev
->priv
;
1644 wake_up(&vm
->host_resp
);
1647 static int virtio_mem_init_vq(struct virtio_mem
*vm
)
1649 struct virtqueue
*vq
;
1651 vq
= virtio_find_single_vq(vm
->vdev
, virtio_mem_handle_response
,
1660 static int virtio_mem_init(struct virtio_mem
*vm
)
1662 const uint64_t phys_limit
= 1UL << MAX_PHYSMEM_BITS
;
1665 if (!vm
->vdev
->config
->get
) {
1666 dev_err(&vm
->vdev
->dev
, "config access disabled\n");
1671 * We don't want to (un)plug or reuse any memory when in kdump. The
1672 * memory is still accessible (but not mapped).
1674 if (is_kdump_kernel()) {
1675 dev_warn(&vm
->vdev
->dev
, "disabled in kdump kernel\n");
1679 /* Fetch all properties that can't change. */
1680 virtio_cread_le(vm
->vdev
, struct virtio_mem_config
, plugged_size
,
1682 virtio_cread_le(vm
->vdev
, struct virtio_mem_config
, block_size
,
1683 &vm
->device_block_size
);
1684 virtio_cread_le(vm
->vdev
, struct virtio_mem_config
, node_id
,
1686 vm
->nid
= virtio_mem_translate_node_id(vm
, node_id
);
1687 virtio_cread_le(vm
->vdev
, struct virtio_mem_config
, addr
, &vm
->addr
);
1688 virtio_cread_le(vm
->vdev
, struct virtio_mem_config
, region_size
,
1692 * We always hotplug memory in memory block granularity. This way,
1693 * we have to wait for exactly one memory block to online.
1695 if (vm
->device_block_size
> memory_block_size_bytes()) {
1696 dev_err(&vm
->vdev
->dev
,
1697 "The block size is not supported (too big).\n");
1701 /* bad device setup - warn only */
1702 if (!IS_ALIGNED(vm
->addr
, memory_block_size_bytes()))
1703 dev_warn(&vm
->vdev
->dev
,
1704 "The alignment of the physical start address can make some memory unusable.\n");
1705 if (!IS_ALIGNED(vm
->addr
+ vm
->region_size
, memory_block_size_bytes()))
1706 dev_warn(&vm
->vdev
->dev
,
1707 "The alignment of the physical end address can make some memory unusable.\n");
1708 if (vm
->addr
+ vm
->region_size
> phys_limit
)
1709 dev_warn(&vm
->vdev
->dev
,
1710 "Some memory is not addressable. This can make some memory unusable.\n");
1713 * Calculate the subblock size:
1714 * - At least MAX_ORDER - 1 / pageblock_order.
1715 * - At least the device block size.
1716 * In the worst case, a single subblock per memory block.
1718 vm
->subblock_size
= PAGE_SIZE
* 1ul << max_t(uint32_t, MAX_ORDER
- 1,
1720 vm
->subblock_size
= max_t(uint64_t, vm
->device_block_size
,
1722 vm
->nb_sb_per_mb
= memory_block_size_bytes() / vm
->subblock_size
;
1724 /* Round up to the next full memory block */
1725 vm
->first_mb_id
= virtio_mem_phys_to_mb_id(vm
->addr
- 1 +
1726 memory_block_size_bytes());
1727 vm
->next_mb_id
= vm
->first_mb_id
;
1728 vm
->last_mb_id
= virtio_mem_phys_to_mb_id(vm
->addr
+
1729 vm
->region_size
) - 1;
1731 dev_info(&vm
->vdev
->dev
, "start address: 0x%llx", vm
->addr
);
1732 dev_info(&vm
->vdev
->dev
, "region size: 0x%llx", vm
->region_size
);
1733 dev_info(&vm
->vdev
->dev
, "device block size: 0x%llx",
1734 (unsigned long long)vm
->device_block_size
);
1735 dev_info(&vm
->vdev
->dev
, "memory block size: 0x%lx",
1736 memory_block_size_bytes());
1737 dev_info(&vm
->vdev
->dev
, "subblock size: 0x%llx",
1738 (unsigned long long)vm
->subblock_size
);
1739 if (vm
->nid
!= NUMA_NO_NODE
)
1740 dev_info(&vm
->vdev
->dev
, "nid: %d", vm
->nid
);
1745 static int virtio_mem_create_resource(struct virtio_mem
*vm
)
1748 * When force-unloading the driver and removing the device, we
1749 * could have a garbage pointer. Duplicate the string.
1751 const char *name
= kstrdup(dev_name(&vm
->vdev
->dev
), GFP_KERNEL
);
1756 vm
->parent_resource
= __request_mem_region(vm
->addr
, vm
->region_size
,
1757 name
, IORESOURCE_SYSTEM_RAM
);
1758 if (!vm
->parent_resource
) {
1760 dev_warn(&vm
->vdev
->dev
, "could not reserve device region\n");
1761 dev_info(&vm
->vdev
->dev
,
1762 "reloading the driver is not supported\n");
1766 /* The memory is not actually busy - make add_memory() work. */
1767 vm
->parent_resource
->flags
&= ~IORESOURCE_BUSY
;
1771 static void virtio_mem_delete_resource(struct virtio_mem
*vm
)
1775 if (!vm
->parent_resource
)
1778 name
= vm
->parent_resource
->name
;
1779 release_resource(vm
->parent_resource
);
1780 kfree(vm
->parent_resource
);
1782 vm
->parent_resource
= NULL
;
1785 static int virtio_mem_probe(struct virtio_device
*vdev
)
1787 struct virtio_mem
*vm
;
1790 BUILD_BUG_ON(sizeof(struct virtio_mem_req
) != 24);
1791 BUILD_BUG_ON(sizeof(struct virtio_mem_resp
) != 10);
1793 vdev
->priv
= vm
= kzalloc(sizeof(*vm
), GFP_KERNEL
);
1797 init_waitqueue_head(&vm
->host_resp
);
1799 INIT_WORK(&vm
->wq
, virtio_mem_run_wq
);
1800 mutex_init(&vm
->hotplug_mutex
);
1801 INIT_LIST_HEAD(&vm
->next
);
1802 spin_lock_init(&vm
->removal_lock
);
1803 hrtimer_init(&vm
->retry_timer
, CLOCK_MONOTONIC
, HRTIMER_MODE_REL
);
1804 vm
->retry_timer
.function
= virtio_mem_timer_expired
;
1805 vm
->retry_timer_ms
= VIRTIO_MEM_RETRY_TIMER_MIN_MS
;
1807 /* register the virtqueue */
1808 rc
= virtio_mem_init_vq(vm
);
1812 /* initialize the device by querying the config */
1813 rc
= virtio_mem_init(vm
);
1817 /* create the parent resource for all memory */
1818 rc
= virtio_mem_create_resource(vm
);
1823 * If we still have memory plugged, we have to unplug all memory first.
1824 * Registering our parent resource makes sure that this memory isn't
1825 * actually in use (e.g., trying to reload the driver).
1827 if (vm
->plugged_size
) {
1828 vm
->unplug_all_required
= 1;
1829 dev_info(&vm
->vdev
->dev
, "unplugging all memory is required\n");
1832 /* register callbacks */
1833 vm
->memory_notifier
.notifier_call
= virtio_mem_memory_notifier_cb
;
1834 rc
= register_memory_notifier(&vm
->memory_notifier
);
1836 goto out_del_resource
;
1837 rc
= register_virtio_mem_device(vm
);
1841 virtio_device_ready(vdev
);
1843 /* trigger a config update to start processing the requested_size */
1844 atomic_set(&vm
->config_changed
, 1);
1845 queue_work(system_freezable_wq
, &vm
->wq
);
1849 unregister_memory_notifier(&vm
->memory_notifier
);
1851 virtio_mem_delete_resource(vm
);
1853 vdev
->config
->del_vqs(vdev
);
1861 static void virtio_mem_remove(struct virtio_device
*vdev
)
1863 struct virtio_mem
*vm
= vdev
->priv
;
1864 unsigned long mb_id
;
1868 * Make sure the workqueue won't be triggered anymore and no memory
1869 * blocks can be onlined/offlined until we're finished here.
1871 mutex_lock(&vm
->hotplug_mutex
);
1872 spin_lock_irq(&vm
->removal_lock
);
1873 vm
->removing
= true;
1874 spin_unlock_irq(&vm
->removal_lock
);
1875 mutex_unlock(&vm
->hotplug_mutex
);
1877 /* wait until the workqueue stopped */
1878 cancel_work_sync(&vm
->wq
);
1879 hrtimer_cancel(&vm
->retry_timer
);
1882 * After we unregistered our callbacks, user space can online partially
1883 * plugged offline blocks. Make sure to remove them.
1885 virtio_mem_for_each_mb_state(vm
, mb_id
,
1886 VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL
) {
1887 rc
= virtio_mem_mb_remove(vm
, mb_id
);
1889 virtio_mem_mb_set_state(vm
, mb_id
, VIRTIO_MEM_MB_STATE_UNUSED
);
1892 * After we unregistered our callbacks, user space can no longer
1893 * offline partially plugged online memory blocks. No need to worry
1897 /* unregister callbacks */
1898 unregister_virtio_mem_device(vm
);
1899 unregister_memory_notifier(&vm
->memory_notifier
);
1902 * There is no way we could reliably remove all memory we have added to
1903 * the system. And there is no way to stop the driver/device from going
1904 * away. Warn at least.
1906 if (vm
->nb_mb_state
[VIRTIO_MEM_MB_STATE_OFFLINE
] ||
1907 vm
->nb_mb_state
[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL
] ||
1908 vm
->nb_mb_state
[VIRTIO_MEM_MB_STATE_ONLINE
] ||
1909 vm
->nb_mb_state
[VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL
] ||
1910 vm
->nb_mb_state
[VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE
]) {
1911 dev_warn(&vdev
->dev
, "device still has system memory added\n");
1913 virtio_mem_delete_resource(vm
);
1914 kfree_const(vm
->resource_name
);
1917 /* remove all tracking data - no locking needed */
1918 vfree(vm
->mb_state
);
1919 vfree(vm
->sb_bitmap
);
1921 /* reset the device and cleanup the queues */
1922 vdev
->config
->reset(vdev
);
1923 vdev
->config
->del_vqs(vdev
);
1929 static void virtio_mem_config_changed(struct virtio_device
*vdev
)
1931 struct virtio_mem
*vm
= vdev
->priv
;
1933 atomic_set(&vm
->config_changed
, 1);
1934 virtio_mem_retry(vm
);
1937 #ifdef CONFIG_PM_SLEEP
1938 static int virtio_mem_freeze(struct virtio_device
*vdev
)
1941 * When restarting the VM, all memory is usually unplugged. Don't
1942 * allow to suspend/hibernate.
1944 dev_err(&vdev
->dev
, "save/restore not supported.\n");
1948 static int virtio_mem_restore(struct virtio_device
*vdev
)
1954 static unsigned int virtio_mem_features
[] = {
1955 #if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA)
1956 VIRTIO_MEM_F_ACPI_PXM
,
1960 static struct virtio_device_id virtio_mem_id_table
[] = {
1961 { VIRTIO_ID_MEM
, VIRTIO_DEV_ANY_ID
},
1965 static struct virtio_driver virtio_mem_driver
= {
1966 .feature_table
= virtio_mem_features
,
1967 .feature_table_size
= ARRAY_SIZE(virtio_mem_features
),
1968 .driver
.name
= KBUILD_MODNAME
,
1969 .driver
.owner
= THIS_MODULE
,
1970 .id_table
= virtio_mem_id_table
,
1971 .probe
= virtio_mem_probe
,
1972 .remove
= virtio_mem_remove
,
1973 .config_changed
= virtio_mem_config_changed
,
1974 #ifdef CONFIG_PM_SLEEP
1975 .freeze
= virtio_mem_freeze
,
1976 .restore
= virtio_mem_restore
,
1980 module_virtio_driver(virtio_mem_driver
);
1981 MODULE_DEVICE_TABLE(virtio
, virtio_mem_id_table
);
1982 MODULE_AUTHOR("David Hildenbrand <david@redhat.com>");
1983 MODULE_DESCRIPTION("Virtio-mem driver");
1984 MODULE_LICENSE("GPL");