2 * Memory Device Interface
4 * Copyright ProfitBricks GmbH 2012
5 * Copyright (C) 2014 Red Hat Inc
6 * Copyright (c) 2018 Red Hat Inc
8 * This work is licensed under the terms of the GNU GPL, version 2 or later.
9 * See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
13 #include "qemu/error-report.h"
14 #include "hw/mem/memory-device.h"
15 #include "qapi/error.h"
16 #include "hw/boards.h"
17 #include "qemu/range.h"
18 #include "hw/virtio/vhost.h"
19 #include "sysemu/kvm.h"
20 #include "exec/address-spaces.h"
23 static bool memory_device_is_empty(const MemoryDeviceState
*md
)
25 const MemoryDeviceClass
*mdc
= MEMORY_DEVICE_GET_CLASS(md
);
26 Error
*local_err
= NULL
;
29 /* dropping const here is fine as we don't touch the memory region */
30 mr
= mdc
->get_memory_region((MemoryDeviceState
*)md
, &local_err
);
32 /* Not empty, we'll report errors later when containing the MR again. */
33 error_free(local_err
);
39 static gint
memory_device_addr_sort(gconstpointer a
, gconstpointer b
)
41 const MemoryDeviceState
*md_a
= MEMORY_DEVICE(a
);
42 const MemoryDeviceState
*md_b
= MEMORY_DEVICE(b
);
43 const MemoryDeviceClass
*mdc_a
= MEMORY_DEVICE_GET_CLASS(a
);
44 const MemoryDeviceClass
*mdc_b
= MEMORY_DEVICE_GET_CLASS(b
);
45 const uint64_t addr_a
= mdc_a
->get_addr(md_a
);
46 const uint64_t addr_b
= mdc_b
->get_addr(md_b
);
48 if (addr_a
> addr_b
) {
50 } else if (addr_a
< addr_b
) {
56 static int memory_device_build_list(Object
*obj
, void *opaque
)
58 GSList
**list
= opaque
;
60 if (object_dynamic_cast(obj
, TYPE_MEMORY_DEVICE
)) {
61 DeviceState
*dev
= DEVICE(obj
);
62 if (dev
->realized
) { /* only realized memory devices matter */
63 *list
= g_slist_insert_sorted(*list
, dev
, memory_device_addr_sort
);
67 object_child_foreach(obj
, memory_device_build_list
, opaque
);
71 static unsigned int memory_device_get_memslots(MemoryDeviceState
*md
)
73 const MemoryDeviceClass
*mdc
= MEMORY_DEVICE_GET_CLASS(md
);
75 if (mdc
->get_memslots
) {
76 return mdc
->get_memslots(md
);
82 * Memslots that are reserved by memory devices (required but still reported
83 * as free from KVM / vhost).
85 static unsigned int get_reserved_memslots(MachineState
*ms
)
87 if (ms
->device_memory
->used_memslots
>
88 ms
->device_memory
->required_memslots
) {
89 /* This is unexpected, and we warned already in the memory notifier. */
92 return ms
->device_memory
->required_memslots
-
93 ms
->device_memory
->used_memslots
;
96 unsigned int memory_devices_get_reserved_memslots(void)
98 if (!current_machine
->device_memory
) {
101 return get_reserved_memslots(current_machine
);
104 bool memory_devices_memslot_auto_decision_active(void)
106 if (!current_machine
->device_memory
) {
110 return current_machine
->device_memory
->memslot_auto_decision_active
;
113 static unsigned int memory_device_memslot_decision_limit(MachineState
*ms
,
116 const unsigned int reserved
= get_reserved_memslots(ms
);
117 const uint64_t size
= memory_region_size(mr
);
118 unsigned int max
= vhost_get_max_memslots();
119 unsigned int free
= vhost_get_free_memslots();
120 uint64_t available_space
;
121 unsigned int memslots
;
124 max
= MIN(max
, kvm_get_max_memslots());
125 free
= MIN(free
, kvm_get_free_memslots());
129 * If we only have less overall memslots than what we consider reasonable,
130 * just keep it to a minimum.
132 if (max
< MEMORY_DEVICES_SAFE_MAX_MEMSLOTS
) {
137 * Consider our soft-limit across all memory devices. We don't really
138 * expect to exceed this limit in reasonable configurations.
140 if (MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT
<=
141 ms
->device_memory
->required_memslots
) {
144 memslots
= MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT
-
145 ms
->device_memory
->required_memslots
;
148 * Consider the actually still free memslots. This is only relevant if
149 * other memslot consumers would consume *significantly* more memslots than
150 * what we prepared for (> 253). Unlikely, but let's just handle it
153 memslots
= MIN(memslots
, free
- reserved
);
154 if (memslots
< 1 || unlikely(free
< reserved
)) {
158 /* We cannot have any other memory devices? So give all to this device. */
159 if (size
== ms
->maxram_size
- ms
->ram_size
) {
164 * Simple heuristic: equally distribute the memslots over the space
165 * still available for memory devices.
167 available_space
= ms
->maxram_size
- ms
->ram_size
-
168 ms
->device_memory
->used_region_size
;
169 memslots
= (double)memslots
* size
/ available_space
;
170 return memslots
< 1 ? 1 : memslots
;
173 static void memory_device_check_addable(MachineState
*ms
, MemoryDeviceState
*md
,
174 MemoryRegion
*mr
, Error
**errp
)
176 const MemoryDeviceClass
*mdc
= MEMORY_DEVICE_GET_CLASS(md
);
177 const uint64_t used_region_size
= ms
->device_memory
->used_region_size
;
178 const uint64_t size
= memory_region_size(mr
);
179 const unsigned int reserved_memslots
= get_reserved_memslots(ms
);
180 unsigned int required_memslots
, memslot_limit
;
183 * Instruct the device to decide how many memslots to use, if applicable,
184 * before we query the number of required memslots the first time.
186 if (mdc
->decide_memslots
) {
187 memslot_limit
= memory_device_memslot_decision_limit(ms
, mr
);
188 mdc
->decide_memslots(md
, memslot_limit
);
190 required_memslots
= memory_device_get_memslots(md
);
192 /* we will need memory slots for kvm and vhost */
194 kvm_get_free_memslots() < required_memslots
+ reserved_memslots
) {
195 error_setg(errp
, "hypervisor has not enough free memory slots left");
198 if (vhost_get_free_memslots() < required_memslots
+ reserved_memslots
) {
199 error_setg(errp
, "a used vhost backend has not enough free memory slots left");
203 /* will we exceed the total amount of memory specified */
204 if (used_region_size
+ size
< used_region_size
||
205 used_region_size
+ size
> ms
->maxram_size
- ms
->ram_size
) {
206 error_setg(errp
, "not enough space, currently 0x%" PRIx64
207 " in use of total space for memory devices 0x" RAM_ADDR_FMT
,
208 used_region_size
, ms
->maxram_size
- ms
->ram_size
);
214 static uint64_t memory_device_get_free_addr(MachineState
*ms
,
215 const uint64_t *hint
,
216 uint64_t align
, uint64_t size
,
219 GSList
*list
= NULL
, *item
;
220 Range as
, new = range_empty
;
222 range_init_nofail(&as
, ms
->device_memory
->base
,
223 memory_region_size(&ms
->device_memory
->mr
));
225 /* start of address space indicates the maximum alignment we expect */
226 if (!QEMU_IS_ALIGNED(range_lob(&as
), align
)) {
227 warn_report("the alignment (0x%" PRIx64
") exceeds the expected"
228 " maximum alignment, memory will get fragmented and not"
229 " all 'maxmem' might be usable for memory devices.",
233 if (hint
&& !QEMU_IS_ALIGNED(*hint
, align
)) {
234 error_setg(errp
, "address must be aligned to 0x%" PRIx64
" bytes",
240 if (range_init(&new, *hint
, size
) || !range_contains_range(&as
, &new)) {
241 error_setg(errp
, "can't add memory device [0x%" PRIx64
":0x%" PRIx64
242 "], usable range for memory devices [0x%" PRIx64
":0x%"
243 PRIx64
"]", *hint
, size
, range_lob(&as
),
248 if (range_init(&new, QEMU_ALIGN_UP(range_lob(&as
), align
), size
)) {
249 error_setg(errp
, "can't add memory device, device too big");
254 /* find address range that will fit new memory device */
255 object_child_foreach(OBJECT(ms
), memory_device_build_list
, &list
);
256 for (item
= list
; item
; item
= g_slist_next(item
)) {
257 const MemoryDeviceState
*md
= item
->data
;
258 const MemoryDeviceClass
*mdc
= MEMORY_DEVICE_GET_CLASS(OBJECT(md
));
262 if (memory_device_is_empty(md
)) {
266 range_init_nofail(&tmp
, mdc
->get_addr(md
),
267 memory_device_get_region_size(md
, &error_abort
));
269 if (range_overlaps_range(&tmp
, &new)) {
271 const DeviceState
*d
= DEVICE(md
);
272 error_setg(errp
, "address range conflicts with memory device"
273 " id='%s'", d
->id
? d
->id
: "(unnamed)");
277 next_addr
= QEMU_ALIGN_UP(range_upb(&tmp
) + 1, align
);
278 if (!next_addr
|| range_init(&new, next_addr
, range_size(&new))) {
279 range_make_empty(&new);
282 } else if (range_lob(&tmp
) > range_upb(&new)) {
287 if (!range_contains_range(&as
, &new)) {
288 error_setg(errp
, "could not find position in guest address space for "
289 "memory device - memory fragmented due to alignments");
293 return range_lob(&new);
296 MemoryDeviceInfoList
*qmp_memory_device_list(void)
298 GSList
*devices
= NULL
, *item
;
299 MemoryDeviceInfoList
*list
= NULL
, **tail
= &list
;
301 object_child_foreach(qdev_get_machine(), memory_device_build_list
,
304 for (item
= devices
; item
; item
= g_slist_next(item
)) {
305 const MemoryDeviceState
*md
= MEMORY_DEVICE(item
->data
);
306 const MemoryDeviceClass
*mdc
= MEMORY_DEVICE_GET_CLASS(item
->data
);
307 MemoryDeviceInfo
*info
= g_new0(MemoryDeviceInfo
, 1);
309 /* Let's query infotmation even for empty memory devices. */
310 mdc
->fill_device_info(md
, info
);
312 QAPI_LIST_APPEND(tail
, info
);
315 g_slist_free(devices
);
320 static int memory_device_plugged_size(Object
*obj
, void *opaque
)
322 uint64_t *size
= opaque
;
324 if (object_dynamic_cast(obj
, TYPE_MEMORY_DEVICE
)) {
325 const DeviceState
*dev
= DEVICE(obj
);
326 const MemoryDeviceState
*md
= MEMORY_DEVICE(obj
);
327 const MemoryDeviceClass
*mdc
= MEMORY_DEVICE_GET_CLASS(obj
);
329 if (dev
->realized
&& !memory_device_is_empty(md
)) {
330 *size
+= mdc
->get_plugged_size(md
, &error_abort
);
334 object_child_foreach(obj
, memory_device_plugged_size
, opaque
);
338 uint64_t get_plugged_memory_size(void)
342 memory_device_plugged_size(qdev_get_machine(), &size
);
347 void memory_device_pre_plug(MemoryDeviceState
*md
, MachineState
*ms
,
350 const MemoryDeviceClass
*mdc
= MEMORY_DEVICE_GET_CLASS(md
);
351 Error
*local_err
= NULL
;
352 uint64_t addr
, align
= 0;
355 /* We support empty memory devices even without device memory. */
356 if (memory_device_is_empty(md
)) {
360 if (!ms
->device_memory
) {
361 error_setg(errp
, "the configuration is not prepared for memory devices"
362 " (e.g., for memory hotplug), consider specifying the"
367 mr
= mdc
->get_memory_region(md
, &local_err
);
372 memory_device_check_addable(ms
, md
, mr
, &local_err
);
378 * We always want the memory region size to be multiples of the memory
379 * region alignment: for example, DIMMs with 1G+1byte size don't make
380 * any sense. Note that we don't check that the size is multiples
381 * of any additional alignment requirements the memory device might
382 * have when it comes to the address in physical address space.
384 if (!QEMU_IS_ALIGNED(memory_region_size(mr
),
385 memory_region_get_alignment(mr
))) {
386 error_setg(errp
, "backend memory size must be multiple of 0x%"
387 PRIx64
, memory_region_get_alignment(mr
));
391 if (mdc
->get_min_alignment
) {
392 align
= mdc
->get_min_alignment(md
);
394 align
= MAX(align
, memory_region_get_alignment(mr
));
395 addr
= mdc
->get_addr(md
);
396 addr
= memory_device_get_free_addr(ms
, !addr
? NULL
: &addr
, align
,
397 memory_region_size(mr
), &local_err
);
401 mdc
->set_addr(md
, addr
, &local_err
);
403 trace_memory_device_pre_plug(DEVICE(md
)->id
? DEVICE(md
)->id
: "",
407 error_propagate(errp
, local_err
);
410 void memory_device_plug(MemoryDeviceState
*md
, MachineState
*ms
)
412 const MemoryDeviceClass
*mdc
= MEMORY_DEVICE_GET_CLASS(md
);
413 unsigned int memslots
;
417 if (memory_device_is_empty(md
)) {
421 memslots
= memory_device_get_memslots(md
);
422 addr
= mdc
->get_addr(md
);
425 * We expect that a previous call to memory_device_pre_plug() succeeded, so
426 * it can't fail at this point.
428 mr
= mdc
->get_memory_region(md
, &error_abort
);
429 g_assert(ms
->device_memory
);
431 ms
->device_memory
->used_region_size
+= memory_region_size(mr
);
432 ms
->device_memory
->required_memslots
+= memslots
;
433 if (mdc
->decide_memslots
&& memslots
> 1) {
434 ms
->device_memory
->memslot_auto_decision_active
++;
437 memory_region_add_subregion(&ms
->device_memory
->mr
,
438 addr
- ms
->device_memory
->base
, mr
);
439 trace_memory_device_plug(DEVICE(md
)->id
? DEVICE(md
)->id
: "", addr
);
442 void memory_device_unplug(MemoryDeviceState
*md
, MachineState
*ms
)
444 const MemoryDeviceClass
*mdc
= MEMORY_DEVICE_GET_CLASS(md
);
445 const unsigned int memslots
= memory_device_get_memslots(md
);
448 if (memory_device_is_empty(md
)) {
453 * We expect that a previous call to memory_device_pre_plug() succeeded, so
454 * it can't fail at this point.
456 mr
= mdc
->get_memory_region(md
, &error_abort
);
457 g_assert(ms
->device_memory
);
459 memory_region_del_subregion(&ms
->device_memory
->mr
, mr
);
461 if (mdc
->decide_memslots
&& memslots
> 1) {
462 ms
->device_memory
->memslot_auto_decision_active
--;
464 ms
->device_memory
->used_region_size
-= memory_region_size(mr
);
465 ms
->device_memory
->required_memslots
-= memslots
;
466 trace_memory_device_unplug(DEVICE(md
)->id
? DEVICE(md
)->id
: "",
470 uint64_t memory_device_get_region_size(const MemoryDeviceState
*md
,
473 const MemoryDeviceClass
*mdc
= MEMORY_DEVICE_GET_CLASS(md
);
476 /* dropping const here is fine as we don't touch the memory region */
477 mr
= mdc
->get_memory_region((MemoryDeviceState
*)md
, errp
);
482 return memory_region_size(mr
);
485 static void memory_devices_region_mod(MemoryListener
*listener
,
486 MemoryRegionSection
*mrs
, bool add
)
488 DeviceMemoryState
*dms
= container_of(listener
, DeviceMemoryState
,
491 if (!memory_region_is_ram(mrs
->mr
)) {
492 warn_report("Unexpected memory region mapped into device memory region.");
497 * The expectation is that each distinct RAM memory region section in
498 * our region for memory devices consumes exactly one memslot in KVM
499 * and in vhost. For vhost, this is true, except:
500 * * ROM memory regions don't consume a memslot. These get used very
501 * rarely for memory devices (R/O NVDIMMs).
502 * * Memslots without a fd (memory-backend-ram) don't necessarily
503 * consume a memslot. Such setups are quite rare and possibly bogus:
504 * the memory would be inaccessible by such vhost devices.
506 * So for vhost, in corner cases we might over-estimate the number of
507 * memslots that are currently used or that might still be reserved
510 dms
->used_memslots
+= add
? 1 : -1;
512 if (dms
->used_memslots
> dms
->required_memslots
) {
513 warn_report("Memory devices use more memory slots than indicated as required.");
517 static void memory_devices_region_add(MemoryListener
*listener
,
518 MemoryRegionSection
*mrs
)
520 return memory_devices_region_mod(listener
, mrs
, true);
523 static void memory_devices_region_del(MemoryListener
*listener
,
524 MemoryRegionSection
*mrs
)
526 return memory_devices_region_mod(listener
, mrs
, false);
529 void machine_memory_devices_init(MachineState
*ms
, hwaddr base
, uint64_t size
)
532 g_assert(!ms
->device_memory
);
533 ms
->device_memory
= g_new0(DeviceMemoryState
, 1);
534 ms
->device_memory
->base
= base
;
536 memory_region_init(&ms
->device_memory
->mr
, OBJECT(ms
), "device-memory",
538 address_space_init(&ms
->device_memory
->as
, &ms
->device_memory
->mr
,
540 memory_region_add_subregion(get_system_memory(), ms
->device_memory
->base
,
541 &ms
->device_memory
->mr
);
543 /* Track the number of memslots used by memory devices. */
544 ms
->device_memory
->listener
.region_add
= memory_devices_region_add
;
545 ms
->device_memory
->listener
.region_del
= memory_devices_region_del
;
546 memory_listener_register(&ms
->device_memory
->listener
,
547 &ms
->device_memory
->as
);
550 static const TypeInfo memory_device_info
= {
551 .name
= TYPE_MEMORY_DEVICE
,
552 .parent
= TYPE_INTERFACE
,
553 .class_size
= sizeof(MemoryDeviceClass
),
556 static void memory_device_register_types(void)
558 type_register_static(&memory_device_info
);
561 type_init(memory_device_register_types
)