1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/backing-dev.h>
3 #include <linux/falloc.h>
4 #include <linux/kvm_host.h>
5 #include <linux/pagemap.h>
6 #include <linux/anon_inodes.h>
12 struct xarray bindings
;
13 struct list_head entry
;
17 * folio_file_pfn - like folio_file_page, but return a pfn.
18 * @folio: The folio which contains this index.
19 * @index: The index we want to look up.
21 * Return: The pfn for this index.
23 static inline kvm_pfn_t
folio_file_pfn(struct folio
*folio
, pgoff_t index
)
25 return folio_pfn(folio
) + (index
& (folio_nr_pages(folio
) - 1));
28 static int __kvm_gmem_prepare_folio(struct kvm
*kvm
, struct kvm_memory_slot
*slot
,
29 pgoff_t index
, struct folio
*folio
)
31 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
32 kvm_pfn_t pfn
= folio_file_pfn(folio
, index
);
33 gfn_t gfn
= slot
->base_gfn
+ index
- slot
->gmem
.pgoff
;
34 int rc
= kvm_arch_gmem_prepare(kvm
, gfn
, pfn
, folio_order(folio
));
36 pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
45 static inline void kvm_gmem_mark_prepared(struct folio
*folio
)
47 folio_mark_uptodate(folio
);
51 * Process @folio, which contains @gfn, so that the guest can use it.
52 * The folio must be locked and the gfn must be contained in @slot.
53 * On successful return the guest sees a zero page so as to avoid
54 * leaking host data and the up-to-date flag is set.
56 static int kvm_gmem_prepare_folio(struct kvm
*kvm
, struct kvm_memory_slot
*slot
,
57 gfn_t gfn
, struct folio
*folio
)
59 unsigned long nr_pages
, i
;
63 nr_pages
= folio_nr_pages(folio
);
64 for (i
= 0; i
< nr_pages
; i
++)
65 clear_highpage(folio_page(folio
, i
));
68 * Preparing huge folios should always be safe, since it should
69 * be possible to split them later if needed.
71 * Right now the folio order is always going to be zero, but the
72 * code is ready for huge folios. The only assumption is that
73 * the base pgoff of memslots is naturally aligned with the
74 * requested page order, ensuring that huge folios can also use
75 * huge page table entries for GPA->HPA mapping.
77 * The order will be passed when creating the guest_memfd, and
78 * checked when creating memslots.
80 WARN_ON(!IS_ALIGNED(slot
->gmem
.pgoff
, 1 << folio_order(folio
)));
81 index
= gfn
- slot
->base_gfn
+ slot
->gmem
.pgoff
;
82 index
= ALIGN_DOWN(index
, 1 << folio_order(folio
));
83 r
= __kvm_gmem_prepare_folio(kvm
, slot
, index
, folio
);
85 kvm_gmem_mark_prepared(folio
);
91 * Returns a locked folio on success. The caller is responsible for
92 * setting the up-to-date flag before the memory is mapped into the guest.
93 * There is no backing storage for the memory, so the folio will remain
94 * up-to-date until it's removed.
96 * Ignore accessed, referenced, and dirty flags. The memory is
97 * unevictable and there is no storage to write back to.
99 static struct folio
*kvm_gmem_get_folio(struct inode
*inode
, pgoff_t index
)
101 /* TODO: Support huge pages. */
102 return filemap_grab_folio(inode
->i_mapping
, index
);
105 static void kvm_gmem_invalidate_begin(struct kvm_gmem
*gmem
, pgoff_t start
,
108 bool flush
= false, found_memslot
= false;
109 struct kvm_memory_slot
*slot
;
110 struct kvm
*kvm
= gmem
->kvm
;
113 xa_for_each_range(&gmem
->bindings
, index
, slot
, start
, end
- 1) {
114 pgoff_t pgoff
= slot
->gmem
.pgoff
;
116 struct kvm_gfn_range gfn_range
= {
117 .start
= slot
->base_gfn
+ max(pgoff
, start
) - pgoff
,
118 .end
= slot
->base_gfn
+ min(pgoff
+ slot
->npages
, end
) - pgoff
,
123 if (!found_memslot
) {
124 found_memslot
= true;
127 kvm_mmu_invalidate_begin(kvm
);
130 flush
|= kvm_mmu_unmap_gfn_range(kvm
, &gfn_range
);
134 kvm_flush_remote_tlbs(kvm
);
140 static void kvm_gmem_invalidate_end(struct kvm_gmem
*gmem
, pgoff_t start
,
143 struct kvm
*kvm
= gmem
->kvm
;
145 if (xa_find(&gmem
->bindings
, &start
, end
- 1, XA_PRESENT
)) {
147 kvm_mmu_invalidate_end(kvm
);
152 static long kvm_gmem_punch_hole(struct inode
*inode
, loff_t offset
, loff_t len
)
154 struct list_head
*gmem_list
= &inode
->i_mapping
->i_private_list
;
155 pgoff_t start
= offset
>> PAGE_SHIFT
;
156 pgoff_t end
= (offset
+ len
) >> PAGE_SHIFT
;
157 struct kvm_gmem
*gmem
;
160 * Bindings must be stable across invalidation to ensure the start+end
163 filemap_invalidate_lock(inode
->i_mapping
);
165 list_for_each_entry(gmem
, gmem_list
, entry
)
166 kvm_gmem_invalidate_begin(gmem
, start
, end
);
168 truncate_inode_pages_range(inode
->i_mapping
, offset
, offset
+ len
- 1);
170 list_for_each_entry(gmem
, gmem_list
, entry
)
171 kvm_gmem_invalidate_end(gmem
, start
, end
);
173 filemap_invalidate_unlock(inode
->i_mapping
);
178 static long kvm_gmem_allocate(struct inode
*inode
, loff_t offset
, loff_t len
)
180 struct address_space
*mapping
= inode
->i_mapping
;
181 pgoff_t start
, index
, end
;
184 /* Dedicated guest is immutable by default. */
185 if (offset
+ len
> i_size_read(inode
))
188 filemap_invalidate_lock_shared(mapping
);
190 start
= offset
>> PAGE_SHIFT
;
191 end
= (offset
+ len
) >> PAGE_SHIFT
;
194 for (index
= start
; index
< end
; ) {
197 if (signal_pending(current
)) {
202 folio
= kvm_gmem_get_folio(inode
, index
);
208 index
= folio_next_index(folio
);
213 /* 64-bit only, wrapping the index should be impossible. */
214 if (WARN_ON_ONCE(!index
))
220 filemap_invalidate_unlock_shared(mapping
);
225 static long kvm_gmem_fallocate(struct file
*file
, int mode
, loff_t offset
,
230 if (!(mode
& FALLOC_FL_KEEP_SIZE
))
233 if (mode
& ~(FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
))
236 if (!PAGE_ALIGNED(offset
) || !PAGE_ALIGNED(len
))
239 if (mode
& FALLOC_FL_PUNCH_HOLE
)
240 ret
= kvm_gmem_punch_hole(file_inode(file
), offset
, len
);
242 ret
= kvm_gmem_allocate(file_inode(file
), offset
, len
);
249 static int kvm_gmem_release(struct inode
*inode
, struct file
*file
)
251 struct kvm_gmem
*gmem
= file
->private_data
;
252 struct kvm_memory_slot
*slot
;
253 struct kvm
*kvm
= gmem
->kvm
;
257 * Prevent concurrent attempts to *unbind* a memslot. This is the last
258 * reference to the file and thus no new bindings can be created, but
259 * dereferencing the slot for existing bindings needs to be protected
260 * against memslot updates, specifically so that unbind doesn't race
261 * and free the memslot (kvm_gmem_get_file() will return NULL).
263 mutex_lock(&kvm
->slots_lock
);
265 filemap_invalidate_lock(inode
->i_mapping
);
267 xa_for_each(&gmem
->bindings
, index
, slot
)
268 rcu_assign_pointer(slot
->gmem
.file
, NULL
);
273 * All in-flight operations are gone and new bindings can be created.
274 * Zap all SPTEs pointed at by this file. Do not free the backing
275 * memory, as its lifetime is associated with the inode, not the file.
277 kvm_gmem_invalidate_begin(gmem
, 0, -1ul);
278 kvm_gmem_invalidate_end(gmem
, 0, -1ul);
280 list_del(&gmem
->entry
);
282 filemap_invalidate_unlock(inode
->i_mapping
);
284 mutex_unlock(&kvm
->slots_lock
);
286 xa_destroy(&gmem
->bindings
);
294 static inline struct file
*kvm_gmem_get_file(struct kvm_memory_slot
*slot
)
297 * Do not return slot->gmem.file if it has already been closed;
298 * there might be some time between the last fput() and when
299 * kvm_gmem_release() clears slot->gmem.file, and you do not
300 * want to spin in the meanwhile.
302 return get_file_active(&slot
->gmem
.file
);
305 static pgoff_t
kvm_gmem_get_index(struct kvm_memory_slot
*slot
, gfn_t gfn
)
307 return gfn
- slot
->base_gfn
+ slot
->gmem
.pgoff
;
310 static struct file_operations kvm_gmem_fops
= {
311 .open
= generic_file_open
,
312 .release
= kvm_gmem_release
,
313 .fallocate
= kvm_gmem_fallocate
,
316 void kvm_gmem_init(struct module
*module
)
318 kvm_gmem_fops
.owner
= module
;
321 static int kvm_gmem_migrate_folio(struct address_space
*mapping
,
322 struct folio
*dst
, struct folio
*src
,
323 enum migrate_mode mode
)
329 static int kvm_gmem_error_folio(struct address_space
*mapping
, struct folio
*folio
)
331 struct list_head
*gmem_list
= &mapping
->i_private_list
;
332 struct kvm_gmem
*gmem
;
335 filemap_invalidate_lock_shared(mapping
);
337 start
= folio
->index
;
338 end
= start
+ folio_nr_pages(folio
);
340 list_for_each_entry(gmem
, gmem_list
, entry
)
341 kvm_gmem_invalidate_begin(gmem
, start
, end
);
344 * Do not truncate the range, what action is taken in response to the
345 * error is userspace's decision (assuming the architecture supports
346 * gracefully handling memory errors). If/when the guest attempts to
347 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
348 * at which point KVM can either terminate the VM or propagate the
349 * error to userspace.
352 list_for_each_entry(gmem
, gmem_list
, entry
)
353 kvm_gmem_invalidate_end(gmem
, start
, end
);
355 filemap_invalidate_unlock_shared(mapping
);
360 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
361 static void kvm_gmem_free_folio(struct folio
*folio
)
363 struct page
*page
= folio_page(folio
, 0);
364 kvm_pfn_t pfn
= page_to_pfn(page
);
365 int order
= folio_order(folio
);
367 kvm_arch_gmem_invalidate(pfn
, pfn
+ (1ul << order
));
371 static const struct address_space_operations kvm_gmem_aops
= {
372 .dirty_folio
= noop_dirty_folio
,
373 .migrate_folio
= kvm_gmem_migrate_folio
,
374 .error_remove_folio
= kvm_gmem_error_folio
,
375 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
376 .free_folio
= kvm_gmem_free_folio
,
380 static int kvm_gmem_getattr(struct mnt_idmap
*idmap
, const struct path
*path
,
381 struct kstat
*stat
, u32 request_mask
,
382 unsigned int query_flags
)
384 struct inode
*inode
= path
->dentry
->d_inode
;
386 generic_fillattr(idmap
, request_mask
, inode
, stat
);
390 static int kvm_gmem_setattr(struct mnt_idmap
*idmap
, struct dentry
*dentry
,
395 static const struct inode_operations kvm_gmem_iops
= {
396 .getattr
= kvm_gmem_getattr
,
397 .setattr
= kvm_gmem_setattr
,
400 static int __kvm_gmem_create(struct kvm
*kvm
, loff_t size
, u64 flags
)
402 const char *anon_name
= "[kvm-gmem]";
403 struct kvm_gmem
*gmem
;
408 fd
= get_unused_fd_flags(0);
412 gmem
= kzalloc(sizeof(*gmem
), GFP_KERNEL
);
418 file
= anon_inode_create_getfile(anon_name
, &kvm_gmem_fops
, gmem
,
425 file
->f_flags
|= O_LARGEFILE
;
427 inode
= file
->f_inode
;
428 WARN_ON(file
->f_mapping
!= inode
->i_mapping
);
430 inode
->i_private
= (void *)(unsigned long)flags
;
431 inode
->i_op
= &kvm_gmem_iops
;
432 inode
->i_mapping
->a_ops
= &kvm_gmem_aops
;
433 inode
->i_mode
|= S_IFREG
;
434 inode
->i_size
= size
;
435 mapping_set_gfp_mask(inode
->i_mapping
, GFP_HIGHUSER
);
436 mapping_set_inaccessible(inode
->i_mapping
);
437 /* Unmovable mappings are supposed to be marked unevictable as well. */
438 WARN_ON_ONCE(!mapping_unevictable(inode
->i_mapping
));
442 xa_init(&gmem
->bindings
);
443 list_add(&gmem
->entry
, &inode
->i_mapping
->i_private_list
);
445 fd_install(fd
, file
);
455 int kvm_gmem_create(struct kvm
*kvm
, struct kvm_create_guest_memfd
*args
)
457 loff_t size
= args
->size
;
458 u64 flags
= args
->flags
;
461 if (flags
& ~valid_flags
)
464 if (size
<= 0 || !PAGE_ALIGNED(size
))
467 return __kvm_gmem_create(kvm
, size
, flags
);
470 int kvm_gmem_bind(struct kvm
*kvm
, struct kvm_memory_slot
*slot
,
471 unsigned int fd
, loff_t offset
)
473 loff_t size
= slot
->npages
<< PAGE_SHIFT
;
474 unsigned long start
, end
;
475 struct kvm_gmem
*gmem
;
480 BUILD_BUG_ON(sizeof(gfn_t
) != sizeof(slot
->gmem
.pgoff
));
486 if (file
->f_op
!= &kvm_gmem_fops
)
489 gmem
= file
->private_data
;
490 if (gmem
->kvm
!= kvm
)
493 inode
= file_inode(file
);
495 if (offset
< 0 || !PAGE_ALIGNED(offset
) ||
496 offset
+ size
> i_size_read(inode
))
499 filemap_invalidate_lock(inode
->i_mapping
);
501 start
= offset
>> PAGE_SHIFT
;
502 end
= start
+ slot
->npages
;
504 if (!xa_empty(&gmem
->bindings
) &&
505 xa_find(&gmem
->bindings
, &start
, end
- 1, XA_PRESENT
)) {
506 filemap_invalidate_unlock(inode
->i_mapping
);
511 * No synchronize_rcu() needed, any in-flight readers are guaranteed to
512 * be see either a NULL file or this new file, no need for them to go
515 rcu_assign_pointer(slot
->gmem
.file
, file
);
516 slot
->gmem
.pgoff
= start
;
518 xa_store_range(&gmem
->bindings
, start
, end
- 1, slot
, GFP_KERNEL
);
519 filemap_invalidate_unlock(inode
->i_mapping
);
522 * Drop the reference to the file, even on success. The file pins KVM,
523 * not the other way 'round. Active bindings are invalidated if the
524 * file is closed before memslots are destroyed.
532 void kvm_gmem_unbind(struct kvm_memory_slot
*slot
)
534 unsigned long start
= slot
->gmem
.pgoff
;
535 unsigned long end
= start
+ slot
->npages
;
536 struct kvm_gmem
*gmem
;
540 * Nothing to do if the underlying file was already closed (or is being
541 * closed right now), kvm_gmem_release() invalidates all bindings.
543 file
= kvm_gmem_get_file(slot
);
547 gmem
= file
->private_data
;
549 filemap_invalidate_lock(file
->f_mapping
);
550 xa_store_range(&gmem
->bindings
, start
, end
- 1, NULL
, GFP_KERNEL
);
551 rcu_assign_pointer(slot
->gmem
.file
, NULL
);
553 filemap_invalidate_unlock(file
->f_mapping
);
558 /* Returns a locked folio on success. */
559 static struct folio
*__kvm_gmem_get_pfn(struct file
*file
,
560 struct kvm_memory_slot
*slot
,
561 pgoff_t index
, kvm_pfn_t
*pfn
,
562 bool *is_prepared
, int *max_order
)
564 struct kvm_gmem
*gmem
= file
->private_data
;
567 if (file
!= slot
->gmem
.file
) {
568 WARN_ON_ONCE(slot
->gmem
.file
);
569 return ERR_PTR(-EFAULT
);
572 gmem
= file
->private_data
;
573 if (xa_load(&gmem
->bindings
, index
) != slot
) {
574 WARN_ON_ONCE(xa_load(&gmem
->bindings
, index
));
575 return ERR_PTR(-EIO
);
578 folio
= kvm_gmem_get_folio(file_inode(file
), index
);
582 if (folio_test_hwpoison(folio
)) {
585 return ERR_PTR(-EHWPOISON
);
588 *pfn
= folio_file_pfn(folio
, index
);
592 *is_prepared
= folio_test_uptodate(folio
);
596 int kvm_gmem_get_pfn(struct kvm
*kvm
, struct kvm_memory_slot
*slot
,
597 gfn_t gfn
, kvm_pfn_t
*pfn
, struct page
**page
,
600 pgoff_t index
= kvm_gmem_get_index(slot
, gfn
);
601 struct file
*file
= kvm_gmem_get_file(slot
);
603 bool is_prepared
= false;
609 folio
= __kvm_gmem_get_pfn(file
, slot
, index
, pfn
, &is_prepared
, max_order
);
616 r
= kvm_gmem_prepare_folio(kvm
, slot
, gfn
, folio
);
621 *page
= folio_file_page(folio
, index
);
629 EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn
);
631 #ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM
632 long kvm_gmem_populate(struct kvm
*kvm
, gfn_t start_gfn
, void __user
*src
, long npages
,
633 kvm_gmem_populate_cb post_populate
, void *opaque
)
636 struct kvm_memory_slot
*slot
;
639 int ret
= 0, max_order
;
642 lockdep_assert_held(&kvm
->slots_lock
);
646 slot
= gfn_to_memslot(kvm
, start_gfn
);
647 if (!kvm_slot_can_be_private(slot
))
650 file
= kvm_gmem_get_file(slot
);
654 filemap_invalidate_lock(file
->f_mapping
);
656 npages
= min_t(ulong
, slot
->npages
- (start_gfn
- slot
->base_gfn
), npages
);
657 for (i
= 0; i
< npages
; i
+= (1 << max_order
)) {
659 gfn_t gfn
= start_gfn
+ i
;
660 pgoff_t index
= kvm_gmem_get_index(slot
, gfn
);
661 bool is_prepared
= false;
664 if (signal_pending(current
)) {
669 folio
= __kvm_gmem_get_pfn(file
, slot
, index
, &pfn
, &is_prepared
, &max_order
);
671 ret
= PTR_ERR(folio
);
683 WARN_ON(!IS_ALIGNED(gfn
, 1 << max_order
) ||
684 (npages
- i
) < (1 << max_order
));
687 while (!kvm_range_has_memory_attributes(kvm
, gfn
, gfn
+ (1 << max_order
),
688 KVM_MEMORY_ATTRIBUTE_PRIVATE
,
689 KVM_MEMORY_ATTRIBUTE_PRIVATE
)) {
691 goto put_folio_and_exit
;
695 p
= src
? src
+ i
* PAGE_SIZE
: NULL
;
696 ret
= post_populate(kvm
, gfn
, pfn
, p
, max_order
, opaque
);
698 kvm_gmem_mark_prepared(folio
);
706 filemap_invalidate_unlock(file
->f_mapping
);
709 return ret
&& !i
? ret
: i
;
711 EXPORT_SYMBOL_GPL(kvm_gmem_populate
);