1 // SPDX-License-Identifier: GPL-2.0-only
3 * Kernel-based Virtual Machine driver for Linux
5 * This module enables kernel and guest-mode vCPU access to guest physical
6 * memory with suitable invalidation mechanisms.
8 * Copyright © 2021 Amazon.com, Inc. or its affiliates.
11 * David Woodhouse <dwmw2@infradead.org>
14 #include <linux/kvm_host.h>
15 #include <linux/kvm.h>
16 #include <linux/highmem.h>
17 #include <linux/module.h>
18 #include <linux/errno.h>
23 * MMU notifier 'invalidate_range_start' hook.
25 void gfn_to_pfn_cache_invalidate_start(struct kvm
*kvm
, unsigned long start
,
28 struct gfn_to_pfn_cache
*gpc
;
30 spin_lock(&kvm
->gpc_lock
);
31 list_for_each_entry(gpc
, &kvm
->gpc_list
, list
) {
32 read_lock_irq(&gpc
->lock
);
34 /* Only a single page so no need to care about length */
35 if (gpc
->valid
&& !is_error_noslot_pfn(gpc
->pfn
) &&
36 gpc
->uhva
>= start
&& gpc
->uhva
< end
) {
37 read_unlock_irq(&gpc
->lock
);
40 * There is a small window here where the cache could
41 * be modified, and invalidation would no longer be
42 * necessary. Hence check again whether invalidation
43 * is still necessary once the write lock has been
47 write_lock_irq(&gpc
->lock
);
48 if (gpc
->valid
&& !is_error_noslot_pfn(gpc
->pfn
) &&
49 gpc
->uhva
>= start
&& gpc
->uhva
< end
)
51 write_unlock_irq(&gpc
->lock
);
55 read_unlock_irq(&gpc
->lock
);
57 spin_unlock(&kvm
->gpc_lock
);
60 static bool kvm_gpc_is_valid_len(gpa_t gpa
, unsigned long uhva
,
63 unsigned long offset
= kvm_is_error_gpa(gpa
) ? offset_in_page(uhva
) :
67 * The cached access must fit within a single page. The 'len' argument
68 * to activate() and refresh() exists only to enforce that.
70 return offset
+ len
<= PAGE_SIZE
;
73 bool kvm_gpc_check(struct gfn_to_pfn_cache
*gpc
, unsigned long len
)
75 struct kvm_memslots
*slots
= kvm_memslots(gpc
->kvm
);
81 * If the page was cached from a memslot, make sure the memslots have
82 * not been re-configured.
84 if (!kvm_is_error_gpa(gpc
->gpa
) && gpc
->generation
!= slots
->generation
)
87 if (kvm_is_error_hva(gpc
->uhva
))
90 if (!kvm_gpc_is_valid_len(gpc
->gpa
, gpc
->uhva
, len
))
99 static void *gpc_map(kvm_pfn_t pfn
)
102 return kmap(pfn_to_page(pfn
));
104 #ifdef CONFIG_HAS_IOMEM
105 return memremap(pfn_to_hpa(pfn
), PAGE_SIZE
, MEMREMAP_WB
);
111 static void gpc_unmap(kvm_pfn_t pfn
, void *khva
)
113 /* Unmap the old pfn/page if it was mapped before. */
114 if (is_error_noslot_pfn(pfn
) || !khva
)
117 if (pfn_valid(pfn
)) {
118 kunmap(pfn_to_page(pfn
));
122 #ifdef CONFIG_HAS_IOMEM
127 static inline bool mmu_notifier_retry_cache(struct kvm
*kvm
, unsigned long mmu_seq
)
130 * mn_active_invalidate_count acts for all intents and purposes
131 * like mmu_invalidate_in_progress here; but the latter cannot
132 * be used here because the invalidation of caches in the
133 * mmu_notifier event occurs _before_ mmu_invalidate_in_progress
136 * Note, it does not matter that mn_active_invalidate_count
137 * is not protected by gpc->lock. It is guaranteed to
138 * be elevated before the mmu_notifier acquires gpc->lock, and
139 * isn't dropped until after mmu_invalidate_seq is updated.
141 if (kvm
->mn_active_invalidate_count
)
145 * Ensure mn_active_invalidate_count is read before
146 * mmu_invalidate_seq. This pairs with the smp_wmb() in
147 * mmu_notifier_invalidate_range_end() to guarantee either the
148 * old (non-zero) value of mn_active_invalidate_count or the
149 * new (incremented) value of mmu_invalidate_seq is observed.
152 return kvm
->mmu_invalidate_seq
!= mmu_seq
;
155 static kvm_pfn_t
hva_to_pfn_retry(struct gfn_to_pfn_cache
*gpc
)
157 /* Note, the new page offset may be different than the old! */
158 void *old_khva
= (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc
->khva
);
159 kvm_pfn_t new_pfn
= KVM_PFN_ERR_FAULT
;
160 void *new_khva
= NULL
;
161 unsigned long mmu_seq
;
164 struct kvm_follow_pfn kfp
= {
165 .slot
= gpc
->memslot
,
166 .gfn
= gpa_to_gfn(gpc
->gpa
),
169 .refcounted_page
= &page
,
172 lockdep_assert_held(&gpc
->refresh_lock
);
174 lockdep_assert_held_write(&gpc
->lock
);
177 * Invalidate the cache prior to dropping gpc->lock, the gpa=>uhva
178 * assets have already been updated and so a concurrent check() from a
179 * different task may not fail the gpa/uhva/generation checks.
184 mmu_seq
= gpc
->kvm
->mmu_invalidate_seq
;
187 write_unlock_irq(&gpc
->lock
);
190 * If the previous iteration "failed" due to an mmu_notifier
191 * event, release the pfn and unmap the kernel virtual address
192 * from the previous attempt. Unmapping might sleep, so this
193 * needs to be done after dropping the lock. Opportunistically
194 * check for resched while the lock isn't held.
196 if (new_pfn
!= KVM_PFN_ERR_FAULT
) {
198 * Keep the mapping if the previous iteration reused
199 * the existing mapping and didn't create a new one.
201 if (new_khva
!= old_khva
)
202 gpc_unmap(new_pfn
, new_khva
);
204 kvm_release_page_unused(page
);
209 new_pfn
= hva_to_pfn(&kfp
);
210 if (is_error_noslot_pfn(new_pfn
))
214 * Obtain a new kernel mapping if KVM itself will access the
215 * pfn. Note, kmap() and memremap() can both sleep, so this
216 * too must be done outside of gpc->lock!
218 if (new_pfn
== gpc
->pfn
)
221 new_khva
= gpc_map(new_pfn
);
224 kvm_release_page_unused(page
);
228 write_lock_irq(&gpc
->lock
);
231 * Other tasks must wait for _this_ refresh to complete before
232 * attempting to refresh.
234 WARN_ON_ONCE(gpc
->valid
);
235 } while (mmu_notifier_retry_cache(gpc
->kvm
, mmu_seq
));
239 gpc
->khva
= new_khva
+ offset_in_page(gpc
->uhva
);
242 * Put the reference to the _new_ page. The page is now tracked by the
243 * cache and can be safely migrated, swapped, etc... as the cache will
244 * invalidate any mappings in response to relevant mmu_notifier events.
246 kvm_release_page_clean(page
);
251 write_lock_irq(&gpc
->lock
);
256 static int __kvm_gpc_refresh(struct gfn_to_pfn_cache
*gpc
, gpa_t gpa
, unsigned long uhva
)
258 unsigned long page_offset
;
259 bool unmap_old
= false;
260 unsigned long old_uhva
;
262 bool hva_change
= false;
266 /* Either gpa or uhva must be valid, but not both */
267 if (WARN_ON_ONCE(kvm_is_error_gpa(gpa
) == kvm_is_error_hva(uhva
)))
270 lockdep_assert_held(&gpc
->refresh_lock
);
272 write_lock_irq(&gpc
->lock
);
280 old_khva
= (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc
->khva
);
281 old_uhva
= PAGE_ALIGN_DOWN(gpc
->uhva
);
283 if (kvm_is_error_gpa(gpa
)) {
284 page_offset
= offset_in_page(uhva
);
286 gpc
->gpa
= INVALID_GPA
;
288 gpc
->uhva
= PAGE_ALIGN_DOWN(uhva
);
290 if (gpc
->uhva
!= old_uhva
)
293 struct kvm_memslots
*slots
= kvm_memslots(gpc
->kvm
);
295 page_offset
= offset_in_page(gpa
);
297 if (gpc
->gpa
!= gpa
|| gpc
->generation
!= slots
->generation
||
298 kvm_is_error_hva(gpc
->uhva
)) {
299 gfn_t gfn
= gpa_to_gfn(gpa
);
302 gpc
->generation
= slots
->generation
;
303 gpc
->memslot
= __gfn_to_memslot(slots
, gfn
);
304 gpc
->uhva
= gfn_to_hva_memslot(gpc
->memslot
, gfn
);
306 if (kvm_is_error_hva(gpc
->uhva
)) {
312 * Even if the GPA and/or the memslot generation changed, the
313 * HVA may still be the same.
315 if (gpc
->uhva
!= old_uhva
)
318 gpc
->uhva
= old_uhva
;
322 /* Note: the offset must be correct before calling hva_to_pfn_retry() */
323 gpc
->uhva
+= page_offset
;
326 * If the userspace HVA changed or the PFN was already invalid,
327 * drop the lock and do the HVA to PFN lookup again.
329 if (!gpc
->valid
|| hva_change
) {
330 ret
= hva_to_pfn_retry(gpc
);
333 * If the HVA→PFN mapping was already valid, don't unmap it.
334 * But do update gpc->khva because the offset within the page
337 gpc
->khva
= old_khva
+ page_offset
;
344 * Invalidate the cache and purge the pfn/khva if the refresh failed.
345 * Some/all of the uhva, gpa, and memslot generation info may still be
346 * valid, leave it as is.
350 gpc
->pfn
= KVM_PFN_ERR_FAULT
;
354 /* Detect a pfn change before dropping the lock! */
355 unmap_old
= (old_pfn
!= gpc
->pfn
);
358 write_unlock_irq(&gpc
->lock
);
361 gpc_unmap(old_pfn
, old_khva
);
366 int kvm_gpc_refresh(struct gfn_to_pfn_cache
*gpc
, unsigned long len
)
370 guard(mutex
)(&gpc
->refresh_lock
);
372 if (!kvm_gpc_is_valid_len(gpc
->gpa
, gpc
->uhva
, len
))
376 * If the GPA is valid then ignore the HVA, as a cache can be GPA-based
377 * or HVA-based, not both. For GPA-based caches, the HVA will be
378 * recomputed during refresh if necessary.
380 uhva
= kvm_is_error_gpa(gpc
->gpa
) ? gpc
->uhva
: KVM_HVA_ERR_BAD
;
382 return __kvm_gpc_refresh(gpc
, gpc
->gpa
, uhva
);
385 void kvm_gpc_init(struct gfn_to_pfn_cache
*gpc
, struct kvm
*kvm
)
387 rwlock_init(&gpc
->lock
);
388 mutex_init(&gpc
->refresh_lock
);
391 gpc
->pfn
= KVM_PFN_ERR_FAULT
;
392 gpc
->gpa
= INVALID_GPA
;
393 gpc
->uhva
= KVM_HVA_ERR_BAD
;
394 gpc
->active
= gpc
->valid
= false;
397 static int __kvm_gpc_activate(struct gfn_to_pfn_cache
*gpc
, gpa_t gpa
, unsigned long uhva
,
400 struct kvm
*kvm
= gpc
->kvm
;
402 if (!kvm_gpc_is_valid_len(gpa
, uhva
, len
))
405 guard(mutex
)(&gpc
->refresh_lock
);
408 if (KVM_BUG_ON(gpc
->valid
, kvm
))
411 spin_lock(&kvm
->gpc_lock
);
412 list_add(&gpc
->list
, &kvm
->gpc_list
);
413 spin_unlock(&kvm
->gpc_lock
);
416 * Activate the cache after adding it to the list, a concurrent
417 * refresh must not establish a mapping until the cache is
418 * reachable by mmu_notifier events.
420 write_lock_irq(&gpc
->lock
);
422 write_unlock_irq(&gpc
->lock
);
424 return __kvm_gpc_refresh(gpc
, gpa
, uhva
);
427 int kvm_gpc_activate(struct gfn_to_pfn_cache
*gpc
, gpa_t gpa
, unsigned long len
)
430 * Explicitly disallow INVALID_GPA so that the magic value can be used
431 * by KVM to differentiate between GPA-based and HVA-based caches.
433 if (WARN_ON_ONCE(kvm_is_error_gpa(gpa
)))
436 return __kvm_gpc_activate(gpc
, gpa
, KVM_HVA_ERR_BAD
, len
);
439 int kvm_gpc_activate_hva(struct gfn_to_pfn_cache
*gpc
, unsigned long uhva
, unsigned long len
)
441 if (!access_ok((void __user
*)uhva
, len
))
444 return __kvm_gpc_activate(gpc
, INVALID_GPA
, uhva
, len
);
447 void kvm_gpc_deactivate(struct gfn_to_pfn_cache
*gpc
)
449 struct kvm
*kvm
= gpc
->kvm
;
453 guard(mutex
)(&gpc
->refresh_lock
);
457 * Deactivate the cache before removing it from the list, KVM
458 * must stall mmu_notifier events until all users go away, i.e.
459 * until gpc->lock is dropped and refresh is guaranteed to fail.
461 write_lock_irq(&gpc
->lock
);
466 * Leave the GPA => uHVA cache intact, it's protected by the
467 * memslot generation. The PFN lookup needs to be redone every
468 * time as mmu_notifier protection is lost when the cache is
469 * removed from the VM's gpc_list.
471 old_khva
= gpc
->khva
- offset_in_page(gpc
->khva
);
475 gpc
->pfn
= KVM_PFN_ERR_FAULT
;
476 write_unlock_irq(&gpc
->lock
);
478 spin_lock(&kvm
->gpc_lock
);
479 list_del(&gpc
->list
);
480 spin_unlock(&kvm
->gpc_lock
);
482 gpc_unmap(old_pfn
, old_khva
);