1 // SPDX-License-Identifier: GPL-2.0-only
3 * Kernel-based Virtual Machine driver for Linux
7 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11 #include <linux/kvm_types.h>
12 #include <linux/kvm_host.h>
13 #include <linux/kernel.h>
14 #include <linux/highmem.h>
15 #include <linux/psp.h>
16 #include <linux/psp-sev.h>
17 #include <linux/pagemap.h>
18 #include <linux/swap.h>
19 #include <linux/misc_cgroup.h>
20 #include <linux/processor.h>
21 #include <linux/trace_events.h>
22 #include <uapi/linux/sev-guest.h>
25 #include <asm/trapnr.h>
26 #include <asm/fpu/xcr.h>
27 #include <asm/fpu/xstate.h>
28 #include <asm/debugreg.h>
38 #define GHCB_VERSION_MAX 2ULL
39 #define GHCB_VERSION_DEFAULT 2ULL
40 #define GHCB_VERSION_MIN 1ULL
42 #define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION)
44 /* enable/disable SEV support */
45 static bool sev_enabled
= true;
46 module_param_named(sev
, sev_enabled
, bool, 0444);
48 /* enable/disable SEV-ES support */
49 static bool sev_es_enabled
= true;
50 module_param_named(sev_es
, sev_es_enabled
, bool, 0444);
52 /* enable/disable SEV-SNP support */
53 static bool sev_snp_enabled
= true;
54 module_param_named(sev_snp
, sev_snp_enabled
, bool, 0444);
56 /* enable/disable SEV-ES DebugSwap support */
57 static bool sev_es_debug_swap_enabled
= true;
58 module_param_named(debug_swap
, sev_es_debug_swap_enabled
, bool, 0444);
59 static u64 sev_supported_vmsa_features
;
61 #define AP_RESET_HOLD_NONE 0
62 #define AP_RESET_HOLD_NAE_EVENT 1
63 #define AP_RESET_HOLD_MSR_PROTO 2
65 /* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */
66 #define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0)
67 #define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8)
68 #define SNP_POLICY_MASK_SMT BIT_ULL(16)
69 #define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17)
70 #define SNP_POLICY_MASK_DEBUG BIT_ULL(19)
71 #define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20)
73 #define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \
74 SNP_POLICY_MASK_API_MAJOR | \
75 SNP_POLICY_MASK_SMT | \
76 SNP_POLICY_MASK_RSVD_MBO | \
77 SNP_POLICY_MASK_DEBUG | \
78 SNP_POLICY_MASK_SINGLE_SOCKET)
80 #define INITIAL_VMSA_GPA 0xFFFFFFFFF000
82 static u8 sev_enc_bit
;
83 static DECLARE_RWSEM(sev_deactivate_lock
);
84 static DEFINE_MUTEX(sev_bitmap_lock
);
85 unsigned int max_sev_asid
;
86 static unsigned int min_sev_asid
;
87 static unsigned long sev_me_mask
;
88 static unsigned int nr_asids
;
89 static unsigned long *sev_asid_bitmap
;
90 static unsigned long *sev_reclaim_asid_bitmap
;
92 static int snp_decommission_context(struct kvm
*kvm
);
95 struct list_head list
;
102 /* Called with the sev_bitmap_lock held, or on shutdown */
103 static int sev_flush_asids(unsigned int min_asid
, unsigned int max_asid
)
108 /* Check if there are any ASIDs to reclaim before performing a flush */
109 asid
= find_next_bit(sev_reclaim_asid_bitmap
, nr_asids
, min_asid
);
114 * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail,
115 * so it must be guarded.
117 down_write(&sev_deactivate_lock
);
119 wbinvd_on_all_cpus();
122 ret
= sev_do_cmd(SEV_CMD_SNP_DF_FLUSH
, NULL
, &error
);
124 ret
= sev_guest_df_flush(&error
);
126 up_write(&sev_deactivate_lock
);
129 pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n",
130 sev_snp_enabled
? "-SNP" : "", ret
, error
);
135 static inline bool is_mirroring_enc_context(struct kvm
*kvm
)
137 return !!to_kvm_sev_info(kvm
)->enc_context_owner
;
140 static bool sev_vcpu_has_debug_swap(struct vcpu_svm
*svm
)
142 struct kvm_vcpu
*vcpu
= &svm
->vcpu
;
143 struct kvm_sev_info
*sev
= &to_kvm_svm(vcpu
->kvm
)->sev_info
;
145 return sev
->vmsa_features
& SVM_SEV_FEAT_DEBUG_SWAP
;
148 /* Must be called with the sev_bitmap_lock held */
149 static bool __sev_recycle_asids(unsigned int min_asid
, unsigned int max_asid
)
151 if (sev_flush_asids(min_asid
, max_asid
))
154 /* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */
155 bitmap_xor(sev_asid_bitmap
, sev_asid_bitmap
, sev_reclaim_asid_bitmap
,
157 bitmap_zero(sev_reclaim_asid_bitmap
, nr_asids
);
162 static int sev_misc_cg_try_charge(struct kvm_sev_info
*sev
)
164 enum misc_res_type type
= sev
->es_active
? MISC_CG_RES_SEV_ES
: MISC_CG_RES_SEV
;
165 return misc_cg_try_charge(type
, sev
->misc_cg
, 1);
168 static void sev_misc_cg_uncharge(struct kvm_sev_info
*sev
)
170 enum misc_res_type type
= sev
->es_active
? MISC_CG_RES_SEV_ES
: MISC_CG_RES_SEV
;
171 misc_cg_uncharge(type
, sev
->misc_cg
, 1);
174 static int sev_asid_new(struct kvm_sev_info
*sev
)
177 * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid.
178 * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1.
179 * Note: min ASID can end up larger than the max if basic SEV support is
180 * effectively disabled by disallowing use of ASIDs for SEV guests.
182 unsigned int min_asid
= sev
->es_active
? 1 : min_sev_asid
;
183 unsigned int max_asid
= sev
->es_active
? min_sev_asid
- 1 : max_sev_asid
;
188 if (min_asid
> max_asid
)
191 WARN_ON(sev
->misc_cg
);
192 sev
->misc_cg
= get_current_misc_cg();
193 ret
= sev_misc_cg_try_charge(sev
);
195 put_misc_cg(sev
->misc_cg
);
200 mutex_lock(&sev_bitmap_lock
);
203 asid
= find_next_zero_bit(sev_asid_bitmap
, max_asid
+ 1, min_asid
);
204 if (asid
> max_asid
) {
205 if (retry
&& __sev_recycle_asids(min_asid
, max_asid
)) {
209 mutex_unlock(&sev_bitmap_lock
);
214 __set_bit(asid
, sev_asid_bitmap
);
216 mutex_unlock(&sev_bitmap_lock
);
221 sev_misc_cg_uncharge(sev
);
222 put_misc_cg(sev
->misc_cg
);
227 static unsigned int sev_get_asid(struct kvm
*kvm
)
229 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
234 static void sev_asid_free(struct kvm_sev_info
*sev
)
236 struct svm_cpu_data
*sd
;
239 mutex_lock(&sev_bitmap_lock
);
241 __set_bit(sev
->asid
, sev_reclaim_asid_bitmap
);
243 for_each_possible_cpu(cpu
) {
244 sd
= per_cpu_ptr(&svm_data
, cpu
);
245 sd
->sev_vmcbs
[sev
->asid
] = NULL
;
248 mutex_unlock(&sev_bitmap_lock
);
250 sev_misc_cg_uncharge(sev
);
251 put_misc_cg(sev
->misc_cg
);
255 static void sev_decommission(unsigned int handle
)
257 struct sev_data_decommission decommission
;
262 decommission
.handle
= handle
;
263 sev_guest_decommission(&decommission
, NULL
);
267 * Transition a page to hypervisor-owned/shared state in the RMP table. This
268 * should not fail under normal conditions, but leak the page should that
269 * happen since it will no longer be usable by the host due to RMP protections.
271 static int kvm_rmp_make_shared(struct kvm
*kvm
, u64 pfn
, enum pg_level level
)
273 if (KVM_BUG_ON(rmp_make_shared(pfn
, level
), kvm
)) {
274 snp_leak_pages(pfn
, page_level_size(level
) >> PAGE_SHIFT
);
282 * Certain page-states, such as Pre-Guest and Firmware pages (as documented
283 * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be
284 * directly transitioned back to normal/hypervisor-owned state via RMPUPDATE
285 * unless they are reclaimed first.
287 * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they
288 * might not be usable by the host due to being set as immutable or still
289 * being associated with a guest ASID.
291 * Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be
292 * converted back to shared, as the page is no longer usable due to RMP
293 * protections, and it's infeasible for the guest to continue on.
295 static int snp_page_reclaim(struct kvm
*kvm
, u64 pfn
)
297 struct sev_data_snp_page_reclaim data
= {0};
300 data
.paddr
= __sme_set(pfn
<< PAGE_SHIFT
);
301 rc
= sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM
, &data
, &fw_err
);
302 if (KVM_BUG(rc
, kvm
, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn
, rc
, fw_err
)) {
303 snp_leak_pages(pfn
, 1);
307 if (kvm_rmp_make_shared(kvm
, pfn
, PG_LEVEL_4K
))
313 static void sev_unbind_asid(struct kvm
*kvm
, unsigned int handle
)
315 struct sev_data_deactivate deactivate
;
320 deactivate
.handle
= handle
;
322 /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
323 down_read(&sev_deactivate_lock
);
324 sev_guest_deactivate(&deactivate
, NULL
);
325 up_read(&sev_deactivate_lock
);
327 sev_decommission(handle
);
331 * This sets up bounce buffers/firmware pages to handle SNP Guest Request
332 * messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB
333 * 2.0 specification for more details.
335 * Technically, when an SNP Guest Request is issued, the guest will provide its
336 * own request/response pages, which could in theory be passed along directly
337 * to firmware rather than using bounce pages. However, these pages would need
340 * - Both pages are from shared guest memory, so they need to be protected
341 * from migration/etc. occurring while firmware reads/writes to them. At a
342 * minimum, this requires elevating the ref counts and potentially needing
343 * an explicit pinning of the memory. This places additional restrictions
344 * on what type of memory backends userspace can use for shared guest
345 * memory since there is some reliance on using refcounted pages.
347 * - The response page needs to be switched to Firmware-owned[1] state
348 * before the firmware can write to it, which can lead to potential
349 * host RMP #PFs if the guest is misbehaved and hands the host a
350 * guest page that KVM might write to for other reasons (e.g. virtio
353 * Both of these issues can be avoided completely by using separately-allocated
354 * bounce pages for both the request/response pages and passing those to
355 * firmware instead. So that's what is being set up here.
357 * Guest requests rely on message sequence numbers to ensure requests are
358 * issued to firmware in the order the guest issues them, so concurrent guest
359 * requests generally shouldn't happen. But a misbehaved guest could issue
360 * concurrent guest requests in theory, so a mutex is used to serialize
361 * access to the bounce buffers.
363 * [1] See the "Page States" section of the SEV-SNP Firmware ABI for more
364 * details on Firmware-owned pages, along with "RMP and VMPL Access Checks"
365 * in the APM for details on the related RMP restrictions.
367 static int snp_guest_req_init(struct kvm
*kvm
)
369 struct kvm_sev_info
*sev
= to_kvm_sev_info(kvm
);
370 struct page
*req_page
;
372 req_page
= alloc_page(GFP_KERNEL_ACCOUNT
| __GFP_ZERO
);
376 sev
->guest_resp_buf
= snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT
| __GFP_ZERO
);
377 if (!sev
->guest_resp_buf
) {
378 __free_page(req_page
);
382 sev
->guest_req_buf
= page_address(req_page
);
383 mutex_init(&sev
->guest_req_mutex
);
388 static void snp_guest_req_cleanup(struct kvm
*kvm
)
390 struct kvm_sev_info
*sev
= to_kvm_sev_info(kvm
);
392 if (sev
->guest_resp_buf
)
393 snp_free_firmware_page(sev
->guest_resp_buf
);
395 if (sev
->guest_req_buf
)
396 __free_page(virt_to_page(sev
->guest_req_buf
));
398 sev
->guest_req_buf
= NULL
;
399 sev
->guest_resp_buf
= NULL
;
402 static int __sev_guest_init(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
,
403 struct kvm_sev_init
*data
,
404 unsigned long vm_type
)
406 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
407 struct sev_platform_init_args init_args
= {0};
408 bool es_active
= vm_type
!= KVM_X86_SEV_VM
;
409 u64 valid_vmsa_features
= es_active
? sev_supported_vmsa_features
: 0;
412 if (kvm
->created_vcpus
)
418 if (data
->vmsa_features
& ~valid_vmsa_features
)
421 if (data
->ghcb_version
> GHCB_VERSION_MAX
|| (!es_active
&& data
->ghcb_version
))
424 if (unlikely(sev
->active
))
428 sev
->es_active
= es_active
;
429 sev
->vmsa_features
= data
->vmsa_features
;
430 sev
->ghcb_version
= data
->ghcb_version
;
433 * Currently KVM supports the full range of mandatory features defined
434 * by version 2 of the GHCB protocol, so default to that for SEV-ES
435 * guests created via KVM_SEV_INIT2.
437 if (sev
->es_active
&& !sev
->ghcb_version
)
438 sev
->ghcb_version
= GHCB_VERSION_DEFAULT
;
440 if (vm_type
== KVM_X86_SNP_VM
)
441 sev
->vmsa_features
|= SVM_SEV_FEAT_SNP_ACTIVE
;
443 ret
= sev_asid_new(sev
);
447 init_args
.probe
= false;
448 ret
= sev_platform_init(&init_args
);
452 /* This needs to happen after SEV/SNP firmware initialization. */
453 if (vm_type
== KVM_X86_SNP_VM
) {
454 ret
= snp_guest_req_init(kvm
);
459 INIT_LIST_HEAD(&sev
->regions_list
);
460 INIT_LIST_HEAD(&sev
->mirror_vms
);
461 sev
->need_init
= false;
463 kvm_set_apicv_inhibit(kvm
, APICV_INHIBIT_REASON_SEV
);
468 argp
->error
= init_args
.error
;
472 sev
->vmsa_features
= 0;
473 sev
->es_active
= false;
478 static int sev_guest_init(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
480 struct kvm_sev_init data
= {
484 unsigned long vm_type
;
486 if (kvm
->arch
.vm_type
!= KVM_X86_DEFAULT_VM
)
489 vm_type
= (argp
->id
== KVM_SEV_INIT
? KVM_X86_SEV_VM
: KVM_X86_SEV_ES_VM
);
492 * KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will
493 * continue to only ever support the minimal GHCB protocol version.
495 if (vm_type
== KVM_X86_SEV_ES_VM
)
496 data
.ghcb_version
= GHCB_VERSION_MIN
;
498 return __sev_guest_init(kvm
, argp
, &data
, vm_type
);
501 static int sev_guest_init2(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
503 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
504 struct kvm_sev_init data
;
509 if (kvm
->arch
.vm_type
!= KVM_X86_SEV_VM
&&
510 kvm
->arch
.vm_type
!= KVM_X86_SEV_ES_VM
&&
511 kvm
->arch
.vm_type
!= KVM_X86_SNP_VM
)
514 if (copy_from_user(&data
, u64_to_user_ptr(argp
->data
), sizeof(data
)))
517 return __sev_guest_init(kvm
, argp
, &data
, kvm
->arch
.vm_type
);
520 static int sev_bind_asid(struct kvm
*kvm
, unsigned int handle
, int *error
)
522 unsigned int asid
= sev_get_asid(kvm
);
523 struct sev_data_activate activate
;
526 /* activate ASID on the given handle */
527 activate
.handle
= handle
;
528 activate
.asid
= asid
;
529 ret
= sev_guest_activate(&activate
, error
);
534 static int __sev_issue_cmd(int fd
, int id
, void *data
, int *error
)
541 return sev_issue_cmd_external_user(fd_file(f
), id
, data
, error
);
544 static int sev_issue_cmd(struct kvm
*kvm
, int id
, void *data
, int *error
)
546 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
548 return __sev_issue_cmd(sev
->fd
, id
, data
, error
);
551 static int sev_launch_start(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
553 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
554 struct sev_data_launch_start start
;
555 struct kvm_sev_launch_start params
;
556 void *dh_blob
, *session_blob
;
557 int *error
= &argp
->error
;
563 if (copy_from_user(¶ms
, u64_to_user_ptr(argp
->data
), sizeof(params
)))
566 memset(&start
, 0, sizeof(start
));
569 if (params
.dh_uaddr
) {
570 dh_blob
= psp_copy_user_blob(params
.dh_uaddr
, params
.dh_len
);
572 return PTR_ERR(dh_blob
);
574 start
.dh_cert_address
= __sme_set(__pa(dh_blob
));
575 start
.dh_cert_len
= params
.dh_len
;
579 if (params
.session_uaddr
) {
580 session_blob
= psp_copy_user_blob(params
.session_uaddr
, params
.session_len
);
581 if (IS_ERR(session_blob
)) {
582 ret
= PTR_ERR(session_blob
);
586 start
.session_address
= __sme_set(__pa(session_blob
));
587 start
.session_len
= params
.session_len
;
590 start
.handle
= params
.handle
;
591 start
.policy
= params
.policy
;
593 /* create memory encryption context */
594 ret
= __sev_issue_cmd(argp
->sev_fd
, SEV_CMD_LAUNCH_START
, &start
, error
);
598 /* Bind ASID to this guest */
599 ret
= sev_bind_asid(kvm
, start
.handle
, error
);
601 sev_decommission(start
.handle
);
605 /* return handle to userspace */
606 params
.handle
= start
.handle
;
607 if (copy_to_user(u64_to_user_ptr(argp
->data
), ¶ms
, sizeof(params
))) {
608 sev_unbind_asid(kvm
, start
.handle
);
613 sev
->handle
= start
.handle
;
614 sev
->fd
= argp
->sev_fd
;
623 static struct page
**sev_pin_memory(struct kvm
*kvm
, unsigned long uaddr
,
624 unsigned long ulen
, unsigned long *n
,
627 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
628 unsigned long npages
, size
;
630 unsigned long locked
, lock_limit
;
632 unsigned long first
, last
;
635 lockdep_assert_held(&kvm
->lock
);
637 if (ulen
== 0 || uaddr
+ ulen
< uaddr
)
638 return ERR_PTR(-EINVAL
);
640 /* Calculate number of pages. */
641 first
= (uaddr
& PAGE_MASK
) >> PAGE_SHIFT
;
642 last
= ((uaddr
+ ulen
- 1) & PAGE_MASK
) >> PAGE_SHIFT
;
643 npages
= (last
- first
+ 1);
645 locked
= sev
->pages_locked
+ npages
;
646 lock_limit
= rlimit(RLIMIT_MEMLOCK
) >> PAGE_SHIFT
;
647 if (locked
> lock_limit
&& !capable(CAP_IPC_LOCK
)) {
648 pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked
, lock_limit
);
649 return ERR_PTR(-ENOMEM
);
652 if (WARN_ON_ONCE(npages
> INT_MAX
))
653 return ERR_PTR(-EINVAL
);
655 /* Avoid using vmalloc for smaller buffers. */
656 size
= npages
* sizeof(struct page
*);
657 if (size
> PAGE_SIZE
)
658 pages
= __vmalloc(size
, GFP_KERNEL_ACCOUNT
);
660 pages
= kmalloc(size
, GFP_KERNEL_ACCOUNT
);
663 return ERR_PTR(-ENOMEM
);
665 /* Pin the user virtual address. */
666 npinned
= pin_user_pages_fast(uaddr
, npages
, write
? FOLL_WRITE
: 0, pages
);
667 if (npinned
!= npages
) {
668 pr_err("SEV: Failure locking %lu pages.\n", npages
);
674 sev
->pages_locked
= locked
;
680 unpin_user_pages(pages
, npinned
);
686 static void sev_unpin_memory(struct kvm
*kvm
, struct page
**pages
,
687 unsigned long npages
)
689 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
691 unpin_user_pages(pages
, npages
);
693 sev
->pages_locked
-= npages
;
696 static void sev_clflush_pages(struct page
*pages
[], unsigned long npages
)
698 uint8_t *page_virtual
;
701 if (this_cpu_has(X86_FEATURE_SME_COHERENT
) || npages
== 0 ||
705 for (i
= 0; i
< npages
; i
++) {
706 page_virtual
= kmap_local_page(pages
[i
]);
707 clflush_cache_range(page_virtual
, PAGE_SIZE
);
708 kunmap_local(page_virtual
);
713 static unsigned long get_num_contig_pages(unsigned long idx
,
714 struct page
**inpages
, unsigned long npages
)
716 unsigned long paddr
, next_paddr
;
717 unsigned long i
= idx
+ 1, pages
= 1;
719 /* find the number of contiguous pages starting from idx */
720 paddr
= __sme_page_pa(inpages
[idx
]);
722 next_paddr
= __sme_page_pa(inpages
[i
++]);
723 if ((paddr
+ PAGE_SIZE
) == next_paddr
) {
734 static int sev_launch_update_data(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
736 unsigned long vaddr
, vaddr_end
, next_vaddr
, npages
, pages
, size
, i
;
737 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
738 struct kvm_sev_launch_update_data params
;
739 struct sev_data_launch_update_data data
;
740 struct page
**inpages
;
746 if (copy_from_user(¶ms
, u64_to_user_ptr(argp
->data
), sizeof(params
)))
749 vaddr
= params
.uaddr
;
751 vaddr_end
= vaddr
+ size
;
753 /* Lock the user memory. */
754 inpages
= sev_pin_memory(kvm
, vaddr
, size
, &npages
, 1);
756 return PTR_ERR(inpages
);
759 * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
760 * place; the cache may contain the data that was written unencrypted.
762 sev_clflush_pages(inpages
, npages
);
765 data
.handle
= sev
->handle
;
767 for (i
= 0; vaddr
< vaddr_end
; vaddr
= next_vaddr
, i
+= pages
) {
771 * If the user buffer is not page-aligned, calculate the offset
774 offset
= vaddr
& (PAGE_SIZE
- 1);
776 /* Calculate the number of pages that can be encrypted in one go. */
777 pages
= get_num_contig_pages(i
, inpages
, npages
);
779 len
= min_t(size_t, ((pages
* PAGE_SIZE
) - offset
), size
);
782 data
.address
= __sme_page_pa(inpages
[i
]) + offset
;
783 ret
= sev_issue_cmd(kvm
, SEV_CMD_LAUNCH_UPDATE_DATA
, &data
, &argp
->error
);
788 next_vaddr
= vaddr
+ len
;
792 /* content of memory is updated, mark pages dirty */
793 for (i
= 0; i
< npages
; i
++) {
794 set_page_dirty_lock(inpages
[i
]);
795 mark_page_accessed(inpages
[i
]);
797 /* unlock the user pages */
798 sev_unpin_memory(kvm
, inpages
, npages
);
802 static int sev_es_sync_vmsa(struct vcpu_svm
*svm
)
804 struct kvm_vcpu
*vcpu
= &svm
->vcpu
;
805 struct kvm_sev_info
*sev
= &to_kvm_svm(vcpu
->kvm
)->sev_info
;
806 struct sev_es_save_area
*save
= svm
->sev_es
.vmsa
;
807 struct xregs_state
*xsave
;
812 /* Check some debug related fields before encrypting the VMSA */
813 if (svm
->vcpu
.guest_debug
|| (svm
->vmcb
->save
.dr7
& ~DR7_FIXED_1
))
817 * SEV-ES will use a VMSA that is pointed to by the VMCB, not
818 * the traditional VMSA that is part of the VMCB. Copy the
819 * traditional VMSA as it has been built so far (in prep
820 * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
822 memcpy(save
, &svm
->vmcb
->save
, sizeof(svm
->vmcb
->save
));
824 /* Sync registgers */
825 save
->rax
= svm
->vcpu
.arch
.regs
[VCPU_REGS_RAX
];
826 save
->rbx
= svm
->vcpu
.arch
.regs
[VCPU_REGS_RBX
];
827 save
->rcx
= svm
->vcpu
.arch
.regs
[VCPU_REGS_RCX
];
828 save
->rdx
= svm
->vcpu
.arch
.regs
[VCPU_REGS_RDX
];
829 save
->rsp
= svm
->vcpu
.arch
.regs
[VCPU_REGS_RSP
];
830 save
->rbp
= svm
->vcpu
.arch
.regs
[VCPU_REGS_RBP
];
831 save
->rsi
= svm
->vcpu
.arch
.regs
[VCPU_REGS_RSI
];
832 save
->rdi
= svm
->vcpu
.arch
.regs
[VCPU_REGS_RDI
];
834 save
->r8
= svm
->vcpu
.arch
.regs
[VCPU_REGS_R8
];
835 save
->r9
= svm
->vcpu
.arch
.regs
[VCPU_REGS_R9
];
836 save
->r10
= svm
->vcpu
.arch
.regs
[VCPU_REGS_R10
];
837 save
->r11
= svm
->vcpu
.arch
.regs
[VCPU_REGS_R11
];
838 save
->r12
= svm
->vcpu
.arch
.regs
[VCPU_REGS_R12
];
839 save
->r13
= svm
->vcpu
.arch
.regs
[VCPU_REGS_R13
];
840 save
->r14
= svm
->vcpu
.arch
.regs
[VCPU_REGS_R14
];
841 save
->r15
= svm
->vcpu
.arch
.regs
[VCPU_REGS_R15
];
843 save
->rip
= svm
->vcpu
.arch
.regs
[VCPU_REGS_RIP
];
845 /* Sync some non-GPR registers before encrypting */
846 save
->xcr0
= svm
->vcpu
.arch
.xcr0
;
847 save
->pkru
= svm
->vcpu
.arch
.pkru
;
848 save
->xss
= svm
->vcpu
.arch
.ia32_xss
;
849 save
->dr6
= svm
->vcpu
.arch
.dr6
;
851 save
->sev_features
= sev
->vmsa_features
;
854 * Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid
855 * breaking older measurements.
857 if (vcpu
->kvm
->arch
.vm_type
!= KVM_X86_DEFAULT_VM
) {
858 xsave
= &vcpu
->arch
.guest_fpu
.fpstate
->regs
.xsave
;
859 save
->x87_dp
= xsave
->i387
.rdp
;
860 save
->mxcsr
= xsave
->i387
.mxcsr
;
861 save
->x87_ftw
= xsave
->i387
.twd
;
862 save
->x87_fsw
= xsave
->i387
.swd
;
863 save
->x87_fcw
= xsave
->i387
.cwd
;
864 save
->x87_fop
= xsave
->i387
.fop
;
867 save
->x87_rip
= xsave
->i387
.rip
;
869 for (i
= 0; i
< 8; i
++) {
871 * The format of the x87 save area is undocumented and
872 * definitely not what you would expect. It consists of
873 * an 8*8 bytes area with bytes 0-7, and an 8*2 bytes
874 * area with bytes 8-9 of each register.
876 d
= save
->fpreg_x87
+ i
* 8;
877 s
= ((u8
*)xsave
->i387
.st_space
) + i
* 16;
879 save
->fpreg_x87
[64 + i
* 2] = s
[8];
880 save
->fpreg_x87
[64 + i
* 2 + 1] = s
[9];
882 memcpy(save
->fpreg_xmm
, xsave
->i387
.xmm_space
, 256);
884 s
= get_xsave_addr(xsave
, XFEATURE_YMM
);
886 memcpy(save
->fpreg_ymm
, s
, 256);
888 memset(save
->fpreg_ymm
, 0, 256);
891 pr_debug("Virtual Machine Save Area (VMSA):\n");
892 print_hex_dump_debug("", DUMP_PREFIX_NONE
, 16, 1, save
, sizeof(*save
), false);
897 static int __sev_launch_update_vmsa(struct kvm
*kvm
, struct kvm_vcpu
*vcpu
,
900 struct sev_data_launch_update_vmsa vmsa
;
901 struct vcpu_svm
*svm
= to_svm(vcpu
);
904 if (vcpu
->guest_debug
) {
905 pr_warn_once("KVM_SET_GUEST_DEBUG for SEV-ES guest is not supported");
909 /* Perform some pre-encryption checks against the VMSA */
910 ret
= sev_es_sync_vmsa(svm
);
915 * The LAUNCH_UPDATE_VMSA command will perform in-place encryption of
916 * the VMSA memory content (i.e it will write the same memory region
917 * with the guest's key), so invalidate it first.
919 clflush_cache_range(svm
->sev_es
.vmsa
, PAGE_SIZE
);
922 vmsa
.handle
= to_kvm_sev_info(kvm
)->handle
;
923 vmsa
.address
= __sme_pa(svm
->sev_es
.vmsa
);
924 vmsa
.len
= PAGE_SIZE
;
925 ret
= sev_issue_cmd(kvm
, SEV_CMD_LAUNCH_UPDATE_VMSA
, &vmsa
, error
);
930 * SEV-ES guests maintain an encrypted version of their FPU
931 * state which is restored and saved on VMRUN and VMEXIT.
932 * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
933 * do xsave/xrstor on it.
935 fpstate_set_confidential(&vcpu
->arch
.guest_fpu
);
936 vcpu
->arch
.guest_state_protected
= true;
939 * SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it
940 * only after setting guest_state_protected because KVM_SET_MSRS allows
941 * dynamic toggling of LBRV (for performance reason) on write access to
942 * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
944 svm_enable_lbrv(vcpu
);
948 static int sev_launch_update_vmsa(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
950 struct kvm_vcpu
*vcpu
;
954 if (!sev_es_guest(kvm
))
957 kvm_for_each_vcpu(i
, vcpu
, kvm
) {
958 ret
= mutex_lock_killable(&vcpu
->mutex
);
962 ret
= __sev_launch_update_vmsa(kvm
, vcpu
, &argp
->error
);
964 mutex_unlock(&vcpu
->mutex
);
972 static int sev_launch_measure(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
974 void __user
*measure
= u64_to_user_ptr(argp
->data
);
975 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
976 struct sev_data_launch_measure data
;
977 struct kvm_sev_launch_measure params
;
978 void __user
*p
= NULL
;
985 if (copy_from_user(¶ms
, measure
, sizeof(params
)))
988 memset(&data
, 0, sizeof(data
));
990 /* User wants to query the blob length */
994 p
= u64_to_user_ptr(params
.uaddr
);
996 if (params
.len
> SEV_FW_BLOB_MAX_SIZE
)
999 blob
= kzalloc(params
.len
, GFP_KERNEL_ACCOUNT
);
1003 data
.address
= __psp_pa(blob
);
1004 data
.len
= params
.len
;
1008 data
.handle
= sev
->handle
;
1009 ret
= sev_issue_cmd(kvm
, SEV_CMD_LAUNCH_MEASURE
, &data
, &argp
->error
);
1012 * If we query the session length, FW responded with expected data.
1021 if (copy_to_user(p
, blob
, params
.len
))
1026 params
.len
= data
.len
;
1027 if (copy_to_user(measure
, ¶ms
, sizeof(params
)))
1034 static int sev_launch_finish(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
1036 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
1037 struct sev_data_launch_finish data
;
1039 if (!sev_guest(kvm
))
1042 data
.handle
= sev
->handle
;
1043 return sev_issue_cmd(kvm
, SEV_CMD_LAUNCH_FINISH
, &data
, &argp
->error
);
1046 static int sev_guest_status(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
1048 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
1049 struct kvm_sev_guest_status params
;
1050 struct sev_data_guest_status data
;
1053 if (!sev_guest(kvm
))
1056 memset(&data
, 0, sizeof(data
));
1058 data
.handle
= sev
->handle
;
1059 ret
= sev_issue_cmd(kvm
, SEV_CMD_GUEST_STATUS
, &data
, &argp
->error
);
1063 params
.policy
= data
.policy
;
1064 params
.state
= data
.state
;
1065 params
.handle
= data
.handle
;
1067 if (copy_to_user(u64_to_user_ptr(argp
->data
), ¶ms
, sizeof(params
)))
1073 static int __sev_issue_dbg_cmd(struct kvm
*kvm
, unsigned long src
,
1074 unsigned long dst
, int size
,
1075 int *error
, bool enc
)
1077 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
1078 struct sev_data_dbg data
;
1081 data
.handle
= sev
->handle
;
1082 data
.dst_addr
= dst
;
1083 data
.src_addr
= src
;
1086 return sev_issue_cmd(kvm
,
1087 enc
? SEV_CMD_DBG_ENCRYPT
: SEV_CMD_DBG_DECRYPT
,
1091 static int __sev_dbg_decrypt(struct kvm
*kvm
, unsigned long src_paddr
,
1092 unsigned long dst_paddr
, int sz
, int *err
)
1097 * Its safe to read more than we are asked, caller should ensure that
1098 * destination has enough space.
1100 offset
= src_paddr
& 15;
1101 src_paddr
= round_down(src_paddr
, 16);
1102 sz
= round_up(sz
+ offset
, 16);
1104 return __sev_issue_dbg_cmd(kvm
, src_paddr
, dst_paddr
, sz
, err
, false);
1107 static int __sev_dbg_decrypt_user(struct kvm
*kvm
, unsigned long paddr
,
1108 void __user
*dst_uaddr
,
1109 unsigned long dst_paddr
,
1112 struct page
*tpage
= NULL
;
1115 /* if inputs are not 16-byte then use intermediate buffer */
1116 if (!IS_ALIGNED(dst_paddr
, 16) ||
1117 !IS_ALIGNED(paddr
, 16) ||
1118 !IS_ALIGNED(size
, 16)) {
1119 tpage
= (void *)alloc_page(GFP_KERNEL_ACCOUNT
| __GFP_ZERO
);
1123 dst_paddr
= __sme_page_pa(tpage
);
1126 ret
= __sev_dbg_decrypt(kvm
, paddr
, dst_paddr
, size
, err
);
1131 offset
= paddr
& 15;
1132 if (copy_to_user(dst_uaddr
, page_address(tpage
) + offset
, size
))
1143 static int __sev_dbg_encrypt_user(struct kvm
*kvm
, unsigned long paddr
,
1145 unsigned long dst_paddr
,
1146 void __user
*dst_vaddr
,
1147 int size
, int *error
)
1149 struct page
*src_tpage
= NULL
;
1150 struct page
*dst_tpage
= NULL
;
1151 int ret
, len
= size
;
1153 /* If source buffer is not aligned then use an intermediate buffer */
1154 if (!IS_ALIGNED((unsigned long)vaddr
, 16)) {
1155 src_tpage
= alloc_page(GFP_KERNEL_ACCOUNT
);
1159 if (copy_from_user(page_address(src_tpage
), vaddr
, size
)) {
1160 __free_page(src_tpage
);
1164 paddr
= __sme_page_pa(src_tpage
);
1168 * If destination buffer or length is not aligned then do read-modify-write:
1169 * - decrypt destination in an intermediate buffer
1170 * - copy the source buffer in an intermediate buffer
1171 * - use the intermediate buffer as source buffer
1173 if (!IS_ALIGNED((unsigned long)dst_vaddr
, 16) || !IS_ALIGNED(size
, 16)) {
1176 dst_tpage
= alloc_page(GFP_KERNEL_ACCOUNT
);
1182 ret
= __sev_dbg_decrypt(kvm
, dst_paddr
,
1183 __sme_page_pa(dst_tpage
), size
, error
);
1188 * If source is kernel buffer then use memcpy() otherwise
1191 dst_offset
= dst_paddr
& 15;
1194 memcpy(page_address(dst_tpage
) + dst_offset
,
1195 page_address(src_tpage
), size
);
1197 if (copy_from_user(page_address(dst_tpage
) + dst_offset
,
1204 paddr
= __sme_page_pa(dst_tpage
);
1205 dst_paddr
= round_down(dst_paddr
, 16);
1206 len
= round_up(size
, 16);
1209 ret
= __sev_issue_dbg_cmd(kvm
, paddr
, dst_paddr
, len
, error
, true);
1213 __free_page(src_tpage
);
1215 __free_page(dst_tpage
);
1219 static int sev_dbg_crypt(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
, bool dec
)
1221 unsigned long vaddr
, vaddr_end
, next_vaddr
;
1222 unsigned long dst_vaddr
;
1223 struct page
**src_p
, **dst_p
;
1224 struct kvm_sev_dbg debug
;
1229 if (!sev_guest(kvm
))
1232 if (copy_from_user(&debug
, u64_to_user_ptr(argp
->data
), sizeof(debug
)))
1235 if (!debug
.len
|| debug
.src_uaddr
+ debug
.len
< debug
.src_uaddr
)
1237 if (!debug
.dst_uaddr
)
1240 vaddr
= debug
.src_uaddr
;
1242 vaddr_end
= vaddr
+ size
;
1243 dst_vaddr
= debug
.dst_uaddr
;
1245 for (; vaddr
< vaddr_end
; vaddr
= next_vaddr
) {
1246 int len
, s_off
, d_off
;
1248 /* lock userspace source and destination page */
1249 src_p
= sev_pin_memory(kvm
, vaddr
& PAGE_MASK
, PAGE_SIZE
, &n
, 0);
1251 return PTR_ERR(src_p
);
1253 dst_p
= sev_pin_memory(kvm
, dst_vaddr
& PAGE_MASK
, PAGE_SIZE
, &n
, 1);
1254 if (IS_ERR(dst_p
)) {
1255 sev_unpin_memory(kvm
, src_p
, n
);
1256 return PTR_ERR(dst_p
);
1260 * Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify
1261 * the pages; flush the destination too so that future accesses do not
1264 sev_clflush_pages(src_p
, 1);
1265 sev_clflush_pages(dst_p
, 1);
1268 * Since user buffer may not be page aligned, calculate the
1269 * offset within the page.
1271 s_off
= vaddr
& ~PAGE_MASK
;
1272 d_off
= dst_vaddr
& ~PAGE_MASK
;
1273 len
= min_t(size_t, (PAGE_SIZE
- s_off
), size
);
1276 ret
= __sev_dbg_decrypt_user(kvm
,
1277 __sme_page_pa(src_p
[0]) + s_off
,
1278 (void __user
*)dst_vaddr
,
1279 __sme_page_pa(dst_p
[0]) + d_off
,
1282 ret
= __sev_dbg_encrypt_user(kvm
,
1283 __sme_page_pa(src_p
[0]) + s_off
,
1284 (void __user
*)vaddr
,
1285 __sme_page_pa(dst_p
[0]) + d_off
,
1286 (void __user
*)dst_vaddr
,
1289 sev_unpin_memory(kvm
, src_p
, n
);
1290 sev_unpin_memory(kvm
, dst_p
, n
);
1295 next_vaddr
= vaddr
+ len
;
1296 dst_vaddr
= dst_vaddr
+ len
;
1303 static int sev_launch_secret(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
1305 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
1306 struct sev_data_launch_secret data
;
1307 struct kvm_sev_launch_secret params
;
1308 struct page
**pages
;
1313 if (!sev_guest(kvm
))
1316 if (copy_from_user(¶ms
, u64_to_user_ptr(argp
->data
), sizeof(params
)))
1319 pages
= sev_pin_memory(kvm
, params
.guest_uaddr
, params
.guest_len
, &n
, 1);
1321 return PTR_ERR(pages
);
1324 * Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in
1325 * place; the cache may contain the data that was written unencrypted.
1327 sev_clflush_pages(pages
, n
);
1330 * The secret must be copied into contiguous memory region, lets verify
1331 * that userspace memory pages are contiguous before we issue command.
1333 if (get_num_contig_pages(0, pages
, n
) != n
) {
1335 goto e_unpin_memory
;
1338 memset(&data
, 0, sizeof(data
));
1340 offset
= params
.guest_uaddr
& (PAGE_SIZE
- 1);
1341 data
.guest_address
= __sme_page_pa(pages
[0]) + offset
;
1342 data
.guest_len
= params
.guest_len
;
1344 blob
= psp_copy_user_blob(params
.trans_uaddr
, params
.trans_len
);
1346 ret
= PTR_ERR(blob
);
1347 goto e_unpin_memory
;
1350 data
.trans_address
= __psp_pa(blob
);
1351 data
.trans_len
= params
.trans_len
;
1353 hdr
= psp_copy_user_blob(params
.hdr_uaddr
, params
.hdr_len
);
1358 data
.hdr_address
= __psp_pa(hdr
);
1359 data
.hdr_len
= params
.hdr_len
;
1361 data
.handle
= sev
->handle
;
1362 ret
= sev_issue_cmd(kvm
, SEV_CMD_LAUNCH_UPDATE_SECRET
, &data
, &argp
->error
);
1369 /* content of memory is updated, mark pages dirty */
1370 for (i
= 0; i
< n
; i
++) {
1371 set_page_dirty_lock(pages
[i
]);
1372 mark_page_accessed(pages
[i
]);
1374 sev_unpin_memory(kvm
, pages
, n
);
1378 static int sev_get_attestation_report(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
1380 void __user
*report
= u64_to_user_ptr(argp
->data
);
1381 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
1382 struct sev_data_attestation_report data
;
1383 struct kvm_sev_attestation_report params
;
1388 if (!sev_guest(kvm
))
1391 if (copy_from_user(¶ms
, u64_to_user_ptr(argp
->data
), sizeof(params
)))
1394 memset(&data
, 0, sizeof(data
));
1396 /* User wants to query the blob length */
1400 p
= u64_to_user_ptr(params
.uaddr
);
1402 if (params
.len
> SEV_FW_BLOB_MAX_SIZE
)
1405 blob
= kzalloc(params
.len
, GFP_KERNEL_ACCOUNT
);
1409 data
.address
= __psp_pa(blob
);
1410 data
.len
= params
.len
;
1411 memcpy(data
.mnonce
, params
.mnonce
, sizeof(params
.mnonce
));
1414 data
.handle
= sev
->handle
;
1415 ret
= sev_issue_cmd(kvm
, SEV_CMD_ATTESTATION_REPORT
, &data
, &argp
->error
);
1417 * If we query the session length, FW responded with expected data.
1426 if (copy_to_user(p
, blob
, params
.len
))
1431 params
.len
= data
.len
;
1432 if (copy_to_user(report
, ¶ms
, sizeof(params
)))
1439 /* Userspace wants to query session length. */
1441 __sev_send_start_query_session_length(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
,
1442 struct kvm_sev_send_start
*params
)
1444 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
1445 struct sev_data_send_start data
;
1448 memset(&data
, 0, sizeof(data
));
1449 data
.handle
= sev
->handle
;
1450 ret
= sev_issue_cmd(kvm
, SEV_CMD_SEND_START
, &data
, &argp
->error
);
1452 params
->session_len
= data
.session_len
;
1453 if (copy_to_user(u64_to_user_ptr(argp
->data
), params
,
1454 sizeof(struct kvm_sev_send_start
)))
1460 static int sev_send_start(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
1462 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
1463 struct sev_data_send_start data
;
1464 struct kvm_sev_send_start params
;
1465 void *amd_certs
, *session_data
;
1466 void *pdh_cert
, *plat_certs
;
1469 if (!sev_guest(kvm
))
1472 if (copy_from_user(¶ms
, u64_to_user_ptr(argp
->data
),
1473 sizeof(struct kvm_sev_send_start
)))
1476 /* if session_len is zero, userspace wants to query the session length */
1477 if (!params
.session_len
)
1478 return __sev_send_start_query_session_length(kvm
, argp
,
1481 /* some sanity checks */
1482 if (!params
.pdh_cert_uaddr
|| !params
.pdh_cert_len
||
1483 !params
.session_uaddr
|| params
.session_len
> SEV_FW_BLOB_MAX_SIZE
)
1486 /* allocate the memory to hold the session data blob */
1487 session_data
= kzalloc(params
.session_len
, GFP_KERNEL_ACCOUNT
);
1491 /* copy the certificate blobs from userspace */
1492 pdh_cert
= psp_copy_user_blob(params
.pdh_cert_uaddr
,
1493 params
.pdh_cert_len
);
1494 if (IS_ERR(pdh_cert
)) {
1495 ret
= PTR_ERR(pdh_cert
);
1496 goto e_free_session
;
1499 plat_certs
= psp_copy_user_blob(params
.plat_certs_uaddr
,
1500 params
.plat_certs_len
);
1501 if (IS_ERR(plat_certs
)) {
1502 ret
= PTR_ERR(plat_certs
);
1506 amd_certs
= psp_copy_user_blob(params
.amd_certs_uaddr
,
1507 params
.amd_certs_len
);
1508 if (IS_ERR(amd_certs
)) {
1509 ret
= PTR_ERR(amd_certs
);
1510 goto e_free_plat_cert
;
1513 /* populate the FW SEND_START field with system physical address */
1514 memset(&data
, 0, sizeof(data
));
1515 data
.pdh_cert_address
= __psp_pa(pdh_cert
);
1516 data
.pdh_cert_len
= params
.pdh_cert_len
;
1517 data
.plat_certs_address
= __psp_pa(plat_certs
);
1518 data
.plat_certs_len
= params
.plat_certs_len
;
1519 data
.amd_certs_address
= __psp_pa(amd_certs
);
1520 data
.amd_certs_len
= params
.amd_certs_len
;
1521 data
.session_address
= __psp_pa(session_data
);
1522 data
.session_len
= params
.session_len
;
1523 data
.handle
= sev
->handle
;
1525 ret
= sev_issue_cmd(kvm
, SEV_CMD_SEND_START
, &data
, &argp
->error
);
1527 if (!ret
&& copy_to_user(u64_to_user_ptr(params
.session_uaddr
),
1528 session_data
, params
.session_len
)) {
1530 goto e_free_amd_cert
;
1533 params
.policy
= data
.policy
;
1534 params
.session_len
= data
.session_len
;
1535 if (copy_to_user(u64_to_user_ptr(argp
->data
), ¶ms
,
1536 sizeof(struct kvm_sev_send_start
)))
1546 kfree(session_data
);
1550 /* Userspace wants to query either header or trans length. */
1552 __sev_send_update_data_query_lengths(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
,
1553 struct kvm_sev_send_update_data
*params
)
1555 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
1556 struct sev_data_send_update_data data
;
1559 memset(&data
, 0, sizeof(data
));
1560 data
.handle
= sev
->handle
;
1561 ret
= sev_issue_cmd(kvm
, SEV_CMD_SEND_UPDATE_DATA
, &data
, &argp
->error
);
1563 params
->hdr_len
= data
.hdr_len
;
1564 params
->trans_len
= data
.trans_len
;
1566 if (copy_to_user(u64_to_user_ptr(argp
->data
), params
,
1567 sizeof(struct kvm_sev_send_update_data
)))
1573 static int sev_send_update_data(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
1575 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
1576 struct sev_data_send_update_data data
;
1577 struct kvm_sev_send_update_data params
;
1578 void *hdr
, *trans_data
;
1579 struct page
**guest_page
;
1583 if (!sev_guest(kvm
))
1586 if (copy_from_user(¶ms
, u64_to_user_ptr(argp
->data
),
1587 sizeof(struct kvm_sev_send_update_data
)))
1590 /* userspace wants to query either header or trans length */
1591 if (!params
.trans_len
|| !params
.hdr_len
)
1592 return __sev_send_update_data_query_lengths(kvm
, argp
, ¶ms
);
1594 if (!params
.trans_uaddr
|| !params
.guest_uaddr
||
1595 !params
.guest_len
|| !params
.hdr_uaddr
)
1598 /* Check if we are crossing the page boundary */
1599 offset
= params
.guest_uaddr
& (PAGE_SIZE
- 1);
1600 if (params
.guest_len
> PAGE_SIZE
|| (params
.guest_len
+ offset
) > PAGE_SIZE
)
1603 /* Pin guest memory */
1604 guest_page
= sev_pin_memory(kvm
, params
.guest_uaddr
& PAGE_MASK
,
1606 if (IS_ERR(guest_page
))
1607 return PTR_ERR(guest_page
);
1609 /* allocate memory for header and transport buffer */
1611 hdr
= kzalloc(params
.hdr_len
, GFP_KERNEL_ACCOUNT
);
1615 trans_data
= kzalloc(params
.trans_len
, GFP_KERNEL_ACCOUNT
);
1619 memset(&data
, 0, sizeof(data
));
1620 data
.hdr_address
= __psp_pa(hdr
);
1621 data
.hdr_len
= params
.hdr_len
;
1622 data
.trans_address
= __psp_pa(trans_data
);
1623 data
.trans_len
= params
.trans_len
;
1625 /* The SEND_UPDATE_DATA command requires C-bit to be always set. */
1626 data
.guest_address
= (page_to_pfn(guest_page
[0]) << PAGE_SHIFT
) + offset
;
1627 data
.guest_address
|= sev_me_mask
;
1628 data
.guest_len
= params
.guest_len
;
1629 data
.handle
= sev
->handle
;
1631 ret
= sev_issue_cmd(kvm
, SEV_CMD_SEND_UPDATE_DATA
, &data
, &argp
->error
);
1634 goto e_free_trans_data
;
1636 /* copy transport buffer to user space */
1637 if (copy_to_user(u64_to_user_ptr(params
.trans_uaddr
),
1638 trans_data
, params
.trans_len
)) {
1640 goto e_free_trans_data
;
1643 /* Copy packet header to userspace. */
1644 if (copy_to_user(u64_to_user_ptr(params
.hdr_uaddr
), hdr
,
1653 sev_unpin_memory(kvm
, guest_page
, n
);
1658 static int sev_send_finish(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
1660 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
1661 struct sev_data_send_finish data
;
1663 if (!sev_guest(kvm
))
1666 data
.handle
= sev
->handle
;
1667 return sev_issue_cmd(kvm
, SEV_CMD_SEND_FINISH
, &data
, &argp
->error
);
1670 static int sev_send_cancel(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
1672 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
1673 struct sev_data_send_cancel data
;
1675 if (!sev_guest(kvm
))
1678 data
.handle
= sev
->handle
;
1679 return sev_issue_cmd(kvm
, SEV_CMD_SEND_CANCEL
, &data
, &argp
->error
);
1682 static int sev_receive_start(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
1684 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
1685 struct sev_data_receive_start start
;
1686 struct kvm_sev_receive_start params
;
1687 int *error
= &argp
->error
;
1692 if (!sev_guest(kvm
))
1695 /* Get parameter from the userspace */
1696 if (copy_from_user(¶ms
, u64_to_user_ptr(argp
->data
),
1697 sizeof(struct kvm_sev_receive_start
)))
1700 /* some sanity checks */
1701 if (!params
.pdh_uaddr
|| !params
.pdh_len
||
1702 !params
.session_uaddr
|| !params
.session_len
)
1705 pdh_data
= psp_copy_user_blob(params
.pdh_uaddr
, params
.pdh_len
);
1706 if (IS_ERR(pdh_data
))
1707 return PTR_ERR(pdh_data
);
1709 session_data
= psp_copy_user_blob(params
.session_uaddr
,
1710 params
.session_len
);
1711 if (IS_ERR(session_data
)) {
1712 ret
= PTR_ERR(session_data
);
1716 memset(&start
, 0, sizeof(start
));
1717 start
.handle
= params
.handle
;
1718 start
.policy
= params
.policy
;
1719 start
.pdh_cert_address
= __psp_pa(pdh_data
);
1720 start
.pdh_cert_len
= params
.pdh_len
;
1721 start
.session_address
= __psp_pa(session_data
);
1722 start
.session_len
= params
.session_len
;
1724 /* create memory encryption context */
1725 ret
= __sev_issue_cmd(argp
->sev_fd
, SEV_CMD_RECEIVE_START
, &start
,
1728 goto e_free_session
;
1730 /* Bind ASID to this guest */
1731 ret
= sev_bind_asid(kvm
, start
.handle
, error
);
1733 sev_decommission(start
.handle
);
1734 goto e_free_session
;
1737 params
.handle
= start
.handle
;
1738 if (copy_to_user(u64_to_user_ptr(argp
->data
),
1739 ¶ms
, sizeof(struct kvm_sev_receive_start
))) {
1741 sev_unbind_asid(kvm
, start
.handle
);
1742 goto e_free_session
;
1745 sev
->handle
= start
.handle
;
1746 sev
->fd
= argp
->sev_fd
;
1749 kfree(session_data
);
1756 static int sev_receive_update_data(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
1758 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
1759 struct kvm_sev_receive_update_data params
;
1760 struct sev_data_receive_update_data data
;
1761 void *hdr
= NULL
, *trans
= NULL
;
1762 struct page
**guest_page
;
1766 if (!sev_guest(kvm
))
1769 if (copy_from_user(¶ms
, u64_to_user_ptr(argp
->data
),
1770 sizeof(struct kvm_sev_receive_update_data
)))
1773 if (!params
.hdr_uaddr
|| !params
.hdr_len
||
1774 !params
.guest_uaddr
|| !params
.guest_len
||
1775 !params
.trans_uaddr
|| !params
.trans_len
)
1778 /* Check if we are crossing the page boundary */
1779 offset
= params
.guest_uaddr
& (PAGE_SIZE
- 1);
1780 if (params
.guest_len
> PAGE_SIZE
|| (params
.guest_len
+ offset
) > PAGE_SIZE
)
1783 hdr
= psp_copy_user_blob(params
.hdr_uaddr
, params
.hdr_len
);
1785 return PTR_ERR(hdr
);
1787 trans
= psp_copy_user_blob(params
.trans_uaddr
, params
.trans_len
);
1788 if (IS_ERR(trans
)) {
1789 ret
= PTR_ERR(trans
);
1793 memset(&data
, 0, sizeof(data
));
1794 data
.hdr_address
= __psp_pa(hdr
);
1795 data
.hdr_len
= params
.hdr_len
;
1796 data
.trans_address
= __psp_pa(trans
);
1797 data
.trans_len
= params
.trans_len
;
1799 /* Pin guest memory */
1800 guest_page
= sev_pin_memory(kvm
, params
.guest_uaddr
& PAGE_MASK
,
1802 if (IS_ERR(guest_page
)) {
1803 ret
= PTR_ERR(guest_page
);
1808 * Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP
1809 * encrypts the written data with the guest's key, and the cache may
1810 * contain dirty, unencrypted data.
1812 sev_clflush_pages(guest_page
, n
);
1814 /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
1815 data
.guest_address
= (page_to_pfn(guest_page
[0]) << PAGE_SHIFT
) + offset
;
1816 data
.guest_address
|= sev_me_mask
;
1817 data
.guest_len
= params
.guest_len
;
1818 data
.handle
= sev
->handle
;
1820 ret
= sev_issue_cmd(kvm
, SEV_CMD_RECEIVE_UPDATE_DATA
, &data
,
1823 sev_unpin_memory(kvm
, guest_page
, n
);
1833 static int sev_receive_finish(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
1835 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
1836 struct sev_data_receive_finish data
;
1838 if (!sev_guest(kvm
))
1841 data
.handle
= sev
->handle
;
1842 return sev_issue_cmd(kvm
, SEV_CMD_RECEIVE_FINISH
, &data
, &argp
->error
);
1845 static bool is_cmd_allowed_from_mirror(u32 cmd_id
)
1848 * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES
1849 * active mirror VMs. Also allow the debugging and status commands.
1851 if (cmd_id
== KVM_SEV_LAUNCH_UPDATE_VMSA
||
1852 cmd_id
== KVM_SEV_GUEST_STATUS
|| cmd_id
== KVM_SEV_DBG_DECRYPT
||
1853 cmd_id
== KVM_SEV_DBG_ENCRYPT
)
1859 static int sev_lock_two_vms(struct kvm
*dst_kvm
, struct kvm
*src_kvm
)
1861 struct kvm_sev_info
*dst_sev
= &to_kvm_svm(dst_kvm
)->sev_info
;
1862 struct kvm_sev_info
*src_sev
= &to_kvm_svm(src_kvm
)->sev_info
;
1865 if (dst_kvm
== src_kvm
)
1869 * Bail if these VMs are already involved in a migration to avoid
1870 * deadlock between two VMs trying to migrate to/from each other.
1872 if (atomic_cmpxchg_acquire(&dst_sev
->migration_in_progress
, 0, 1))
1875 if (atomic_cmpxchg_acquire(&src_sev
->migration_in_progress
, 0, 1))
1879 if (mutex_lock_killable(&dst_kvm
->lock
))
1881 if (mutex_lock_killable_nested(&src_kvm
->lock
, SINGLE_DEPTH_NESTING
))
1886 mutex_unlock(&dst_kvm
->lock
);
1888 atomic_set_release(&src_sev
->migration_in_progress
, 0);
1890 atomic_set_release(&dst_sev
->migration_in_progress
, 0);
1894 static void sev_unlock_two_vms(struct kvm
*dst_kvm
, struct kvm
*src_kvm
)
1896 struct kvm_sev_info
*dst_sev
= &to_kvm_svm(dst_kvm
)->sev_info
;
1897 struct kvm_sev_info
*src_sev
= &to_kvm_svm(src_kvm
)->sev_info
;
1899 mutex_unlock(&dst_kvm
->lock
);
1900 mutex_unlock(&src_kvm
->lock
);
1901 atomic_set_release(&dst_sev
->migration_in_progress
, 0);
1902 atomic_set_release(&src_sev
->migration_in_progress
, 0);
1905 /* vCPU mutex subclasses. */
1906 enum sev_migration_role
{
1907 SEV_MIGRATION_SOURCE
= 0,
1908 SEV_MIGRATION_TARGET
,
1909 SEV_NR_MIGRATION_ROLES
,
1912 static int sev_lock_vcpus_for_migration(struct kvm
*kvm
,
1913 enum sev_migration_role role
)
1915 struct kvm_vcpu
*vcpu
;
1918 kvm_for_each_vcpu(i
, vcpu
, kvm
) {
1919 if (mutex_lock_killable_nested(&vcpu
->mutex
, role
))
1922 #ifdef CONFIG_PROVE_LOCKING
1925 * Reset the role to one that avoids colliding with
1926 * the role used for the first vcpu mutex.
1928 role
= SEV_NR_MIGRATION_ROLES
;
1930 mutex_release(&vcpu
->mutex
.dep_map
, _THIS_IP_
);
1938 kvm_for_each_vcpu(j
, vcpu
, kvm
) {
1942 #ifdef CONFIG_PROVE_LOCKING
1944 mutex_acquire(&vcpu
->mutex
.dep_map
, role
, 0, _THIS_IP_
);
1947 mutex_unlock(&vcpu
->mutex
);
1952 static void sev_unlock_vcpus_for_migration(struct kvm
*kvm
)
1954 struct kvm_vcpu
*vcpu
;
1958 kvm_for_each_vcpu(i
, vcpu
, kvm
) {
1962 mutex_acquire(&vcpu
->mutex
.dep_map
,
1963 SEV_NR_MIGRATION_ROLES
, 0, _THIS_IP_
);
1965 mutex_unlock(&vcpu
->mutex
);
1969 static void sev_migrate_from(struct kvm
*dst_kvm
, struct kvm
*src_kvm
)
1971 struct kvm_sev_info
*dst
= &to_kvm_svm(dst_kvm
)->sev_info
;
1972 struct kvm_sev_info
*src
= &to_kvm_svm(src_kvm
)->sev_info
;
1973 struct kvm_vcpu
*dst_vcpu
, *src_vcpu
;
1974 struct vcpu_svm
*dst_svm
, *src_svm
;
1975 struct kvm_sev_info
*mirror
;
1979 dst
->asid
= src
->asid
;
1980 dst
->handle
= src
->handle
;
1981 dst
->pages_locked
= src
->pages_locked
;
1982 dst
->enc_context_owner
= src
->enc_context_owner
;
1983 dst
->es_active
= src
->es_active
;
1984 dst
->vmsa_features
= src
->vmsa_features
;
1987 src
->active
= false;
1989 src
->pages_locked
= 0;
1990 src
->enc_context_owner
= NULL
;
1991 src
->es_active
= false;
1993 list_cut_before(&dst
->regions_list
, &src
->regions_list
, &src
->regions_list
);
1996 * If this VM has mirrors, "transfer" each mirror's refcount of the
1997 * source to the destination (this KVM). The caller holds a reference
1998 * to the source, so there's no danger of use-after-free.
2000 list_cut_before(&dst
->mirror_vms
, &src
->mirror_vms
, &src
->mirror_vms
);
2001 list_for_each_entry(mirror
, &dst
->mirror_vms
, mirror_entry
) {
2002 kvm_get_kvm(dst_kvm
);
2003 kvm_put_kvm(src_kvm
);
2004 mirror
->enc_context_owner
= dst_kvm
;
2008 * If this VM is a mirror, remove the old mirror from the owners list
2009 * and add the new mirror to the list.
2011 if (is_mirroring_enc_context(dst_kvm
)) {
2012 struct kvm_sev_info
*owner_sev_info
=
2013 &to_kvm_svm(dst
->enc_context_owner
)->sev_info
;
2015 list_del(&src
->mirror_entry
);
2016 list_add_tail(&dst
->mirror_entry
, &owner_sev_info
->mirror_vms
);
2019 kvm_for_each_vcpu(i
, dst_vcpu
, dst_kvm
) {
2020 dst_svm
= to_svm(dst_vcpu
);
2022 sev_init_vmcb(dst_svm
);
2024 if (!dst
->es_active
)
2028 * Note, the source is not required to have the same number of
2029 * vCPUs as the destination when migrating a vanilla SEV VM.
2031 src_vcpu
= kvm_get_vcpu(src_kvm
, i
);
2032 src_svm
= to_svm(src_vcpu
);
2035 * Transfer VMSA and GHCB state to the destination. Nullify and
2036 * clear source fields as appropriate, the state now belongs to
2039 memcpy(&dst_svm
->sev_es
, &src_svm
->sev_es
, sizeof(src_svm
->sev_es
));
2040 dst_svm
->vmcb
->control
.ghcb_gpa
= src_svm
->vmcb
->control
.ghcb_gpa
;
2041 dst_svm
->vmcb
->control
.vmsa_pa
= src_svm
->vmcb
->control
.vmsa_pa
;
2042 dst_vcpu
->arch
.guest_state_protected
= true;
2044 memset(&src_svm
->sev_es
, 0, sizeof(src_svm
->sev_es
));
2045 src_svm
->vmcb
->control
.ghcb_gpa
= INVALID_PAGE
;
2046 src_svm
->vmcb
->control
.vmsa_pa
= INVALID_PAGE
;
2047 src_vcpu
->arch
.guest_state_protected
= false;
2051 static int sev_check_source_vcpus(struct kvm
*dst
, struct kvm
*src
)
2053 struct kvm_vcpu
*src_vcpu
;
2056 if (!sev_es_guest(src
))
2059 if (atomic_read(&src
->online_vcpus
) != atomic_read(&dst
->online_vcpus
))
2062 kvm_for_each_vcpu(i
, src_vcpu
, src
) {
2063 if (!src_vcpu
->arch
.guest_state_protected
)
2070 int sev_vm_move_enc_context_from(struct kvm
*kvm
, unsigned int source_fd
)
2072 struct kvm_sev_info
*dst_sev
= &to_kvm_svm(kvm
)->sev_info
;
2073 struct kvm_sev_info
*src_sev
, *cg_cleanup_sev
;
2074 CLASS(fd
, f
)(source_fd
);
2075 struct kvm
*source_kvm
;
2076 bool charged
= false;
2082 if (!file_is_kvm(fd_file(f
)))
2085 source_kvm
= fd_file(f
)->private_data
;
2086 ret
= sev_lock_two_vms(kvm
, source_kvm
);
2090 if (kvm
->arch
.vm_type
!= source_kvm
->arch
.vm_type
||
2091 sev_guest(kvm
) || !sev_guest(source_kvm
)) {
2096 src_sev
= &to_kvm_svm(source_kvm
)->sev_info
;
2098 dst_sev
->misc_cg
= get_current_misc_cg();
2099 cg_cleanup_sev
= dst_sev
;
2100 if (dst_sev
->misc_cg
!= src_sev
->misc_cg
) {
2101 ret
= sev_misc_cg_try_charge(dst_sev
);
2103 goto out_dst_cgroup
;
2107 ret
= sev_lock_vcpus_for_migration(kvm
, SEV_MIGRATION_SOURCE
);
2109 goto out_dst_cgroup
;
2110 ret
= sev_lock_vcpus_for_migration(source_kvm
, SEV_MIGRATION_TARGET
);
2114 ret
= sev_check_source_vcpus(kvm
, source_kvm
);
2116 goto out_source_vcpu
;
2118 sev_migrate_from(kvm
, source_kvm
);
2119 kvm_vm_dead(source_kvm
);
2120 cg_cleanup_sev
= src_sev
;
2124 sev_unlock_vcpus_for_migration(source_kvm
);
2126 sev_unlock_vcpus_for_migration(kvm
);
2128 /* Operates on the source on success, on the destination on failure. */
2130 sev_misc_cg_uncharge(cg_cleanup_sev
);
2131 put_misc_cg(cg_cleanup_sev
->misc_cg
);
2132 cg_cleanup_sev
->misc_cg
= NULL
;
2134 sev_unlock_two_vms(kvm
, source_kvm
);
2138 int sev_dev_get_attr(u32 group
, u64 attr
, u64
*val
)
2140 if (group
!= KVM_X86_GRP_SEV
)
2144 case KVM_X86_SEV_VMSA_FEATURES
:
2145 *val
= sev_supported_vmsa_features
;
2154 * The guest context contains all the information, keys and metadata
2155 * associated with the guest that the firmware tracks to implement SEV
2156 * and SNP features. The firmware stores the guest context in hypervisor
2157 * provide page via the SNP_GCTX_CREATE command.
2159 static void *snp_context_create(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
2161 struct sev_data_snp_addr data
= {};
2165 /* Allocate memory for context page */
2166 context
= snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT
);
2170 data
.address
= __psp_pa(context
);
2171 rc
= __sev_issue_cmd(argp
->sev_fd
, SEV_CMD_SNP_GCTX_CREATE
, &data
, &argp
->error
);
2173 pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d",
2175 snp_free_firmware_page(context
);
2182 static int snp_bind_asid(struct kvm
*kvm
, int *error
)
2184 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
2185 struct sev_data_snp_activate data
= {0};
2187 data
.gctx_paddr
= __psp_pa(sev
->snp_context
);
2188 data
.asid
= sev_get_asid(kvm
);
2189 return sev_issue_cmd(kvm
, SEV_CMD_SNP_ACTIVATE
, &data
, error
);
2192 static int snp_launch_start(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
2194 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
2195 struct sev_data_snp_launch_start start
= {0};
2196 struct kvm_sev_snp_launch_start params
;
2199 if (!sev_snp_guest(kvm
))
2202 if (copy_from_user(¶ms
, u64_to_user_ptr(argp
->data
), sizeof(params
)))
2205 /* Don't allow userspace to allocate memory for more than 1 SNP context. */
2206 if (sev
->snp_context
)
2212 if (params
.policy
& ~SNP_POLICY_MASK_VALID
)
2215 /* Check for policy bits that must be set */
2216 if (!(params
.policy
& SNP_POLICY_MASK_RSVD_MBO
) ||
2217 !(params
.policy
& SNP_POLICY_MASK_SMT
))
2220 if (params
.policy
& SNP_POLICY_MASK_SINGLE_SOCKET
)
2223 sev
->snp_context
= snp_context_create(kvm
, argp
);
2224 if (!sev
->snp_context
)
2227 start
.gctx_paddr
= __psp_pa(sev
->snp_context
);
2228 start
.policy
= params
.policy
;
2229 memcpy(start
.gosvw
, params
.gosvw
, sizeof(params
.gosvw
));
2230 rc
= __sev_issue_cmd(argp
->sev_fd
, SEV_CMD_SNP_LAUNCH_START
, &start
, &argp
->error
);
2232 pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n",
2234 goto e_free_context
;
2237 sev
->fd
= argp
->sev_fd
;
2238 rc
= snp_bind_asid(kvm
, &argp
->error
);
2240 pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n",
2242 goto e_free_context
;
2248 snp_decommission_context(kvm
);
2253 struct sev_gmem_populate_args
{
2259 static int sev_gmem_post_populate(struct kvm
*kvm
, gfn_t gfn_start
, kvm_pfn_t pfn
,
2260 void __user
*src
, int order
, void *opaque
)
2262 struct sev_gmem_populate_args
*sev_populate_args
= opaque
;
2263 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
2264 int n_private
= 0, ret
, i
;
2265 int npages
= (1 << order
);
2268 if (WARN_ON_ONCE(sev_populate_args
->type
!= KVM_SEV_SNP_PAGE_TYPE_ZERO
&& !src
))
2271 for (gfn
= gfn_start
, i
= 0; gfn
< gfn_start
+ npages
; gfn
++, i
++) {
2272 struct sev_data_snp_launch_update fw_args
= {0};
2273 bool assigned
= false;
2276 ret
= snp_lookup_rmpentry((u64
)pfn
+ i
, &assigned
, &level
);
2277 if (ret
|| assigned
) {
2278 pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
2279 __func__
, gfn
, ret
, assigned
);
2280 ret
= ret
? -EINVAL
: -EEXIST
;
2285 void *vaddr
= kmap_local_pfn(pfn
+ i
);
2287 if (copy_from_user(vaddr
, src
+ i
* PAGE_SIZE
, PAGE_SIZE
)) {
2291 kunmap_local(vaddr
);
2294 ret
= rmp_make_private(pfn
+ i
, gfn
<< PAGE_SHIFT
, PG_LEVEL_4K
,
2295 sev_get_asid(kvm
), true);
2301 fw_args
.gctx_paddr
= __psp_pa(sev
->snp_context
);
2302 fw_args
.address
= __sme_set(pfn_to_hpa(pfn
+ i
));
2303 fw_args
.page_size
= PG_LEVEL_TO_RMP(PG_LEVEL_4K
);
2304 fw_args
.page_type
= sev_populate_args
->type
;
2306 ret
= __sev_issue_cmd(sev_populate_args
->sev_fd
, SEV_CMD_SNP_LAUNCH_UPDATE
,
2307 &fw_args
, &sev_populate_args
->fw_error
);
2316 * If the firmware command failed handle the reclaim and cleanup of that
2317 * PFN specially vs. prior pages which can be cleaned up below without
2318 * needing to reclaim in advance.
2320 * Additionally, when invalid CPUID function entries are detected,
2321 * firmware writes the expected values into the page and leaves it
2322 * unencrypted so it can be used for debugging and error-reporting.
2324 * Copy this page back into the source buffer so userspace can use this
2325 * information to provide information on which CPUID leaves/fields
2326 * failed CPUID validation.
2328 if (!snp_page_reclaim(kvm
, pfn
+ i
) &&
2329 sev_populate_args
->type
== KVM_SEV_SNP_PAGE_TYPE_CPUID
&&
2330 sev_populate_args
->fw_error
== SEV_RET_INVALID_PARAM
) {
2331 void *vaddr
= kmap_local_pfn(pfn
+ i
);
2333 if (copy_to_user(src
+ i
* PAGE_SIZE
, vaddr
, PAGE_SIZE
))
2334 pr_debug("Failed to write CPUID page back to userspace\n");
2336 kunmap_local(vaddr
);
2339 /* pfn + i is hypervisor-owned now, so skip below cleanup for it. */
2343 pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n",
2344 __func__
, ret
, sev_populate_args
->fw_error
, n_private
);
2345 for (i
= 0; i
< n_private
; i
++)
2346 kvm_rmp_make_shared(kvm
, pfn
+ i
, PG_LEVEL_4K
);
2351 static int snp_launch_update(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
2353 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
2354 struct sev_gmem_populate_args sev_populate_args
= {0};
2355 struct kvm_sev_snp_launch_update params
;
2356 struct kvm_memory_slot
*memslot
;
2361 if (!sev_snp_guest(kvm
) || !sev
->snp_context
)
2364 if (copy_from_user(¶ms
, u64_to_user_ptr(argp
->data
), sizeof(params
)))
2367 pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__
,
2368 params
.gfn_start
, params
.len
, params
.type
, params
.flags
);
2370 if (!PAGE_ALIGNED(params
.len
) || params
.flags
||
2371 (params
.type
!= KVM_SEV_SNP_PAGE_TYPE_NORMAL
&&
2372 params
.type
!= KVM_SEV_SNP_PAGE_TYPE_ZERO
&&
2373 params
.type
!= KVM_SEV_SNP_PAGE_TYPE_UNMEASURED
&&
2374 params
.type
!= KVM_SEV_SNP_PAGE_TYPE_SECRETS
&&
2375 params
.type
!= KVM_SEV_SNP_PAGE_TYPE_CPUID
))
2378 npages
= params
.len
/ PAGE_SIZE
;
2381 * For each GFN that's being prepared as part of the initial guest
2382 * state, the following pre-conditions are verified:
2384 * 1) The backing memslot is a valid private memslot.
2385 * 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES
2387 * 3) The PFN of the guest_memfd has not already been set to private
2390 * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page
2391 * faults if there's a race between a fault and an attribute update via
2392 * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized
2393 * here. However, kvm->slots_lock guards against both this as well as
2394 * concurrent memslot updates occurring while these checks are being
2395 * performed, so use that here to make it easier to reason about the
2396 * initial expected state and better guard against unexpected
2399 mutex_lock(&kvm
->slots_lock
);
2401 memslot
= gfn_to_memslot(kvm
, params
.gfn_start
);
2402 if (!kvm_slot_can_be_private(memslot
)) {
2407 sev_populate_args
.sev_fd
= argp
->sev_fd
;
2408 sev_populate_args
.type
= params
.type
;
2409 src
= params
.type
== KVM_SEV_SNP_PAGE_TYPE_ZERO
? NULL
: u64_to_user_ptr(params
.uaddr
);
2411 count
= kvm_gmem_populate(kvm
, params
.gfn_start
, src
, npages
,
2412 sev_gmem_post_populate
, &sev_populate_args
);
2414 argp
->error
= sev_populate_args
.fw_error
;
2415 pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n",
2416 __func__
, count
, argp
->error
);
2419 params
.gfn_start
+= count
;
2420 params
.len
-= count
* PAGE_SIZE
;
2421 if (params
.type
!= KVM_SEV_SNP_PAGE_TYPE_ZERO
)
2422 params
.uaddr
+= count
* PAGE_SIZE
;
2425 if (copy_to_user(u64_to_user_ptr(argp
->data
), ¶ms
, sizeof(params
)))
2430 mutex_unlock(&kvm
->slots_lock
);
2435 static int snp_launch_update_vmsa(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
2437 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
2438 struct sev_data_snp_launch_update data
= {};
2439 struct kvm_vcpu
*vcpu
;
2443 data
.gctx_paddr
= __psp_pa(sev
->snp_context
);
2444 data
.page_type
= SNP_PAGE_TYPE_VMSA
;
2446 kvm_for_each_vcpu(i
, vcpu
, kvm
) {
2447 struct vcpu_svm
*svm
= to_svm(vcpu
);
2448 u64 pfn
= __pa(svm
->sev_es
.vmsa
) >> PAGE_SHIFT
;
2450 ret
= sev_es_sync_vmsa(svm
);
2454 /* Transition the VMSA page to a firmware state. */
2455 ret
= rmp_make_private(pfn
, INITIAL_VMSA_GPA
, PG_LEVEL_4K
, sev
->asid
, true);
2459 /* Issue the SNP command to encrypt the VMSA */
2460 data
.address
= __sme_pa(svm
->sev_es
.vmsa
);
2461 ret
= __sev_issue_cmd(argp
->sev_fd
, SEV_CMD_SNP_LAUNCH_UPDATE
,
2462 &data
, &argp
->error
);
2464 snp_page_reclaim(kvm
, pfn
);
2469 svm
->vcpu
.arch
.guest_state_protected
= true;
2471 * SEV-ES (and thus SNP) guest mandates LBR Virtualization to
2472 * be _always_ ON. Enable it only after setting
2473 * guest_state_protected because KVM_SET_MSRS allows dynamic
2474 * toggling of LBRV (for performance reason) on write access to
2475 * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
2477 svm_enable_lbrv(vcpu
);
2483 static int snp_launch_finish(struct kvm
*kvm
, struct kvm_sev_cmd
*argp
)
2485 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
2486 struct kvm_sev_snp_launch_finish params
;
2487 struct sev_data_snp_launch_finish
*data
;
2488 void *id_block
= NULL
, *id_auth
= NULL
;
2491 if (!sev_snp_guest(kvm
))
2494 if (!sev
->snp_context
)
2497 if (copy_from_user(¶ms
, u64_to_user_ptr(argp
->data
), sizeof(params
)))
2503 /* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */
2504 ret
= snp_launch_update_vmsa(kvm
, argp
);
2508 data
= kzalloc(sizeof(*data
), GFP_KERNEL_ACCOUNT
);
2512 if (params
.id_block_en
) {
2513 id_block
= psp_copy_user_blob(params
.id_block_uaddr
, KVM_SEV_SNP_ID_BLOCK_SIZE
);
2514 if (IS_ERR(id_block
)) {
2515 ret
= PTR_ERR(id_block
);
2519 data
->id_block_en
= 1;
2520 data
->id_block_paddr
= __sme_pa(id_block
);
2522 id_auth
= psp_copy_user_blob(params
.id_auth_uaddr
, KVM_SEV_SNP_ID_AUTH_SIZE
);
2523 if (IS_ERR(id_auth
)) {
2524 ret
= PTR_ERR(id_auth
);
2525 goto e_free_id_block
;
2528 data
->id_auth_paddr
= __sme_pa(id_auth
);
2530 if (params
.auth_key_en
)
2531 data
->auth_key_en
= 1;
2534 data
->vcek_disabled
= params
.vcek_disabled
;
2536 memcpy(data
->host_data
, params
.host_data
, KVM_SEV_SNP_FINISH_DATA_SIZE
);
2537 data
->gctx_paddr
= __psp_pa(sev
->snp_context
);
2538 ret
= sev_issue_cmd(kvm
, SEV_CMD_SNP_LAUNCH_FINISH
, data
, &argp
->error
);
2541 * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages
2542 * can be given to the guest simply by marking the RMP entry as private.
2543 * This can happen on first access and also with KVM_PRE_FAULT_MEMORY.
2546 kvm
->arch
.pre_fault_allowed
= true;
2559 int sev_mem_enc_ioctl(struct kvm
*kvm
, void __user
*argp
)
2561 struct kvm_sev_cmd sev_cmd
;
2570 if (copy_from_user(&sev_cmd
, argp
, sizeof(struct kvm_sev_cmd
)))
2573 mutex_lock(&kvm
->lock
);
2575 /* Only the enc_context_owner handles some memory enc operations. */
2576 if (is_mirroring_enc_context(kvm
) &&
2577 !is_cmd_allowed_from_mirror(sev_cmd
.id
)) {
2583 * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only
2584 * allow the use of SNP-specific commands.
2586 if (sev_snp_guest(kvm
) && sev_cmd
.id
< KVM_SEV_SNP_LAUNCH_START
) {
2591 switch (sev_cmd
.id
) {
2592 case KVM_SEV_ES_INIT
:
2593 if (!sev_es_enabled
) {
2599 r
= sev_guest_init(kvm
, &sev_cmd
);
2602 r
= sev_guest_init2(kvm
, &sev_cmd
);
2604 case KVM_SEV_LAUNCH_START
:
2605 r
= sev_launch_start(kvm
, &sev_cmd
);
2607 case KVM_SEV_LAUNCH_UPDATE_DATA
:
2608 r
= sev_launch_update_data(kvm
, &sev_cmd
);
2610 case KVM_SEV_LAUNCH_UPDATE_VMSA
:
2611 r
= sev_launch_update_vmsa(kvm
, &sev_cmd
);
2613 case KVM_SEV_LAUNCH_MEASURE
:
2614 r
= sev_launch_measure(kvm
, &sev_cmd
);
2616 case KVM_SEV_LAUNCH_FINISH
:
2617 r
= sev_launch_finish(kvm
, &sev_cmd
);
2619 case KVM_SEV_GUEST_STATUS
:
2620 r
= sev_guest_status(kvm
, &sev_cmd
);
2622 case KVM_SEV_DBG_DECRYPT
:
2623 r
= sev_dbg_crypt(kvm
, &sev_cmd
, true);
2625 case KVM_SEV_DBG_ENCRYPT
:
2626 r
= sev_dbg_crypt(kvm
, &sev_cmd
, false);
2628 case KVM_SEV_LAUNCH_SECRET
:
2629 r
= sev_launch_secret(kvm
, &sev_cmd
);
2631 case KVM_SEV_GET_ATTESTATION_REPORT
:
2632 r
= sev_get_attestation_report(kvm
, &sev_cmd
);
2634 case KVM_SEV_SEND_START
:
2635 r
= sev_send_start(kvm
, &sev_cmd
);
2637 case KVM_SEV_SEND_UPDATE_DATA
:
2638 r
= sev_send_update_data(kvm
, &sev_cmd
);
2640 case KVM_SEV_SEND_FINISH
:
2641 r
= sev_send_finish(kvm
, &sev_cmd
);
2643 case KVM_SEV_SEND_CANCEL
:
2644 r
= sev_send_cancel(kvm
, &sev_cmd
);
2646 case KVM_SEV_RECEIVE_START
:
2647 r
= sev_receive_start(kvm
, &sev_cmd
);
2649 case KVM_SEV_RECEIVE_UPDATE_DATA
:
2650 r
= sev_receive_update_data(kvm
, &sev_cmd
);
2652 case KVM_SEV_RECEIVE_FINISH
:
2653 r
= sev_receive_finish(kvm
, &sev_cmd
);
2655 case KVM_SEV_SNP_LAUNCH_START
:
2656 r
= snp_launch_start(kvm
, &sev_cmd
);
2658 case KVM_SEV_SNP_LAUNCH_UPDATE
:
2659 r
= snp_launch_update(kvm
, &sev_cmd
);
2661 case KVM_SEV_SNP_LAUNCH_FINISH
:
2662 r
= snp_launch_finish(kvm
, &sev_cmd
);
2669 if (copy_to_user(argp
, &sev_cmd
, sizeof(struct kvm_sev_cmd
)))
2673 mutex_unlock(&kvm
->lock
);
2677 int sev_mem_enc_register_region(struct kvm
*kvm
,
2678 struct kvm_enc_region
*range
)
2680 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
2681 struct enc_region
*region
;
2684 if (!sev_guest(kvm
))
2687 /* If kvm is mirroring encryption context it isn't responsible for it */
2688 if (is_mirroring_enc_context(kvm
))
2691 if (range
->addr
> ULONG_MAX
|| range
->size
> ULONG_MAX
)
2694 region
= kzalloc(sizeof(*region
), GFP_KERNEL_ACCOUNT
);
2698 mutex_lock(&kvm
->lock
);
2699 region
->pages
= sev_pin_memory(kvm
, range
->addr
, range
->size
, ®ion
->npages
, 1);
2700 if (IS_ERR(region
->pages
)) {
2701 ret
= PTR_ERR(region
->pages
);
2702 mutex_unlock(&kvm
->lock
);
2707 * The guest may change the memory encryption attribute from C=0 -> C=1
2708 * or vice versa for this memory range. Lets make sure caches are
2709 * flushed to ensure that guest data gets written into memory with
2710 * correct C-bit. Note, this must be done before dropping kvm->lock,
2711 * as region and its array of pages can be freed by a different task
2712 * once kvm->lock is released.
2714 sev_clflush_pages(region
->pages
, region
->npages
);
2716 region
->uaddr
= range
->addr
;
2717 region
->size
= range
->size
;
2719 list_add_tail(®ion
->list
, &sev
->regions_list
);
2720 mutex_unlock(&kvm
->lock
);
2729 static struct enc_region
*
2730 find_enc_region(struct kvm
*kvm
, struct kvm_enc_region
*range
)
2732 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
2733 struct list_head
*head
= &sev
->regions_list
;
2734 struct enc_region
*i
;
2736 list_for_each_entry(i
, head
, list
) {
2737 if (i
->uaddr
== range
->addr
&&
2738 i
->size
== range
->size
)
2745 static void __unregister_enc_region_locked(struct kvm
*kvm
,
2746 struct enc_region
*region
)
2748 sev_unpin_memory(kvm
, region
->pages
, region
->npages
);
2749 list_del(®ion
->list
);
2753 int sev_mem_enc_unregister_region(struct kvm
*kvm
,
2754 struct kvm_enc_region
*range
)
2756 struct enc_region
*region
;
2759 /* If kvm is mirroring encryption context it isn't responsible for it */
2760 if (is_mirroring_enc_context(kvm
))
2763 mutex_lock(&kvm
->lock
);
2765 if (!sev_guest(kvm
)) {
2770 region
= find_enc_region(kvm
, range
);
2777 * Ensure that all guest tagged cache entries are flushed before
2778 * releasing the pages back to the system for use. CLFLUSH will
2779 * not do this, so issue a WBINVD.
2781 wbinvd_on_all_cpus();
2783 __unregister_enc_region_locked(kvm
, region
);
2785 mutex_unlock(&kvm
->lock
);
2789 mutex_unlock(&kvm
->lock
);
2793 int sev_vm_copy_enc_context_from(struct kvm
*kvm
, unsigned int source_fd
)
2795 CLASS(fd
, f
)(source_fd
);
2796 struct kvm
*source_kvm
;
2797 struct kvm_sev_info
*source_sev
, *mirror_sev
;
2803 if (!file_is_kvm(fd_file(f
)))
2806 source_kvm
= fd_file(f
)->private_data
;
2807 ret
= sev_lock_two_vms(kvm
, source_kvm
);
2812 * Mirrors of mirrors should work, but let's not get silly. Also
2813 * disallow out-of-band SEV/SEV-ES init if the target is already an
2814 * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
2815 * created after SEV/SEV-ES initialization, e.g. to init intercepts.
2817 if (sev_guest(kvm
) || !sev_guest(source_kvm
) ||
2818 is_mirroring_enc_context(source_kvm
) || kvm
->created_vcpus
) {
2824 * The mirror kvm holds an enc_context_owner ref so its asid can't
2825 * disappear until we're done with it
2827 source_sev
= &to_kvm_svm(source_kvm
)->sev_info
;
2828 kvm_get_kvm(source_kvm
);
2829 mirror_sev
= &to_kvm_svm(kvm
)->sev_info
;
2830 list_add_tail(&mirror_sev
->mirror_entry
, &source_sev
->mirror_vms
);
2832 /* Set enc_context_owner and copy its encryption context over */
2833 mirror_sev
->enc_context_owner
= source_kvm
;
2834 mirror_sev
->active
= true;
2835 mirror_sev
->asid
= source_sev
->asid
;
2836 mirror_sev
->fd
= source_sev
->fd
;
2837 mirror_sev
->es_active
= source_sev
->es_active
;
2838 mirror_sev
->need_init
= false;
2839 mirror_sev
->handle
= source_sev
->handle
;
2840 INIT_LIST_HEAD(&mirror_sev
->regions_list
);
2841 INIT_LIST_HEAD(&mirror_sev
->mirror_vms
);
2845 * Do not copy ap_jump_table. Since the mirror does not share the same
2846 * KVM contexts as the original, and they may have different
2851 sev_unlock_two_vms(kvm
, source_kvm
);
2855 static int snp_decommission_context(struct kvm
*kvm
)
2857 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
2858 struct sev_data_snp_addr data
= {};
2861 /* If context is not created then do nothing */
2862 if (!sev
->snp_context
)
2865 /* Do the decommision, which will unbind the ASID from the SNP context */
2866 data
.address
= __sme_pa(sev
->snp_context
);
2867 down_write(&sev_deactivate_lock
);
2868 ret
= sev_do_cmd(SEV_CMD_SNP_DECOMMISSION
, &data
, NULL
);
2869 up_write(&sev_deactivate_lock
);
2871 if (WARN_ONCE(ret
, "Failed to release guest context, ret %d", ret
))
2874 snp_free_firmware_page(sev
->snp_context
);
2875 sev
->snp_context
= NULL
;
2880 void sev_vm_destroy(struct kvm
*kvm
)
2882 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
2883 struct list_head
*head
= &sev
->regions_list
;
2884 struct list_head
*pos
, *q
;
2886 if (!sev_guest(kvm
))
2889 WARN_ON(!list_empty(&sev
->mirror_vms
));
2891 /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
2892 if (is_mirroring_enc_context(kvm
)) {
2893 struct kvm
*owner_kvm
= sev
->enc_context_owner
;
2895 mutex_lock(&owner_kvm
->lock
);
2896 list_del(&sev
->mirror_entry
);
2897 mutex_unlock(&owner_kvm
->lock
);
2898 kvm_put_kvm(owner_kvm
);
2903 * Ensure that all guest tagged cache entries are flushed before
2904 * releasing the pages back to the system for use. CLFLUSH will
2905 * not do this, so issue a WBINVD.
2907 wbinvd_on_all_cpus();
2910 * if userspace was terminated before unregistering the memory regions
2911 * then lets unpin all the registered memory.
2913 if (!list_empty(head
)) {
2914 list_for_each_safe(pos
, q
, head
) {
2915 __unregister_enc_region_locked(kvm
,
2916 list_entry(pos
, struct enc_region
, list
));
2921 if (sev_snp_guest(kvm
)) {
2922 snp_guest_req_cleanup(kvm
);
2925 * Decomission handles unbinding of the ASID. If it fails for
2926 * some unexpected reason, just leak the ASID.
2928 if (snp_decommission_context(kvm
))
2931 sev_unbind_asid(kvm
, sev
->handle
);
2937 void __init
sev_set_cpu_caps(void)
2940 kvm_cpu_cap_set(X86_FEATURE_SEV
);
2941 kvm_caps
.supported_vm_types
|= BIT(KVM_X86_SEV_VM
);
2943 if (sev_es_enabled
) {
2944 kvm_cpu_cap_set(X86_FEATURE_SEV_ES
);
2945 kvm_caps
.supported_vm_types
|= BIT(KVM_X86_SEV_ES_VM
);
2947 if (sev_snp_enabled
) {
2948 kvm_cpu_cap_set(X86_FEATURE_SEV_SNP
);
2949 kvm_caps
.supported_vm_types
|= BIT(KVM_X86_SNP_VM
);
2953 void __init
sev_hardware_setup(void)
2955 unsigned int eax
, ebx
, ecx
, edx
, sev_asid_count
, sev_es_asid_count
;
2956 bool sev_snp_supported
= false;
2957 bool sev_es_supported
= false;
2958 bool sev_supported
= false;
2960 if (!sev_enabled
|| !npt_enabled
|| !nrips
)
2964 * SEV must obviously be supported in hardware. Sanity check that the
2965 * CPU supports decode assists, which is mandatory for SEV guests to
2966 * support instruction emulation. Ditto for flushing by ASID, as SEV
2967 * guests are bound to a single ASID, i.e. KVM can't rotate to a new
2968 * ASID to effect a TLB flush.
2970 if (!boot_cpu_has(X86_FEATURE_SEV
) ||
2971 WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS
)) ||
2972 WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID
)))
2975 /* Retrieve SEV CPUID information */
2976 cpuid(0x8000001f, &eax
, &ebx
, &ecx
, &edx
);
2978 /* Set encryption bit location for SEV-ES guests */
2979 sev_enc_bit
= ebx
& 0x3f;
2981 /* Maximum number of encrypted guests supported simultaneously */
2986 /* Minimum ASID value that should be used for SEV guest */
2988 sev_me_mask
= 1UL << (ebx
& 0x3f);
2991 * Initialize SEV ASID bitmaps. Allocate space for ASID 0 in the bitmap,
2992 * even though it's never used, so that the bitmap is indexed by the
2995 nr_asids
= max_sev_asid
+ 1;
2996 sev_asid_bitmap
= bitmap_zalloc(nr_asids
, GFP_KERNEL
);
2997 if (!sev_asid_bitmap
)
3000 sev_reclaim_asid_bitmap
= bitmap_zalloc(nr_asids
, GFP_KERNEL
);
3001 if (!sev_reclaim_asid_bitmap
) {
3002 bitmap_free(sev_asid_bitmap
);
3003 sev_asid_bitmap
= NULL
;
3007 if (min_sev_asid
<= max_sev_asid
) {
3008 sev_asid_count
= max_sev_asid
- min_sev_asid
+ 1;
3009 WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV
, sev_asid_count
));
3011 sev_supported
= true;
3013 /* SEV-ES support requested? */
3014 if (!sev_es_enabled
)
3018 * SEV-ES requires MMIO caching as KVM doesn't have access to the guest
3019 * instruction stream, i.e. can't emulate in response to a #NPF and
3020 * instead relies on #NPF(RSVD) being reflected into the guest as #VC
3021 * (the guest can then do a #VMGEXIT to request MMIO emulation).
3023 if (!enable_mmio_caching
)
3026 /* Does the CPU support SEV-ES? */
3027 if (!boot_cpu_has(X86_FEATURE_SEV_ES
))
3031 WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV
),
3032 "LBRV must be present for SEV-ES support");
3036 /* Has the system been allocated ASIDs for SEV-ES? */
3037 if (min_sev_asid
== 1)
3040 sev_es_asid_count
= min_sev_asid
- 1;
3041 WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES
, sev_es_asid_count
));
3042 sev_es_supported
= true;
3043 sev_snp_supported
= sev_snp_enabled
&& cc_platform_has(CC_ATTR_HOST_SEV_SNP
);
3046 if (boot_cpu_has(X86_FEATURE_SEV
))
3047 pr_info("SEV %s (ASIDs %u - %u)\n",
3048 sev_supported
? min_sev_asid
<= max_sev_asid
? "enabled" :
3051 min_sev_asid
, max_sev_asid
);
3052 if (boot_cpu_has(X86_FEATURE_SEV_ES
))
3053 pr_info("SEV-ES %s (ASIDs %u - %u)\n",
3054 sev_es_supported
? "enabled" : "disabled",
3055 min_sev_asid
> 1 ? 1 : 0, min_sev_asid
- 1);
3056 if (boot_cpu_has(X86_FEATURE_SEV_SNP
))
3057 pr_info("SEV-SNP %s (ASIDs %u - %u)\n",
3058 sev_snp_supported
? "enabled" : "disabled",
3059 min_sev_asid
> 1 ? 1 : 0, min_sev_asid
- 1);
3061 sev_enabled
= sev_supported
;
3062 sev_es_enabled
= sev_es_supported
;
3063 sev_snp_enabled
= sev_snp_supported
;
3065 if (!sev_es_enabled
|| !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP
) ||
3066 !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP
))
3067 sev_es_debug_swap_enabled
= false;
3069 sev_supported_vmsa_features
= 0;
3070 if (sev_es_debug_swap_enabled
)
3071 sev_supported_vmsa_features
|= SVM_SEV_FEAT_DEBUG_SWAP
;
3074 void sev_hardware_unsetup(void)
3079 /* No need to take sev_bitmap_lock, all VMs have been destroyed. */
3080 sev_flush_asids(1, max_sev_asid
);
3082 bitmap_free(sev_asid_bitmap
);
3083 bitmap_free(sev_reclaim_asid_bitmap
);
3085 misc_cg_set_capacity(MISC_CG_RES_SEV
, 0);
3086 misc_cg_set_capacity(MISC_CG_RES_SEV_ES
, 0);
3089 int sev_cpu_init(struct svm_cpu_data
*sd
)
3094 sd
->sev_vmcbs
= kcalloc(nr_asids
, sizeof(void *), GFP_KERNEL
);
3102 * Pages used by hardware to hold guest encrypted state must be flushed before
3103 * returning them to the system.
3105 static void sev_flush_encrypted_page(struct kvm_vcpu
*vcpu
, void *va
)
3107 unsigned int asid
= sev_get_asid(vcpu
->kvm
);
3110 * Note! The address must be a kernel address, as regular page walk
3111 * checks are performed by VM_PAGE_FLUSH, i.e. operating on a user
3112 * address is non-deterministic and unsafe. This function deliberately
3113 * takes a pointer to deter passing in a user address.
3115 unsigned long addr
= (unsigned long)va
;
3118 * If CPU enforced cache coherency for encrypted mappings of the
3119 * same physical page is supported, use CLFLUSHOPT instead. NOTE: cache
3120 * flush is still needed in order to work properly with DMA devices.
3122 if (boot_cpu_has(X86_FEATURE_SME_COHERENT
)) {
3123 clflush_cache_range(va
, PAGE_SIZE
);
3128 * VM Page Flush takes a host virtual address and a guest ASID. Fall
3129 * back to WBINVD if this faults so as not to make any problems worse
3130 * by leaving stale encrypted data in the cache.
3132 if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH
, addr
| asid
)))
3138 wbinvd_on_all_cpus();
3141 void sev_guest_memory_reclaimed(struct kvm
*kvm
)
3144 * With SNP+gmem, private/encrypted memory is unreachable via the
3145 * hva-based mmu notifiers, so these events are only actually
3146 * pertaining to shared pages where there is no need to perform
3147 * the WBINVD to flush associated caches.
3149 if (!sev_guest(kvm
) || sev_snp_guest(kvm
))
3152 wbinvd_on_all_cpus();
3155 void sev_free_vcpu(struct kvm_vcpu
*vcpu
)
3157 struct vcpu_svm
*svm
;
3159 if (!sev_es_guest(vcpu
->kvm
))
3165 * If it's an SNP guest, then the VMSA was marked in the RMP table as
3166 * a guest-owned page. Transition the page to hypervisor state before
3167 * releasing it back to the system.
3169 if (sev_snp_guest(vcpu
->kvm
)) {
3170 u64 pfn
= __pa(svm
->sev_es
.vmsa
) >> PAGE_SHIFT
;
3172 if (kvm_rmp_make_shared(vcpu
->kvm
, pfn
, PG_LEVEL_4K
))
3173 goto skip_vmsa_free
;
3176 if (vcpu
->arch
.guest_state_protected
)
3177 sev_flush_encrypted_page(vcpu
, svm
->sev_es
.vmsa
);
3179 __free_page(virt_to_page(svm
->sev_es
.vmsa
));
3182 if (svm
->sev_es
.ghcb_sa_free
)
3183 kvfree(svm
->sev_es
.ghcb_sa
);
3186 static void dump_ghcb(struct vcpu_svm
*svm
)
3188 struct ghcb
*ghcb
= svm
->sev_es
.ghcb
;
3191 /* Re-use the dump_invalid_vmcb module parameter */
3192 if (!dump_invalid_vmcb
) {
3193 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3197 nbits
= sizeof(ghcb
->save
.valid_bitmap
) * 8;
3199 pr_err("GHCB (GPA=%016llx):\n", svm
->vmcb
->control
.ghcb_gpa
);
3200 pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code",
3201 ghcb
->save
.sw_exit_code
, ghcb_sw_exit_code_is_valid(ghcb
));
3202 pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1",
3203 ghcb
->save
.sw_exit_info_1
, ghcb_sw_exit_info_1_is_valid(ghcb
));
3204 pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2",
3205 ghcb
->save
.sw_exit_info_2
, ghcb_sw_exit_info_2_is_valid(ghcb
));
3206 pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch",
3207 ghcb
->save
.sw_scratch
, ghcb_sw_scratch_is_valid(ghcb
));
3208 pr_err("%-20s%*pb\n", "valid_bitmap", nbits
, ghcb
->save
.valid_bitmap
);
3211 static void sev_es_sync_to_ghcb(struct vcpu_svm
*svm
)
3213 struct kvm_vcpu
*vcpu
= &svm
->vcpu
;
3214 struct ghcb
*ghcb
= svm
->sev_es
.ghcb
;
3217 * The GHCB protocol so far allows for the following data
3219 * GPRs RAX, RBX, RCX, RDX
3221 * Copy their values, even if they may not have been written during the
3222 * VM-Exit. It's the guest's responsibility to not consume random data.
3224 ghcb_set_rax(ghcb
, vcpu
->arch
.regs
[VCPU_REGS_RAX
]);
3225 ghcb_set_rbx(ghcb
, vcpu
->arch
.regs
[VCPU_REGS_RBX
]);
3226 ghcb_set_rcx(ghcb
, vcpu
->arch
.regs
[VCPU_REGS_RCX
]);
3227 ghcb_set_rdx(ghcb
, vcpu
->arch
.regs
[VCPU_REGS_RDX
]);
3230 static void sev_es_sync_from_ghcb(struct vcpu_svm
*svm
)
3232 struct vmcb_control_area
*control
= &svm
->vmcb
->control
;
3233 struct kvm_vcpu
*vcpu
= &svm
->vcpu
;
3234 struct ghcb
*ghcb
= svm
->sev_es
.ghcb
;
3238 * The GHCB protocol so far allows for the following data
3240 * GPRs RAX, RBX, RCX, RDX
3244 * VMMCALL allows the guest to provide extra registers. KVM also
3245 * expects RSI for hypercalls, so include that, too.
3247 * Copy their values to the appropriate location if supplied.
3249 memset(vcpu
->arch
.regs
, 0, sizeof(vcpu
->arch
.regs
));
3251 BUILD_BUG_ON(sizeof(svm
->sev_es
.valid_bitmap
) != sizeof(ghcb
->save
.valid_bitmap
));
3252 memcpy(&svm
->sev_es
.valid_bitmap
, &ghcb
->save
.valid_bitmap
, sizeof(ghcb
->save
.valid_bitmap
));
3254 vcpu
->arch
.regs
[VCPU_REGS_RAX
] = kvm_ghcb_get_rax_if_valid(svm
, ghcb
);
3255 vcpu
->arch
.regs
[VCPU_REGS_RBX
] = kvm_ghcb_get_rbx_if_valid(svm
, ghcb
);
3256 vcpu
->arch
.regs
[VCPU_REGS_RCX
] = kvm_ghcb_get_rcx_if_valid(svm
, ghcb
);
3257 vcpu
->arch
.regs
[VCPU_REGS_RDX
] = kvm_ghcb_get_rdx_if_valid(svm
, ghcb
);
3258 vcpu
->arch
.regs
[VCPU_REGS_RSI
] = kvm_ghcb_get_rsi_if_valid(svm
, ghcb
);
3260 svm
->vmcb
->save
.cpl
= kvm_ghcb_get_cpl_if_valid(svm
, ghcb
);
3262 if (kvm_ghcb_xcr0_is_valid(svm
)) {
3263 vcpu
->arch
.xcr0
= ghcb_get_xcr0(ghcb
);
3264 kvm_update_cpuid_runtime(vcpu
);
3267 /* Copy the GHCB exit information into the VMCB fields */
3268 exit_code
= ghcb_get_sw_exit_code(ghcb
);
3269 control
->exit_code
= lower_32_bits(exit_code
);
3270 control
->exit_code_hi
= upper_32_bits(exit_code
);
3271 control
->exit_info_1
= ghcb_get_sw_exit_info_1(ghcb
);
3272 control
->exit_info_2
= ghcb_get_sw_exit_info_2(ghcb
);
3273 svm
->sev_es
.sw_scratch
= kvm_ghcb_get_sw_scratch_if_valid(svm
, ghcb
);
3275 /* Clear the valid entries fields */
3276 memset(ghcb
->save
.valid_bitmap
, 0, sizeof(ghcb
->save
.valid_bitmap
));
3279 static u64
kvm_ghcb_get_sw_exit_code(struct vmcb_control_area
*control
)
3281 return (((u64
)control
->exit_code_hi
) << 32) | control
->exit_code
;
3284 static int sev_es_validate_vmgexit(struct vcpu_svm
*svm
)
3286 struct vmcb_control_area
*control
= &svm
->vmcb
->control
;
3287 struct kvm_vcpu
*vcpu
= &svm
->vcpu
;
3292 * Retrieve the exit code now even though it may not be marked valid
3293 * as it could help with debugging.
3295 exit_code
= kvm_ghcb_get_sw_exit_code(control
);
3297 /* Only GHCB Usage code 0 is supported */
3298 if (svm
->sev_es
.ghcb
->ghcb_usage
) {
3299 reason
= GHCB_ERR_INVALID_USAGE
;
3303 reason
= GHCB_ERR_MISSING_INPUT
;
3305 if (!kvm_ghcb_sw_exit_code_is_valid(svm
) ||
3306 !kvm_ghcb_sw_exit_info_1_is_valid(svm
) ||
3307 !kvm_ghcb_sw_exit_info_2_is_valid(svm
))
3310 switch (exit_code
) {
3311 case SVM_EXIT_READ_DR7
:
3313 case SVM_EXIT_WRITE_DR7
:
3314 if (!kvm_ghcb_rax_is_valid(svm
))
3317 case SVM_EXIT_RDTSC
:
3319 case SVM_EXIT_RDPMC
:
3320 if (!kvm_ghcb_rcx_is_valid(svm
))
3323 case SVM_EXIT_CPUID
:
3324 if (!kvm_ghcb_rax_is_valid(svm
) ||
3325 !kvm_ghcb_rcx_is_valid(svm
))
3327 if (vcpu
->arch
.regs
[VCPU_REGS_RAX
] == 0xd)
3328 if (!kvm_ghcb_xcr0_is_valid(svm
))
3334 if (control
->exit_info_1
& SVM_IOIO_STR_MASK
) {
3335 if (!kvm_ghcb_sw_scratch_is_valid(svm
))
3338 if (!(control
->exit_info_1
& SVM_IOIO_TYPE_MASK
))
3339 if (!kvm_ghcb_rax_is_valid(svm
))
3344 if (!kvm_ghcb_rcx_is_valid(svm
))
3346 if (control
->exit_info_1
) {
3347 if (!kvm_ghcb_rax_is_valid(svm
) ||
3348 !kvm_ghcb_rdx_is_valid(svm
))
3352 case SVM_EXIT_VMMCALL
:
3353 if (!kvm_ghcb_rax_is_valid(svm
) ||
3354 !kvm_ghcb_cpl_is_valid(svm
))
3357 case SVM_EXIT_RDTSCP
:
3359 case SVM_EXIT_WBINVD
:
3361 case SVM_EXIT_MONITOR
:
3362 if (!kvm_ghcb_rax_is_valid(svm
) ||
3363 !kvm_ghcb_rcx_is_valid(svm
) ||
3364 !kvm_ghcb_rdx_is_valid(svm
))
3367 case SVM_EXIT_MWAIT
:
3368 if (!kvm_ghcb_rax_is_valid(svm
) ||
3369 !kvm_ghcb_rcx_is_valid(svm
))
3372 case SVM_VMGEXIT_MMIO_READ
:
3373 case SVM_VMGEXIT_MMIO_WRITE
:
3374 if (!kvm_ghcb_sw_scratch_is_valid(svm
))
3377 case SVM_VMGEXIT_AP_CREATION
:
3378 if (!sev_snp_guest(vcpu
->kvm
))
3380 if (lower_32_bits(control
->exit_info_1
) != SVM_VMGEXIT_AP_DESTROY
)
3381 if (!kvm_ghcb_rax_is_valid(svm
))
3384 case SVM_VMGEXIT_NMI_COMPLETE
:
3385 case SVM_VMGEXIT_AP_HLT_LOOP
:
3386 case SVM_VMGEXIT_AP_JUMP_TABLE
:
3387 case SVM_VMGEXIT_UNSUPPORTED_EVENT
:
3388 case SVM_VMGEXIT_HV_FEATURES
:
3389 case SVM_VMGEXIT_TERM_REQUEST
:
3391 case SVM_VMGEXIT_PSC
:
3392 if (!sev_snp_guest(vcpu
->kvm
) || !kvm_ghcb_sw_scratch_is_valid(svm
))
3395 case SVM_VMGEXIT_GUEST_REQUEST
:
3396 case SVM_VMGEXIT_EXT_GUEST_REQUEST
:
3397 if (!sev_snp_guest(vcpu
->kvm
) ||
3398 !PAGE_ALIGNED(control
->exit_info_1
) ||
3399 !PAGE_ALIGNED(control
->exit_info_2
) ||
3400 control
->exit_info_1
== control
->exit_info_2
)
3404 reason
= GHCB_ERR_INVALID_EVENT
;
3411 if (reason
== GHCB_ERR_INVALID_USAGE
) {
3412 vcpu_unimpl(vcpu
, "vmgexit: ghcb usage %#x is not valid\n",
3413 svm
->sev_es
.ghcb
->ghcb_usage
);
3414 } else if (reason
== GHCB_ERR_INVALID_EVENT
) {
3415 vcpu_unimpl(vcpu
, "vmgexit: exit code %#llx is not valid\n",
3418 vcpu_unimpl(vcpu
, "vmgexit: exit code %#llx input is not valid\n",
3423 ghcb_set_sw_exit_info_1(svm
->sev_es
.ghcb
, 2);
3424 ghcb_set_sw_exit_info_2(svm
->sev_es
.ghcb
, reason
);
3426 /* Resume the guest to "return" the error code. */
3430 void sev_es_unmap_ghcb(struct vcpu_svm
*svm
)
3432 /* Clear any indication that the vCPU is in a type of AP Reset Hold */
3433 svm
->sev_es
.ap_reset_hold_type
= AP_RESET_HOLD_NONE
;
3435 if (!svm
->sev_es
.ghcb
)
3438 if (svm
->sev_es
.ghcb_sa_free
) {
3440 * The scratch area lives outside the GHCB, so there is a
3441 * buffer that, depending on the operation performed, may
3442 * need to be synced, then freed.
3444 if (svm
->sev_es
.ghcb_sa_sync
) {
3445 kvm_write_guest(svm
->vcpu
.kvm
,
3446 svm
->sev_es
.sw_scratch
,
3447 svm
->sev_es
.ghcb_sa
,
3448 svm
->sev_es
.ghcb_sa_len
);
3449 svm
->sev_es
.ghcb_sa_sync
= false;
3452 kvfree(svm
->sev_es
.ghcb_sa
);
3453 svm
->sev_es
.ghcb_sa
= NULL
;
3454 svm
->sev_es
.ghcb_sa_free
= false;
3457 trace_kvm_vmgexit_exit(svm
->vcpu
.vcpu_id
, svm
->sev_es
.ghcb
);
3459 sev_es_sync_to_ghcb(svm
);
3461 kvm_vcpu_unmap(&svm
->vcpu
, &svm
->sev_es
.ghcb_map
);
3462 svm
->sev_es
.ghcb
= NULL
;
3465 void pre_sev_run(struct vcpu_svm
*svm
, int cpu
)
3467 struct svm_cpu_data
*sd
= per_cpu_ptr(&svm_data
, cpu
);
3468 unsigned int asid
= sev_get_asid(svm
->vcpu
.kvm
);
3470 /* Assign the asid allocated with this SEV guest */
3476 * 1) when different VMCB for the same ASID is to be run on the same host CPU.
3477 * 2) or this VMCB was executed on different host CPU in previous VMRUNs.
3479 if (sd
->sev_vmcbs
[asid
] == svm
->vmcb
&&
3480 svm
->vcpu
.arch
.last_vmentry_cpu
== cpu
)
3483 sd
->sev_vmcbs
[asid
] = svm
->vmcb
;
3484 svm
->vmcb
->control
.tlb_ctl
= TLB_CONTROL_FLUSH_ASID
;
3485 vmcb_mark_dirty(svm
->vmcb
, VMCB_ASID
);
3488 #define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE)
3489 static int setup_vmgexit_scratch(struct vcpu_svm
*svm
, bool sync
, u64 len
)
3491 struct vmcb_control_area
*control
= &svm
->vmcb
->control
;
3492 u64 ghcb_scratch_beg
, ghcb_scratch_end
;
3493 u64 scratch_gpa_beg
, scratch_gpa_end
;
3496 scratch_gpa_beg
= svm
->sev_es
.sw_scratch
;
3497 if (!scratch_gpa_beg
) {
3498 pr_err("vmgexit: scratch gpa not provided\n");
3502 scratch_gpa_end
= scratch_gpa_beg
+ len
;
3503 if (scratch_gpa_end
< scratch_gpa_beg
) {
3504 pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n",
3505 len
, scratch_gpa_beg
);
3509 if ((scratch_gpa_beg
& PAGE_MASK
) == control
->ghcb_gpa
) {
3510 /* Scratch area begins within GHCB */
3511 ghcb_scratch_beg
= control
->ghcb_gpa
+
3512 offsetof(struct ghcb
, shared_buffer
);
3513 ghcb_scratch_end
= control
->ghcb_gpa
+
3514 offsetof(struct ghcb
, reserved_0xff0
);
3517 * If the scratch area begins within the GHCB, it must be
3518 * completely contained in the GHCB shared buffer area.
3520 if (scratch_gpa_beg
< ghcb_scratch_beg
||
3521 scratch_gpa_end
> ghcb_scratch_end
) {
3522 pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n",
3523 scratch_gpa_beg
, scratch_gpa_end
);
3527 scratch_va
= (void *)svm
->sev_es
.ghcb
;
3528 scratch_va
+= (scratch_gpa_beg
- control
->ghcb_gpa
);
3531 * The guest memory must be read into a kernel buffer, so
3534 if (len
> GHCB_SCRATCH_AREA_LIMIT
) {
3535 pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n",
3536 len
, GHCB_SCRATCH_AREA_LIMIT
);
3539 scratch_va
= kvzalloc(len
, GFP_KERNEL_ACCOUNT
);
3543 if (kvm_read_guest(svm
->vcpu
.kvm
, scratch_gpa_beg
, scratch_va
, len
)) {
3544 /* Unable to copy scratch area from guest */
3545 pr_err("vmgexit: kvm_read_guest for scratch area failed\n");
3552 * The scratch area is outside the GHCB. The operation will
3553 * dictate whether the buffer needs to be synced before running
3554 * the vCPU next time (i.e. a read was requested so the data
3555 * must be written back to the guest memory).
3557 svm
->sev_es
.ghcb_sa_sync
= sync
;
3558 svm
->sev_es
.ghcb_sa_free
= true;
3561 svm
->sev_es
.ghcb_sa
= scratch_va
;
3562 svm
->sev_es
.ghcb_sa_len
= len
;
3567 ghcb_set_sw_exit_info_1(svm
->sev_es
.ghcb
, 2);
3568 ghcb_set_sw_exit_info_2(svm
->sev_es
.ghcb
, GHCB_ERR_INVALID_SCRATCH_AREA
);
3573 static void set_ghcb_msr_bits(struct vcpu_svm
*svm
, u64 value
, u64 mask
,
3576 svm
->vmcb
->control
.ghcb_gpa
&= ~(mask
<< pos
);
3577 svm
->vmcb
->control
.ghcb_gpa
|= (value
& mask
) << pos
;
3580 static u64
get_ghcb_msr_bits(struct vcpu_svm
*svm
, u64 mask
, unsigned int pos
)
3582 return (svm
->vmcb
->control
.ghcb_gpa
>> pos
) & mask
;
3585 static void set_ghcb_msr(struct vcpu_svm
*svm
, u64 value
)
3587 svm
->vmcb
->control
.ghcb_gpa
= value
;
3590 static int snp_rmptable_psmash(kvm_pfn_t pfn
)
3594 pfn
= pfn
& ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M
) - 1);
3597 * PSMASH_FAIL_INUSE indicates another processor is modifying the
3598 * entry, so retry until that's no longer the case.
3602 } while (ret
== PSMASH_FAIL_INUSE
);
3607 static int snp_complete_psc_msr(struct kvm_vcpu
*vcpu
)
3609 struct vcpu_svm
*svm
= to_svm(vcpu
);
3611 if (vcpu
->run
->hypercall
.ret
)
3612 set_ghcb_msr(svm
, GHCB_MSR_PSC_RESP_ERROR
);
3614 set_ghcb_msr(svm
, GHCB_MSR_PSC_RESP
);
3616 return 1; /* resume guest */
3619 static int snp_begin_psc_msr(struct vcpu_svm
*svm
, u64 ghcb_msr
)
3621 u64 gpa
= gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr
));
3622 u8 op
= GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr
);
3623 struct kvm_vcpu
*vcpu
= &svm
->vcpu
;
3625 if (op
!= SNP_PAGE_STATE_PRIVATE
&& op
!= SNP_PAGE_STATE_SHARED
) {
3626 set_ghcb_msr(svm
, GHCB_MSR_PSC_RESP_ERROR
);
3627 return 1; /* resume guest */
3630 if (!(vcpu
->kvm
->arch
.hypercall_exit_enabled
& (1 << KVM_HC_MAP_GPA_RANGE
))) {
3631 set_ghcb_msr(svm
, GHCB_MSR_PSC_RESP_ERROR
);
3632 return 1; /* resume guest */
3635 vcpu
->run
->exit_reason
= KVM_EXIT_HYPERCALL
;
3636 vcpu
->run
->hypercall
.nr
= KVM_HC_MAP_GPA_RANGE
;
3637 vcpu
->run
->hypercall
.args
[0] = gpa
;
3638 vcpu
->run
->hypercall
.args
[1] = 1;
3639 vcpu
->run
->hypercall
.args
[2] = (op
== SNP_PAGE_STATE_PRIVATE
)
3640 ? KVM_MAP_GPA_RANGE_ENCRYPTED
3641 : KVM_MAP_GPA_RANGE_DECRYPTED
;
3642 vcpu
->run
->hypercall
.args
[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K
;
3644 vcpu
->arch
.complete_userspace_io
= snp_complete_psc_msr
;
3646 return 0; /* forward request to userspace */
3651 struct psc_entry entries
[];
3654 static int snp_begin_psc(struct vcpu_svm
*svm
, struct psc_buffer
*psc
);
3656 static void snp_complete_psc(struct vcpu_svm
*svm
, u64 psc_ret
)
3658 svm
->sev_es
.psc_inflight
= 0;
3659 svm
->sev_es
.psc_idx
= 0;
3660 svm
->sev_es
.psc_2m
= false;
3661 ghcb_set_sw_exit_info_2(svm
->sev_es
.ghcb
, psc_ret
);
3664 static void __snp_complete_one_psc(struct vcpu_svm
*svm
)
3666 struct psc_buffer
*psc
= svm
->sev_es
.ghcb_sa
;
3667 struct psc_entry
*entries
= psc
->entries
;
3668 struct psc_hdr
*hdr
= &psc
->hdr
;
3672 * Everything in-flight has been processed successfully. Update the
3673 * corresponding entries in the guest's PSC buffer and zero out the
3674 * count of in-flight PSC entries.
3676 for (idx
= svm
->sev_es
.psc_idx
; svm
->sev_es
.psc_inflight
;
3677 svm
->sev_es
.psc_inflight
--, idx
++) {
3678 struct psc_entry
*entry
= &entries
[idx
];
3680 entry
->cur_page
= entry
->pagesize
? 512 : 1;
3683 hdr
->cur_entry
= idx
;
3686 static int snp_complete_one_psc(struct kvm_vcpu
*vcpu
)
3688 struct vcpu_svm
*svm
= to_svm(vcpu
);
3689 struct psc_buffer
*psc
= svm
->sev_es
.ghcb_sa
;
3691 if (vcpu
->run
->hypercall
.ret
) {
3692 snp_complete_psc(svm
, VMGEXIT_PSC_ERROR_GENERIC
);
3693 return 1; /* resume guest */
3696 __snp_complete_one_psc(svm
);
3698 /* Handle the next range (if any). */
3699 return snp_begin_psc(svm
, psc
);
3702 static int snp_begin_psc(struct vcpu_svm
*svm
, struct psc_buffer
*psc
)
3704 struct psc_entry
*entries
= psc
->entries
;
3705 struct kvm_vcpu
*vcpu
= &svm
->vcpu
;
3706 struct psc_hdr
*hdr
= &psc
->hdr
;
3707 struct psc_entry entry_start
;
3708 u16 idx
, idx_start
, idx_end
;
3713 if (!(vcpu
->kvm
->arch
.hypercall_exit_enabled
& (1 << KVM_HC_MAP_GPA_RANGE
))) {
3714 snp_complete_psc(svm
, VMGEXIT_PSC_ERROR_GENERIC
);
3719 /* There should be no other PSCs in-flight at this point. */
3720 if (WARN_ON_ONCE(svm
->sev_es
.psc_inflight
)) {
3721 snp_complete_psc(svm
, VMGEXIT_PSC_ERROR_GENERIC
);
3726 * The PSC descriptor buffer can be modified by a misbehaved guest after
3727 * validation, so take care to only use validated copies of values used
3728 * for things like array indexing.
3730 idx_start
= hdr
->cur_entry
;
3731 idx_end
= hdr
->end_entry
;
3733 if (idx_end
>= VMGEXIT_PSC_MAX_COUNT
) {
3734 snp_complete_psc(svm
, VMGEXIT_PSC_ERROR_INVALID_HDR
);
3738 /* Find the start of the next range which needs processing. */
3739 for (idx
= idx_start
; idx
<= idx_end
; idx
++, hdr
->cur_entry
++) {
3740 entry_start
= entries
[idx
];
3742 gfn
= entry_start
.gfn
;
3743 huge
= entry_start
.pagesize
;
3744 npages
= huge
? 512 : 1;
3746 if (entry_start
.cur_page
> npages
|| !IS_ALIGNED(gfn
, npages
)) {
3747 snp_complete_psc(svm
, VMGEXIT_PSC_ERROR_INVALID_ENTRY
);
3751 if (entry_start
.cur_page
) {
3753 * If this is a partially-completed 2M range, force 4K handling
3754 * for the remaining pages since they're effectively split at
3755 * this point. Subsequent code should ensure this doesn't get
3756 * combined with adjacent PSC entries where 2M handling is still
3759 npages
-= entry_start
.cur_page
;
3760 gfn
+= entry_start
.cur_page
;
3768 if (idx
> idx_end
) {
3769 /* Nothing more to process. */
3770 snp_complete_psc(svm
, 0);
3774 svm
->sev_es
.psc_2m
= huge
;
3775 svm
->sev_es
.psc_idx
= idx
;
3776 svm
->sev_es
.psc_inflight
= 1;
3779 * Find all subsequent PSC entries that contain adjacent GPA
3780 * ranges/operations and can be combined into a single
3781 * KVM_HC_MAP_GPA_RANGE exit.
3783 while (++idx
<= idx_end
) {
3784 struct psc_entry entry
= entries
[idx
];
3786 if (entry
.operation
!= entry_start
.operation
||
3787 entry
.gfn
!= entry_start
.gfn
+ npages
||
3788 entry
.cur_page
|| !!entry
.pagesize
!= huge
)
3791 svm
->sev_es
.psc_inflight
++;
3792 npages
+= huge
? 512 : 1;
3795 switch (entry_start
.operation
) {
3796 case VMGEXIT_PSC_OP_PRIVATE
:
3797 case VMGEXIT_PSC_OP_SHARED
:
3798 vcpu
->run
->exit_reason
= KVM_EXIT_HYPERCALL
;
3799 vcpu
->run
->hypercall
.nr
= KVM_HC_MAP_GPA_RANGE
;
3800 vcpu
->run
->hypercall
.args
[0] = gfn_to_gpa(gfn
);
3801 vcpu
->run
->hypercall
.args
[1] = npages
;
3802 vcpu
->run
->hypercall
.args
[2] = entry_start
.operation
== VMGEXIT_PSC_OP_PRIVATE
3803 ? KVM_MAP_GPA_RANGE_ENCRYPTED
3804 : KVM_MAP_GPA_RANGE_DECRYPTED
;
3805 vcpu
->run
->hypercall
.args
[2] |= entry_start
.pagesize
3806 ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M
3807 : KVM_MAP_GPA_RANGE_PAGE_SZ_4K
;
3808 vcpu
->arch
.complete_userspace_io
= snp_complete_one_psc
;
3809 return 0; /* forward request to userspace */
3812 * Only shared/private PSC operations are currently supported, so if the
3813 * entire range consists of unsupported operations (e.g. SMASH/UNSMASH),
3814 * then consider the entire range completed and avoid exiting to
3815 * userspace. In theory snp_complete_psc() can always be called directly
3816 * at this point to complete the current range and start the next one,
3817 * but that could lead to unexpected levels of recursion.
3819 __snp_complete_one_psc(svm
);
3826 static int __sev_snp_update_protected_guest_state(struct kvm_vcpu
*vcpu
)
3828 struct vcpu_svm
*svm
= to_svm(vcpu
);
3830 WARN_ON(!mutex_is_locked(&svm
->sev_es
.snp_vmsa_mutex
));
3832 /* Mark the vCPU as offline and not runnable */
3833 vcpu
->arch
.pv
.pv_unhalted
= false;
3834 vcpu
->arch
.mp_state
= KVM_MP_STATE_HALTED
;
3836 /* Clear use of the VMSA */
3837 svm
->vmcb
->control
.vmsa_pa
= INVALID_PAGE
;
3839 if (VALID_PAGE(svm
->sev_es
.snp_vmsa_gpa
)) {
3840 gfn_t gfn
= gpa_to_gfn(svm
->sev_es
.snp_vmsa_gpa
);
3841 struct kvm_memory_slot
*slot
;
3845 slot
= gfn_to_memslot(vcpu
->kvm
, gfn
);
3850 * The new VMSA will be private memory guest memory, so
3851 * retrieve the PFN from the gmem backend.
3853 if (kvm_gmem_get_pfn(vcpu
->kvm
, slot
, gfn
, &pfn
, &page
, NULL
))
3857 * From this point forward, the VMSA will always be a
3858 * guest-mapped page rather than the initial one allocated
3859 * by KVM in svm->sev_es.vmsa. In theory, svm->sev_es.vmsa
3860 * could be free'd and cleaned up here, but that involves
3861 * cleanups like wbinvd_on_all_cpus() which would ideally
3862 * be handled during teardown rather than guest boot.
3863 * Deferring that also allows the existing logic for SEV-ES
3864 * VMSAs to be re-used with minimal SNP-specific changes.
3866 svm
->sev_es
.snp_has_guest_vmsa
= true;
3868 /* Use the new VMSA */
3869 svm
->vmcb
->control
.vmsa_pa
= pfn_to_hpa(pfn
);
3871 /* Mark the vCPU as runnable */
3872 vcpu
->arch
.pv
.pv_unhalted
= false;
3873 vcpu
->arch
.mp_state
= KVM_MP_STATE_RUNNABLE
;
3875 svm
->sev_es
.snp_vmsa_gpa
= INVALID_PAGE
;
3878 * gmem pages aren't currently migratable, but if this ever
3879 * changes then care should be taken to ensure
3880 * svm->sev_es.vmsa is pinned through some other means.
3882 kvm_release_page_clean(page
);
3886 * When replacing the VMSA during SEV-SNP AP creation,
3887 * mark the VMCB dirty so that full state is always reloaded.
3889 vmcb_mark_all_dirty(svm
->vmcb
);
3895 * Invoked as part of svm_vcpu_reset() processing of an init event.
3897 void sev_snp_init_protected_guest_state(struct kvm_vcpu
*vcpu
)
3899 struct vcpu_svm
*svm
= to_svm(vcpu
);
3902 if (!sev_snp_guest(vcpu
->kvm
))
3905 mutex_lock(&svm
->sev_es
.snp_vmsa_mutex
);
3907 if (!svm
->sev_es
.snp_ap_waiting_for_reset
)
3910 svm
->sev_es
.snp_ap_waiting_for_reset
= false;
3912 ret
= __sev_snp_update_protected_guest_state(vcpu
);
3914 vcpu_unimpl(vcpu
, "snp: AP state update on init failed\n");
3917 mutex_unlock(&svm
->sev_es
.snp_vmsa_mutex
);
3920 static int sev_snp_ap_creation(struct vcpu_svm
*svm
)
3922 struct kvm_sev_info
*sev
= &to_kvm_svm(svm
->vcpu
.kvm
)->sev_info
;
3923 struct kvm_vcpu
*vcpu
= &svm
->vcpu
;
3924 struct kvm_vcpu
*target_vcpu
;
3925 struct vcpu_svm
*target_svm
;
3926 unsigned int request
;
3927 unsigned int apic_id
;
3931 request
= lower_32_bits(svm
->vmcb
->control
.exit_info_1
);
3932 apic_id
= upper_32_bits(svm
->vmcb
->control
.exit_info_1
);
3934 /* Validate the APIC ID */
3935 target_vcpu
= kvm_get_vcpu_by_id(vcpu
->kvm
, apic_id
);
3937 vcpu_unimpl(vcpu
, "vmgexit: invalid AP APIC ID [%#x] from guest\n",
3944 target_svm
= to_svm(target_vcpu
);
3947 * The target vCPU is valid, so the vCPU will be kicked unless the
3948 * request is for CREATE_ON_INIT. For any errors at this stage, the
3949 * kick will place the vCPU in an non-runnable state.
3953 mutex_lock(&target_svm
->sev_es
.snp_vmsa_mutex
);
3955 target_svm
->sev_es
.snp_vmsa_gpa
= INVALID_PAGE
;
3956 target_svm
->sev_es
.snp_ap_waiting_for_reset
= true;
3958 /* Interrupt injection mode shouldn't change for AP creation */
3959 if (request
< SVM_VMGEXIT_AP_DESTROY
) {
3962 sev_features
= vcpu
->arch
.regs
[VCPU_REGS_RAX
];
3963 sev_features
^= sev
->vmsa_features
;
3965 if (sev_features
& SVM_SEV_FEAT_INT_INJ_MODES
) {
3966 vcpu_unimpl(vcpu
, "vmgexit: invalid AP injection mode [%#lx] from guest\n",
3967 vcpu
->arch
.regs
[VCPU_REGS_RAX
]);
3974 case SVM_VMGEXIT_AP_CREATE_ON_INIT
:
3977 case SVM_VMGEXIT_AP_CREATE
:
3978 if (!page_address_valid(vcpu
, svm
->vmcb
->control
.exit_info_2
)) {
3979 vcpu_unimpl(vcpu
, "vmgexit: invalid AP VMSA address [%#llx] from guest\n",
3980 svm
->vmcb
->control
.exit_info_2
);
3986 * Malicious guest can RMPADJUST a large page into VMSA which
3987 * will hit the SNP erratum where the CPU will incorrectly signal
3988 * an RMP violation #PF if a hugepage collides with the RMP entry
3989 * of VMSA page, reject the AP CREATE request if VMSA address from
3990 * guest is 2M aligned.
3992 if (IS_ALIGNED(svm
->vmcb
->control
.exit_info_2
, PMD_SIZE
)) {
3994 "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n",
3995 svm
->vmcb
->control
.exit_info_2
);
4000 target_svm
->sev_es
.snp_vmsa_gpa
= svm
->vmcb
->control
.exit_info_2
;
4002 case SVM_VMGEXIT_AP_DESTROY
:
4005 vcpu_unimpl(vcpu
, "vmgexit: invalid AP creation request [%#x] from guest\n",
4013 kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE
, target_vcpu
);
4014 kvm_vcpu_kick(target_vcpu
);
4017 mutex_unlock(&target_svm
->sev_es
.snp_vmsa_mutex
);
4022 static int snp_handle_guest_req(struct vcpu_svm
*svm
, gpa_t req_gpa
, gpa_t resp_gpa
)
4024 struct sev_data_snp_guest_request data
= {0};
4025 struct kvm
*kvm
= svm
->vcpu
.kvm
;
4026 struct kvm_sev_info
*sev
= to_kvm_sev_info(kvm
);
4027 sev_ret_code fw_err
= 0;
4030 if (!sev_snp_guest(kvm
))
4033 mutex_lock(&sev
->guest_req_mutex
);
4035 if (kvm_read_guest(kvm
, req_gpa
, sev
->guest_req_buf
, PAGE_SIZE
)) {
4040 data
.gctx_paddr
= __psp_pa(sev
->snp_context
);
4041 data
.req_paddr
= __psp_pa(sev
->guest_req_buf
);
4042 data
.res_paddr
= __psp_pa(sev
->guest_resp_buf
);
4045 * Firmware failures are propagated on to guest, but any other failure
4046 * condition along the way should be reported to userspace. E.g. if
4047 * the PSP is dead and commands are timing out.
4049 ret
= sev_issue_cmd(kvm
, SEV_CMD_SNP_GUEST_REQUEST
, &data
, &fw_err
);
4053 if (kvm_write_guest(kvm
, resp_gpa
, sev
->guest_resp_buf
, PAGE_SIZE
)) {
4058 ghcb_set_sw_exit_info_2(svm
->sev_es
.ghcb
, SNP_GUEST_ERR(0, fw_err
));
4060 ret
= 1; /* resume guest */
4063 mutex_unlock(&sev
->guest_req_mutex
);
4067 static int snp_handle_ext_guest_req(struct vcpu_svm
*svm
, gpa_t req_gpa
, gpa_t resp_gpa
)
4069 struct kvm
*kvm
= svm
->vcpu
.kvm
;
4072 if (!sev_snp_guest(kvm
))
4075 if (kvm_read_guest(kvm
, req_gpa
+ offsetof(struct snp_guest_msg_hdr
, msg_type
),
4080 * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for
4081 * additional certificate data to be provided alongside the attestation
4082 * report via the guest-provided data pages indicated by RAX/RBX. The
4083 * certificate data is optional and requires additional KVM enablement
4084 * to provide an interface for userspace to provide it, but KVM still
4085 * needs to be able to handle extended guest requests either way. So
4086 * provide a stub implementation that will always return an empty
4087 * certificate table in the guest-provided data pages.
4089 if (msg_type
== SNP_MSG_REPORT_REQ
) {
4090 struct kvm_vcpu
*vcpu
= &svm
->vcpu
;
4094 if (!kvm_ghcb_rax_is_valid(svm
) || !kvm_ghcb_rbx_is_valid(svm
))
4095 goto request_invalid
;
4097 data_gpa
= vcpu
->arch
.regs
[VCPU_REGS_RAX
];
4098 data_npages
= vcpu
->arch
.regs
[VCPU_REGS_RBX
];
4100 if (!PAGE_ALIGNED(data_gpa
))
4101 goto request_invalid
;
4104 * As per GHCB spec (see "SNP Extended Guest Request"), the
4105 * certificate table is terminated by 24-bytes of zeroes.
4107 if (data_npages
&& kvm_clear_guest(kvm
, data_gpa
, 24))
4111 return snp_handle_guest_req(svm
, req_gpa
, resp_gpa
);
4114 ghcb_set_sw_exit_info_1(svm
->sev_es
.ghcb
, 2);
4115 ghcb_set_sw_exit_info_2(svm
->sev_es
.ghcb
, GHCB_ERR_INVALID_INPUT
);
4116 return 1; /* resume guest */
4119 static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm
*svm
)
4121 struct vmcb_control_area
*control
= &svm
->vmcb
->control
;
4122 struct kvm_vcpu
*vcpu
= &svm
->vcpu
;
4123 struct kvm_sev_info
*sev
= &to_kvm_svm(vcpu
->kvm
)->sev_info
;
4127 ghcb_info
= control
->ghcb_gpa
& GHCB_MSR_INFO_MASK
;
4129 trace_kvm_vmgexit_msr_protocol_enter(svm
->vcpu
.vcpu_id
,
4132 switch (ghcb_info
) {
4133 case GHCB_MSR_SEV_INFO_REQ
:
4134 set_ghcb_msr(svm
, GHCB_MSR_SEV_INFO((__u64
)sev
->ghcb_version
,
4138 case GHCB_MSR_CPUID_REQ
: {
4139 u64 cpuid_fn
, cpuid_reg
, cpuid_value
;
4141 cpuid_fn
= get_ghcb_msr_bits(svm
,
4142 GHCB_MSR_CPUID_FUNC_MASK
,
4143 GHCB_MSR_CPUID_FUNC_POS
);
4145 /* Initialize the registers needed by the CPUID intercept */
4146 vcpu
->arch
.regs
[VCPU_REGS_RAX
] = cpuid_fn
;
4147 vcpu
->arch
.regs
[VCPU_REGS_RCX
] = 0;
4149 ret
= svm_invoke_exit_handler(vcpu
, SVM_EXIT_CPUID
);
4151 /* Error, keep GHCB MSR value as-is */
4155 cpuid_reg
= get_ghcb_msr_bits(svm
,
4156 GHCB_MSR_CPUID_REG_MASK
,
4157 GHCB_MSR_CPUID_REG_POS
);
4159 cpuid_value
= vcpu
->arch
.regs
[VCPU_REGS_RAX
];
4160 else if (cpuid_reg
== 1)
4161 cpuid_value
= vcpu
->arch
.regs
[VCPU_REGS_RBX
];
4162 else if (cpuid_reg
== 2)
4163 cpuid_value
= vcpu
->arch
.regs
[VCPU_REGS_RCX
];
4165 cpuid_value
= vcpu
->arch
.regs
[VCPU_REGS_RDX
];
4167 set_ghcb_msr_bits(svm
, cpuid_value
,
4168 GHCB_MSR_CPUID_VALUE_MASK
,
4169 GHCB_MSR_CPUID_VALUE_POS
);
4171 set_ghcb_msr_bits(svm
, GHCB_MSR_CPUID_RESP
,
4176 case GHCB_MSR_AP_RESET_HOLD_REQ
:
4177 svm
->sev_es
.ap_reset_hold_type
= AP_RESET_HOLD_MSR_PROTO
;
4178 ret
= kvm_emulate_ap_reset_hold(&svm
->vcpu
);
4181 * Preset the result to a non-SIPI return and then only set
4182 * the result to non-zero when delivering a SIPI.
4184 set_ghcb_msr_bits(svm
, 0,
4185 GHCB_MSR_AP_RESET_HOLD_RESULT_MASK
,
4186 GHCB_MSR_AP_RESET_HOLD_RESULT_POS
);
4188 set_ghcb_msr_bits(svm
, GHCB_MSR_AP_RESET_HOLD_RESP
,
4192 case GHCB_MSR_HV_FT_REQ
:
4193 set_ghcb_msr_bits(svm
, GHCB_HV_FT_SUPPORTED
,
4194 GHCB_MSR_HV_FT_MASK
, GHCB_MSR_HV_FT_POS
);
4195 set_ghcb_msr_bits(svm
, GHCB_MSR_HV_FT_RESP
,
4196 GHCB_MSR_INFO_MASK
, GHCB_MSR_INFO_POS
);
4198 case GHCB_MSR_PREF_GPA_REQ
:
4199 if (!sev_snp_guest(vcpu
->kvm
))
4202 set_ghcb_msr_bits(svm
, GHCB_MSR_PREF_GPA_NONE
, GHCB_MSR_GPA_VALUE_MASK
,
4203 GHCB_MSR_GPA_VALUE_POS
);
4204 set_ghcb_msr_bits(svm
, GHCB_MSR_PREF_GPA_RESP
, GHCB_MSR_INFO_MASK
,
4207 case GHCB_MSR_REG_GPA_REQ
: {
4210 if (!sev_snp_guest(vcpu
->kvm
))
4213 gfn
= get_ghcb_msr_bits(svm
, GHCB_MSR_GPA_VALUE_MASK
,
4214 GHCB_MSR_GPA_VALUE_POS
);
4216 svm
->sev_es
.ghcb_registered_gpa
= gfn_to_gpa(gfn
);
4218 set_ghcb_msr_bits(svm
, gfn
, GHCB_MSR_GPA_VALUE_MASK
,
4219 GHCB_MSR_GPA_VALUE_POS
);
4220 set_ghcb_msr_bits(svm
, GHCB_MSR_REG_GPA_RESP
, GHCB_MSR_INFO_MASK
,
4224 case GHCB_MSR_PSC_REQ
:
4225 if (!sev_snp_guest(vcpu
->kvm
))
4228 ret
= snp_begin_psc_msr(svm
, control
->ghcb_gpa
);
4230 case GHCB_MSR_TERM_REQ
: {
4231 u64 reason_set
, reason_code
;
4233 reason_set
= get_ghcb_msr_bits(svm
,
4234 GHCB_MSR_TERM_REASON_SET_MASK
,
4235 GHCB_MSR_TERM_REASON_SET_POS
);
4236 reason_code
= get_ghcb_msr_bits(svm
,
4237 GHCB_MSR_TERM_REASON_MASK
,
4238 GHCB_MSR_TERM_REASON_POS
);
4239 pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
4240 reason_set
, reason_code
);
4245 /* Error, keep GHCB MSR value as-is */
4249 trace_kvm_vmgexit_msr_protocol_exit(svm
->vcpu
.vcpu_id
,
4250 control
->ghcb_gpa
, ret
);
4255 vcpu
->run
->exit_reason
= KVM_EXIT_SYSTEM_EVENT
;
4256 vcpu
->run
->system_event
.type
= KVM_SYSTEM_EVENT_SEV_TERM
;
4257 vcpu
->run
->system_event
.ndata
= 1;
4258 vcpu
->run
->system_event
.data
[0] = control
->ghcb_gpa
;
4263 int sev_handle_vmgexit(struct kvm_vcpu
*vcpu
)
4265 struct vcpu_svm
*svm
= to_svm(vcpu
);
4266 struct vmcb_control_area
*control
= &svm
->vmcb
->control
;
4267 u64 ghcb_gpa
, exit_code
;
4270 /* Validate the GHCB */
4271 ghcb_gpa
= control
->ghcb_gpa
;
4272 if (ghcb_gpa
& GHCB_MSR_INFO_MASK
)
4273 return sev_handle_vmgexit_msr_protocol(svm
);
4276 vcpu_unimpl(vcpu
, "vmgexit: GHCB gpa is not set\n");
4278 /* Without a GHCB, just return right back to the guest */
4282 if (kvm_vcpu_map(vcpu
, ghcb_gpa
>> PAGE_SHIFT
, &svm
->sev_es
.ghcb_map
)) {
4283 /* Unable to map GHCB from guest */
4284 vcpu_unimpl(vcpu
, "vmgexit: error mapping GHCB [%#llx] from guest\n",
4287 /* Without a GHCB, just return right back to the guest */
4291 svm
->sev_es
.ghcb
= svm
->sev_es
.ghcb_map
.hva
;
4293 trace_kvm_vmgexit_enter(vcpu
->vcpu_id
, svm
->sev_es
.ghcb
);
4295 sev_es_sync_from_ghcb(svm
);
4297 /* SEV-SNP guest requires that the GHCB GPA must be registered */
4298 if (sev_snp_guest(svm
->vcpu
.kvm
) && !ghcb_gpa_is_registered(svm
, ghcb_gpa
)) {
4299 vcpu_unimpl(&svm
->vcpu
, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa
);
4303 ret
= sev_es_validate_vmgexit(svm
);
4307 ghcb_set_sw_exit_info_1(svm
->sev_es
.ghcb
, 0);
4308 ghcb_set_sw_exit_info_2(svm
->sev_es
.ghcb
, 0);
4310 exit_code
= kvm_ghcb_get_sw_exit_code(control
);
4311 switch (exit_code
) {
4312 case SVM_VMGEXIT_MMIO_READ
:
4313 ret
= setup_vmgexit_scratch(svm
, true, control
->exit_info_2
);
4317 ret
= kvm_sev_es_mmio_read(vcpu
,
4318 control
->exit_info_1
,
4319 control
->exit_info_2
,
4320 svm
->sev_es
.ghcb_sa
);
4322 case SVM_VMGEXIT_MMIO_WRITE
:
4323 ret
= setup_vmgexit_scratch(svm
, false, control
->exit_info_2
);
4327 ret
= kvm_sev_es_mmio_write(vcpu
,
4328 control
->exit_info_1
,
4329 control
->exit_info_2
,
4330 svm
->sev_es
.ghcb_sa
);
4332 case SVM_VMGEXIT_NMI_COMPLETE
:
4333 ++vcpu
->stat
.nmi_window_exits
;
4334 svm
->nmi_masked
= false;
4335 kvm_make_request(KVM_REQ_EVENT
, vcpu
);
4338 case SVM_VMGEXIT_AP_HLT_LOOP
:
4339 svm
->sev_es
.ap_reset_hold_type
= AP_RESET_HOLD_NAE_EVENT
;
4340 ret
= kvm_emulate_ap_reset_hold(vcpu
);
4342 case SVM_VMGEXIT_AP_JUMP_TABLE
: {
4343 struct kvm_sev_info
*sev
= &to_kvm_svm(vcpu
->kvm
)->sev_info
;
4345 switch (control
->exit_info_1
) {
4347 /* Set AP jump table address */
4348 sev
->ap_jump_table
= control
->exit_info_2
;
4351 /* Get AP jump table address */
4352 ghcb_set_sw_exit_info_2(svm
->sev_es
.ghcb
, sev
->ap_jump_table
);
4355 pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
4356 control
->exit_info_1
);
4357 ghcb_set_sw_exit_info_1(svm
->sev_es
.ghcb
, 2);
4358 ghcb_set_sw_exit_info_2(svm
->sev_es
.ghcb
, GHCB_ERR_INVALID_INPUT
);
4364 case SVM_VMGEXIT_HV_FEATURES
:
4365 ghcb_set_sw_exit_info_2(svm
->sev_es
.ghcb
, GHCB_HV_FT_SUPPORTED
);
4369 case SVM_VMGEXIT_TERM_REQUEST
:
4370 pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n",
4371 control
->exit_info_1
, control
->exit_info_2
);
4372 vcpu
->run
->exit_reason
= KVM_EXIT_SYSTEM_EVENT
;
4373 vcpu
->run
->system_event
.type
= KVM_SYSTEM_EVENT_SEV_TERM
;
4374 vcpu
->run
->system_event
.ndata
= 1;
4375 vcpu
->run
->system_event
.data
[0] = control
->ghcb_gpa
;
4377 case SVM_VMGEXIT_PSC
:
4378 ret
= setup_vmgexit_scratch(svm
, true, control
->exit_info_2
);
4382 ret
= snp_begin_psc(svm
, svm
->sev_es
.ghcb_sa
);
4384 case SVM_VMGEXIT_AP_CREATION
:
4385 ret
= sev_snp_ap_creation(svm
);
4387 ghcb_set_sw_exit_info_1(svm
->sev_es
.ghcb
, 2);
4388 ghcb_set_sw_exit_info_2(svm
->sev_es
.ghcb
, GHCB_ERR_INVALID_INPUT
);
4393 case SVM_VMGEXIT_GUEST_REQUEST
:
4394 ret
= snp_handle_guest_req(svm
, control
->exit_info_1
, control
->exit_info_2
);
4396 case SVM_VMGEXIT_EXT_GUEST_REQUEST
:
4397 ret
= snp_handle_ext_guest_req(svm
, control
->exit_info_1
, control
->exit_info_2
);
4399 case SVM_VMGEXIT_UNSUPPORTED_EVENT
:
4401 "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
4402 control
->exit_info_1
, control
->exit_info_2
);
4406 ret
= svm_invoke_exit_handler(vcpu
, exit_code
);
4412 int sev_es_string_io(struct vcpu_svm
*svm
, int size
, unsigned int port
, int in
)
4418 if (svm
->vmcb
->control
.exit_info_2
> INT_MAX
)
4421 count
= svm
->vmcb
->control
.exit_info_2
;
4422 if (unlikely(check_mul_overflow(count
, size
, &bytes
)))
4425 r
= setup_vmgexit_scratch(svm
, in
, bytes
);
4429 return kvm_sev_es_string_io(&svm
->vcpu
, size
, port
, svm
->sev_es
.ghcb_sa
,
4433 static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm
*svm
)
4435 struct kvm_vcpu
*vcpu
= &svm
->vcpu
;
4437 if (boot_cpu_has(X86_FEATURE_V_TSC_AUX
)) {
4438 bool v_tsc_aux
= guest_cpuid_has(vcpu
, X86_FEATURE_RDTSCP
) ||
4439 guest_cpuid_has(vcpu
, X86_FEATURE_RDPID
);
4441 set_msr_interception(vcpu
, svm
->msrpm
, MSR_TSC_AUX
, v_tsc_aux
, v_tsc_aux
);
4445 * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if
4446 * the host/guest supports its use.
4448 * guest_can_use() checks a number of requirements on the host/guest to
4449 * ensure that MSR_IA32_XSS is available, but it might report true even
4450 * if X86_FEATURE_XSAVES isn't configured in the guest to ensure host
4451 * MSR_IA32_XSS is always properly restored. For SEV-ES, it is better
4452 * to further check that the guest CPUID actually supports
4453 * X86_FEATURE_XSAVES so that accesses to MSR_IA32_XSS by misbehaved
4454 * guests will still get intercepted and caught in the normal
4455 * kvm_emulate_rdmsr()/kvm_emulated_wrmsr() paths.
4457 if (guest_can_use(vcpu
, X86_FEATURE_XSAVES
) &&
4458 guest_cpuid_has(vcpu
, X86_FEATURE_XSAVES
))
4459 set_msr_interception(vcpu
, svm
->msrpm
, MSR_IA32_XSS
, 1, 1);
4461 set_msr_interception(vcpu
, svm
->msrpm
, MSR_IA32_XSS
, 0, 0);
4464 void sev_vcpu_after_set_cpuid(struct vcpu_svm
*svm
)
4466 struct kvm_vcpu
*vcpu
= &svm
->vcpu
;
4467 struct kvm_cpuid_entry2
*best
;
4469 /* For sev guests, the memory encryption bit is not reserved in CR3. */
4470 best
= kvm_find_cpuid_entry(vcpu
, 0x8000001F);
4472 vcpu
->arch
.reserved_gpa_bits
&= ~(1UL << (best
->ebx
& 0x3f));
4474 if (sev_es_guest(svm
->vcpu
.kvm
))
4475 sev_es_vcpu_after_set_cpuid(svm
);
4478 static void sev_es_init_vmcb(struct vcpu_svm
*svm
)
4480 struct vmcb
*vmcb
= svm
->vmcb01
.ptr
;
4481 struct kvm_vcpu
*vcpu
= &svm
->vcpu
;
4483 svm
->vmcb
->control
.nested_ctl
|= SVM_NESTED_CTL_SEV_ES_ENABLE
;
4486 * An SEV-ES guest requires a VMSA area that is a separate from the
4487 * VMCB page. Do not include the encryption mask on the VMSA physical
4488 * address since hardware will access it using the guest key. Note,
4489 * the VMSA will be NULL if this vCPU is the destination for intrahost
4490 * migration, and will be copied later.
4492 if (svm
->sev_es
.vmsa
&& !svm
->sev_es
.snp_has_guest_vmsa
)
4493 svm
->vmcb
->control
.vmsa_pa
= __pa(svm
->sev_es
.vmsa
);
4495 /* Can't intercept CR register access, HV can't modify CR registers */
4496 svm_clr_intercept(svm
, INTERCEPT_CR0_READ
);
4497 svm_clr_intercept(svm
, INTERCEPT_CR4_READ
);
4498 svm_clr_intercept(svm
, INTERCEPT_CR8_READ
);
4499 svm_clr_intercept(svm
, INTERCEPT_CR0_WRITE
);
4500 svm_clr_intercept(svm
, INTERCEPT_CR4_WRITE
);
4501 svm_clr_intercept(svm
, INTERCEPT_CR8_WRITE
);
4503 svm_clr_intercept(svm
, INTERCEPT_SELECTIVE_CR0
);
4505 /* Track EFER/CR register changes */
4506 svm_set_intercept(svm
, TRAP_EFER_WRITE
);
4507 svm_set_intercept(svm
, TRAP_CR0_WRITE
);
4508 svm_set_intercept(svm
, TRAP_CR4_WRITE
);
4509 svm_set_intercept(svm
, TRAP_CR8_WRITE
);
4511 vmcb
->control
.intercepts
[INTERCEPT_DR
] = 0;
4512 if (!sev_vcpu_has_debug_swap(svm
)) {
4513 vmcb_set_intercept(&vmcb
->control
, INTERCEPT_DR7_READ
);
4514 vmcb_set_intercept(&vmcb
->control
, INTERCEPT_DR7_WRITE
);
4515 recalc_intercepts(svm
);
4518 * Disable #DB intercept iff DebugSwap is enabled. KVM doesn't
4519 * allow debugging SEV-ES guests, and enables DebugSwap iff
4520 * NO_NESTED_DATA_BP is supported, so there's no reason to
4521 * intercept #DB when DebugSwap is enabled. For simplicity
4522 * with respect to guest debug, intercept #DB for other VMs
4523 * even if NO_NESTED_DATA_BP is supported, i.e. even if the
4524 * guest can't DoS the CPU with infinite #DB vectoring.
4526 clr_exception_intercept(svm
, DB_VECTOR
);
4529 /* Can't intercept XSETBV, HV can't modify XCR0 directly */
4530 svm_clr_intercept(svm
, INTERCEPT_XSETBV
);
4532 /* Clear intercepts on selected MSRs */
4533 set_msr_interception(vcpu
, svm
->msrpm
, MSR_EFER
, 1, 1);
4534 set_msr_interception(vcpu
, svm
->msrpm
, MSR_IA32_CR_PAT
, 1, 1);
4537 void sev_init_vmcb(struct vcpu_svm
*svm
)
4539 svm
->vmcb
->control
.nested_ctl
|= SVM_NESTED_CTL_SEV_ENABLE
;
4540 clr_exception_intercept(svm
, UD_VECTOR
);
4543 * Don't intercept #GP for SEV guests, e.g. for the VMware backdoor, as
4544 * KVM can't decrypt guest memory to decode the faulting instruction.
4546 clr_exception_intercept(svm
, GP_VECTOR
);
4548 if (sev_es_guest(svm
->vcpu
.kvm
))
4549 sev_es_init_vmcb(svm
);
4552 void sev_es_vcpu_reset(struct vcpu_svm
*svm
)
4554 struct kvm_vcpu
*vcpu
= &svm
->vcpu
;
4555 struct kvm_sev_info
*sev
= &to_kvm_svm(vcpu
->kvm
)->sev_info
;
4558 * Set the GHCB MSR value as per the GHCB specification when emulating
4559 * vCPU RESET for an SEV-ES guest.
4561 set_ghcb_msr(svm
, GHCB_MSR_SEV_INFO((__u64
)sev
->ghcb_version
,
4565 mutex_init(&svm
->sev_es
.snp_vmsa_mutex
);
4568 void sev_es_prepare_switch_to_guest(struct vcpu_svm
*svm
, struct sev_es_save_area
*hostsa
)
4571 * All host state for SEV-ES guests is categorized into three swap types
4572 * based on how it is handled by hardware during a world switch:
4574 * A: VMRUN: Host state saved in host save area
4575 * VMEXIT: Host state loaded from host save area
4577 * B: VMRUN: Host state _NOT_ saved in host save area
4578 * VMEXIT: Host state loaded from host save area
4580 * C: VMRUN: Host state _NOT_ saved in host save area
4581 * VMEXIT: Host state initialized to default(reset) values
4583 * Manually save type-B state, i.e. state that is loaded by VMEXIT but
4584 * isn't saved by VMRUN, that isn't already saved by VMSAVE (performed
4585 * by common SVM code).
4587 hostsa
->xcr0
= kvm_host
.xcr0
;
4588 hostsa
->pkru
= read_pkru();
4589 hostsa
->xss
= kvm_host
.xss
;
4592 * If DebugSwap is enabled, debug registers are loaded but NOT saved by
4593 * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both
4594 * saves and loads debug registers (Type-A).
4596 if (sev_vcpu_has_debug_swap(svm
)) {
4597 hostsa
->dr0
= native_get_debugreg(0);
4598 hostsa
->dr1
= native_get_debugreg(1);
4599 hostsa
->dr2
= native_get_debugreg(2);
4600 hostsa
->dr3
= native_get_debugreg(3);
4601 hostsa
->dr0_addr_mask
= amd_get_dr_addr_mask(0);
4602 hostsa
->dr1_addr_mask
= amd_get_dr_addr_mask(1);
4603 hostsa
->dr2_addr_mask
= amd_get_dr_addr_mask(2);
4604 hostsa
->dr3_addr_mask
= amd_get_dr_addr_mask(3);
4608 void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu
*vcpu
, u8 vector
)
4610 struct vcpu_svm
*svm
= to_svm(vcpu
);
4612 /* First SIPI: Use the values as initially set by the VMM */
4613 if (!svm
->sev_es
.received_first_sipi
) {
4614 svm
->sev_es
.received_first_sipi
= true;
4618 /* Subsequent SIPI */
4619 switch (svm
->sev_es
.ap_reset_hold_type
) {
4620 case AP_RESET_HOLD_NAE_EVENT
:
4622 * Return from an AP Reset Hold VMGEXIT, where the guest will
4623 * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value.
4625 ghcb_set_sw_exit_info_2(svm
->sev_es
.ghcb
, 1);
4627 case AP_RESET_HOLD_MSR_PROTO
:
4629 * Return from an AP Reset Hold VMGEXIT, where the guest will
4630 * set the CS and RIP. Set GHCB data field to a non-zero value.
4632 set_ghcb_msr_bits(svm
, 1,
4633 GHCB_MSR_AP_RESET_HOLD_RESULT_MASK
,
4634 GHCB_MSR_AP_RESET_HOLD_RESULT_POS
);
4636 set_ghcb_msr_bits(svm
, GHCB_MSR_AP_RESET_HOLD_RESP
,
4645 struct page
*snp_safe_alloc_page_node(int node
, gfp_t gfp
)
4650 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP
))
4651 return alloc_pages_node(node
, gfp
| __GFP_ZERO
, 0);
4654 * Allocate an SNP-safe page to workaround the SNP erratum where
4655 * the CPU will incorrectly signal an RMP violation #PF if a
4656 * hugepage (2MB or 1GB) collides with the RMP entry of a
4657 * 2MB-aligned VMCB, VMSA, or AVIC backing page.
4659 * Allocate one extra page, choose a page which is not
4660 * 2MB-aligned, and free the other.
4662 p
= alloc_pages_node(node
, gfp
| __GFP_ZERO
, 1);
4668 pfn
= page_to_pfn(p
);
4669 if (IS_ALIGNED(pfn
, PTRS_PER_PMD
))
4677 void sev_handle_rmp_fault(struct kvm_vcpu
*vcpu
, gpa_t gpa
, u64 error_code
)
4679 struct kvm_memory_slot
*slot
;
4680 struct kvm
*kvm
= vcpu
->kvm
;
4681 int order
, rmp_level
, ret
;
4687 gfn
= gpa
>> PAGE_SHIFT
;
4690 * The only time RMP faults occur for shared pages is when the guest is
4691 * triggering an RMP fault for an implicit page-state change from
4692 * shared->private. Implicit page-state changes are forwarded to
4693 * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults
4694 * for shared pages should not end up here.
4696 if (!kvm_mem_is_private(kvm
, gfn
)) {
4697 pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n",
4702 slot
= gfn_to_memslot(kvm
, gfn
);
4703 if (!kvm_slot_can_be_private(slot
)) {
4704 pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n",
4709 ret
= kvm_gmem_get_pfn(kvm
, slot
, gfn
, &pfn
, &page
, &order
);
4711 pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n",
4716 ret
= snp_lookup_rmpentry(pfn
, &assigned
, &rmp_level
);
4717 if (ret
|| !assigned
) {
4718 pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n",
4724 * There are 2 cases where a PSMASH may be needed to resolve an #NPF
4725 * with PFERR_GUEST_RMP_BIT set:
4727 * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM
4728 * bit set if the guest issues them with a smaller granularity than
4729 * what is indicated by the page-size bit in the 2MB RMP entry for
4730 * the PFN that backs the GPA.
4732 * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is
4733 * smaller than what is indicated by the 2MB RMP entry for the PFN
4734 * that backs the GPA.
4736 * In both these cases, the corresponding 2M RMP entry needs to
4737 * be PSMASH'd to 512 4K RMP entries. If the RMP entry is already
4738 * split into 4K RMP entries, then this is likely a spurious case which
4739 * can occur when there are concurrent accesses by the guest to a 2MB
4740 * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in
4741 * the process of being PMASH'd into 4K entries. These cases should
4742 * resolve automatically on subsequent accesses, so just ignore them
4745 if (rmp_level
== PG_LEVEL_4K
)
4748 ret
= snp_rmptable_psmash(pfn
);
4751 * Look it up again. If it's 4K now then the PSMASH may have
4752 * raced with another process and the issue has already resolved
4755 if (!snp_lookup_rmpentry(pfn
, &assigned
, &rmp_level
) &&
4756 assigned
&& rmp_level
== PG_LEVEL_4K
)
4759 pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n",
4763 kvm_zap_gfn_range(kvm
, gfn
, gfn
+ PTRS_PER_PMD
);
4765 trace_kvm_rmp_fault(vcpu
, gpa
, pfn
, error_code
, rmp_level
, ret
);
4767 kvm_release_page_unused(page
);
4770 static bool is_pfn_range_shared(kvm_pfn_t start
, kvm_pfn_t end
)
4772 kvm_pfn_t pfn
= start
;
4778 ret
= snp_lookup_rmpentry(pfn
, &assigned
, &rmp_level
);
4780 pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n",
4781 pfn
, start
, end
, rmp_level
, ret
);
4786 pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n",
4787 __func__
, pfn
, start
, end
, rmp_level
);
4797 static u8
max_level_for_order(int order
)
4799 if (order
>= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M
))
4805 static bool is_large_rmp_possible(struct kvm
*kvm
, kvm_pfn_t pfn
, int order
)
4807 kvm_pfn_t pfn_aligned
= ALIGN_DOWN(pfn
, PTRS_PER_PMD
);
4810 * If this is a large folio, and the entire 2M range containing the
4811 * PFN is currently shared, then the entire 2M-aligned range can be
4812 * set to private via a single 2M RMP entry.
4814 if (max_level_for_order(order
) > PG_LEVEL_4K
&&
4815 is_pfn_range_shared(pfn_aligned
, pfn_aligned
+ PTRS_PER_PMD
))
4821 int sev_gmem_prepare(struct kvm
*kvm
, kvm_pfn_t pfn
, gfn_t gfn
, int max_order
)
4823 struct kvm_sev_info
*sev
= &to_kvm_svm(kvm
)->sev_info
;
4824 kvm_pfn_t pfn_aligned
;
4829 if (!sev_snp_guest(kvm
))
4832 rc
= snp_lookup_rmpentry(pfn
, &assigned
, &level
);
4834 pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n",
4840 pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n",
4841 __func__
, gfn
, pfn
, max_order
, level
);
4845 if (is_large_rmp_possible(kvm
, pfn
, max_order
)) {
4846 level
= PG_LEVEL_2M
;
4847 pfn_aligned
= ALIGN_DOWN(pfn
, PTRS_PER_PMD
);
4848 gfn_aligned
= ALIGN_DOWN(gfn
, PTRS_PER_PMD
);
4850 level
= PG_LEVEL_4K
;
4855 rc
= rmp_make_private(pfn_aligned
, gfn_to_gpa(gfn_aligned
), level
, sev
->asid
, false);
4857 pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n",
4858 gfn
, pfn
, level
, rc
);
4862 pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n",
4863 __func__
, gfn
, pfn
, pfn_aligned
, max_order
, level
);
4868 void sev_gmem_invalidate(kvm_pfn_t start
, kvm_pfn_t end
)
4872 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP
))
4875 pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__
, start
, end
);
4877 for (pfn
= start
; pfn
< end
;) {
4878 bool use_2m_update
= false;
4882 rc
= snp_lookup_rmpentry(pfn
, &assigned
, &rmp_level
);
4883 if (rc
|| !assigned
)
4886 use_2m_update
= IS_ALIGNED(pfn
, PTRS_PER_PMD
) &&
4887 end
>= (pfn
+ PTRS_PER_PMD
) &&
4888 rmp_level
> PG_LEVEL_4K
;
4891 * If an unaligned PFN corresponds to a 2M region assigned as a
4892 * large page in the RMP table, PSMASH the region into individual
4893 * 4K RMP entries before attempting to convert a 4K sub-page.
4895 if (!use_2m_update
&& rmp_level
> PG_LEVEL_4K
) {
4897 * This shouldn't fail, but if it does, report it, but
4898 * still try to update RMP entry to shared and pray this
4899 * was a spurious error that can be addressed later.
4901 rc
= snp_rmptable_psmash(pfn
);
4902 WARN_ONCE(rc
, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n",
4906 rc
= rmp_make_shared(pfn
, use_2m_update
? PG_LEVEL_2M
: PG_LEVEL_4K
);
4907 if (WARN_ONCE(rc
, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n",
4912 * SEV-ES avoids host/guest cache coherency issues through
4913 * WBINVD hooks issued via MMU notifiers during run-time, and
4914 * KVM's VM destroy path at shutdown. Those MMU notifier events
4915 * don't cover gmem since there is no requirement to map pages
4916 * to a HVA in order to use them for a running guest. While the
4917 * shutdown path would still likely cover things for SNP guests,
4918 * userspace may also free gmem pages during run-time via
4919 * hole-punching operations on the guest_memfd, so flush the
4920 * cache entries for these pages before free'ing them back to
4923 clflush_cache_range(__va(pfn_to_hpa(pfn
)),
4924 use_2m_update
? PMD_SIZE
: PAGE_SIZE
);
4926 pfn
+= use_2m_update
? PTRS_PER_PMD
: 1;
4931 int sev_private_max_mapping_level(struct kvm
*kvm
, kvm_pfn_t pfn
)
4936 if (!sev_snp_guest(kvm
))
4939 rc
= snp_lookup_rmpentry(pfn
, &assigned
, &level
);
4940 if (rc
|| !assigned
)