1 // SPDX-License-Identifier: GPL-2.0
3 * Device driver to expose SGX enclave memory to KVM guests.
5 * Copyright(c) 2021 Intel Corporation.
8 #include <linux/miscdevice.h>
10 #include <linux/mman.h>
11 #include <linux/sched/mm.h>
12 #include <linux/sched/signal.h>
13 #include <linux/slab.h>
14 #include <linux/xarray.h>
16 #include <uapi/asm/sgx.h>
22 struct xarray page_array
;
27 * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
28 * virtual EPC instances, and the lock to protect it.
30 static struct mutex zombie_secs_pages_lock
;
31 static struct list_head zombie_secs_pages
;
33 static int __sgx_vepc_fault(struct sgx_vepc
*vepc
,
34 struct vm_area_struct
*vma
, unsigned long addr
)
36 struct sgx_epc_page
*epc_page
;
37 unsigned long index
, pfn
;
40 WARN_ON(!mutex_is_locked(&vepc
->lock
));
42 /* Calculate index of EPC page in virtual EPC's page_array */
43 index
= vma
->vm_pgoff
+ PFN_DOWN(addr
- vma
->vm_start
);
45 epc_page
= xa_load(&vepc
->page_array
, index
);
49 epc_page
= sgx_alloc_epc_page(vepc
, false);
51 return PTR_ERR(epc_page
);
53 ret
= xa_err(xa_store(&vepc
->page_array
, index
, epc_page
, GFP_KERNEL
));
57 pfn
= PFN_DOWN(sgx_get_epc_phys_addr(epc_page
));
59 ret
= vmf_insert_pfn(vma
, addr
, pfn
);
60 if (ret
!= VM_FAULT_NOPAGE
) {
68 xa_erase(&vepc
->page_array
, index
);
70 sgx_free_epc_page(epc_page
);
74 static vm_fault_t
sgx_vepc_fault(struct vm_fault
*vmf
)
76 struct vm_area_struct
*vma
= vmf
->vma
;
77 struct sgx_vepc
*vepc
= vma
->vm_private_data
;
80 mutex_lock(&vepc
->lock
);
81 ret
= __sgx_vepc_fault(vepc
, vma
, vmf
->address
);
82 mutex_unlock(&vepc
->lock
);
85 return VM_FAULT_NOPAGE
;
87 if (ret
== -EBUSY
&& (vmf
->flags
& FAULT_FLAG_ALLOW_RETRY
)) {
88 mmap_read_unlock(vma
->vm_mm
);
89 return VM_FAULT_RETRY
;
92 return VM_FAULT_SIGBUS
;
95 static const struct vm_operations_struct sgx_vepc_vm_ops
= {
96 .fault
= sgx_vepc_fault
,
99 static int sgx_vepc_mmap(struct file
*file
, struct vm_area_struct
*vma
)
101 struct sgx_vepc
*vepc
= file
->private_data
;
103 if (!(vma
->vm_flags
& VM_SHARED
))
106 vma
->vm_ops
= &sgx_vepc_vm_ops
;
107 /* Don't copy VMA in fork() */
108 vm_flags_set(vma
, VM_PFNMAP
| VM_IO
| VM_DONTDUMP
| VM_DONTCOPY
);
109 vma
->vm_private_data
= vepc
;
114 static int sgx_vepc_remove_page(struct sgx_epc_page
*epc_page
)
117 * Take a previously guest-owned EPC page and return it to the
118 * general EPC page pool.
120 * Guests can not be trusted to have left this page in a good
121 * state, so run EREMOVE on the page unconditionally. In the
122 * case that a guest properly EREMOVE'd this page, a superfluous
123 * EREMOVE is harmless.
125 return __eremove(sgx_get_epc_virt_addr(epc_page
));
128 static int sgx_vepc_free_page(struct sgx_epc_page
*epc_page
)
130 int ret
= sgx_vepc_remove_page(epc_page
);
133 * Only SGX_CHILD_PRESENT is expected, which is because of
134 * EREMOVE'ing an SECS still with child, in which case it can
135 * be handled by EREMOVE'ing the SECS again after all pages in
136 * virtual EPC have been EREMOVE'd. See comments in below in
137 * sgx_vepc_release().
139 * The user of virtual EPC (KVM) needs to guarantee there's no
140 * logical processor is still running in the enclave in guest,
141 * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
144 WARN_ONCE(ret
!= SGX_CHILD_PRESENT
, EREMOVE_ERROR_MESSAGE
,
149 sgx_free_epc_page(epc_page
);
153 static long sgx_vepc_remove_all(struct sgx_vepc
*vepc
)
155 struct sgx_epc_page
*entry
;
159 xa_for_each(&vepc
->page_array
, index
, entry
) {
160 int ret
= sgx_vepc_remove_page(entry
);
162 if (ret
== SGX_CHILD_PRESENT
) {
163 /* The page is a SECS, userspace will retry. */
167 * Report errors due to #GP or SGX_ENCLAVE_ACT; do not
168 * WARN, as userspace can induce said failures by
169 * calling the ioctl concurrently on multiple vEPCs or
170 * while one or more CPUs is running the enclave. Only
171 * a #PF on EREMOVE indicates a kernel/hardware issue.
173 WARN_ON_ONCE(encls_faulted(ret
) &&
174 ENCLS_TRAPNR(ret
) != X86_TRAP_GP
);
182 * Return the number of SECS pages that failed to be removed, so
183 * userspace knows that it has to retry.
188 static int sgx_vepc_release(struct inode
*inode
, struct file
*file
)
190 struct sgx_vepc
*vepc
= file
->private_data
;
191 struct sgx_epc_page
*epc_page
, *tmp
, *entry
;
194 LIST_HEAD(secs_pages
);
196 xa_for_each(&vepc
->page_array
, index
, entry
) {
198 * Remove all normal, child pages. sgx_vepc_free_page()
199 * will fail if EREMOVE fails, but this is OK and expected on
200 * SECS pages. Those can only be EREMOVE'd *after* all their
201 * child pages. Retries below will clean them up.
203 if (sgx_vepc_free_page(entry
))
206 xa_erase(&vepc
->page_array
, index
);
211 * Retry EREMOVE'ing pages. This will clean up any SECS pages that
212 * only had children in this 'epc' area.
214 xa_for_each(&vepc
->page_array
, index
, entry
) {
217 * An EREMOVE failure here means that the SECS page still
218 * has children. But, since all children in this 'sgx_vepc'
219 * have been removed, the SECS page must have a child on
222 if (sgx_vepc_free_page(epc_page
))
223 list_add_tail(&epc_page
->list
, &secs_pages
);
225 xa_erase(&vepc
->page_array
, index
);
230 * SECS pages are "pinned" by child pages, and "unpinned" once all
231 * children have been EREMOVE'd. A child page in this instance
232 * may have pinned an SECS page encountered in an earlier release(),
233 * creating a zombie. Since some children were EREMOVE'd above,
234 * try to EREMOVE all zombies in the hopes that one was unpinned.
236 mutex_lock(&zombie_secs_pages_lock
);
237 list_for_each_entry_safe(epc_page
, tmp
, &zombie_secs_pages
, list
) {
239 * Speculatively remove the page from the list of zombies,
240 * if the page is successfully EREMOVE'd it will be added to
241 * the list of free pages. If EREMOVE fails, throw the page
242 * on the local list, which will be spliced on at the end.
244 list_del(&epc_page
->list
);
246 if (sgx_vepc_free_page(epc_page
))
247 list_add_tail(&epc_page
->list
, &secs_pages
);
251 if (!list_empty(&secs_pages
))
252 list_splice_tail(&secs_pages
, &zombie_secs_pages
);
253 mutex_unlock(&zombie_secs_pages_lock
);
255 xa_destroy(&vepc
->page_array
);
261 static int sgx_vepc_open(struct inode
*inode
, struct file
*file
)
263 struct sgx_vepc
*vepc
;
265 vepc
= kzalloc(sizeof(struct sgx_vepc
), GFP_KERNEL
);
268 mutex_init(&vepc
->lock
);
269 xa_init(&vepc
->page_array
);
271 file
->private_data
= vepc
;
276 static long sgx_vepc_ioctl(struct file
*file
,
277 unsigned int cmd
, unsigned long arg
)
279 struct sgx_vepc
*vepc
= file
->private_data
;
282 case SGX_IOC_VEPC_REMOVE_ALL
:
285 return sgx_vepc_remove_all(vepc
);
292 static const struct file_operations sgx_vepc_fops
= {
293 .owner
= THIS_MODULE
,
294 .open
= sgx_vepc_open
,
295 .unlocked_ioctl
= sgx_vepc_ioctl
,
296 .compat_ioctl
= sgx_vepc_ioctl
,
297 .release
= sgx_vepc_release
,
298 .mmap
= sgx_vepc_mmap
,
301 static struct miscdevice sgx_vepc_dev
= {
302 .minor
= MISC_DYNAMIC_MINOR
,
304 .nodename
= "sgx_vepc",
305 .fops
= &sgx_vepc_fops
,
308 int __init
sgx_vepc_init(void)
310 /* SGX virtualization requires KVM to work */
311 if (!cpu_feature_enabled(X86_FEATURE_VMX
))
314 INIT_LIST_HEAD(&zombie_secs_pages
);
315 mutex_init(&zombie_secs_pages_lock
);
317 return misc_register(&sgx_vepc_dev
);
321 * sgx_virt_ecreate() - Run ECREATE on behalf of guest
322 * @pageinfo: Pointer to PAGEINFO structure
323 * @secs: Userspace pointer to SECS page
324 * @trapnr: trap number injected to guest in case of ECREATE error
326 * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
327 * of enforcing policies of guest's enclaves, and return the trap number
328 * which should be injected to guest in case of any ECREATE error.
331 * - 0: ECREATE was successful.
334 int sgx_virt_ecreate(struct sgx_pageinfo
*pageinfo
, void __user
*secs
,
340 * @secs is an untrusted, userspace-provided address. It comes from
341 * KVM and is assumed to be a valid pointer which points somewhere in
342 * userspace. This can fault and call SGX or other fault handlers when
343 * userspace mapping @secs doesn't exist.
345 * Add a WARN() to make sure @secs is already valid userspace pointer
346 * from caller (KVM), who should already have handled invalid pointer
347 * case (for instance, made by malicious guest). All other checks,
348 * such as alignment of @secs, are deferred to ENCLS itself.
350 if (WARN_ON_ONCE(!access_ok(secs
, PAGE_SIZE
)))
354 ret
= __ecreate(pageinfo
, (void *)secs
);
357 if (encls_faulted(ret
)) {
358 *trapnr
= ENCLS_TRAPNR(ret
);
362 /* ECREATE doesn't return an error code, it faults or succeeds. */
366 EXPORT_SYMBOL_GPL(sgx_virt_ecreate
);
368 static int __sgx_virt_einit(void __user
*sigstruct
, void __user
*token
,
374 * Make sure all userspace pointers from caller (KVM) are valid.
375 * All other checks deferred to ENCLS itself. Also see comment
376 * for @secs in sgx_virt_ecreate().
378 #define SGX_EINITTOKEN_SIZE 304
379 if (WARN_ON_ONCE(!access_ok(sigstruct
, sizeof(struct sgx_sigstruct
)) ||
380 !access_ok(token
, SGX_EINITTOKEN_SIZE
) ||
381 !access_ok(secs
, PAGE_SIZE
)))
385 ret
= __einit((void *)sigstruct
, (void *)token
, (void *)secs
);
392 * sgx_virt_einit() - Run EINIT on behalf of guest
393 * @sigstruct: Userspace pointer to SIGSTRUCT structure
394 * @token: Userspace pointer to EINITTOKEN structure
395 * @secs: Userspace pointer to SECS page
396 * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
397 * @trapnr: trap number injected to guest in case of EINIT error
399 * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
400 * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
401 * needs to update hardware values to guest's virtual MSR values in order to
402 * ensure EINIT is executed with expected hardware values.
405 * - 0: EINIT was successful.
408 int sgx_virt_einit(void __user
*sigstruct
, void __user
*token
,
409 void __user
*secs
, u64
*lepubkeyhash
, int *trapnr
)
413 if (!cpu_feature_enabled(X86_FEATURE_SGX_LC
)) {
414 ret
= __sgx_virt_einit(sigstruct
, token
, secs
);
418 sgx_update_lepubkeyhash(lepubkeyhash
);
420 ret
= __sgx_virt_einit(sigstruct
, token
, secs
);
424 /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
428 if (encls_faulted(ret
)) {
429 *trapnr
= ENCLS_TRAPNR(ret
);
435 EXPORT_SYMBOL_GPL(sgx_virt_einit
);