drm/panthor: Don't add write fences to the shared BOs
[drm/drm-misc.git] / arch / x86 / kernel / cpu / sgx / encl.c
blob279148e7245965dc59198fa3c2705b26aff23f92
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright(c) 2016-20 Intel Corporation. */
4 #include <linux/lockdep.h>
5 #include <linux/mm.h>
6 #include <linux/mman.h>
7 #include <linux/shmem_fs.h>
8 #include <linux/suspend.h>
9 #include <linux/sched/mm.h>
10 #include <asm/sgx.h>
11 #include "encl.h"
12 #include "encls.h"
13 #include "sgx.h"
15 static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
16 struct sgx_backing *backing);
18 #define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd))
20 * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to
21 * determine the page index associated with the first PCMD entry
22 * within a PCMD page.
24 #define PCMD_FIRST_MASK GENMASK(4, 0)
26 /**
27 * reclaimer_writing_to_pcmd() - Query if any enclave page associated with
28 * a PCMD page is in process of being reclaimed.
29 * @encl: Enclave to which PCMD page belongs
30 * @start_addr: Address of enclave page using first entry within the PCMD page
32 * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is
33 * stored. The PCMD data of a reclaimed enclave page contains enough
34 * information for the processor to verify the page at the time
35 * it is loaded back into the Enclave Page Cache (EPC).
37 * The backing storage to which enclave pages are reclaimed is laid out as
38 * follows:
39 * Encrypted enclave pages:SECS page:PCMD pages
41 * Each PCMD page contains the PCMD metadata of
42 * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages.
44 * A PCMD page can only be truncated if it is (a) empty, and (b) not in the
45 * process of getting data (and thus soon being non-empty). (b) is tested with
46 * a check if an enclave page sharing the PCMD page is in the process of being
47 * reclaimed.
49 * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it
50 * intends to reclaim that enclave page - it means that the PCMD page
51 * associated with that enclave page is about to get some data and thus
52 * even if the PCMD page is empty, it should not be truncated.
54 * Context: Enclave mutex (&sgx_encl->lock) must be held.
55 * Return: 1 if the reclaimer is about to write to the PCMD page
56 * 0 if the reclaimer has no intention to write to the PCMD page
58 static int reclaimer_writing_to_pcmd(struct sgx_encl *encl,
59 unsigned long start_addr)
61 int reclaimed = 0;
62 int i;
65 * PCMD_FIRST_MASK is based on number of PCMD entries within
66 * PCMD page being 32.
68 BUILD_BUG_ON(PCMDS_PER_PAGE != 32);
70 for (i = 0; i < PCMDS_PER_PAGE; i++) {
71 struct sgx_encl_page *entry;
72 unsigned long addr;
74 addr = start_addr + i * PAGE_SIZE;
77 * Stop when reaching the SECS page - it does not
78 * have a page_array entry and its reclaim is
79 * started and completed with enclave mutex held so
80 * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED
81 * flag.
83 if (addr == encl->base + encl->size)
84 break;
86 entry = xa_load(&encl->page_array, PFN_DOWN(addr));
87 if (!entry)
88 continue;
91 * VA page slot ID uses same bit as the flag so it is important
92 * to ensure that the page is not already in backing store.
94 if (entry->epc_page &&
95 (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) {
96 reclaimed = 1;
97 break;
101 return reclaimed;
105 * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's
106 * follow right after the EPC data in the backing storage. In addition to the
107 * visible enclave pages, there's one extra page slot for SECS, before PCMD
108 * structs.
110 static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl,
111 unsigned long page_index)
113 pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs);
115 return epc_end_off + page_index * sizeof(struct sgx_pcmd);
119 * Free a page from the backing storage in the given page index.
121 static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index)
123 struct inode *inode = file_inode(encl->backing);
125 shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1);
129 * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
130 * Pages" in the SDM.
132 static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
133 struct sgx_epc_page *epc_page,
134 struct sgx_epc_page *secs_page)
136 unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
137 struct sgx_encl *encl = encl_page->encl;
138 pgoff_t page_index, page_pcmd_off;
139 unsigned long pcmd_first_page;
140 struct sgx_pageinfo pginfo;
141 struct sgx_backing b;
142 bool pcmd_page_empty;
143 u8 *pcmd_page;
144 int ret;
146 if (secs_page)
147 page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
148 else
149 page_index = PFN_DOWN(encl->size);
152 * Address of enclave page using the first entry within the PCMD page.
154 pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base;
156 page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
158 ret = sgx_encl_lookup_backing(encl, page_index, &b);
159 if (ret)
160 return ret;
162 pginfo.addr = encl_page->desc & PAGE_MASK;
163 pginfo.contents = (unsigned long)kmap_local_page(b.contents);
164 pcmd_page = kmap_local_page(b.pcmd);
165 pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
167 if (secs_page)
168 pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
169 else
170 pginfo.secs = 0;
172 ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page),
173 sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset);
174 if (ret) {
175 if (encls_failed(ret))
176 ENCLS_WARN(ret, "ELDU");
178 ret = -EFAULT;
181 memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
182 set_page_dirty(b.pcmd);
185 * The area for the PCMD in the page was zeroed above. Check if the
186 * whole page is now empty meaning that all PCMD's have been zeroed:
188 pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
190 kunmap_local(pcmd_page);
191 kunmap_local((void *)(unsigned long)pginfo.contents);
193 get_page(b.pcmd);
194 sgx_encl_put_backing(&b);
196 sgx_encl_truncate_backing_page(encl, page_index);
198 if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) {
199 sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
200 pcmd_page = kmap_local_page(b.pcmd);
201 if (memchr_inv(pcmd_page, 0, PAGE_SIZE))
202 pr_warn("PCMD page not empty after truncate.\n");
203 kunmap_local(pcmd_page);
206 put_page(b.pcmd);
208 return ret;
211 static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
212 struct sgx_epc_page *secs_page)
215 unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
216 struct sgx_encl *encl = encl_page->encl;
217 struct sgx_epc_page *epc_page;
218 int ret;
220 epc_page = sgx_alloc_epc_page(encl_page, false);
221 if (IS_ERR(epc_page))
222 return epc_page;
224 ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
225 if (ret) {
226 sgx_encl_free_epc_page(epc_page);
227 return ERR_PTR(ret);
230 sgx_free_va_slot(encl_page->va_page, va_offset);
231 list_move(&encl_page->va_page->list, &encl->va_pages);
232 encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK;
233 encl_page->epc_page = epc_page;
235 return epc_page;
239 * Ensure the SECS page is not swapped out. Must be called with encl->lock
240 * to protect the enclave states including SECS and ensure the SECS page is
241 * not swapped out again while being used.
243 static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl)
245 struct sgx_epc_page *epc_page = encl->secs.epc_page;
247 if (!epc_page)
248 epc_page = sgx_encl_eldu(&encl->secs, NULL);
250 return epc_page;
253 static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
254 struct sgx_encl_page *entry)
256 struct sgx_epc_page *epc_page;
258 /* Entry successfully located. */
259 if (entry->epc_page) {
260 if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)
261 return ERR_PTR(-EBUSY);
263 return entry;
266 epc_page = sgx_encl_load_secs(encl);
267 if (IS_ERR(epc_page))
268 return ERR_CAST(epc_page);
270 epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
271 if (IS_ERR(epc_page))
272 return ERR_CAST(epc_page);
274 encl->secs_child_cnt++;
275 sgx_mark_page_reclaimable(entry->epc_page);
277 return entry;
280 static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl,
281 unsigned long addr,
282 unsigned long vm_flags)
284 unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
285 struct sgx_encl_page *entry;
287 entry = xa_load(&encl->page_array, PFN_DOWN(addr));
288 if (!entry)
289 return ERR_PTR(-EFAULT);
292 * Verify that the page has equal or higher build time
293 * permissions than the VMA permissions (i.e. the subset of {VM_READ,
294 * VM_WRITE, VM_EXECUTE} in vma->vm_flags).
296 if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits)
297 return ERR_PTR(-EFAULT);
299 return __sgx_encl_load_page(encl, entry);
302 struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
303 unsigned long addr)
305 struct sgx_encl_page *entry;
307 entry = xa_load(&encl->page_array, PFN_DOWN(addr));
308 if (!entry)
309 return ERR_PTR(-EFAULT);
311 return __sgx_encl_load_page(encl, entry);
315 * sgx_encl_eaug_page() - Dynamically add page to initialized enclave
316 * @vma: VMA obtained from fault info from where page is accessed
317 * @encl: enclave accessing the page
318 * @addr: address that triggered the page fault
320 * When an initialized enclave accesses a page with no backing EPC page
321 * on a SGX2 system then the EPC can be added dynamically via the SGX2
322 * ENCLS[EAUG] instruction.
324 * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed
325 * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise.
327 static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
328 struct sgx_encl *encl, unsigned long addr)
330 vm_fault_t vmret = VM_FAULT_SIGBUS;
331 struct sgx_pageinfo pginfo = {0};
332 struct sgx_encl_page *encl_page;
333 struct sgx_epc_page *epc_page;
334 struct sgx_va_page *va_page;
335 unsigned long phys_addr;
336 u64 secinfo_flags;
337 int ret;
339 if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
340 return VM_FAULT_SIGBUS;
343 * Ignore internal permission checking for dynamically added pages.
344 * They matter only for data added during the pre-initialization
345 * phase. The enclave decides the permissions by the means of
346 * EACCEPT, EACCEPTCOPY and EMODPE.
348 secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X;
349 encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags);
350 if (IS_ERR(encl_page))
351 return VM_FAULT_OOM;
353 mutex_lock(&encl->lock);
355 epc_page = sgx_encl_load_secs(encl);
356 if (IS_ERR(epc_page)) {
357 if (PTR_ERR(epc_page) == -EBUSY)
358 vmret = VM_FAULT_NOPAGE;
359 goto err_out_unlock;
362 epc_page = sgx_alloc_epc_page(encl_page, false);
363 if (IS_ERR(epc_page)) {
364 if (PTR_ERR(epc_page) == -EBUSY)
365 vmret = VM_FAULT_NOPAGE;
366 goto err_out_unlock;
369 va_page = sgx_encl_grow(encl, false);
370 if (IS_ERR(va_page)) {
371 if (PTR_ERR(va_page) == -EBUSY)
372 vmret = VM_FAULT_NOPAGE;
373 goto err_out_epc;
376 if (va_page)
377 list_add(&va_page->list, &encl->va_pages);
379 ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc),
380 encl_page, GFP_KERNEL);
382 * If ret == -EBUSY then page was created in another flow while
383 * running without encl->lock
385 if (ret)
386 goto err_out_shrink;
388 pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page);
389 pginfo.addr = encl_page->desc & PAGE_MASK;
390 pginfo.metadata = 0;
392 ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page));
393 if (ret)
394 goto err_out;
396 encl_page->encl = encl;
397 encl_page->epc_page = epc_page;
398 encl_page->type = SGX_PAGE_TYPE_REG;
399 encl->secs_child_cnt++;
401 sgx_mark_page_reclaimable(encl_page->epc_page);
403 phys_addr = sgx_get_epc_phys_addr(epc_page);
405 * Do not undo everything when creating PTE entry fails - next #PF
406 * would find page ready for a PTE.
408 vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
409 if (vmret != VM_FAULT_NOPAGE) {
410 mutex_unlock(&encl->lock);
411 return VM_FAULT_SIGBUS;
413 mutex_unlock(&encl->lock);
414 return VM_FAULT_NOPAGE;
416 err_out:
417 xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc));
419 err_out_shrink:
420 sgx_encl_shrink(encl, va_page);
421 err_out_epc:
422 sgx_encl_free_epc_page(epc_page);
423 err_out_unlock:
424 mutex_unlock(&encl->lock);
425 kfree(encl_page);
427 return vmret;
430 static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
432 unsigned long addr = (unsigned long)vmf->address;
433 struct vm_area_struct *vma = vmf->vma;
434 struct sgx_encl_page *entry;
435 unsigned long phys_addr;
436 struct sgx_encl *encl;
437 vm_fault_t ret;
439 encl = vma->vm_private_data;
442 * It's very unlikely but possible that allocating memory for the
443 * mm_list entry of a forked process failed in sgx_vma_open(). When
444 * this happens, vm_private_data is set to NULL.
446 if (unlikely(!encl))
447 return VM_FAULT_SIGBUS;
450 * The page_array keeps track of all enclave pages, whether they
451 * are swapped out or not. If there is no entry for this page and
452 * the system supports SGX2 then it is possible to dynamically add
453 * a new enclave page. This is only possible for an initialized
454 * enclave that will be checked for right away.
456 if (cpu_feature_enabled(X86_FEATURE_SGX2) &&
457 (!xa_load(&encl->page_array, PFN_DOWN(addr))))
458 return sgx_encl_eaug_page(vma, encl, addr);
460 mutex_lock(&encl->lock);
462 entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags);
463 if (IS_ERR(entry)) {
464 mutex_unlock(&encl->lock);
466 if (PTR_ERR(entry) == -EBUSY)
467 return VM_FAULT_NOPAGE;
469 return VM_FAULT_SIGBUS;
472 phys_addr = sgx_get_epc_phys_addr(entry->epc_page);
474 ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
475 if (ret != VM_FAULT_NOPAGE) {
476 mutex_unlock(&encl->lock);
478 return VM_FAULT_SIGBUS;
481 sgx_encl_test_and_clear_young(vma->vm_mm, entry);
482 mutex_unlock(&encl->lock);
484 return VM_FAULT_NOPAGE;
487 static void sgx_vma_open(struct vm_area_struct *vma)
489 struct sgx_encl *encl = vma->vm_private_data;
492 * It's possible but unlikely that vm_private_data is NULL. This can
493 * happen in a grandchild of a process, when sgx_encl_mm_add() had
494 * failed to allocate memory in this callback.
496 if (unlikely(!encl))
497 return;
499 if (sgx_encl_mm_add(encl, vma->vm_mm))
500 vma->vm_private_data = NULL;
505 * sgx_encl_may_map() - Check if a requested VMA mapping is allowed
506 * @encl: an enclave pointer
507 * @start: lower bound of the address range, inclusive
508 * @end: upper bound of the address range, exclusive
509 * @vm_flags: VMA flags
511 * Iterate through the enclave pages contained within [@start, @end) to verify
512 * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC}
513 * do not contain any permissions that are not contained in the build time
514 * permissions of any of the enclave pages within the given address range.
516 * An enclave creator must declare the strongest permissions that will be
517 * needed for each enclave page. This ensures that mappings have the identical
518 * or weaker permissions than the earlier declared permissions.
520 * Return: 0 on success, -EACCES otherwise
522 int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
523 unsigned long end, unsigned long vm_flags)
525 unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
526 struct sgx_encl_page *page;
527 unsigned long count = 0;
528 int ret = 0;
530 XA_STATE(xas, &encl->page_array, PFN_DOWN(start));
532 /* Disallow mapping outside enclave's address range. */
533 if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) &&
534 (start < encl->base || end > encl->base + encl->size))
535 return -EACCES;
538 * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might
539 * conflict with the enclave page permissions.
541 if (current->personality & READ_IMPLIES_EXEC)
542 return -EACCES;
544 mutex_lock(&encl->lock);
545 xas_lock(&xas);
546 xas_for_each(&xas, page, PFN_DOWN(end - 1)) {
547 if (~page->vm_max_prot_bits & vm_prot_bits) {
548 ret = -EACCES;
549 break;
552 /* Reschedule on every XA_CHECK_SCHED iteration. */
553 if (!(++count % XA_CHECK_SCHED)) {
554 xas_pause(&xas);
555 xas_unlock(&xas);
556 mutex_unlock(&encl->lock);
558 cond_resched();
560 mutex_lock(&encl->lock);
561 xas_lock(&xas);
564 xas_unlock(&xas);
565 mutex_unlock(&encl->lock);
567 return ret;
570 static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start,
571 unsigned long end, unsigned long newflags)
573 return sgx_encl_may_map(vma->vm_private_data, start, end, newflags);
576 static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page,
577 unsigned long addr, void *data)
579 unsigned long offset = addr & ~PAGE_MASK;
580 int ret;
583 ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
584 if (ret)
585 return -EIO;
587 return 0;
590 static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page,
591 unsigned long addr, void *data)
593 unsigned long offset = addr & ~PAGE_MASK;
594 int ret;
596 ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
597 if (ret)
598 return -EIO;
600 return 0;
604 * Load an enclave page to EPC if required, and take encl->lock.
606 static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl,
607 unsigned long addr,
608 unsigned long vm_flags)
610 struct sgx_encl_page *entry;
612 for ( ; ; ) {
613 mutex_lock(&encl->lock);
615 entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags);
616 if (PTR_ERR(entry) != -EBUSY)
617 break;
619 mutex_unlock(&encl->lock);
622 if (IS_ERR(entry))
623 mutex_unlock(&encl->lock);
625 return entry;
628 static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr,
629 void *buf, int len, int write)
631 struct sgx_encl *encl = vma->vm_private_data;
632 struct sgx_encl_page *entry = NULL;
633 char data[sizeof(unsigned long)];
634 unsigned long align;
635 int offset;
636 int cnt;
637 int ret = 0;
638 int i;
641 * If process was forked, VMA is still there but vm_private_data is set
642 * to NULL.
644 if (!encl)
645 return -EFAULT;
647 if (!test_bit(SGX_ENCL_DEBUG, &encl->flags))
648 return -EFAULT;
650 for (i = 0; i < len; i += cnt) {
651 entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK,
652 vma->vm_flags);
653 if (IS_ERR(entry)) {
654 ret = PTR_ERR(entry);
655 break;
658 align = ALIGN_DOWN(addr + i, sizeof(unsigned long));
659 offset = (addr + i) & (sizeof(unsigned long) - 1);
660 cnt = sizeof(unsigned long) - offset;
661 cnt = min(cnt, len - i);
663 ret = sgx_encl_debug_read(encl, entry, align, data);
664 if (ret)
665 goto out;
667 if (write) {
668 memcpy(data + offset, buf + i, cnt);
669 ret = sgx_encl_debug_write(encl, entry, align, data);
670 if (ret)
671 goto out;
672 } else {
673 memcpy(buf + i, data + offset, cnt);
676 out:
677 mutex_unlock(&encl->lock);
679 if (ret)
680 break;
683 return ret < 0 ? ret : i;
686 const struct vm_operations_struct sgx_vm_ops = {
687 .fault = sgx_vma_fault,
688 .mprotect = sgx_vma_mprotect,
689 .open = sgx_vma_open,
690 .access = sgx_vma_access,
694 * sgx_encl_release - Destroy an enclave instance
695 * @ref: address of a kref inside &sgx_encl
697 * Used together with kref_put(). Frees all the resources associated with the
698 * enclave and the instance itself.
700 void sgx_encl_release(struct kref *ref)
702 struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
703 unsigned long max_page_index = PFN_DOWN(encl->base + encl->size - 1);
704 struct sgx_va_page *va_page;
705 struct sgx_encl_page *entry;
706 unsigned long count = 0;
708 XA_STATE(xas, &encl->page_array, PFN_DOWN(encl->base));
710 xas_lock(&xas);
711 xas_for_each(&xas, entry, max_page_index) {
712 if (entry->epc_page) {
714 * The page and its radix tree entry cannot be freed
715 * if the page is being held by the reclaimer.
717 if (sgx_unmark_page_reclaimable(entry->epc_page))
718 continue;
720 sgx_encl_free_epc_page(entry->epc_page);
721 encl->secs_child_cnt--;
722 entry->epc_page = NULL;
725 kfree(entry);
727 * Invoke scheduler on every XA_CHECK_SCHED iteration
728 * to prevent soft lockups.
730 if (!(++count % XA_CHECK_SCHED)) {
731 xas_pause(&xas);
732 xas_unlock(&xas);
734 cond_resched();
736 xas_lock(&xas);
739 xas_unlock(&xas);
741 xa_destroy(&encl->page_array);
743 if (!encl->secs_child_cnt && encl->secs.epc_page) {
744 sgx_encl_free_epc_page(encl->secs.epc_page);
745 encl->secs.epc_page = NULL;
748 while (!list_empty(&encl->va_pages)) {
749 va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
750 list);
751 list_del(&va_page->list);
752 sgx_encl_free_epc_page(va_page->epc_page);
753 kfree(va_page);
756 if (encl->backing)
757 fput(encl->backing);
759 cleanup_srcu_struct(&encl->srcu);
761 WARN_ON_ONCE(!list_empty(&encl->mm_list));
763 /* Detect EPC page leak's. */
764 WARN_ON_ONCE(encl->secs_child_cnt);
765 WARN_ON_ONCE(encl->secs.epc_page);
767 kfree(encl);
771 * 'mm' is exiting and no longer needs mmu notifications.
773 static void sgx_mmu_notifier_release(struct mmu_notifier *mn,
774 struct mm_struct *mm)
776 struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
777 struct sgx_encl_mm *tmp = NULL;
778 bool found = false;
781 * The enclave itself can remove encl_mm. Note, objects can't be moved
782 * off an RCU protected list, but deletion is ok.
784 spin_lock(&encl_mm->encl->mm_lock);
785 list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) {
786 if (tmp == encl_mm) {
787 list_del_rcu(&encl_mm->list);
788 found = true;
789 break;
792 spin_unlock(&encl_mm->encl->mm_lock);
794 if (found) {
795 synchronize_srcu(&encl_mm->encl->srcu);
796 mmu_notifier_put(mn);
800 static void sgx_mmu_notifier_free(struct mmu_notifier *mn)
802 struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
804 /* 'encl_mm' is going away, put encl_mm->encl reference: */
805 kref_put(&encl_mm->encl->refcount, sgx_encl_release);
807 kfree(encl_mm);
810 static const struct mmu_notifier_ops sgx_mmu_notifier_ops = {
811 .release = sgx_mmu_notifier_release,
812 .free_notifier = sgx_mmu_notifier_free,
815 static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl,
816 struct mm_struct *mm)
818 struct sgx_encl_mm *encl_mm = NULL;
819 struct sgx_encl_mm *tmp;
820 int idx;
822 idx = srcu_read_lock(&encl->srcu);
824 list_for_each_entry_rcu(tmp, &encl->mm_list, list) {
825 if (tmp->mm == mm) {
826 encl_mm = tmp;
827 break;
831 srcu_read_unlock(&encl->srcu, idx);
833 return encl_mm;
836 int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
838 struct sgx_encl_mm *encl_mm;
839 int ret;
842 * Even though a single enclave may be mapped into an mm more than once,
843 * each 'mm' only appears once on encl->mm_list. This is guaranteed by
844 * holding the mm's mmap lock for write before an mm can be added or
845 * remove to an encl->mm_list.
847 mmap_assert_write_locked(mm);
850 * It's possible that an entry already exists in the mm_list, because it
851 * is removed only on VFS release or process exit.
853 if (sgx_encl_find_mm(encl, mm))
854 return 0;
856 encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL);
857 if (!encl_mm)
858 return -ENOMEM;
860 /* Grab a refcount for the encl_mm->encl reference: */
861 kref_get(&encl->refcount);
862 encl_mm->encl = encl;
863 encl_mm->mm = mm;
864 encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops;
866 ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm);
867 if (ret) {
868 kfree(encl_mm);
869 return ret;
872 spin_lock(&encl->mm_lock);
873 list_add_rcu(&encl_mm->list, &encl->mm_list);
874 /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */
875 smp_wmb();
876 encl->mm_list_version++;
877 spin_unlock(&encl->mm_lock);
879 return 0;
883 * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave
884 * @encl: the enclave
886 * Some SGX functions require that no cached linear-to-physical address
887 * mappings are present before they can succeed. For example, ENCLS[EWB]
888 * copies a page from the enclave page cache to regular main memory but
889 * it fails if it cannot ensure that there are no cached
890 * linear-to-physical address mappings referring to the page.
892 * SGX hardware flushes all cached linear-to-physical mappings on a CPU
893 * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave
894 * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical
895 * address mappings are cleared but coordination with the tracking done within
896 * the SGX hardware is needed to support the SGX functions that depend on this
897 * cache clearing.
899 * When the ENCLS[ETRACK] function is issued on an enclave the hardware
900 * tracks threads operating inside the enclave at that time. The SGX
901 * hardware tracking require that all the identified threads must have
902 * exited the enclave in order to flush the mappings before a function such
903 * as ENCLS[EWB] will be permitted
905 * The following flow is used to support SGX functions that require that
906 * no cached linear-to-physical address mappings are present:
907 * 1) Execute ENCLS[ETRACK] to initiate hardware tracking.
908 * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be
909 * accessing the enclave.
910 * 3) Send IPI to identified CPUs, kicking them out of the enclave and
911 * thus flushing all locally cached linear-to-physical address mappings.
912 * 4) Execute SGX function.
914 * Context: It is required to call this function after ENCLS[ETRACK].
915 * This will ensure that if any new mm appears (racing with
916 * sgx_encl_mm_add()) then the new mm will enter into the
917 * enclave with fresh linear-to-physical address mappings.
919 * It is required that all IPIs are completed before a new
920 * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3
921 * of the above flow with the enclave's mutex.
923 * Return: cpumask of CPUs that might be accessing @encl
925 const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl)
927 cpumask_t *cpumask = &encl->cpumask;
928 struct sgx_encl_mm *encl_mm;
929 int idx;
931 cpumask_clear(cpumask);
933 idx = srcu_read_lock(&encl->srcu);
935 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
936 if (!mmget_not_zero(encl_mm->mm))
937 continue;
939 cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm));
941 mmput_async(encl_mm->mm);
944 srcu_read_unlock(&encl->srcu, idx);
946 return cpumask;
949 static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
950 pgoff_t index)
952 struct address_space *mapping = encl->backing->f_mapping;
953 gfp_t gfpmask = mapping_gfp_mask(mapping);
955 return shmem_read_mapping_page_gfp(mapping, index, gfpmask);
959 * __sgx_encl_get_backing() - Pin the backing storage
960 * @encl: an enclave pointer
961 * @page_index: enclave page index
962 * @backing: data for accessing backing storage for the page
964 * Pin the backing storage pages for storing the encrypted contents and Paging
965 * Crypto MetaData (PCMD) of an enclave page.
967 * Return:
968 * 0 on success,
969 * -errno otherwise.
971 static int __sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
972 struct sgx_backing *backing)
974 pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
975 struct page *contents;
976 struct page *pcmd;
978 contents = sgx_encl_get_backing_page(encl, page_index);
979 if (IS_ERR(contents))
980 return PTR_ERR(contents);
982 pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off));
983 if (IS_ERR(pcmd)) {
984 put_page(contents);
985 return PTR_ERR(pcmd);
988 backing->contents = contents;
989 backing->pcmd = pcmd;
990 backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1);
992 return 0;
996 * When called from ksgxd, returns the mem_cgroup of a struct mm stored
997 * in the enclave's mm_list. When not called from ksgxd, just returns
998 * the mem_cgroup of the current task.
1000 static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
1002 struct mem_cgroup *memcg = NULL;
1003 struct sgx_encl_mm *encl_mm;
1004 int idx;
1007 * If called from normal task context, return the mem_cgroup
1008 * of the current task's mm. The remainder of the handling is for
1009 * ksgxd.
1011 if (!current_is_ksgxd())
1012 return get_mem_cgroup_from_mm(current->mm);
1015 * Search the enclave's mm_list to find an mm associated with
1016 * this enclave to charge the allocation to.
1018 idx = srcu_read_lock(&encl->srcu);
1020 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
1021 if (!mmget_not_zero(encl_mm->mm))
1022 continue;
1024 memcg = get_mem_cgroup_from_mm(encl_mm->mm);
1026 mmput_async(encl_mm->mm);
1028 break;
1031 srcu_read_unlock(&encl->srcu, idx);
1034 * In the rare case that there isn't an mm associated with
1035 * the enclave, set memcg to the current active mem_cgroup.
1036 * This will be the root mem_cgroup if there is no active
1037 * mem_cgroup.
1039 if (!memcg)
1040 return get_mem_cgroup_from_mm(NULL);
1042 return memcg;
1046 * sgx_encl_alloc_backing() - create a new backing storage page
1047 * @encl: an enclave pointer
1048 * @page_index: enclave page index
1049 * @backing: data for accessing backing storage for the page
1051 * When called from ksgxd, sets the active memcg from one of the
1052 * mms in the enclave's mm_list prior to any backing page allocation,
1053 * in order to ensure that shmem page allocations are charged to the
1054 * enclave. Create a backing page for loading data back into an EPC page with
1055 * ELDU. This function takes a reference on a new backing page which
1056 * must be dropped with a corresponding call to sgx_encl_put_backing().
1058 * Return:
1059 * 0 on success,
1060 * -errno otherwise.
1062 int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
1063 struct sgx_backing *backing)
1065 struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl);
1066 struct mem_cgroup *memcg = set_active_memcg(encl_memcg);
1067 int ret;
1069 ret = __sgx_encl_get_backing(encl, page_index, backing);
1071 set_active_memcg(memcg);
1072 mem_cgroup_put(encl_memcg);
1074 return ret;
1078 * sgx_encl_lookup_backing() - retrieve an existing backing storage page
1079 * @encl: an enclave pointer
1080 * @page_index: enclave page index
1081 * @backing: data for accessing backing storage for the page
1083 * Retrieve a backing page for loading data back into an EPC page with ELDU.
1084 * It is the caller's responsibility to ensure that it is appropriate to use
1085 * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is
1086 * not used correctly, this will cause an allocation which is not accounted for.
1087 * This function takes a reference on an existing backing page which must be
1088 * dropped with a corresponding call to sgx_encl_put_backing().
1090 * Return:
1091 * 0 on success,
1092 * -errno otherwise.
1094 static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
1095 struct sgx_backing *backing)
1097 return __sgx_encl_get_backing(encl, page_index, backing);
1101 * sgx_encl_put_backing() - Unpin the backing storage
1102 * @backing: data for accessing backing storage for the page
1104 void sgx_encl_put_backing(struct sgx_backing *backing)
1106 put_page(backing->pcmd);
1107 put_page(backing->contents);
1110 static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr,
1111 void *data)
1113 pte_t pte;
1114 int ret;
1116 ret = pte_young(*ptep);
1117 if (ret) {
1118 pte = pte_mkold(*ptep);
1119 set_pte_at((struct mm_struct *)data, addr, ptep, pte);
1122 return ret;
1126 * sgx_encl_test_and_clear_young() - Test and reset the accessed bit
1127 * @mm: mm_struct that is checked
1128 * @page: enclave page to be tested for recent access
1130 * Checks the Access (A) bit from the PTE corresponding to the enclave page and
1131 * clears it.
1133 * Return: 1 if the page has been recently accessed and 0 if not.
1135 int sgx_encl_test_and_clear_young(struct mm_struct *mm,
1136 struct sgx_encl_page *page)
1138 unsigned long addr = page->desc & PAGE_MASK;
1139 struct sgx_encl *encl = page->encl;
1140 struct vm_area_struct *vma;
1141 int ret;
1143 ret = sgx_encl_find(mm, addr, &vma);
1144 if (ret)
1145 return 0;
1147 if (encl != vma->vm_private_data)
1148 return 0;
1150 ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE,
1151 sgx_encl_test_and_clear_young_cb, vma->vm_mm);
1152 if (ret < 0)
1153 return 0;
1155 return ret;
1158 struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
1159 unsigned long offset,
1160 u64 secinfo_flags)
1162 struct sgx_encl_page *encl_page;
1163 unsigned long prot;
1165 encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL);
1166 if (!encl_page)
1167 return ERR_PTR(-ENOMEM);
1169 encl_page->desc = encl->base + offset;
1170 encl_page->encl = encl;
1172 prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) |
1173 _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) |
1174 _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC);
1177 * TCS pages must always RW set for CPU access while the SECINFO
1178 * permissions are *always* zero - the CPU ignores the user provided
1179 * values and silently overwrites them with zero permissions.
1181 if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS)
1182 prot |= PROT_READ | PROT_WRITE;
1184 /* Calculate maximum of the VM flags for the page. */
1185 encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
1187 return encl_page;
1191 * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave
1192 * @encl: the enclave
1193 * @addr: page aligned pointer to single page for which PTEs will be removed
1195 * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping
1196 * @addr from each VMA. Ensure that page fault handler is ready to handle
1197 * new mappings of @addr before calling this function.
1199 void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr)
1201 unsigned long mm_list_version;
1202 struct sgx_encl_mm *encl_mm;
1203 struct vm_area_struct *vma;
1204 int idx, ret;
1206 do {
1207 mm_list_version = encl->mm_list_version;
1209 /* Pairs with smp_wmb() in sgx_encl_mm_add(). */
1210 smp_rmb();
1212 idx = srcu_read_lock(&encl->srcu);
1214 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
1215 if (!mmget_not_zero(encl_mm->mm))
1216 continue;
1218 mmap_read_lock(encl_mm->mm);
1220 ret = sgx_encl_find(encl_mm->mm, addr, &vma);
1221 if (!ret && encl == vma->vm_private_data)
1222 zap_vma_ptes(vma, addr, PAGE_SIZE);
1224 mmap_read_unlock(encl_mm->mm);
1226 mmput_async(encl_mm->mm);
1229 srcu_read_unlock(&encl->srcu, idx);
1230 } while (unlikely(encl->mm_list_version != mm_list_version));
1234 * sgx_alloc_va_page() - Allocate a Version Array (VA) page
1235 * @reclaim: Reclaim EPC pages directly if none available. Enclave
1236 * mutex should not be held if this is set.
1238 * Allocate a free EPC page and convert it to a Version Array (VA) page.
1240 * Return:
1241 * a VA page,
1242 * -errno otherwise
1244 struct sgx_epc_page *sgx_alloc_va_page(bool reclaim)
1246 struct sgx_epc_page *epc_page;
1247 int ret;
1249 epc_page = sgx_alloc_epc_page(NULL, reclaim);
1250 if (IS_ERR(epc_page))
1251 return ERR_CAST(epc_page);
1253 ret = __epa(sgx_get_epc_virt_addr(epc_page));
1254 if (ret) {
1255 WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
1256 sgx_encl_free_epc_page(epc_page);
1257 return ERR_PTR(-EFAULT);
1260 return epc_page;
1264 * sgx_alloc_va_slot - allocate a VA slot
1265 * @va_page: a &struct sgx_va_page instance
1267 * Allocates a slot from a &struct sgx_va_page instance.
1269 * Return: offset of the slot inside the VA page
1271 unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page)
1273 int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
1275 if (slot < SGX_VA_SLOT_COUNT)
1276 set_bit(slot, va_page->slots);
1278 return slot << 3;
1282 * sgx_free_va_slot - free a VA slot
1283 * @va_page: a &struct sgx_va_page instance
1284 * @offset: offset of the slot inside the VA page
1286 * Frees a slot from a &struct sgx_va_page instance.
1288 void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset)
1290 clear_bit(offset >> 3, va_page->slots);
1294 * sgx_va_page_full - is the VA page full?
1295 * @va_page: a &struct sgx_va_page instance
1297 * Return: true if all slots have been taken
1299 bool sgx_va_page_full(struct sgx_va_page *va_page)
1301 int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
1303 return slot == SGX_VA_SLOT_COUNT;
1307 * sgx_encl_free_epc_page - free an EPC page assigned to an enclave
1308 * @page: EPC page to be freed
1310 * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
1311 * only upon success, it puts the page back to free page list. Otherwise, it
1312 * gives a WARNING to indicate page is leaked.
1314 void sgx_encl_free_epc_page(struct sgx_epc_page *page)
1316 int ret;
1318 WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
1320 ret = __eremove(sgx_get_epc_virt_addr(page));
1321 if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret))
1322 return;
1324 sgx_free_epc_page(page);