1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright(c) 2016-20 Intel Corporation. */
4 #include <linux/lockdep.h>
6 #include <linux/mman.h>
7 #include <linux/shmem_fs.h>
8 #include <linux/suspend.h>
9 #include <linux/sched/mm.h>
15 static int sgx_encl_lookup_backing(struct sgx_encl
*encl
, unsigned long page_index
,
16 struct sgx_backing
*backing
);
18 #define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd))
20 * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to
21 * determine the page index associated with the first PCMD entry
24 #define PCMD_FIRST_MASK GENMASK(4, 0)
27 * reclaimer_writing_to_pcmd() - Query if any enclave page associated with
28 * a PCMD page is in process of being reclaimed.
29 * @encl: Enclave to which PCMD page belongs
30 * @start_addr: Address of enclave page using first entry within the PCMD page
32 * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is
33 * stored. The PCMD data of a reclaimed enclave page contains enough
34 * information for the processor to verify the page at the time
35 * it is loaded back into the Enclave Page Cache (EPC).
37 * The backing storage to which enclave pages are reclaimed is laid out as
39 * Encrypted enclave pages:SECS page:PCMD pages
41 * Each PCMD page contains the PCMD metadata of
42 * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages.
44 * A PCMD page can only be truncated if it is (a) empty, and (b) not in the
45 * process of getting data (and thus soon being non-empty). (b) is tested with
46 * a check if an enclave page sharing the PCMD page is in the process of being
49 * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it
50 * intends to reclaim that enclave page - it means that the PCMD page
51 * associated with that enclave page is about to get some data and thus
52 * even if the PCMD page is empty, it should not be truncated.
54 * Context: Enclave mutex (&sgx_encl->lock) must be held.
55 * Return: 1 if the reclaimer is about to write to the PCMD page
56 * 0 if the reclaimer has no intention to write to the PCMD page
58 static int reclaimer_writing_to_pcmd(struct sgx_encl
*encl
,
59 unsigned long start_addr
)
65 * PCMD_FIRST_MASK is based on number of PCMD entries within
68 BUILD_BUG_ON(PCMDS_PER_PAGE
!= 32);
70 for (i
= 0; i
< PCMDS_PER_PAGE
; i
++) {
71 struct sgx_encl_page
*entry
;
74 addr
= start_addr
+ i
* PAGE_SIZE
;
77 * Stop when reaching the SECS page - it does not
78 * have a page_array entry and its reclaim is
79 * started and completed with enclave mutex held so
80 * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED
83 if (addr
== encl
->base
+ encl
->size
)
86 entry
= xa_load(&encl
->page_array
, PFN_DOWN(addr
));
91 * VA page slot ID uses same bit as the flag so it is important
92 * to ensure that the page is not already in backing store.
94 if (entry
->epc_page
&&
95 (entry
->desc
& SGX_ENCL_PAGE_BEING_RECLAIMED
)) {
105 * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's
106 * follow right after the EPC data in the backing storage. In addition to the
107 * visible enclave pages, there's one extra page slot for SECS, before PCMD
110 static inline pgoff_t
sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl
*encl
,
111 unsigned long page_index
)
113 pgoff_t epc_end_off
= encl
->size
+ sizeof(struct sgx_secs
);
115 return epc_end_off
+ page_index
* sizeof(struct sgx_pcmd
);
119 * Free a page from the backing storage in the given page index.
121 static inline void sgx_encl_truncate_backing_page(struct sgx_encl
*encl
, unsigned long page_index
)
123 struct inode
*inode
= file_inode(encl
->backing
);
125 shmem_truncate_range(inode
, PFN_PHYS(page_index
), PFN_PHYS(page_index
) + PAGE_SIZE
- 1);
129 * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
132 static int __sgx_encl_eldu(struct sgx_encl_page
*encl_page
,
133 struct sgx_epc_page
*epc_page
,
134 struct sgx_epc_page
*secs_page
)
136 unsigned long va_offset
= encl_page
->desc
& SGX_ENCL_PAGE_VA_OFFSET_MASK
;
137 struct sgx_encl
*encl
= encl_page
->encl
;
138 pgoff_t page_index
, page_pcmd_off
;
139 unsigned long pcmd_first_page
;
140 struct sgx_pageinfo pginfo
;
141 struct sgx_backing b
;
142 bool pcmd_page_empty
;
147 page_index
= PFN_DOWN(encl_page
->desc
- encl_page
->encl
->base
);
149 page_index
= PFN_DOWN(encl
->size
);
152 * Address of enclave page using the first entry within the PCMD page.
154 pcmd_first_page
= PFN_PHYS(page_index
& ~PCMD_FIRST_MASK
) + encl
->base
;
156 page_pcmd_off
= sgx_encl_get_backing_page_pcmd_offset(encl
, page_index
);
158 ret
= sgx_encl_lookup_backing(encl
, page_index
, &b
);
162 pginfo
.addr
= encl_page
->desc
& PAGE_MASK
;
163 pginfo
.contents
= (unsigned long)kmap_local_page(b
.contents
);
164 pcmd_page
= kmap_local_page(b
.pcmd
);
165 pginfo
.metadata
= (unsigned long)pcmd_page
+ b
.pcmd_offset
;
168 pginfo
.secs
= (u64
)sgx_get_epc_virt_addr(secs_page
);
172 ret
= __eldu(&pginfo
, sgx_get_epc_virt_addr(epc_page
),
173 sgx_get_epc_virt_addr(encl_page
->va_page
->epc_page
) + va_offset
);
175 if (encls_failed(ret
))
176 ENCLS_WARN(ret
, "ELDU");
181 memset(pcmd_page
+ b
.pcmd_offset
, 0, sizeof(struct sgx_pcmd
));
182 set_page_dirty(b
.pcmd
);
185 * The area for the PCMD in the page was zeroed above. Check if the
186 * whole page is now empty meaning that all PCMD's have been zeroed:
188 pcmd_page_empty
= !memchr_inv(pcmd_page
, 0, PAGE_SIZE
);
190 kunmap_local(pcmd_page
);
191 kunmap_local((void *)(unsigned long)pginfo
.contents
);
194 sgx_encl_put_backing(&b
);
196 sgx_encl_truncate_backing_page(encl
, page_index
);
198 if (pcmd_page_empty
&& !reclaimer_writing_to_pcmd(encl
, pcmd_first_page
)) {
199 sgx_encl_truncate_backing_page(encl
, PFN_DOWN(page_pcmd_off
));
200 pcmd_page
= kmap_local_page(b
.pcmd
);
201 if (memchr_inv(pcmd_page
, 0, PAGE_SIZE
))
202 pr_warn("PCMD page not empty after truncate.\n");
203 kunmap_local(pcmd_page
);
211 static struct sgx_epc_page
*sgx_encl_eldu(struct sgx_encl_page
*encl_page
,
212 struct sgx_epc_page
*secs_page
)
215 unsigned long va_offset
= encl_page
->desc
& SGX_ENCL_PAGE_VA_OFFSET_MASK
;
216 struct sgx_encl
*encl
= encl_page
->encl
;
217 struct sgx_epc_page
*epc_page
;
220 epc_page
= sgx_alloc_epc_page(encl_page
, false);
221 if (IS_ERR(epc_page
))
224 ret
= __sgx_encl_eldu(encl_page
, epc_page
, secs_page
);
226 sgx_encl_free_epc_page(epc_page
);
230 sgx_free_va_slot(encl_page
->va_page
, va_offset
);
231 list_move(&encl_page
->va_page
->list
, &encl
->va_pages
);
232 encl_page
->desc
&= ~SGX_ENCL_PAGE_VA_OFFSET_MASK
;
233 encl_page
->epc_page
= epc_page
;
239 * Ensure the SECS page is not swapped out. Must be called with encl->lock
240 * to protect the enclave states including SECS and ensure the SECS page is
241 * not swapped out again while being used.
243 static struct sgx_epc_page
*sgx_encl_load_secs(struct sgx_encl
*encl
)
245 struct sgx_epc_page
*epc_page
= encl
->secs
.epc_page
;
248 epc_page
= sgx_encl_eldu(&encl
->secs
, NULL
);
253 static struct sgx_encl_page
*__sgx_encl_load_page(struct sgx_encl
*encl
,
254 struct sgx_encl_page
*entry
)
256 struct sgx_epc_page
*epc_page
;
258 /* Entry successfully located. */
259 if (entry
->epc_page
) {
260 if (entry
->desc
& SGX_ENCL_PAGE_BEING_RECLAIMED
)
261 return ERR_PTR(-EBUSY
);
266 epc_page
= sgx_encl_load_secs(encl
);
267 if (IS_ERR(epc_page
))
268 return ERR_CAST(epc_page
);
270 epc_page
= sgx_encl_eldu(entry
, encl
->secs
.epc_page
);
271 if (IS_ERR(epc_page
))
272 return ERR_CAST(epc_page
);
274 encl
->secs_child_cnt
++;
275 sgx_mark_page_reclaimable(entry
->epc_page
);
280 static struct sgx_encl_page
*sgx_encl_load_page_in_vma(struct sgx_encl
*encl
,
282 unsigned long vm_flags
)
284 unsigned long vm_prot_bits
= vm_flags
& VM_ACCESS_FLAGS
;
285 struct sgx_encl_page
*entry
;
287 entry
= xa_load(&encl
->page_array
, PFN_DOWN(addr
));
289 return ERR_PTR(-EFAULT
);
292 * Verify that the page has equal or higher build time
293 * permissions than the VMA permissions (i.e. the subset of {VM_READ,
294 * VM_WRITE, VM_EXECUTE} in vma->vm_flags).
296 if ((entry
->vm_max_prot_bits
& vm_prot_bits
) != vm_prot_bits
)
297 return ERR_PTR(-EFAULT
);
299 return __sgx_encl_load_page(encl
, entry
);
302 struct sgx_encl_page
*sgx_encl_load_page(struct sgx_encl
*encl
,
305 struct sgx_encl_page
*entry
;
307 entry
= xa_load(&encl
->page_array
, PFN_DOWN(addr
));
309 return ERR_PTR(-EFAULT
);
311 return __sgx_encl_load_page(encl
, entry
);
315 * sgx_encl_eaug_page() - Dynamically add page to initialized enclave
316 * @vma: VMA obtained from fault info from where page is accessed
317 * @encl: enclave accessing the page
318 * @addr: address that triggered the page fault
320 * When an initialized enclave accesses a page with no backing EPC page
321 * on a SGX2 system then the EPC can be added dynamically via the SGX2
322 * ENCLS[EAUG] instruction.
324 * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed
325 * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise.
327 static vm_fault_t
sgx_encl_eaug_page(struct vm_area_struct
*vma
,
328 struct sgx_encl
*encl
, unsigned long addr
)
330 vm_fault_t vmret
= VM_FAULT_SIGBUS
;
331 struct sgx_pageinfo pginfo
= {0};
332 struct sgx_encl_page
*encl_page
;
333 struct sgx_epc_page
*epc_page
;
334 struct sgx_va_page
*va_page
;
335 unsigned long phys_addr
;
339 if (!test_bit(SGX_ENCL_INITIALIZED
, &encl
->flags
))
340 return VM_FAULT_SIGBUS
;
343 * Ignore internal permission checking for dynamically added pages.
344 * They matter only for data added during the pre-initialization
345 * phase. The enclave decides the permissions by the means of
346 * EACCEPT, EACCEPTCOPY and EMODPE.
348 secinfo_flags
= SGX_SECINFO_R
| SGX_SECINFO_W
| SGX_SECINFO_X
;
349 encl_page
= sgx_encl_page_alloc(encl
, addr
- encl
->base
, secinfo_flags
);
350 if (IS_ERR(encl_page
))
353 mutex_lock(&encl
->lock
);
355 epc_page
= sgx_encl_load_secs(encl
);
356 if (IS_ERR(epc_page
)) {
357 if (PTR_ERR(epc_page
) == -EBUSY
)
358 vmret
= VM_FAULT_NOPAGE
;
362 epc_page
= sgx_alloc_epc_page(encl_page
, false);
363 if (IS_ERR(epc_page
)) {
364 if (PTR_ERR(epc_page
) == -EBUSY
)
365 vmret
= VM_FAULT_NOPAGE
;
369 va_page
= sgx_encl_grow(encl
, false);
370 if (IS_ERR(va_page
)) {
371 if (PTR_ERR(va_page
) == -EBUSY
)
372 vmret
= VM_FAULT_NOPAGE
;
377 list_add(&va_page
->list
, &encl
->va_pages
);
379 ret
= xa_insert(&encl
->page_array
, PFN_DOWN(encl_page
->desc
),
380 encl_page
, GFP_KERNEL
);
382 * If ret == -EBUSY then page was created in another flow while
383 * running without encl->lock
388 pginfo
.secs
= (unsigned long)sgx_get_epc_virt_addr(encl
->secs
.epc_page
);
389 pginfo
.addr
= encl_page
->desc
& PAGE_MASK
;
392 ret
= __eaug(&pginfo
, sgx_get_epc_virt_addr(epc_page
));
396 encl_page
->encl
= encl
;
397 encl_page
->epc_page
= epc_page
;
398 encl_page
->type
= SGX_PAGE_TYPE_REG
;
399 encl
->secs_child_cnt
++;
401 sgx_mark_page_reclaimable(encl_page
->epc_page
);
403 phys_addr
= sgx_get_epc_phys_addr(epc_page
);
405 * Do not undo everything when creating PTE entry fails - next #PF
406 * would find page ready for a PTE.
408 vmret
= vmf_insert_pfn(vma
, addr
, PFN_DOWN(phys_addr
));
409 if (vmret
!= VM_FAULT_NOPAGE
) {
410 mutex_unlock(&encl
->lock
);
411 return VM_FAULT_SIGBUS
;
413 mutex_unlock(&encl
->lock
);
414 return VM_FAULT_NOPAGE
;
417 xa_erase(&encl
->page_array
, PFN_DOWN(encl_page
->desc
));
420 sgx_encl_shrink(encl
, va_page
);
422 sgx_encl_free_epc_page(epc_page
);
424 mutex_unlock(&encl
->lock
);
430 static vm_fault_t
sgx_vma_fault(struct vm_fault
*vmf
)
432 unsigned long addr
= (unsigned long)vmf
->address
;
433 struct vm_area_struct
*vma
= vmf
->vma
;
434 struct sgx_encl_page
*entry
;
435 unsigned long phys_addr
;
436 struct sgx_encl
*encl
;
439 encl
= vma
->vm_private_data
;
442 * It's very unlikely but possible that allocating memory for the
443 * mm_list entry of a forked process failed in sgx_vma_open(). When
444 * this happens, vm_private_data is set to NULL.
447 return VM_FAULT_SIGBUS
;
450 * The page_array keeps track of all enclave pages, whether they
451 * are swapped out or not. If there is no entry for this page and
452 * the system supports SGX2 then it is possible to dynamically add
453 * a new enclave page. This is only possible for an initialized
454 * enclave that will be checked for right away.
456 if (cpu_feature_enabled(X86_FEATURE_SGX2
) &&
457 (!xa_load(&encl
->page_array
, PFN_DOWN(addr
))))
458 return sgx_encl_eaug_page(vma
, encl
, addr
);
460 mutex_lock(&encl
->lock
);
462 entry
= sgx_encl_load_page_in_vma(encl
, addr
, vma
->vm_flags
);
464 mutex_unlock(&encl
->lock
);
466 if (PTR_ERR(entry
) == -EBUSY
)
467 return VM_FAULT_NOPAGE
;
469 return VM_FAULT_SIGBUS
;
472 phys_addr
= sgx_get_epc_phys_addr(entry
->epc_page
);
474 ret
= vmf_insert_pfn(vma
, addr
, PFN_DOWN(phys_addr
));
475 if (ret
!= VM_FAULT_NOPAGE
) {
476 mutex_unlock(&encl
->lock
);
478 return VM_FAULT_SIGBUS
;
481 sgx_encl_test_and_clear_young(vma
->vm_mm
, entry
);
482 mutex_unlock(&encl
->lock
);
484 return VM_FAULT_NOPAGE
;
487 static void sgx_vma_open(struct vm_area_struct
*vma
)
489 struct sgx_encl
*encl
= vma
->vm_private_data
;
492 * It's possible but unlikely that vm_private_data is NULL. This can
493 * happen in a grandchild of a process, when sgx_encl_mm_add() had
494 * failed to allocate memory in this callback.
499 if (sgx_encl_mm_add(encl
, vma
->vm_mm
))
500 vma
->vm_private_data
= NULL
;
505 * sgx_encl_may_map() - Check if a requested VMA mapping is allowed
506 * @encl: an enclave pointer
507 * @start: lower bound of the address range, inclusive
508 * @end: upper bound of the address range, exclusive
509 * @vm_flags: VMA flags
511 * Iterate through the enclave pages contained within [@start, @end) to verify
512 * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC}
513 * do not contain any permissions that are not contained in the build time
514 * permissions of any of the enclave pages within the given address range.
516 * An enclave creator must declare the strongest permissions that will be
517 * needed for each enclave page. This ensures that mappings have the identical
518 * or weaker permissions than the earlier declared permissions.
520 * Return: 0 on success, -EACCES otherwise
522 int sgx_encl_may_map(struct sgx_encl
*encl
, unsigned long start
,
523 unsigned long end
, unsigned long vm_flags
)
525 unsigned long vm_prot_bits
= vm_flags
& VM_ACCESS_FLAGS
;
526 struct sgx_encl_page
*page
;
527 unsigned long count
= 0;
530 XA_STATE(xas
, &encl
->page_array
, PFN_DOWN(start
));
532 /* Disallow mapping outside enclave's address range. */
533 if (test_bit(SGX_ENCL_INITIALIZED
, &encl
->flags
) &&
534 (start
< encl
->base
|| end
> encl
->base
+ encl
->size
))
538 * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might
539 * conflict with the enclave page permissions.
541 if (current
->personality
& READ_IMPLIES_EXEC
)
544 mutex_lock(&encl
->lock
);
546 xas_for_each(&xas
, page
, PFN_DOWN(end
- 1)) {
547 if (~page
->vm_max_prot_bits
& vm_prot_bits
) {
552 /* Reschedule on every XA_CHECK_SCHED iteration. */
553 if (!(++count
% XA_CHECK_SCHED
)) {
556 mutex_unlock(&encl
->lock
);
560 mutex_lock(&encl
->lock
);
565 mutex_unlock(&encl
->lock
);
570 static int sgx_vma_mprotect(struct vm_area_struct
*vma
, unsigned long start
,
571 unsigned long end
, unsigned long newflags
)
573 return sgx_encl_may_map(vma
->vm_private_data
, start
, end
, newflags
);
576 static int sgx_encl_debug_read(struct sgx_encl
*encl
, struct sgx_encl_page
*page
,
577 unsigned long addr
, void *data
)
579 unsigned long offset
= addr
& ~PAGE_MASK
;
583 ret
= __edbgrd(sgx_get_epc_virt_addr(page
->epc_page
) + offset
, data
);
590 static int sgx_encl_debug_write(struct sgx_encl
*encl
, struct sgx_encl_page
*page
,
591 unsigned long addr
, void *data
)
593 unsigned long offset
= addr
& ~PAGE_MASK
;
596 ret
= __edbgwr(sgx_get_epc_virt_addr(page
->epc_page
) + offset
, data
);
604 * Load an enclave page to EPC if required, and take encl->lock.
606 static struct sgx_encl_page
*sgx_encl_reserve_page(struct sgx_encl
*encl
,
608 unsigned long vm_flags
)
610 struct sgx_encl_page
*entry
;
613 mutex_lock(&encl
->lock
);
615 entry
= sgx_encl_load_page_in_vma(encl
, addr
, vm_flags
);
616 if (PTR_ERR(entry
) != -EBUSY
)
619 mutex_unlock(&encl
->lock
);
623 mutex_unlock(&encl
->lock
);
628 static int sgx_vma_access(struct vm_area_struct
*vma
, unsigned long addr
,
629 void *buf
, int len
, int write
)
631 struct sgx_encl
*encl
= vma
->vm_private_data
;
632 struct sgx_encl_page
*entry
= NULL
;
633 char data
[sizeof(unsigned long)];
641 * If process was forked, VMA is still there but vm_private_data is set
647 if (!test_bit(SGX_ENCL_DEBUG
, &encl
->flags
))
650 for (i
= 0; i
< len
; i
+= cnt
) {
651 entry
= sgx_encl_reserve_page(encl
, (addr
+ i
) & PAGE_MASK
,
654 ret
= PTR_ERR(entry
);
658 align
= ALIGN_DOWN(addr
+ i
, sizeof(unsigned long));
659 offset
= (addr
+ i
) & (sizeof(unsigned long) - 1);
660 cnt
= sizeof(unsigned long) - offset
;
661 cnt
= min(cnt
, len
- i
);
663 ret
= sgx_encl_debug_read(encl
, entry
, align
, data
);
668 memcpy(data
+ offset
, buf
+ i
, cnt
);
669 ret
= sgx_encl_debug_write(encl
, entry
, align
, data
);
673 memcpy(buf
+ i
, data
+ offset
, cnt
);
677 mutex_unlock(&encl
->lock
);
683 return ret
< 0 ? ret
: i
;
686 const struct vm_operations_struct sgx_vm_ops
= {
687 .fault
= sgx_vma_fault
,
688 .mprotect
= sgx_vma_mprotect
,
689 .open
= sgx_vma_open
,
690 .access
= sgx_vma_access
,
694 * sgx_encl_release - Destroy an enclave instance
695 * @ref: address of a kref inside &sgx_encl
697 * Used together with kref_put(). Frees all the resources associated with the
698 * enclave and the instance itself.
700 void sgx_encl_release(struct kref
*ref
)
702 struct sgx_encl
*encl
= container_of(ref
, struct sgx_encl
, refcount
);
703 unsigned long max_page_index
= PFN_DOWN(encl
->base
+ encl
->size
- 1);
704 struct sgx_va_page
*va_page
;
705 struct sgx_encl_page
*entry
;
706 unsigned long count
= 0;
708 XA_STATE(xas
, &encl
->page_array
, PFN_DOWN(encl
->base
));
711 xas_for_each(&xas
, entry
, max_page_index
) {
712 if (entry
->epc_page
) {
714 * The page and its radix tree entry cannot be freed
715 * if the page is being held by the reclaimer.
717 if (sgx_unmark_page_reclaimable(entry
->epc_page
))
720 sgx_encl_free_epc_page(entry
->epc_page
);
721 encl
->secs_child_cnt
--;
722 entry
->epc_page
= NULL
;
727 * Invoke scheduler on every XA_CHECK_SCHED iteration
728 * to prevent soft lockups.
730 if (!(++count
% XA_CHECK_SCHED
)) {
741 xa_destroy(&encl
->page_array
);
743 if (!encl
->secs_child_cnt
&& encl
->secs
.epc_page
) {
744 sgx_encl_free_epc_page(encl
->secs
.epc_page
);
745 encl
->secs
.epc_page
= NULL
;
748 while (!list_empty(&encl
->va_pages
)) {
749 va_page
= list_first_entry(&encl
->va_pages
, struct sgx_va_page
,
751 list_del(&va_page
->list
);
752 sgx_encl_free_epc_page(va_page
->epc_page
);
759 cleanup_srcu_struct(&encl
->srcu
);
761 WARN_ON_ONCE(!list_empty(&encl
->mm_list
));
763 /* Detect EPC page leak's. */
764 WARN_ON_ONCE(encl
->secs_child_cnt
);
765 WARN_ON_ONCE(encl
->secs
.epc_page
);
771 * 'mm' is exiting and no longer needs mmu notifications.
773 static void sgx_mmu_notifier_release(struct mmu_notifier
*mn
,
774 struct mm_struct
*mm
)
776 struct sgx_encl_mm
*encl_mm
= container_of(mn
, struct sgx_encl_mm
, mmu_notifier
);
777 struct sgx_encl_mm
*tmp
= NULL
;
781 * The enclave itself can remove encl_mm. Note, objects can't be moved
782 * off an RCU protected list, but deletion is ok.
784 spin_lock(&encl_mm
->encl
->mm_lock
);
785 list_for_each_entry(tmp
, &encl_mm
->encl
->mm_list
, list
) {
786 if (tmp
== encl_mm
) {
787 list_del_rcu(&encl_mm
->list
);
792 spin_unlock(&encl_mm
->encl
->mm_lock
);
795 synchronize_srcu(&encl_mm
->encl
->srcu
);
796 mmu_notifier_put(mn
);
800 static void sgx_mmu_notifier_free(struct mmu_notifier
*mn
)
802 struct sgx_encl_mm
*encl_mm
= container_of(mn
, struct sgx_encl_mm
, mmu_notifier
);
804 /* 'encl_mm' is going away, put encl_mm->encl reference: */
805 kref_put(&encl_mm
->encl
->refcount
, sgx_encl_release
);
810 static const struct mmu_notifier_ops sgx_mmu_notifier_ops
= {
811 .release
= sgx_mmu_notifier_release
,
812 .free_notifier
= sgx_mmu_notifier_free
,
815 static struct sgx_encl_mm
*sgx_encl_find_mm(struct sgx_encl
*encl
,
816 struct mm_struct
*mm
)
818 struct sgx_encl_mm
*encl_mm
= NULL
;
819 struct sgx_encl_mm
*tmp
;
822 idx
= srcu_read_lock(&encl
->srcu
);
824 list_for_each_entry_rcu(tmp
, &encl
->mm_list
, list
) {
831 srcu_read_unlock(&encl
->srcu
, idx
);
836 int sgx_encl_mm_add(struct sgx_encl
*encl
, struct mm_struct
*mm
)
838 struct sgx_encl_mm
*encl_mm
;
842 * Even though a single enclave may be mapped into an mm more than once,
843 * each 'mm' only appears once on encl->mm_list. This is guaranteed by
844 * holding the mm's mmap lock for write before an mm can be added or
845 * remove to an encl->mm_list.
847 mmap_assert_write_locked(mm
);
850 * It's possible that an entry already exists in the mm_list, because it
851 * is removed only on VFS release or process exit.
853 if (sgx_encl_find_mm(encl
, mm
))
856 encl_mm
= kzalloc(sizeof(*encl_mm
), GFP_KERNEL
);
860 /* Grab a refcount for the encl_mm->encl reference: */
861 kref_get(&encl
->refcount
);
862 encl_mm
->encl
= encl
;
864 encl_mm
->mmu_notifier
.ops
= &sgx_mmu_notifier_ops
;
866 ret
= __mmu_notifier_register(&encl_mm
->mmu_notifier
, mm
);
872 spin_lock(&encl
->mm_lock
);
873 list_add_rcu(&encl_mm
->list
, &encl
->mm_list
);
874 /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */
876 encl
->mm_list_version
++;
877 spin_unlock(&encl
->mm_lock
);
883 * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave
886 * Some SGX functions require that no cached linear-to-physical address
887 * mappings are present before they can succeed. For example, ENCLS[EWB]
888 * copies a page from the enclave page cache to regular main memory but
889 * it fails if it cannot ensure that there are no cached
890 * linear-to-physical address mappings referring to the page.
892 * SGX hardware flushes all cached linear-to-physical mappings on a CPU
893 * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave
894 * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical
895 * address mappings are cleared but coordination with the tracking done within
896 * the SGX hardware is needed to support the SGX functions that depend on this
899 * When the ENCLS[ETRACK] function is issued on an enclave the hardware
900 * tracks threads operating inside the enclave at that time. The SGX
901 * hardware tracking require that all the identified threads must have
902 * exited the enclave in order to flush the mappings before a function such
903 * as ENCLS[EWB] will be permitted
905 * The following flow is used to support SGX functions that require that
906 * no cached linear-to-physical address mappings are present:
907 * 1) Execute ENCLS[ETRACK] to initiate hardware tracking.
908 * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be
909 * accessing the enclave.
910 * 3) Send IPI to identified CPUs, kicking them out of the enclave and
911 * thus flushing all locally cached linear-to-physical address mappings.
912 * 4) Execute SGX function.
914 * Context: It is required to call this function after ENCLS[ETRACK].
915 * This will ensure that if any new mm appears (racing with
916 * sgx_encl_mm_add()) then the new mm will enter into the
917 * enclave with fresh linear-to-physical address mappings.
919 * It is required that all IPIs are completed before a new
920 * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3
921 * of the above flow with the enclave's mutex.
923 * Return: cpumask of CPUs that might be accessing @encl
925 const cpumask_t
*sgx_encl_cpumask(struct sgx_encl
*encl
)
927 cpumask_t
*cpumask
= &encl
->cpumask
;
928 struct sgx_encl_mm
*encl_mm
;
931 cpumask_clear(cpumask
);
933 idx
= srcu_read_lock(&encl
->srcu
);
935 list_for_each_entry_rcu(encl_mm
, &encl
->mm_list
, list
) {
936 if (!mmget_not_zero(encl_mm
->mm
))
939 cpumask_or(cpumask
, cpumask
, mm_cpumask(encl_mm
->mm
));
941 mmput_async(encl_mm
->mm
);
944 srcu_read_unlock(&encl
->srcu
, idx
);
949 static struct page
*sgx_encl_get_backing_page(struct sgx_encl
*encl
,
952 struct address_space
*mapping
= encl
->backing
->f_mapping
;
953 gfp_t gfpmask
= mapping_gfp_mask(mapping
);
955 return shmem_read_mapping_page_gfp(mapping
, index
, gfpmask
);
959 * __sgx_encl_get_backing() - Pin the backing storage
960 * @encl: an enclave pointer
961 * @page_index: enclave page index
962 * @backing: data for accessing backing storage for the page
964 * Pin the backing storage pages for storing the encrypted contents and Paging
965 * Crypto MetaData (PCMD) of an enclave page.
971 static int __sgx_encl_get_backing(struct sgx_encl
*encl
, unsigned long page_index
,
972 struct sgx_backing
*backing
)
974 pgoff_t page_pcmd_off
= sgx_encl_get_backing_page_pcmd_offset(encl
, page_index
);
975 struct page
*contents
;
978 contents
= sgx_encl_get_backing_page(encl
, page_index
);
979 if (IS_ERR(contents
))
980 return PTR_ERR(contents
);
982 pcmd
= sgx_encl_get_backing_page(encl
, PFN_DOWN(page_pcmd_off
));
985 return PTR_ERR(pcmd
);
988 backing
->contents
= contents
;
989 backing
->pcmd
= pcmd
;
990 backing
->pcmd_offset
= page_pcmd_off
& (PAGE_SIZE
- 1);
996 * When called from ksgxd, returns the mem_cgroup of a struct mm stored
997 * in the enclave's mm_list. When not called from ksgxd, just returns
998 * the mem_cgroup of the current task.
1000 static struct mem_cgroup
*sgx_encl_get_mem_cgroup(struct sgx_encl
*encl
)
1002 struct mem_cgroup
*memcg
= NULL
;
1003 struct sgx_encl_mm
*encl_mm
;
1007 * If called from normal task context, return the mem_cgroup
1008 * of the current task's mm. The remainder of the handling is for
1011 if (!current_is_ksgxd())
1012 return get_mem_cgroup_from_mm(current
->mm
);
1015 * Search the enclave's mm_list to find an mm associated with
1016 * this enclave to charge the allocation to.
1018 idx
= srcu_read_lock(&encl
->srcu
);
1020 list_for_each_entry_rcu(encl_mm
, &encl
->mm_list
, list
) {
1021 if (!mmget_not_zero(encl_mm
->mm
))
1024 memcg
= get_mem_cgroup_from_mm(encl_mm
->mm
);
1026 mmput_async(encl_mm
->mm
);
1031 srcu_read_unlock(&encl
->srcu
, idx
);
1034 * In the rare case that there isn't an mm associated with
1035 * the enclave, set memcg to the current active mem_cgroup.
1036 * This will be the root mem_cgroup if there is no active
1040 return get_mem_cgroup_from_mm(NULL
);
1046 * sgx_encl_alloc_backing() - create a new backing storage page
1047 * @encl: an enclave pointer
1048 * @page_index: enclave page index
1049 * @backing: data for accessing backing storage for the page
1051 * When called from ksgxd, sets the active memcg from one of the
1052 * mms in the enclave's mm_list prior to any backing page allocation,
1053 * in order to ensure that shmem page allocations are charged to the
1054 * enclave. Create a backing page for loading data back into an EPC page with
1055 * ELDU. This function takes a reference on a new backing page which
1056 * must be dropped with a corresponding call to sgx_encl_put_backing().
1062 int sgx_encl_alloc_backing(struct sgx_encl
*encl
, unsigned long page_index
,
1063 struct sgx_backing
*backing
)
1065 struct mem_cgroup
*encl_memcg
= sgx_encl_get_mem_cgroup(encl
);
1066 struct mem_cgroup
*memcg
= set_active_memcg(encl_memcg
);
1069 ret
= __sgx_encl_get_backing(encl
, page_index
, backing
);
1071 set_active_memcg(memcg
);
1072 mem_cgroup_put(encl_memcg
);
1078 * sgx_encl_lookup_backing() - retrieve an existing backing storage page
1079 * @encl: an enclave pointer
1080 * @page_index: enclave page index
1081 * @backing: data for accessing backing storage for the page
1083 * Retrieve a backing page for loading data back into an EPC page with ELDU.
1084 * It is the caller's responsibility to ensure that it is appropriate to use
1085 * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is
1086 * not used correctly, this will cause an allocation which is not accounted for.
1087 * This function takes a reference on an existing backing page which must be
1088 * dropped with a corresponding call to sgx_encl_put_backing().
1094 static int sgx_encl_lookup_backing(struct sgx_encl
*encl
, unsigned long page_index
,
1095 struct sgx_backing
*backing
)
1097 return __sgx_encl_get_backing(encl
, page_index
, backing
);
1101 * sgx_encl_put_backing() - Unpin the backing storage
1102 * @backing: data for accessing backing storage for the page
1104 void sgx_encl_put_backing(struct sgx_backing
*backing
)
1106 put_page(backing
->pcmd
);
1107 put_page(backing
->contents
);
1110 static int sgx_encl_test_and_clear_young_cb(pte_t
*ptep
, unsigned long addr
,
1116 ret
= pte_young(*ptep
);
1118 pte
= pte_mkold(*ptep
);
1119 set_pte_at((struct mm_struct
*)data
, addr
, ptep
, pte
);
1126 * sgx_encl_test_and_clear_young() - Test and reset the accessed bit
1127 * @mm: mm_struct that is checked
1128 * @page: enclave page to be tested for recent access
1130 * Checks the Access (A) bit from the PTE corresponding to the enclave page and
1133 * Return: 1 if the page has been recently accessed and 0 if not.
1135 int sgx_encl_test_and_clear_young(struct mm_struct
*mm
,
1136 struct sgx_encl_page
*page
)
1138 unsigned long addr
= page
->desc
& PAGE_MASK
;
1139 struct sgx_encl
*encl
= page
->encl
;
1140 struct vm_area_struct
*vma
;
1143 ret
= sgx_encl_find(mm
, addr
, &vma
);
1147 if (encl
!= vma
->vm_private_data
)
1150 ret
= apply_to_page_range(vma
->vm_mm
, addr
, PAGE_SIZE
,
1151 sgx_encl_test_and_clear_young_cb
, vma
->vm_mm
);
1158 struct sgx_encl_page
*sgx_encl_page_alloc(struct sgx_encl
*encl
,
1159 unsigned long offset
,
1162 struct sgx_encl_page
*encl_page
;
1165 encl_page
= kzalloc(sizeof(*encl_page
), GFP_KERNEL
);
1167 return ERR_PTR(-ENOMEM
);
1169 encl_page
->desc
= encl
->base
+ offset
;
1170 encl_page
->encl
= encl
;
1172 prot
= _calc_vm_trans(secinfo_flags
, SGX_SECINFO_R
, PROT_READ
) |
1173 _calc_vm_trans(secinfo_flags
, SGX_SECINFO_W
, PROT_WRITE
) |
1174 _calc_vm_trans(secinfo_flags
, SGX_SECINFO_X
, PROT_EXEC
);
1177 * TCS pages must always RW set for CPU access while the SECINFO
1178 * permissions are *always* zero - the CPU ignores the user provided
1179 * values and silently overwrites them with zero permissions.
1181 if ((secinfo_flags
& SGX_SECINFO_PAGE_TYPE_MASK
) == SGX_SECINFO_TCS
)
1182 prot
|= PROT_READ
| PROT_WRITE
;
1184 /* Calculate maximum of the VM flags for the page. */
1185 encl_page
->vm_max_prot_bits
= calc_vm_prot_bits(prot
, 0);
1191 * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave
1192 * @encl: the enclave
1193 * @addr: page aligned pointer to single page for which PTEs will be removed
1195 * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping
1196 * @addr from each VMA. Ensure that page fault handler is ready to handle
1197 * new mappings of @addr before calling this function.
1199 void sgx_zap_enclave_ptes(struct sgx_encl
*encl
, unsigned long addr
)
1201 unsigned long mm_list_version
;
1202 struct sgx_encl_mm
*encl_mm
;
1203 struct vm_area_struct
*vma
;
1207 mm_list_version
= encl
->mm_list_version
;
1209 /* Pairs with smp_wmb() in sgx_encl_mm_add(). */
1212 idx
= srcu_read_lock(&encl
->srcu
);
1214 list_for_each_entry_rcu(encl_mm
, &encl
->mm_list
, list
) {
1215 if (!mmget_not_zero(encl_mm
->mm
))
1218 mmap_read_lock(encl_mm
->mm
);
1220 ret
= sgx_encl_find(encl_mm
->mm
, addr
, &vma
);
1221 if (!ret
&& encl
== vma
->vm_private_data
)
1222 zap_vma_ptes(vma
, addr
, PAGE_SIZE
);
1224 mmap_read_unlock(encl_mm
->mm
);
1226 mmput_async(encl_mm
->mm
);
1229 srcu_read_unlock(&encl
->srcu
, idx
);
1230 } while (unlikely(encl
->mm_list_version
!= mm_list_version
));
1234 * sgx_alloc_va_page() - Allocate a Version Array (VA) page
1235 * @reclaim: Reclaim EPC pages directly if none available. Enclave
1236 * mutex should not be held if this is set.
1238 * Allocate a free EPC page and convert it to a Version Array (VA) page.
1244 struct sgx_epc_page
*sgx_alloc_va_page(bool reclaim
)
1246 struct sgx_epc_page
*epc_page
;
1249 epc_page
= sgx_alloc_epc_page(NULL
, reclaim
);
1250 if (IS_ERR(epc_page
))
1251 return ERR_CAST(epc_page
);
1253 ret
= __epa(sgx_get_epc_virt_addr(epc_page
));
1255 WARN_ONCE(1, "EPA returned %d (0x%x)", ret
, ret
);
1256 sgx_encl_free_epc_page(epc_page
);
1257 return ERR_PTR(-EFAULT
);
1264 * sgx_alloc_va_slot - allocate a VA slot
1265 * @va_page: a &struct sgx_va_page instance
1267 * Allocates a slot from a &struct sgx_va_page instance.
1269 * Return: offset of the slot inside the VA page
1271 unsigned int sgx_alloc_va_slot(struct sgx_va_page
*va_page
)
1273 int slot
= find_first_zero_bit(va_page
->slots
, SGX_VA_SLOT_COUNT
);
1275 if (slot
< SGX_VA_SLOT_COUNT
)
1276 set_bit(slot
, va_page
->slots
);
1282 * sgx_free_va_slot - free a VA slot
1283 * @va_page: a &struct sgx_va_page instance
1284 * @offset: offset of the slot inside the VA page
1286 * Frees a slot from a &struct sgx_va_page instance.
1288 void sgx_free_va_slot(struct sgx_va_page
*va_page
, unsigned int offset
)
1290 clear_bit(offset
>> 3, va_page
->slots
);
1294 * sgx_va_page_full - is the VA page full?
1295 * @va_page: a &struct sgx_va_page instance
1297 * Return: true if all slots have been taken
1299 bool sgx_va_page_full(struct sgx_va_page
*va_page
)
1301 int slot
= find_first_zero_bit(va_page
->slots
, SGX_VA_SLOT_COUNT
);
1303 return slot
== SGX_VA_SLOT_COUNT
;
1307 * sgx_encl_free_epc_page - free an EPC page assigned to an enclave
1308 * @page: EPC page to be freed
1310 * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
1311 * only upon success, it puts the page back to free page list. Otherwise, it
1312 * gives a WARNING to indicate page is leaked.
1314 void sgx_encl_free_epc_page(struct sgx_epc_page
*page
)
1318 WARN_ON_ONCE(page
->flags
& SGX_EPC_PAGE_RECLAIMER_TRACKED
);
1320 ret
= __eremove(sgx_get_epc_virt_addr(page
));
1321 if (WARN_ONCE(ret
, EREMOVE_ERROR_MESSAGE
, ret
, ret
))
1324 sgx_free_epc_page(page
);