1 // SPDX-License-Identifier: GPL-2.0-only
3 * CPU-agnostic AMD IO page table allocator.
5 * Copyright (C) 2020 Advanced Micro Devices, Inc.
6 * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
9 #define pr_fmt(fmt) "AMD-Vi: " fmt
10 #define dev_fmt(fmt) pr_fmt(fmt)
12 #include <linux/atomic.h>
13 #include <linux/bitops.h>
14 #include <linux/io-pgtable.h>
15 #include <linux/kernel.h>
16 #include <linux/sizes.h>
17 #include <linux/slab.h>
18 #include <linux/types.h>
19 #include <linux/dma-mapping.h>
21 #include <asm/barrier.h>
23 #include "amd_iommu_types.h"
24 #include "amd_iommu.h"
25 #include "../iommu-pages.h"
28 * Helper function to get the first pte of a large mapping
30 static u64
*first_pte_l7(u64
*pte
, unsigned long *page_size
,
33 unsigned long pte_mask
, pg_size
, cnt
;
36 pg_size
= PTE_PAGE_SIZE(*pte
);
37 cnt
= PAGE_SIZE_PTE_COUNT(pg_size
);
38 pte_mask
= ~((cnt
<< 3) - 1);
39 fpte
= (u64
*)(((unsigned long)pte
) & pte_mask
);
50 /****************************************************************************
52 * The functions below are used the create the page table mappings for
53 * unity mapped regions.
55 ****************************************************************************/
57 static void free_pt_page(u64
*pt
, struct list_head
*freelist
)
59 struct page
*p
= virt_to_page(pt
);
61 list_add_tail(&p
->lru
, freelist
);
64 static void free_pt_lvl(u64
*pt
, struct list_head
*freelist
, int lvl
)
69 for (i
= 0; i
< 512; ++i
) {
71 if (!IOMMU_PTE_PRESENT(pt
[i
]))
75 if (PM_PTE_LEVEL(pt
[i
]) == 0 ||
76 PM_PTE_LEVEL(pt
[i
]) == 7)
80 * Free the next level. No need to look at l1 tables here since
81 * they can only contain leaf PTEs; just free them directly.
83 p
= IOMMU_PTE_PAGE(pt
[i
]);
85 free_pt_lvl(p
, freelist
, lvl
- 1);
87 free_pt_page(p
, freelist
);
90 free_pt_page(pt
, freelist
);
93 static void free_sub_pt(u64
*root
, int mode
, struct list_head
*freelist
)
97 case PAGE_MODE_7_LEVEL
:
99 case PAGE_MODE_1_LEVEL
:
100 free_pt_page(root
, freelist
);
102 case PAGE_MODE_2_LEVEL
:
103 case PAGE_MODE_3_LEVEL
:
104 case PAGE_MODE_4_LEVEL
:
105 case PAGE_MODE_5_LEVEL
:
106 case PAGE_MODE_6_LEVEL
:
107 free_pt_lvl(root
, freelist
, mode
);
115 * This function is used to add another level to an IO page table. Adding
116 * another level increases the size of the address space by 9 bits to a size up
119 static bool increase_address_space(struct amd_io_pgtable
*pgtable
,
120 unsigned long address
,
121 unsigned int page_size_level
,
124 struct io_pgtable_cfg
*cfg
= &pgtable
->pgtbl
.cfg
;
125 struct protection_domain
*domain
=
126 container_of(pgtable
, struct protection_domain
, iop
);
131 pte
= iommu_alloc_page_node(cfg
->amd
.nid
, gfp
);
135 spin_lock_irqsave(&domain
->lock
, flags
);
137 if (address
<= PM_LEVEL_SIZE(pgtable
->mode
) &&
138 pgtable
->mode
- 1 >= page_size_level
)
142 if (WARN_ON_ONCE(pgtable
->mode
== PAGE_MODE_6_LEVEL
))
145 *pte
= PM_LEVEL_PDE(pgtable
->mode
, iommu_virt_to_phys(pgtable
->root
));
149 amd_iommu_update_and_flush_device_table(domain
);
155 spin_unlock_irqrestore(&domain
->lock
, flags
);
156 iommu_free_page(pte
);
161 static u64
*alloc_pte(struct amd_io_pgtable
*pgtable
,
162 unsigned long address
,
163 unsigned long page_size
,
168 unsigned long last_addr
= address
+ (page_size
- 1);
169 struct io_pgtable_cfg
*cfg
= &pgtable
->pgtbl
.cfg
;
173 BUG_ON(!is_power_of_2(page_size
));
175 while (last_addr
> PM_LEVEL_SIZE(pgtable
->mode
) ||
176 pgtable
->mode
- 1 < PAGE_SIZE_LEVEL(page_size
)) {
178 * Return an error if there is no memory to update the
181 if (!increase_address_space(pgtable
, last_addr
,
182 PAGE_SIZE_LEVEL(page_size
), gfp
))
187 level
= pgtable
->mode
- 1;
188 pte
= &pgtable
->root
[PM_LEVEL_INDEX(level
, address
)];
189 address
= PAGE_SIZE_ALIGN(address
, page_size
);
190 end_lvl
= PAGE_SIZE_LEVEL(page_size
);
192 while (level
> end_lvl
) {
197 pte_level
= PM_PTE_LEVEL(__pte
);
200 * If we replace a series of large PTEs, we need
201 * to tear down all of them.
203 if (IOMMU_PTE_PRESENT(__pte
) &&
204 pte_level
== PAGE_MODE_7_LEVEL
) {
205 unsigned long count
, i
;
208 lpte
= first_pte_l7(pte
, NULL
, &count
);
211 * Unmap the replicated PTEs that still match the
212 * original large mapping
214 for (i
= 0; i
< count
; ++i
)
215 cmpxchg64(&lpte
[i
], __pte
, 0ULL);
221 if (!IOMMU_PTE_PRESENT(__pte
) ||
222 pte_level
== PAGE_MODE_NONE
) {
223 page
= iommu_alloc_page_node(cfg
->amd
.nid
, gfp
);
228 __npte
= PM_LEVEL_PDE(level
, iommu_virt_to_phys(page
));
230 /* pte could have been changed somewhere. */
231 if (!try_cmpxchg64(pte
, &__pte
, __npte
))
232 iommu_free_page(page
);
233 else if (IOMMU_PTE_PRESENT(__pte
))
239 /* No level skipping support yet */
240 if (pte_level
!= level
)
245 pte
= IOMMU_PTE_PAGE(__pte
);
247 if (pte_page
&& level
== end_lvl
)
250 pte
= &pte
[PM_LEVEL_INDEX(level
, address
)];
257 * This function checks if there is a PTE for a given dma address. If
258 * there is one, it returns the pointer to it.
260 static u64
*fetch_pte(struct amd_io_pgtable
*pgtable
,
261 unsigned long address
,
262 unsigned long *page_size
)
269 if (address
> PM_LEVEL_SIZE(pgtable
->mode
))
272 level
= pgtable
->mode
- 1;
273 pte
= &pgtable
->root
[PM_LEVEL_INDEX(level
, address
)];
274 *page_size
= PTE_LEVEL_PAGE_SIZE(level
);
279 if (!IOMMU_PTE_PRESENT(*pte
))
283 if (PM_PTE_LEVEL(*pte
) == PAGE_MODE_7_LEVEL
||
284 PM_PTE_LEVEL(*pte
) == PAGE_MODE_NONE
)
287 /* No level skipping support yet */
288 if (PM_PTE_LEVEL(*pte
) != level
)
293 /* Walk to the next level */
294 pte
= IOMMU_PTE_PAGE(*pte
);
295 pte
= &pte
[PM_LEVEL_INDEX(level
, address
)];
296 *page_size
= PTE_LEVEL_PAGE_SIZE(level
);
300 * If we have a series of large PTEs, make
301 * sure to return a pointer to the first one.
303 if (PM_PTE_LEVEL(*pte
) == PAGE_MODE_7_LEVEL
)
304 pte
= first_pte_l7(pte
, page_size
, NULL
);
309 static void free_clear_pte(u64
*pte
, u64 pteval
, struct list_head
*freelist
)
314 while (!try_cmpxchg64(pte
, &pteval
, 0))
315 pr_warn("AMD-Vi: IOMMU pte changed since we read it\n");
317 if (!IOMMU_PTE_PRESENT(pteval
))
320 pt
= IOMMU_PTE_PAGE(pteval
);
321 mode
= IOMMU_PTE_MODE(pteval
);
323 free_sub_pt(pt
, mode
, freelist
);
327 * Generic mapping functions. It maps a physical address into a DMA
328 * address space. It allocates the page table pages if necessary.
329 * In the future it can be extended to a generic mapping function
330 * supporting all features of AMD IOMMU page tables like level skipping
331 * and full 64 bit address spaces.
333 static int iommu_v1_map_pages(struct io_pgtable_ops
*ops
, unsigned long iova
,
334 phys_addr_t paddr
, size_t pgsize
, size_t pgcount
,
335 int prot
, gfp_t gfp
, size_t *mapped
)
337 struct amd_io_pgtable
*pgtable
= io_pgtable_ops_to_data(ops
);
339 bool updated
= false;
342 size_t size
= pgcount
<< __ffs(pgsize
);
343 unsigned long o_iova
= iova
;
345 BUG_ON(!IS_ALIGNED(iova
, pgsize
));
346 BUG_ON(!IS_ALIGNED(paddr
, pgsize
));
349 if (!(prot
& IOMMU_PROT_MASK
))
352 while (pgcount
> 0) {
353 count
= PAGE_SIZE_PTE_COUNT(pgsize
);
354 pte
= alloc_pte(pgtable
, iova
, pgsize
, NULL
, gfp
, &updated
);
360 for (i
= 0; i
< count
; ++i
)
361 free_clear_pte(&pte
[i
], pte
[i
], &freelist
);
363 if (!list_empty(&freelist
))
367 __pte
= PAGE_SIZE_PTE(__sme_set(paddr
), pgsize
);
368 __pte
|= PM_LEVEL_ENC(7) | IOMMU_PTE_PR
| IOMMU_PTE_FC
;
370 __pte
= __sme_set(paddr
) | IOMMU_PTE_PR
| IOMMU_PTE_FC
;
372 if (prot
& IOMMU_PROT_IR
)
373 __pte
|= IOMMU_PTE_IR
;
374 if (prot
& IOMMU_PROT_IW
)
375 __pte
|= IOMMU_PTE_IW
;
377 for (i
= 0; i
< count
; ++i
)
391 struct protection_domain
*dom
= io_pgtable_ops_to_domain(ops
);
394 spin_lock_irqsave(&dom
->lock
, flags
);
396 * Flush domain TLB(s) and wait for completion. Any Device-Table
397 * Updates and flushing already happened in
398 * increase_address_space().
400 amd_iommu_domain_flush_pages(dom
, o_iova
, size
);
401 spin_unlock_irqrestore(&dom
->lock
, flags
);
404 /* Everything flushed out, free pages now */
405 iommu_put_pages_list(&freelist
);
410 static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops
*ops
,
412 size_t pgsize
, size_t pgcount
,
413 struct iommu_iotlb_gather
*gather
)
415 struct amd_io_pgtable
*pgtable
= io_pgtable_ops_to_data(ops
);
416 unsigned long long unmapped
;
417 unsigned long unmap_size
;
419 size_t size
= pgcount
<< __ffs(pgsize
);
421 BUG_ON(!is_power_of_2(pgsize
));
425 while (unmapped
< size
) {
426 pte
= fetch_pte(pgtable
, iova
, &unmap_size
);
430 count
= PAGE_SIZE_PTE_COUNT(unmap_size
);
431 for (i
= 0; i
< count
; i
++)
437 iova
= (iova
& ~(unmap_size
- 1)) + unmap_size
;
438 unmapped
+= unmap_size
;
444 static phys_addr_t
iommu_v1_iova_to_phys(struct io_pgtable_ops
*ops
, unsigned long iova
)
446 struct amd_io_pgtable
*pgtable
= io_pgtable_ops_to_data(ops
);
447 unsigned long offset_mask
, pte_pgsize
;
450 pte
= fetch_pte(pgtable
, iova
, &pte_pgsize
);
452 if (!pte
|| !IOMMU_PTE_PRESENT(*pte
))
455 offset_mask
= pte_pgsize
- 1;
456 __pte
= __sme_clr(*pte
& PM_ADDR_MASK
);
458 return (__pte
& ~offset_mask
) | (iova
& offset_mask
);
461 static bool pte_test_and_clear_dirty(u64
*ptep
, unsigned long size
,
464 bool test_only
= flags
& IOMMU_DIRTY_NO_CLEAR
;
469 * 2.2.3.2 Host Dirty Support
470 * When a non-default page size is used , software must OR the
471 * Dirty bits in all of the replicated host PTEs used to map
472 * the page. The IOMMU does not guarantee the Dirty bits are
473 * set in all of the replicated PTEs. Any portion of the page
474 * may have been written even if the Dirty bit is set in only
475 * one of the replicated PTEs.
477 count
= PAGE_SIZE_PTE_COUNT(size
);
478 for (i
= 0; i
< count
&& test_only
; i
++) {
479 if (test_bit(IOMMU_PTE_HD_BIT
, (unsigned long *)&ptep
[i
])) {
485 for (i
= 0; i
< count
&& !test_only
; i
++) {
486 if (test_and_clear_bit(IOMMU_PTE_HD_BIT
,
487 (unsigned long *)&ptep
[i
])) {
495 static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops
*ops
,
496 unsigned long iova
, size_t size
,
498 struct iommu_dirty_bitmap
*dirty
)
500 struct amd_io_pgtable
*pgtable
= io_pgtable_ops_to_data(ops
);
501 unsigned long end
= iova
+ size
- 1;
504 unsigned long pgsize
= 0;
507 ptep
= fetch_pte(pgtable
, iova
, &pgsize
);
509 pte
= READ_ONCE(*ptep
);
510 if (!ptep
|| !IOMMU_PTE_PRESENT(pte
)) {
511 pgsize
= pgsize
?: PTE_LEVEL_PAGE_SIZE(0);
517 * Mark the whole IOVA range as dirty even if only one of
518 * the replicated PTEs were marked dirty.
520 if (pte_test_and_clear_dirty(ptep
, pgsize
, flags
))
521 iommu_dirty_bitmap_record(dirty
, iova
, pgsize
);
523 } while (iova
< end
);
529 * ----------------------------------------------------
531 static void v1_free_pgtable(struct io_pgtable
*iop
)
533 struct amd_io_pgtable
*pgtable
= container_of(iop
, struct amd_io_pgtable
, pgtbl
);
536 if (pgtable
->mode
== PAGE_MODE_NONE
)
539 /* Page-table is not visible to IOMMU anymore, so free it */
540 BUG_ON(pgtable
->mode
< PAGE_MODE_NONE
||
541 pgtable
->mode
> PAGE_MODE_6_LEVEL
);
543 free_sub_pt(pgtable
->root
, pgtable
->mode
, &freelist
);
544 iommu_put_pages_list(&freelist
);
547 static struct io_pgtable
*v1_alloc_pgtable(struct io_pgtable_cfg
*cfg
, void *cookie
)
549 struct amd_io_pgtable
*pgtable
= io_pgtable_cfg_to_data(cfg
);
551 pgtable
->root
= iommu_alloc_page_node(cfg
->amd
.nid
, GFP_KERNEL
);
554 pgtable
->mode
= PAGE_MODE_3_LEVEL
;
556 cfg
->pgsize_bitmap
= amd_iommu_pgsize_bitmap
;
557 cfg
->ias
= IOMMU_IN_ADDR_BIT_SIZE
;
558 cfg
->oas
= IOMMU_OUT_ADDR_BIT_SIZE
;
560 pgtable
->pgtbl
.ops
.map_pages
= iommu_v1_map_pages
;
561 pgtable
->pgtbl
.ops
.unmap_pages
= iommu_v1_unmap_pages
;
562 pgtable
->pgtbl
.ops
.iova_to_phys
= iommu_v1_iova_to_phys
;
563 pgtable
->pgtbl
.ops
.read_and_clear_dirty
= iommu_v1_read_and_clear_dirty
;
565 return &pgtable
->pgtbl
;
568 struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns
= {
569 .alloc
= v1_alloc_pgtable
,
570 .free
= v1_free_pgtable
,