1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
5 * Rewrite, cleanup, new allocation schemes, virtual merging:
6 * Copyright (C) 2004 Olof Johansson, IBM Corporation
7 * and Ben. Herrenschmidt, IBM Corporation
9 * Dynamic DMA mapping support, bus-independent parts.
13 #include <linux/init.h>
14 #include <linux/types.h>
15 #include <linux/slab.h>
17 #include <linux/spinlock.h>
18 #include <linux/string.h>
19 #include <linux/dma-mapping.h>
20 #include <linux/bitmap.h>
21 #include <linux/iommu-helper.h>
22 #include <linux/crash_dump.h>
23 #include <linux/hash.h>
24 #include <linux/fault-inject.h>
25 #include <linux/pci.h>
26 #include <linux/iommu.h>
27 #include <linux/sched.h>
30 #include <asm/iommu.h>
31 #include <asm/pci-bridge.h>
32 #include <asm/machdep.h>
33 #include <asm/kdump.h>
34 #include <asm/fadump.h>
37 #include <asm/mmu_context.h>
43 static void __iommu_free(struct iommu_table
*, dma_addr_t
, unsigned int);
45 static int __init
setup_iommu(char *str
)
47 if (!strcmp(str
, "novmerge"))
49 else if (!strcmp(str
, "vmerge"))
54 __setup("iommu=", setup_iommu
);
56 static DEFINE_PER_CPU(unsigned int, iommu_pool_hash
);
59 * We precalculate the hash to avoid doing it on every allocation.
61 * The hash is important to spread CPUs across all the pools. For example,
62 * on a POWER7 with 4 way SMT we want interrupts on the primary threads and
63 * with 4 pools all primary threads would map to the same pool.
65 static int __init
setup_iommu_pool_hash(void)
69 for_each_possible_cpu(i
)
70 per_cpu(iommu_pool_hash
, i
) = hash_32(i
, IOMMU_POOL_HASHBITS
);
74 subsys_initcall(setup_iommu_pool_hash
);
76 #ifdef CONFIG_FAIL_IOMMU
78 static DECLARE_FAULT_ATTR(fail_iommu
);
80 static int __init
setup_fail_iommu(char *str
)
82 return setup_fault_attr(&fail_iommu
, str
);
84 __setup("fail_iommu=", setup_fail_iommu
);
86 static bool should_fail_iommu(struct device
*dev
)
88 return dev
->archdata
.fail_iommu
&& should_fail(&fail_iommu
, 1);
91 static int __init
fail_iommu_debugfs(void)
93 struct dentry
*dir
= fault_create_debugfs_attr("fail_iommu",
96 return PTR_ERR_OR_ZERO(dir
);
98 late_initcall(fail_iommu_debugfs
);
100 static ssize_t
fail_iommu_show(struct device
*dev
,
101 struct device_attribute
*attr
, char *buf
)
103 return sprintf(buf
, "%d\n", dev
->archdata
.fail_iommu
);
106 static ssize_t
fail_iommu_store(struct device
*dev
,
107 struct device_attribute
*attr
, const char *buf
,
112 if (count
> 0 && sscanf(buf
, "%d", &i
) > 0)
113 dev
->archdata
.fail_iommu
= (i
== 0) ? 0 : 1;
118 static DEVICE_ATTR_RW(fail_iommu
);
120 static int fail_iommu_bus_notify(struct notifier_block
*nb
,
121 unsigned long action
, void *data
)
123 struct device
*dev
= data
;
125 if (action
== BUS_NOTIFY_ADD_DEVICE
) {
126 if (device_create_file(dev
, &dev_attr_fail_iommu
))
127 pr_warn("Unable to create IOMMU fault injection sysfs "
129 } else if (action
== BUS_NOTIFY_DEL_DEVICE
) {
130 device_remove_file(dev
, &dev_attr_fail_iommu
);
136 static struct notifier_block fail_iommu_bus_notifier
= {
137 .notifier_call
= fail_iommu_bus_notify
140 static int __init
fail_iommu_setup(void)
143 bus_register_notifier(&pci_bus_type
, &fail_iommu_bus_notifier
);
146 bus_register_notifier(&vio_bus_type
, &fail_iommu_bus_notifier
);
152 * Must execute after PCI and VIO subsystem have initialised but before
153 * devices are probed.
155 arch_initcall(fail_iommu_setup
);
157 static inline bool should_fail_iommu(struct device
*dev
)
163 static unsigned long iommu_range_alloc(struct device
*dev
,
164 struct iommu_table
*tbl
,
165 unsigned long npages
,
166 unsigned long *handle
,
168 unsigned int align_order
)
170 unsigned long n
, end
, start
;
172 int largealloc
= npages
> 15;
174 unsigned long align_mask
;
175 unsigned long boundary_size
;
177 unsigned int pool_nr
;
178 struct iommu_pool
*pool
;
180 align_mask
= (1ull << align_order
) - 1;
182 /* This allocator was derived from x86_64's bit string search */
185 if (unlikely(npages
== 0)) {
186 if (printk_ratelimit())
188 return DMA_MAPPING_ERROR
;
191 if (should_fail_iommu(dev
))
192 return DMA_MAPPING_ERROR
;
195 * We don't need to disable preemption here because any CPU can
196 * safely use any IOMMU pool.
198 pool_nr
= raw_cpu_read(iommu_pool_hash
) & (tbl
->nr_pools
- 1);
201 pool
= &(tbl
->large_pool
);
203 pool
= &(tbl
->pools
[pool_nr
]);
205 spin_lock_irqsave(&(pool
->lock
), flags
);
208 if ((pass
== 0) && handle
&& *handle
&&
209 (*handle
>= pool
->start
) && (*handle
< pool
->end
))
216 /* The case below can happen if we have a small segment appended
217 * to a large, or when the previous alloc was at the very end of
218 * the available space. If so, go back to the initial start.
223 if (limit
+ tbl
->it_offset
> mask
) {
224 limit
= mask
- tbl
->it_offset
+ 1;
225 /* If we're constrained on address range, first try
226 * at the masked hint to avoid O(n) search complexity,
227 * but on second pass, start at 0 in pool 0.
229 if ((start
& mask
) >= limit
|| pass
> 0) {
230 spin_unlock(&(pool
->lock
));
231 pool
= &(tbl
->pools
[0]);
232 spin_lock(&(pool
->lock
));
240 boundary_size
= ALIGN(dma_get_seg_boundary(dev
) + 1,
241 1 << tbl
->it_page_shift
);
243 boundary_size
= ALIGN(1UL << 32, 1 << tbl
->it_page_shift
);
244 /* 4GB boundary for iseries_hv_alloc and iseries_hv_map */
246 n
= iommu_area_alloc(tbl
->it_map
, limit
, start
, npages
, tbl
->it_offset
,
247 boundary_size
>> tbl
->it_page_shift
, align_mask
);
249 if (likely(pass
== 0)) {
250 /* First try the pool from the start */
251 pool
->hint
= pool
->start
;
255 } else if (pass
<= tbl
->nr_pools
) {
256 /* Now try scanning all the other pools */
257 spin_unlock(&(pool
->lock
));
258 pool_nr
= (pool_nr
+ 1) & (tbl
->nr_pools
- 1);
259 pool
= &tbl
->pools
[pool_nr
];
260 spin_lock(&(pool
->lock
));
261 pool
->hint
= pool
->start
;
267 spin_unlock_irqrestore(&(pool
->lock
), flags
);
268 return DMA_MAPPING_ERROR
;
274 /* Bump the hint to a new block for small allocs. */
276 /* Don't bump to new block to avoid fragmentation */
279 /* Overflow will be taken care of at the next allocation */
280 pool
->hint
= (end
+ tbl
->it_blocksize
- 1) &
281 ~(tbl
->it_blocksize
- 1);
284 /* Update handle for SG allocations */
288 spin_unlock_irqrestore(&(pool
->lock
), flags
);
293 static dma_addr_t
iommu_alloc(struct device
*dev
, struct iommu_table
*tbl
,
294 void *page
, unsigned int npages
,
295 enum dma_data_direction direction
,
296 unsigned long mask
, unsigned int align_order
,
300 dma_addr_t ret
= DMA_MAPPING_ERROR
;
303 entry
= iommu_range_alloc(dev
, tbl
, npages
, NULL
, mask
, align_order
);
305 if (unlikely(entry
== DMA_MAPPING_ERROR
))
306 return DMA_MAPPING_ERROR
;
308 entry
+= tbl
->it_offset
; /* Offset into real TCE table */
309 ret
= entry
<< tbl
->it_page_shift
; /* Set the return dma address */
311 /* Put the TCEs in the HW table */
312 build_fail
= tbl
->it_ops
->set(tbl
, entry
, npages
,
313 (unsigned long)page
&
314 IOMMU_PAGE_MASK(tbl
), direction
, attrs
);
316 /* tbl->it_ops->set() only returns non-zero for transient errors.
317 * Clean up the table bitmap in this case and return
318 * DMA_MAPPING_ERROR. For all other errors the functionality is
321 if (unlikely(build_fail
)) {
322 __iommu_free(tbl
, ret
, npages
);
323 return DMA_MAPPING_ERROR
;
326 /* Flush/invalidate TLB caches if necessary */
327 if (tbl
->it_ops
->flush
)
328 tbl
->it_ops
->flush(tbl
);
330 /* Make sure updates are seen by hardware */
336 static bool iommu_free_check(struct iommu_table
*tbl
, dma_addr_t dma_addr
,
339 unsigned long entry
, free_entry
;
341 entry
= dma_addr
>> tbl
->it_page_shift
;
342 free_entry
= entry
- tbl
->it_offset
;
344 if (((free_entry
+ npages
) > tbl
->it_size
) ||
345 (entry
< tbl
->it_offset
)) {
346 if (printk_ratelimit()) {
347 printk(KERN_INFO
"iommu_free: invalid entry\n");
348 printk(KERN_INFO
"\tentry = 0x%lx\n", entry
);
349 printk(KERN_INFO
"\tdma_addr = 0x%llx\n", (u64
)dma_addr
);
350 printk(KERN_INFO
"\tTable = 0x%llx\n", (u64
)tbl
);
351 printk(KERN_INFO
"\tbus# = 0x%llx\n", (u64
)tbl
->it_busno
);
352 printk(KERN_INFO
"\tsize = 0x%llx\n", (u64
)tbl
->it_size
);
353 printk(KERN_INFO
"\tstartOff = 0x%llx\n", (u64
)tbl
->it_offset
);
354 printk(KERN_INFO
"\tindex = 0x%llx\n", (u64
)tbl
->it_index
);
364 static struct iommu_pool
*get_pool(struct iommu_table
*tbl
,
367 struct iommu_pool
*p
;
368 unsigned long largepool_start
= tbl
->large_pool
.start
;
370 /* The large pool is the last pool at the top of the table */
371 if (entry
>= largepool_start
) {
372 p
= &tbl
->large_pool
;
374 unsigned int pool_nr
= entry
/ tbl
->poolsize
;
376 BUG_ON(pool_nr
> tbl
->nr_pools
);
377 p
= &tbl
->pools
[pool_nr
];
383 static void __iommu_free(struct iommu_table
*tbl
, dma_addr_t dma_addr
,
386 unsigned long entry
, free_entry
;
388 struct iommu_pool
*pool
;
390 entry
= dma_addr
>> tbl
->it_page_shift
;
391 free_entry
= entry
- tbl
->it_offset
;
393 pool
= get_pool(tbl
, free_entry
);
395 if (!iommu_free_check(tbl
, dma_addr
, npages
))
398 tbl
->it_ops
->clear(tbl
, entry
, npages
);
400 spin_lock_irqsave(&(pool
->lock
), flags
);
401 bitmap_clear(tbl
->it_map
, free_entry
, npages
);
402 spin_unlock_irqrestore(&(pool
->lock
), flags
);
405 static void iommu_free(struct iommu_table
*tbl
, dma_addr_t dma_addr
,
408 __iommu_free(tbl
, dma_addr
, npages
);
410 /* Make sure TLB cache is flushed if the HW needs it. We do
411 * not do an mb() here on purpose, it is not needed on any of
412 * the current platforms.
414 if (tbl
->it_ops
->flush
)
415 tbl
->it_ops
->flush(tbl
);
418 int ppc_iommu_map_sg(struct device
*dev
, struct iommu_table
*tbl
,
419 struct scatterlist
*sglist
, int nelems
,
420 unsigned long mask
, enum dma_data_direction direction
,
423 dma_addr_t dma_next
= 0, dma_addr
;
424 struct scatterlist
*s
, *outs
, *segstart
;
425 int outcount
, incount
, i
, build_fail
= 0;
427 unsigned long handle
;
428 unsigned int max_seg_size
;
430 BUG_ON(direction
== DMA_NONE
);
432 if ((nelems
== 0) || !tbl
)
435 outs
= s
= segstart
= &sglist
[0];
440 /* Init first segment length for backout at failure */
441 outs
->dma_length
= 0;
443 DBG("sg mapping %d elements:\n", nelems
);
445 max_seg_size
= dma_get_max_seg_size(dev
);
446 for_each_sg(sglist
, s
, nelems
, i
) {
447 unsigned long vaddr
, npages
, entry
, slen
;
455 /* Allocate iommu entries for that segment */
456 vaddr
= (unsigned long) sg_virt(s
);
457 npages
= iommu_num_pages(vaddr
, slen
, IOMMU_PAGE_SIZE(tbl
));
459 if (tbl
->it_page_shift
< PAGE_SHIFT
&& slen
>= PAGE_SIZE
&&
460 (vaddr
& ~PAGE_MASK
) == 0)
461 align
= PAGE_SHIFT
- tbl
->it_page_shift
;
462 entry
= iommu_range_alloc(dev
, tbl
, npages
, &handle
,
463 mask
>> tbl
->it_page_shift
, align
);
465 DBG(" - vaddr: %lx, size: %lx\n", vaddr
, slen
);
468 if (unlikely(entry
== DMA_MAPPING_ERROR
)) {
469 if (!(attrs
& DMA_ATTR_NO_WARN
) &&
471 dev_info(dev
, "iommu_alloc failed, tbl %p "
472 "vaddr %lx npages %lu\n", tbl
, vaddr
,
477 /* Convert entry to a dma_addr_t */
478 entry
+= tbl
->it_offset
;
479 dma_addr
= entry
<< tbl
->it_page_shift
;
480 dma_addr
|= (s
->offset
& ~IOMMU_PAGE_MASK(tbl
));
482 DBG(" - %lu pages, entry: %lx, dma_addr: %lx\n",
483 npages
, entry
, dma_addr
);
485 /* Insert into HW table */
486 build_fail
= tbl
->it_ops
->set(tbl
, entry
, npages
,
487 vaddr
& IOMMU_PAGE_MASK(tbl
),
489 if(unlikely(build_fail
))
492 /* If we are in an open segment, try merging */
494 DBG(" - trying merge...\n");
495 /* We cannot merge if:
496 * - allocated dma_addr isn't contiguous to previous allocation
498 if (novmerge
|| (dma_addr
!= dma_next
) ||
499 (outs
->dma_length
+ s
->length
> max_seg_size
)) {
500 /* Can't merge: create a new segment */
503 outs
= sg_next(outs
);
504 DBG(" can't merge, new segment.\n");
506 outs
->dma_length
+= s
->length
;
507 DBG(" merged, new len: %ux\n", outs
->dma_length
);
512 /* This is a new segment, fill entries */
513 DBG(" - filling new segment.\n");
514 outs
->dma_address
= dma_addr
;
515 outs
->dma_length
= slen
;
518 /* Calculate next page pointer for contiguous check */
519 dma_next
= dma_addr
+ slen
;
521 DBG(" - dma next is: %lx\n", dma_next
);
524 /* Flush/invalidate TLB caches if necessary */
525 if (tbl
->it_ops
->flush
)
526 tbl
->it_ops
->flush(tbl
);
528 DBG("mapped %d elements:\n", outcount
);
530 /* For the sake of ppc_iommu_unmap_sg, we clear out the length in the
531 * next entry of the sglist if we didn't fill the list completely
533 if (outcount
< incount
) {
534 outs
= sg_next(outs
);
535 outs
->dma_address
= DMA_MAPPING_ERROR
;
536 outs
->dma_length
= 0;
539 /* Make sure updates are seen by hardware */
545 for_each_sg(sglist
, s
, nelems
, i
) {
546 if (s
->dma_length
!= 0) {
547 unsigned long vaddr
, npages
;
549 vaddr
= s
->dma_address
& IOMMU_PAGE_MASK(tbl
);
550 npages
= iommu_num_pages(s
->dma_address
, s
->dma_length
,
551 IOMMU_PAGE_SIZE(tbl
));
552 __iommu_free(tbl
, vaddr
, npages
);
553 s
->dma_address
= DMA_MAPPING_ERROR
;
563 void ppc_iommu_unmap_sg(struct iommu_table
*tbl
, struct scatterlist
*sglist
,
564 int nelems
, enum dma_data_direction direction
,
567 struct scatterlist
*sg
;
569 BUG_ON(direction
== DMA_NONE
);
577 dma_addr_t dma_handle
= sg
->dma_address
;
579 if (sg
->dma_length
== 0)
581 npages
= iommu_num_pages(dma_handle
, sg
->dma_length
,
582 IOMMU_PAGE_SIZE(tbl
));
583 __iommu_free(tbl
, dma_handle
, npages
);
587 /* Flush/invalidate TLBs if necessary. As for iommu_free(), we
588 * do not do an mb() here, the affected platforms do not need it
591 if (tbl
->it_ops
->flush
)
592 tbl
->it_ops
->flush(tbl
);
595 static void iommu_table_clear(struct iommu_table
*tbl
)
598 * In case of firmware assisted dump system goes through clean
599 * reboot process at the time of system crash. Hence it's safe to
600 * clear the TCE entries if firmware assisted dump is active.
602 if (!is_kdump_kernel() || is_fadump_active()) {
603 /* Clear the table in case firmware left allocations in it */
604 tbl
->it_ops
->clear(tbl
, tbl
->it_offset
, tbl
->it_size
);
608 #ifdef CONFIG_CRASH_DUMP
609 if (tbl
->it_ops
->get
) {
610 unsigned long index
, tceval
, tcecount
= 0;
612 /* Reserve the existing mappings left by the first kernel. */
613 for (index
= 0; index
< tbl
->it_size
; index
++) {
614 tceval
= tbl
->it_ops
->get(tbl
, index
+ tbl
->it_offset
);
616 * Freed TCE entry contains 0x7fffffffffffffff on JS20
618 if (tceval
&& (tceval
!= 0x7fffffffffffffffUL
)) {
619 __set_bit(index
, tbl
->it_map
);
624 if ((tbl
->it_size
- tcecount
) < KDUMP_MIN_TCE_ENTRIES
) {
625 printk(KERN_WARNING
"TCE table is full; freeing ");
626 printk(KERN_WARNING
"%d entries for the kdump boot\n",
627 KDUMP_MIN_TCE_ENTRIES
);
628 for (index
= tbl
->it_size
- KDUMP_MIN_TCE_ENTRIES
;
629 index
< tbl
->it_size
; index
++)
630 __clear_bit(index
, tbl
->it_map
);
636 static void iommu_table_reserve_pages(struct iommu_table
*tbl
,
637 unsigned long res_start
, unsigned long res_end
)
641 WARN_ON_ONCE(res_end
< res_start
);
643 * Reserve page 0 so it will not be used for any mappings.
644 * This avoids buggy drivers that consider page 0 to be invalid
645 * to crash the machine or even lose data.
647 if (tbl
->it_offset
== 0)
648 set_bit(0, tbl
->it_map
);
650 tbl
->it_reserved_start
= res_start
;
651 tbl
->it_reserved_end
= res_end
;
653 /* Check if res_start..res_end isn't empty and overlaps the table */
654 if (res_start
&& res_end
&&
655 (tbl
->it_offset
+ tbl
->it_size
< res_start
||
656 res_end
< tbl
->it_offset
))
659 for (i
= tbl
->it_reserved_start
; i
< tbl
->it_reserved_end
; ++i
)
660 set_bit(i
- tbl
->it_offset
, tbl
->it_map
);
663 static void iommu_table_release_pages(struct iommu_table
*tbl
)
668 * In case we have reserved the first bit, we should not emit
671 if (tbl
->it_offset
== 0)
672 clear_bit(0, tbl
->it_map
);
674 for (i
= tbl
->it_reserved_start
; i
< tbl
->it_reserved_end
; ++i
)
675 clear_bit(i
- tbl
->it_offset
, tbl
->it_map
);
679 * Build a iommu_table structure. This contains a bit map which
680 * is used to manage allocation of the tce space.
682 struct iommu_table
*iommu_init_table(struct iommu_table
*tbl
, int nid
,
683 unsigned long res_start
, unsigned long res_end
)
686 static int welcomed
= 0;
689 struct iommu_pool
*p
;
691 BUG_ON(!tbl
->it_ops
);
693 /* number of bytes needed for the bitmap */
694 sz
= BITS_TO_LONGS(tbl
->it_size
) * sizeof(unsigned long);
696 page
= alloc_pages_node(nid
, GFP_KERNEL
, get_order(sz
));
698 panic("iommu_init_table: Can't allocate %ld bytes\n", sz
);
699 tbl
->it_map
= page_address(page
);
700 memset(tbl
->it_map
, 0, sz
);
702 iommu_table_reserve_pages(tbl
, res_start
, res_end
);
704 /* We only split the IOMMU table if we have 1GB or more of space */
705 if ((tbl
->it_size
<< tbl
->it_page_shift
) >= (1UL * 1024 * 1024 * 1024))
706 tbl
->nr_pools
= IOMMU_NR_POOLS
;
710 /* We reserve the top 1/4 of the table for large allocations */
711 tbl
->poolsize
= (tbl
->it_size
* 3 / 4) / tbl
->nr_pools
;
713 for (i
= 0; i
< tbl
->nr_pools
; i
++) {
715 spin_lock_init(&(p
->lock
));
716 p
->start
= tbl
->poolsize
* i
;
718 p
->end
= p
->start
+ tbl
->poolsize
;
721 p
= &tbl
->large_pool
;
722 spin_lock_init(&(p
->lock
));
723 p
->start
= tbl
->poolsize
* i
;
725 p
->end
= tbl
->it_size
;
727 iommu_table_clear(tbl
);
730 printk(KERN_INFO
"IOMMU table initialized, virtual merging %s\n",
731 novmerge
? "disabled" : "enabled");
738 static void iommu_table_free(struct kref
*kref
)
740 unsigned long bitmap_sz
;
742 struct iommu_table
*tbl
;
744 tbl
= container_of(kref
, struct iommu_table
, it_kref
);
746 if (tbl
->it_ops
->free
)
747 tbl
->it_ops
->free(tbl
);
754 iommu_table_release_pages(tbl
);
756 /* verify that table contains no entries */
757 if (!bitmap_empty(tbl
->it_map
, tbl
->it_size
))
758 pr_warn("%s: Unexpected TCEs\n", __func__
);
760 /* calculate bitmap size in bytes */
761 bitmap_sz
= BITS_TO_LONGS(tbl
->it_size
) * sizeof(unsigned long);
764 order
= get_order(bitmap_sz
);
765 free_pages((unsigned long) tbl
->it_map
, order
);
771 struct iommu_table
*iommu_tce_table_get(struct iommu_table
*tbl
)
773 if (kref_get_unless_zero(&tbl
->it_kref
))
778 EXPORT_SYMBOL_GPL(iommu_tce_table_get
);
780 int iommu_tce_table_put(struct iommu_table
*tbl
)
785 return kref_put(&tbl
->it_kref
, iommu_table_free
);
787 EXPORT_SYMBOL_GPL(iommu_tce_table_put
);
789 /* Creates TCEs for a user provided buffer. The user buffer must be
790 * contiguous real kernel storage (not vmalloc). The address passed here
791 * comprises a page address and offset into that page. The dma_addr_t
792 * returned will point to the same byte within the page as was passed in.
794 dma_addr_t
iommu_map_page(struct device
*dev
, struct iommu_table
*tbl
,
795 struct page
*page
, unsigned long offset
, size_t size
,
796 unsigned long mask
, enum dma_data_direction direction
,
799 dma_addr_t dma_handle
= DMA_MAPPING_ERROR
;
802 unsigned int npages
, align
;
804 BUG_ON(direction
== DMA_NONE
);
806 vaddr
= page_address(page
) + offset
;
807 uaddr
= (unsigned long)vaddr
;
810 npages
= iommu_num_pages(uaddr
, size
, IOMMU_PAGE_SIZE(tbl
));
812 if (tbl
->it_page_shift
< PAGE_SHIFT
&& size
>= PAGE_SIZE
&&
813 ((unsigned long)vaddr
& ~PAGE_MASK
) == 0)
814 align
= PAGE_SHIFT
- tbl
->it_page_shift
;
816 dma_handle
= iommu_alloc(dev
, tbl
, vaddr
, npages
, direction
,
817 mask
>> tbl
->it_page_shift
, align
,
819 if (dma_handle
== DMA_MAPPING_ERROR
) {
820 if (!(attrs
& DMA_ATTR_NO_WARN
) &&
821 printk_ratelimit()) {
822 dev_info(dev
, "iommu_alloc failed, tbl %p "
823 "vaddr %p npages %d\n", tbl
, vaddr
,
827 dma_handle
|= (uaddr
& ~IOMMU_PAGE_MASK(tbl
));
833 void iommu_unmap_page(struct iommu_table
*tbl
, dma_addr_t dma_handle
,
834 size_t size
, enum dma_data_direction direction
,
839 BUG_ON(direction
== DMA_NONE
);
842 npages
= iommu_num_pages(dma_handle
, size
,
843 IOMMU_PAGE_SIZE(tbl
));
844 iommu_free(tbl
, dma_handle
, npages
);
848 /* Allocates a contiguous real buffer and creates mappings over it.
849 * Returns the virtual address of the buffer and sets dma_handle
850 * to the dma address (mapping) of the first page.
852 void *iommu_alloc_coherent(struct device
*dev
, struct iommu_table
*tbl
,
853 size_t size
, dma_addr_t
*dma_handle
,
854 unsigned long mask
, gfp_t flag
, int node
)
859 unsigned int nio_pages
, io_order
;
862 size
= PAGE_ALIGN(size
);
863 order
= get_order(size
);
866 * Client asked for way too much space. This is checked later
867 * anyway. It is easier to debug here for the drivers than in
870 if (order
>= IOMAP_MAX_ORDER
) {
871 dev_info(dev
, "iommu_alloc_consistent size too large: 0x%lx\n",
879 /* Alloc enough pages (and possibly more) */
880 page
= alloc_pages_node(node
, flag
, order
);
883 ret
= page_address(page
);
884 memset(ret
, 0, size
);
886 /* Set up tces to cover the allocated range */
887 nio_pages
= size
>> tbl
->it_page_shift
;
888 io_order
= get_iommu_order(size
, tbl
);
889 mapping
= iommu_alloc(dev
, tbl
, ret
, nio_pages
, DMA_BIDIRECTIONAL
,
890 mask
>> tbl
->it_page_shift
, io_order
, 0);
891 if (mapping
== DMA_MAPPING_ERROR
) {
892 free_pages((unsigned long)ret
, order
);
895 *dma_handle
= mapping
;
899 void iommu_free_coherent(struct iommu_table
*tbl
, size_t size
,
900 void *vaddr
, dma_addr_t dma_handle
)
903 unsigned int nio_pages
;
905 size
= PAGE_ALIGN(size
);
906 nio_pages
= size
>> tbl
->it_page_shift
;
907 iommu_free(tbl
, dma_handle
, nio_pages
);
908 size
= PAGE_ALIGN(size
);
909 free_pages((unsigned long)vaddr
, get_order(size
));
913 unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir
)
916 case DMA_BIDIRECTIONAL
:
917 return TCE_PCI_READ
| TCE_PCI_WRITE
;
918 case DMA_FROM_DEVICE
:
919 return TCE_PCI_WRITE
;
926 EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm
);
928 #ifdef CONFIG_IOMMU_API
932 static void group_release(void *iommu_data
)
934 struct iommu_table_group
*table_group
= iommu_data
;
936 table_group
->group
= NULL
;
939 void iommu_register_group(struct iommu_table_group
*table_group
,
940 int pci_domain_number
, unsigned long pe_num
)
942 struct iommu_group
*grp
;
945 grp
= iommu_group_alloc();
947 pr_warn("powerpc iommu api: cannot create new group, err=%ld\n",
951 table_group
->group
= grp
;
952 iommu_group_set_iommudata(grp
, table_group
, group_release
);
953 name
= kasprintf(GFP_KERNEL
, "domain%d-pe%lx",
954 pci_domain_number
, pe_num
);
957 iommu_group_set_name(grp
, name
);
961 enum dma_data_direction
iommu_tce_direction(unsigned long tce
)
963 if ((tce
& TCE_PCI_READ
) && (tce
& TCE_PCI_WRITE
))
964 return DMA_BIDIRECTIONAL
;
965 else if (tce
& TCE_PCI_READ
)
966 return DMA_TO_DEVICE
;
967 else if (tce
& TCE_PCI_WRITE
)
968 return DMA_FROM_DEVICE
;
972 EXPORT_SYMBOL_GPL(iommu_tce_direction
);
974 void iommu_flush_tce(struct iommu_table
*tbl
)
976 /* Flush/invalidate TLB caches if necessary */
977 if (tbl
->it_ops
->flush
)
978 tbl
->it_ops
->flush(tbl
);
980 /* Make sure updates are seen by hardware */
983 EXPORT_SYMBOL_GPL(iommu_flush_tce
);
985 int iommu_tce_check_ioba(unsigned long page_shift
,
986 unsigned long offset
, unsigned long size
,
987 unsigned long ioba
, unsigned long npages
)
989 unsigned long mask
= (1UL << page_shift
) - 1;
998 if ((ioba
+ 1) > (offset
+ size
))
1003 EXPORT_SYMBOL_GPL(iommu_tce_check_ioba
);
1005 int iommu_tce_check_gpa(unsigned long page_shift
, unsigned long gpa
)
1007 unsigned long mask
= (1UL << page_shift
) - 1;
1014 EXPORT_SYMBOL_GPL(iommu_tce_check_gpa
);
1016 extern long iommu_tce_xchg_no_kill(struct mm_struct
*mm
,
1017 struct iommu_table
*tbl
,
1018 unsigned long entry
, unsigned long *hpa
,
1019 enum dma_data_direction
*direction
)
1022 unsigned long size
= 0;
1024 ret
= tbl
->it_ops
->xchg_no_kill(tbl
, entry
, hpa
, direction
, false);
1025 if (!ret
&& ((*direction
== DMA_FROM_DEVICE
) ||
1026 (*direction
== DMA_BIDIRECTIONAL
)) &&
1027 !mm_iommu_is_devmem(mm
, *hpa
, tbl
->it_page_shift
,
1029 SetPageDirty(pfn_to_page(*hpa
>> PAGE_SHIFT
));
1033 EXPORT_SYMBOL_GPL(iommu_tce_xchg_no_kill
);
1035 void iommu_tce_kill(struct iommu_table
*tbl
,
1036 unsigned long entry
, unsigned long pages
)
1038 if (tbl
->it_ops
->tce_kill
)
1039 tbl
->it_ops
->tce_kill(tbl
, entry
, pages
, false);
1041 EXPORT_SYMBOL_GPL(iommu_tce_kill
);
1043 int iommu_take_ownership(struct iommu_table
*tbl
)
1045 unsigned long flags
, i
, sz
= (tbl
->it_size
+ 7) >> 3;
1049 * VFIO does not control TCE entries allocation and the guest
1050 * can write new TCEs on top of existing ones so iommu_tce_build()
1051 * must be able to release old pages. This functionality
1052 * requires exchange() callback defined so if it is not
1053 * implemented, we disallow taking ownership over the table.
1055 if (!tbl
->it_ops
->xchg_no_kill
)
1058 spin_lock_irqsave(&tbl
->large_pool
.lock
, flags
);
1059 for (i
= 0; i
< tbl
->nr_pools
; i
++)
1060 spin_lock(&tbl
->pools
[i
].lock
);
1062 iommu_table_release_pages(tbl
);
1064 if (!bitmap_empty(tbl
->it_map
, tbl
->it_size
)) {
1065 pr_err("iommu_tce: it_map is not empty");
1067 /* Undo iommu_table_release_pages, i.e. restore bit#0, etc */
1068 iommu_table_reserve_pages(tbl
, tbl
->it_reserved_start
,
1069 tbl
->it_reserved_end
);
1071 memset(tbl
->it_map
, 0xff, sz
);
1074 for (i
= 0; i
< tbl
->nr_pools
; i
++)
1075 spin_unlock(&tbl
->pools
[i
].lock
);
1076 spin_unlock_irqrestore(&tbl
->large_pool
.lock
, flags
);
1080 EXPORT_SYMBOL_GPL(iommu_take_ownership
);
1082 void iommu_release_ownership(struct iommu_table
*tbl
)
1084 unsigned long flags
, i
, sz
= (tbl
->it_size
+ 7) >> 3;
1086 spin_lock_irqsave(&tbl
->large_pool
.lock
, flags
);
1087 for (i
= 0; i
< tbl
->nr_pools
; i
++)
1088 spin_lock(&tbl
->pools
[i
].lock
);
1090 memset(tbl
->it_map
, 0, sz
);
1092 iommu_table_reserve_pages(tbl
, tbl
->it_reserved_start
,
1093 tbl
->it_reserved_end
);
1095 for (i
= 0; i
< tbl
->nr_pools
; i
++)
1096 spin_unlock(&tbl
->pools
[i
].lock
);
1097 spin_unlock_irqrestore(&tbl
->large_pool
.lock
, flags
);
1099 EXPORT_SYMBOL_GPL(iommu_release_ownership
);
1101 int iommu_add_device(struct iommu_table_group
*table_group
, struct device
*dev
)
1104 * The sysfs entries should be populated before
1105 * binding IOMMU group. If sysfs entries isn't
1106 * ready, we simply bail.
1108 if (!device_is_registered(dev
))
1111 if (device_iommu_mapped(dev
)) {
1112 pr_debug("%s: Skipping device %s with iommu group %d\n",
1113 __func__
, dev_name(dev
),
1114 iommu_group_id(dev
->iommu_group
));
1118 pr_debug("%s: Adding %s to iommu group %d\n",
1119 __func__
, dev_name(dev
), iommu_group_id(table_group
->group
));
1121 return iommu_group_add_device(table_group
->group
, dev
);
1123 EXPORT_SYMBOL_GPL(iommu_add_device
);
1125 void iommu_del_device(struct device
*dev
)
1128 * Some devices might not have IOMMU table and group
1129 * and we needn't detach them from the associated
1132 if (!device_iommu_mapped(dev
)) {
1133 pr_debug("iommu_tce: skipping device %s with no tbl\n",
1138 iommu_group_remove_device(dev
);
1140 EXPORT_SYMBOL_GPL(iommu_del_device
);
1141 #endif /* CONFIG_IOMMU_API */