Revert "tty: hvc: Fix data abort due to race in hvc_open"
[linux/fpc-iii.git] / drivers / iommu / intel-iommu.c
blob34b2ed91cf4d9fdcacff8915772fe5a8049fdacc
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
92 * that we support.
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
106 static inline int agaw_to_level(int agaw)
108 return agaw + 2;
111 static inline int agaw_to_width(int agaw)
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 static inline int width_to_agaw(int width)
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 static inline unsigned int level_to_offset_bits(int level)
123 return (level - 1) * LEVEL_STRIDE;
126 static inline int pfn_level_offset(unsigned long pfn, int level)
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 static inline unsigned long level_mask(int level)
133 return -1UL << level_to_offset_bits(level);
136 static inline unsigned long level_size(int level)
138 return 1UL << level_to_offset_bits(level);
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
143 return (pfn + level_size(level) - 1) & level_mask(level);
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
148 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
164 return mm_to_dma_pfn(page_to_pfn(pg));
166 static inline unsigned long virt_to_dma_pfn(void *p)
168 return page_to_dma_pfn(virt_to_page(p));
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189 * if marked present.
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
193 if (!(re->lo & 1))
194 return 0;
196 return re->lo & VTD_PAGE_MASK;
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201 * if marked present.
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
205 if (!(re->hi & 1))
206 return 0;
208 return re->hi & VTD_PAGE_MASK;
211 static inline void context_clear_pasid_enable(struct context_entry *context)
213 context->lo &= ~(1ULL << 11);
216 static inline bool context_pasid_enabled(struct context_entry *context)
218 return !!(context->lo & (1ULL << 11));
221 static inline void context_set_copied(struct context_entry *context)
223 context->hi |= (1ull << 3);
226 static inline bool context_copied(struct context_entry *context)
228 return !!(context->hi & (1ULL << 3));
231 static inline bool __context_present(struct context_entry *context)
233 return (context->lo & 1);
236 bool context_present(struct context_entry *context)
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
243 static inline void context_set_present(struct context_entry *context)
245 context->lo |= 1;
248 static inline void context_set_fault_enable(struct context_entry *context)
250 context->lo &= (((u64)-1) << 2) | 1;
253 static inline void context_set_translation_type(struct context_entry *context,
254 unsigned long value)
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
260 static inline void context_set_address_root(struct context_entry *context,
261 unsigned long value)
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
267 static inline void context_set_address_width(struct context_entry *context,
268 unsigned long value)
270 context->hi |= value & 7;
273 static inline void context_set_domain_id(struct context_entry *context,
274 unsigned long value)
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
279 static inline int context_domain_id(struct context_entry *c)
281 return((c->hi >> 8) & 0xffff);
284 static inline void context_clear_entry(struct context_entry *context)
286 context->lo = 0;
287 context->hi = 0;
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
303 * This is a DMA domain allocated through the iommu domain allocation
304 * interface. But one or more devices belonging to this domain have
305 * been chosen to use a private domain. We should avoid to use the
306 * map/unmap/iova_to_phys APIs on it.
308 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
311 * When VT-d works in the scalable mode, it allows DMA translation to
312 * happen through either first level or second level page table. This
313 * bit marks that the DMA translation for the domain goes through the
314 * first level page table, otherwise, it goes through the second level.
316 #define DOMAIN_FLAG_USE_FIRST_LEVEL BIT(2)
319 * Domain represents a virtual machine which demands iommu nested
320 * translation mode support.
322 #define DOMAIN_FLAG_NESTING_MODE BIT(3)
324 #define for_each_domain_iommu(idx, domain) \
325 for (idx = 0; idx < g_num_of_iommus; idx++) \
326 if (domain->iommu_refcnt[idx])
328 struct dmar_rmrr_unit {
329 struct list_head list; /* list of rmrr units */
330 struct acpi_dmar_header *hdr; /* ACPI header */
331 u64 base_address; /* reserved base address*/
332 u64 end_address; /* reserved end address */
333 struct dmar_dev_scope *devices; /* target devices */
334 int devices_cnt; /* target device count */
337 struct dmar_atsr_unit {
338 struct list_head list; /* list of ATSR units */
339 struct acpi_dmar_header *hdr; /* ACPI header */
340 struct dmar_dev_scope *devices; /* target devices */
341 int devices_cnt; /* target device count */
342 u8 include_all:1; /* include all ports */
345 static LIST_HEAD(dmar_atsr_units);
346 static LIST_HEAD(dmar_rmrr_units);
348 #define for_each_rmrr_units(rmrr) \
349 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
351 /* bitmap for indexing intel_iommus */
352 static int g_num_of_iommus;
354 static void domain_exit(struct dmar_domain *domain);
355 static void domain_remove_dev_info(struct dmar_domain *domain);
356 static void dmar_remove_one_dev_info(struct device *dev);
357 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
358 static void domain_context_clear(struct intel_iommu *iommu,
359 struct device *dev);
360 static int domain_detach_iommu(struct dmar_domain *domain,
361 struct intel_iommu *iommu);
362 static bool device_is_rmrr_locked(struct device *dev);
363 static int intel_iommu_attach_device(struct iommu_domain *domain,
364 struct device *dev);
365 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
366 dma_addr_t iova);
368 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
369 int dmar_disabled = 0;
370 #else
371 int dmar_disabled = 1;
372 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
374 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
375 int intel_iommu_sm = 1;
376 #else
377 int intel_iommu_sm;
378 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
380 int intel_iommu_enabled = 0;
381 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
383 static int dmar_map_gfx = 1;
384 static int dmar_forcedac;
385 static int intel_iommu_strict;
386 static int intel_iommu_superpage = 1;
387 static int iommu_identity_mapping;
388 static int intel_no_bounce;
390 #define IDENTMAP_GFX 2
391 #define IDENTMAP_AZALIA 4
393 int intel_iommu_gfx_mapped;
394 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
396 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
397 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
398 DEFINE_SPINLOCK(device_domain_lock);
399 static LIST_HEAD(device_domain_list);
401 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
402 to_pci_dev(d)->untrusted)
405 * Iterate over elements in device_domain_list and call the specified
406 * callback @fn against each element.
408 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
409 void *data), void *data)
411 int ret = 0;
412 unsigned long flags;
413 struct device_domain_info *info;
415 spin_lock_irqsave(&device_domain_lock, flags);
416 list_for_each_entry(info, &device_domain_list, global) {
417 ret = fn(info, data);
418 if (ret) {
419 spin_unlock_irqrestore(&device_domain_lock, flags);
420 return ret;
423 spin_unlock_irqrestore(&device_domain_lock, flags);
425 return 0;
428 const struct iommu_ops intel_iommu_ops;
430 static bool translation_pre_enabled(struct intel_iommu *iommu)
432 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
435 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
437 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
440 static void init_translation_status(struct intel_iommu *iommu)
442 u32 gsts;
444 gsts = readl(iommu->reg + DMAR_GSTS_REG);
445 if (gsts & DMA_GSTS_TES)
446 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
449 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
450 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
452 return container_of(dom, struct dmar_domain, domain);
455 static int __init intel_iommu_setup(char *str)
457 if (!str)
458 return -EINVAL;
459 while (*str) {
460 if (!strncmp(str, "on", 2)) {
461 dmar_disabled = 0;
462 pr_info("IOMMU enabled\n");
463 } else if (!strncmp(str, "off", 3)) {
464 dmar_disabled = 1;
465 no_platform_optin = 1;
466 pr_info("IOMMU disabled\n");
467 } else if (!strncmp(str, "igfx_off", 8)) {
468 dmar_map_gfx = 0;
469 pr_info("Disable GFX device mapping\n");
470 } else if (!strncmp(str, "forcedac", 8)) {
471 pr_info("Forcing DAC for PCI devices\n");
472 dmar_forcedac = 1;
473 } else if (!strncmp(str, "strict", 6)) {
474 pr_info("Disable batched IOTLB flush\n");
475 intel_iommu_strict = 1;
476 } else if (!strncmp(str, "sp_off", 6)) {
477 pr_info("Disable supported super page\n");
478 intel_iommu_superpage = 0;
479 } else if (!strncmp(str, "sm_on", 5)) {
480 pr_info("Intel-IOMMU: scalable mode supported\n");
481 intel_iommu_sm = 1;
482 } else if (!strncmp(str, "tboot_noforce", 13)) {
483 printk(KERN_INFO
484 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
485 intel_iommu_tboot_noforce = 1;
486 } else if (!strncmp(str, "nobounce", 8)) {
487 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
488 intel_no_bounce = 1;
491 str += strcspn(str, ",");
492 while (*str == ',')
493 str++;
495 return 0;
497 __setup("intel_iommu=", intel_iommu_setup);
499 static struct kmem_cache *iommu_domain_cache;
500 static struct kmem_cache *iommu_devinfo_cache;
502 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
504 struct dmar_domain **domains;
505 int idx = did >> 8;
507 domains = iommu->domains[idx];
508 if (!domains)
509 return NULL;
511 return domains[did & 0xff];
514 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
515 struct dmar_domain *domain)
517 struct dmar_domain **domains;
518 int idx = did >> 8;
520 if (!iommu->domains[idx]) {
521 size_t size = 256 * sizeof(struct dmar_domain *);
522 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
525 domains = iommu->domains[idx];
526 if (WARN_ON(!domains))
527 return;
528 else
529 domains[did & 0xff] = domain;
532 void *alloc_pgtable_page(int node)
534 struct page *page;
535 void *vaddr = NULL;
537 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
538 if (page)
539 vaddr = page_address(page);
540 return vaddr;
543 void free_pgtable_page(void *vaddr)
545 free_page((unsigned long)vaddr);
548 static inline void *alloc_domain_mem(void)
550 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
553 static void free_domain_mem(void *vaddr)
555 kmem_cache_free(iommu_domain_cache, vaddr);
558 static inline void * alloc_devinfo_mem(void)
560 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
563 static inline void free_devinfo_mem(void *vaddr)
565 kmem_cache_free(iommu_devinfo_cache, vaddr);
568 static inline int domain_type_is_si(struct dmar_domain *domain)
570 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
573 static inline bool domain_use_first_level(struct dmar_domain *domain)
575 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
578 static inline int domain_pfn_supported(struct dmar_domain *domain,
579 unsigned long pfn)
581 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
583 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
586 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
588 unsigned long sagaw;
589 int agaw = -1;
591 sagaw = cap_sagaw(iommu->cap);
592 for (agaw = width_to_agaw(max_gaw);
593 agaw >= 0; agaw--) {
594 if (test_bit(agaw, &sagaw))
595 break;
598 return agaw;
602 * Calculate max SAGAW for each iommu.
604 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
606 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
610 * calculate agaw for each iommu.
611 * "SAGAW" may be different across iommus, use a default agaw, and
612 * get a supported less agaw for iommus that don't support the default agaw.
614 int iommu_calculate_agaw(struct intel_iommu *iommu)
616 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
619 /* This functionin only returns single iommu in a domain */
620 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
622 int iommu_id;
624 /* si_domain and vm domain should not get here. */
625 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
626 return NULL;
628 for_each_domain_iommu(iommu_id, domain)
629 break;
631 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
632 return NULL;
634 return g_iommus[iommu_id];
637 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
639 return sm_supported(iommu) ?
640 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
643 static void domain_update_iommu_coherency(struct dmar_domain *domain)
645 struct dmar_drhd_unit *drhd;
646 struct intel_iommu *iommu;
647 bool found = false;
648 int i;
650 domain->iommu_coherency = 1;
652 for_each_domain_iommu(i, domain) {
653 found = true;
654 if (!iommu_paging_structure_coherency(g_iommus[i])) {
655 domain->iommu_coherency = 0;
656 break;
659 if (found)
660 return;
662 /* No hardware attached; use lowest common denominator */
663 rcu_read_lock();
664 for_each_active_iommu(iommu, drhd) {
665 if (!iommu_paging_structure_coherency(iommu)) {
666 domain->iommu_coherency = 0;
667 break;
670 rcu_read_unlock();
673 static int domain_update_iommu_snooping(struct intel_iommu *skip)
675 struct dmar_drhd_unit *drhd;
676 struct intel_iommu *iommu;
677 int ret = 1;
679 rcu_read_lock();
680 for_each_active_iommu(iommu, drhd) {
681 if (iommu != skip) {
682 if (!ecap_sc_support(iommu->ecap)) {
683 ret = 0;
684 break;
688 rcu_read_unlock();
690 return ret;
693 static int domain_update_iommu_superpage(struct dmar_domain *domain,
694 struct intel_iommu *skip)
696 struct dmar_drhd_unit *drhd;
697 struct intel_iommu *iommu;
698 int mask = 0x3;
700 if (!intel_iommu_superpage) {
701 return 0;
704 /* set iommu_superpage to the smallest common denominator */
705 rcu_read_lock();
706 for_each_active_iommu(iommu, drhd) {
707 if (iommu != skip) {
708 if (domain && domain_use_first_level(domain)) {
709 if (!cap_fl1gp_support(iommu->cap))
710 mask = 0x1;
711 } else {
712 mask &= cap_super_page_val(iommu->cap);
715 if (!mask)
716 break;
719 rcu_read_unlock();
721 return fls(mask);
724 /* Some capabilities may be different across iommus */
725 static void domain_update_iommu_cap(struct dmar_domain *domain)
727 domain_update_iommu_coherency(domain);
728 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
729 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
732 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
733 u8 devfn, int alloc)
735 struct root_entry *root = &iommu->root_entry[bus];
736 struct context_entry *context;
737 u64 *entry;
739 entry = &root->lo;
740 if (sm_supported(iommu)) {
741 if (devfn >= 0x80) {
742 devfn -= 0x80;
743 entry = &root->hi;
745 devfn *= 2;
747 if (*entry & 1)
748 context = phys_to_virt(*entry & VTD_PAGE_MASK);
749 else {
750 unsigned long phy_addr;
751 if (!alloc)
752 return NULL;
754 context = alloc_pgtable_page(iommu->node);
755 if (!context)
756 return NULL;
758 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
759 phy_addr = virt_to_phys((void *)context);
760 *entry = phy_addr | 1;
761 __iommu_flush_cache(iommu, entry, sizeof(*entry));
763 return &context[devfn];
766 static int iommu_dummy(struct device *dev)
768 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
771 static bool attach_deferred(struct device *dev)
773 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
777 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
778 * sub-hierarchy of a candidate PCI-PCI bridge
779 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
780 * @bridge: the candidate PCI-PCI bridge
782 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
784 static bool
785 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
787 struct pci_dev *pdev, *pbridge;
789 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
790 return false;
792 pdev = to_pci_dev(dev);
793 pbridge = to_pci_dev(bridge);
795 if (pbridge->subordinate &&
796 pbridge->subordinate->number <= pdev->bus->number &&
797 pbridge->subordinate->busn_res.end >= pdev->bus->number)
798 return true;
800 return false;
803 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
805 struct dmar_drhd_unit *drhd = NULL;
806 struct intel_iommu *iommu;
807 struct device *tmp;
808 struct pci_dev *pdev = NULL;
809 u16 segment = 0;
810 int i;
812 if (iommu_dummy(dev))
813 return NULL;
815 if (dev_is_pci(dev)) {
816 struct pci_dev *pf_pdev;
818 pdev = pci_real_dma_dev(to_pci_dev(dev));
820 /* VFs aren't listed in scope tables; we need to look up
821 * the PF instead to find the IOMMU. */
822 pf_pdev = pci_physfn(pdev);
823 dev = &pf_pdev->dev;
824 segment = pci_domain_nr(pdev->bus);
825 } else if (has_acpi_companion(dev))
826 dev = &ACPI_COMPANION(dev)->dev;
828 rcu_read_lock();
829 for_each_active_iommu(iommu, drhd) {
830 if (pdev && segment != drhd->segment)
831 continue;
833 for_each_active_dev_scope(drhd->devices,
834 drhd->devices_cnt, i, tmp) {
835 if (tmp == dev) {
836 /* For a VF use its original BDF# not that of the PF
837 * which we used for the IOMMU lookup. Strictly speaking
838 * we could do this for all PCI devices; we only need to
839 * get the BDF# from the scope table for ACPI matches. */
840 if (pdev && pdev->is_virtfn)
841 goto got_pdev;
843 *bus = drhd->devices[i].bus;
844 *devfn = drhd->devices[i].devfn;
845 goto out;
848 if (is_downstream_to_pci_bridge(dev, tmp))
849 goto got_pdev;
852 if (pdev && drhd->include_all) {
853 got_pdev:
854 *bus = pdev->bus->number;
855 *devfn = pdev->devfn;
856 goto out;
859 iommu = NULL;
860 out:
861 rcu_read_unlock();
863 return iommu;
866 static void domain_flush_cache(struct dmar_domain *domain,
867 void *addr, int size)
869 if (!domain->iommu_coherency)
870 clflush_cache_range(addr, size);
873 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
875 struct context_entry *context;
876 int ret = 0;
877 unsigned long flags;
879 spin_lock_irqsave(&iommu->lock, flags);
880 context = iommu_context_addr(iommu, bus, devfn, 0);
881 if (context)
882 ret = context_present(context);
883 spin_unlock_irqrestore(&iommu->lock, flags);
884 return ret;
887 static void free_context_table(struct intel_iommu *iommu)
889 int i;
890 unsigned long flags;
891 struct context_entry *context;
893 spin_lock_irqsave(&iommu->lock, flags);
894 if (!iommu->root_entry) {
895 goto out;
897 for (i = 0; i < ROOT_ENTRY_NR; i++) {
898 context = iommu_context_addr(iommu, i, 0, 0);
899 if (context)
900 free_pgtable_page(context);
902 if (!sm_supported(iommu))
903 continue;
905 context = iommu_context_addr(iommu, i, 0x80, 0);
906 if (context)
907 free_pgtable_page(context);
910 free_pgtable_page(iommu->root_entry);
911 iommu->root_entry = NULL;
912 out:
913 spin_unlock_irqrestore(&iommu->lock, flags);
916 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
917 unsigned long pfn, int *target_level)
919 struct dma_pte *parent, *pte;
920 int level = agaw_to_level(domain->agaw);
921 int offset;
923 BUG_ON(!domain->pgd);
925 if (!domain_pfn_supported(domain, pfn))
926 /* Address beyond IOMMU's addressing capabilities. */
927 return NULL;
929 parent = domain->pgd;
931 while (1) {
932 void *tmp_page;
934 offset = pfn_level_offset(pfn, level);
935 pte = &parent[offset];
936 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
937 break;
938 if (level == *target_level)
939 break;
941 if (!dma_pte_present(pte)) {
942 uint64_t pteval;
944 tmp_page = alloc_pgtable_page(domain->nid);
946 if (!tmp_page)
947 return NULL;
949 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
950 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
951 if (domain_use_first_level(domain))
952 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
953 if (cmpxchg64(&pte->val, 0ULL, pteval))
954 /* Someone else set it while we were thinking; use theirs. */
955 free_pgtable_page(tmp_page);
956 else
957 domain_flush_cache(domain, pte, sizeof(*pte));
959 if (level == 1)
960 break;
962 parent = phys_to_virt(dma_pte_addr(pte));
963 level--;
966 if (!*target_level)
967 *target_level = level;
969 return pte;
972 /* return address's pte at specific level */
973 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
974 unsigned long pfn,
975 int level, int *large_page)
977 struct dma_pte *parent, *pte;
978 int total = agaw_to_level(domain->agaw);
979 int offset;
981 parent = domain->pgd;
982 while (level <= total) {
983 offset = pfn_level_offset(pfn, total);
984 pte = &parent[offset];
985 if (level == total)
986 return pte;
988 if (!dma_pte_present(pte)) {
989 *large_page = total;
990 break;
993 if (dma_pte_superpage(pte)) {
994 *large_page = total;
995 return pte;
998 parent = phys_to_virt(dma_pte_addr(pte));
999 total--;
1001 return NULL;
1004 /* clear last level pte, a tlb flush should be followed */
1005 static void dma_pte_clear_range(struct dmar_domain *domain,
1006 unsigned long start_pfn,
1007 unsigned long last_pfn)
1009 unsigned int large_page;
1010 struct dma_pte *first_pte, *pte;
1012 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1013 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1014 BUG_ON(start_pfn > last_pfn);
1016 /* we don't need lock here; nobody else touches the iova range */
1017 do {
1018 large_page = 1;
1019 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1020 if (!pte) {
1021 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1022 continue;
1024 do {
1025 dma_clear_pte(pte);
1026 start_pfn += lvl_to_nr_pages(large_page);
1027 pte++;
1028 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1030 domain_flush_cache(domain, first_pte,
1031 (void *)pte - (void *)first_pte);
1033 } while (start_pfn && start_pfn <= last_pfn);
1036 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1037 int retain_level, struct dma_pte *pte,
1038 unsigned long pfn, unsigned long start_pfn,
1039 unsigned long last_pfn)
1041 pfn = max(start_pfn, pfn);
1042 pte = &pte[pfn_level_offset(pfn, level)];
1044 do {
1045 unsigned long level_pfn;
1046 struct dma_pte *level_pte;
1048 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1049 goto next;
1051 level_pfn = pfn & level_mask(level);
1052 level_pte = phys_to_virt(dma_pte_addr(pte));
1054 if (level > 2) {
1055 dma_pte_free_level(domain, level - 1, retain_level,
1056 level_pte, level_pfn, start_pfn,
1057 last_pfn);
1061 * Free the page table if we're below the level we want to
1062 * retain and the range covers the entire table.
1064 if (level < retain_level && !(start_pfn > level_pfn ||
1065 last_pfn < level_pfn + level_size(level) - 1)) {
1066 dma_clear_pte(pte);
1067 domain_flush_cache(domain, pte, sizeof(*pte));
1068 free_pgtable_page(level_pte);
1070 next:
1071 pfn += level_size(level);
1072 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1076 * clear last level (leaf) ptes and free page table pages below the
1077 * level we wish to keep intact.
1079 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1080 unsigned long start_pfn,
1081 unsigned long last_pfn,
1082 int retain_level)
1084 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1085 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1086 BUG_ON(start_pfn > last_pfn);
1088 dma_pte_clear_range(domain, start_pfn, last_pfn);
1090 /* We don't need lock here; nobody else touches the iova range */
1091 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1092 domain->pgd, 0, start_pfn, last_pfn);
1094 /* free pgd */
1095 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1096 free_pgtable_page(domain->pgd);
1097 domain->pgd = NULL;
1101 /* When a page at a given level is being unlinked from its parent, we don't
1102 need to *modify* it at all. All we need to do is make a list of all the
1103 pages which can be freed just as soon as we've flushed the IOTLB and we
1104 know the hardware page-walk will no longer touch them.
1105 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1106 be freed. */
1107 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1108 int level, struct dma_pte *pte,
1109 struct page *freelist)
1111 struct page *pg;
1113 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1114 pg->freelist = freelist;
1115 freelist = pg;
1117 if (level == 1)
1118 return freelist;
1120 pte = page_address(pg);
1121 do {
1122 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1123 freelist = dma_pte_list_pagetables(domain, level - 1,
1124 pte, freelist);
1125 pte++;
1126 } while (!first_pte_in_page(pte));
1128 return freelist;
1131 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1132 struct dma_pte *pte, unsigned long pfn,
1133 unsigned long start_pfn,
1134 unsigned long last_pfn,
1135 struct page *freelist)
1137 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1139 pfn = max(start_pfn, pfn);
1140 pte = &pte[pfn_level_offset(pfn, level)];
1142 do {
1143 unsigned long level_pfn;
1145 if (!dma_pte_present(pte))
1146 goto next;
1148 level_pfn = pfn & level_mask(level);
1150 /* If range covers entire pagetable, free it */
1151 if (start_pfn <= level_pfn &&
1152 last_pfn >= level_pfn + level_size(level) - 1) {
1153 /* These suborbinate page tables are going away entirely. Don't
1154 bother to clear them; we're just going to *free* them. */
1155 if (level > 1 && !dma_pte_superpage(pte))
1156 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1158 dma_clear_pte(pte);
1159 if (!first_pte)
1160 first_pte = pte;
1161 last_pte = pte;
1162 } else if (level > 1) {
1163 /* Recurse down into a level that isn't *entirely* obsolete */
1164 freelist = dma_pte_clear_level(domain, level - 1,
1165 phys_to_virt(dma_pte_addr(pte)),
1166 level_pfn, start_pfn, last_pfn,
1167 freelist);
1169 next:
1170 pfn += level_size(level);
1171 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1173 if (first_pte)
1174 domain_flush_cache(domain, first_pte,
1175 (void *)++last_pte - (void *)first_pte);
1177 return freelist;
1180 /* We can't just free the pages because the IOMMU may still be walking
1181 the page tables, and may have cached the intermediate levels. The
1182 pages can only be freed after the IOTLB flush has been done. */
1183 static struct page *domain_unmap(struct dmar_domain *domain,
1184 unsigned long start_pfn,
1185 unsigned long last_pfn)
1187 struct page *freelist;
1189 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1190 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1191 BUG_ON(start_pfn > last_pfn);
1193 /* we don't need lock here; nobody else touches the iova range */
1194 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1195 domain->pgd, 0, start_pfn, last_pfn, NULL);
1197 /* free pgd */
1198 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1199 struct page *pgd_page = virt_to_page(domain->pgd);
1200 pgd_page->freelist = freelist;
1201 freelist = pgd_page;
1203 domain->pgd = NULL;
1206 return freelist;
1209 static void dma_free_pagelist(struct page *freelist)
1211 struct page *pg;
1213 while ((pg = freelist)) {
1214 freelist = pg->freelist;
1215 free_pgtable_page(page_address(pg));
1219 static void iova_entry_free(unsigned long data)
1221 struct page *freelist = (struct page *)data;
1223 dma_free_pagelist(freelist);
1226 /* iommu handling */
1227 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1229 struct root_entry *root;
1230 unsigned long flags;
1232 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1233 if (!root) {
1234 pr_err("Allocating root entry for %s failed\n",
1235 iommu->name);
1236 return -ENOMEM;
1239 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1241 spin_lock_irqsave(&iommu->lock, flags);
1242 iommu->root_entry = root;
1243 spin_unlock_irqrestore(&iommu->lock, flags);
1245 return 0;
1248 static void iommu_set_root_entry(struct intel_iommu *iommu)
1250 u64 addr;
1251 u32 sts;
1252 unsigned long flag;
1254 addr = virt_to_phys(iommu->root_entry);
1255 if (sm_supported(iommu))
1256 addr |= DMA_RTADDR_SMT;
1258 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1259 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1261 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1263 /* Make sure hardware complete it */
1264 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1265 readl, (sts & DMA_GSTS_RTPS), sts);
1267 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1270 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1272 u32 val;
1273 unsigned long flag;
1275 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1276 return;
1278 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1279 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1281 /* Make sure hardware complete it */
1282 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1283 readl, (!(val & DMA_GSTS_WBFS)), val);
1285 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1288 /* return value determine if we need a write buffer flush */
1289 static void __iommu_flush_context(struct intel_iommu *iommu,
1290 u16 did, u16 source_id, u8 function_mask,
1291 u64 type)
1293 u64 val = 0;
1294 unsigned long flag;
1296 switch (type) {
1297 case DMA_CCMD_GLOBAL_INVL:
1298 val = DMA_CCMD_GLOBAL_INVL;
1299 break;
1300 case DMA_CCMD_DOMAIN_INVL:
1301 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1302 break;
1303 case DMA_CCMD_DEVICE_INVL:
1304 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1305 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1306 break;
1307 default:
1308 BUG();
1310 val |= DMA_CCMD_ICC;
1312 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1313 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1315 /* Make sure hardware complete it */
1316 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1317 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1319 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1322 /* return value determine if we need a write buffer flush */
1323 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1324 u64 addr, unsigned int size_order, u64 type)
1326 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1327 u64 val = 0, val_iva = 0;
1328 unsigned long flag;
1330 switch (type) {
1331 case DMA_TLB_GLOBAL_FLUSH:
1332 /* global flush doesn't need set IVA_REG */
1333 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1334 break;
1335 case DMA_TLB_DSI_FLUSH:
1336 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1337 break;
1338 case DMA_TLB_PSI_FLUSH:
1339 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1340 /* IH bit is passed in as part of address */
1341 val_iva = size_order | addr;
1342 break;
1343 default:
1344 BUG();
1346 /* Note: set drain read/write */
1347 #if 0
1349 * This is probably to be super secure.. Looks like we can
1350 * ignore it without any impact.
1352 if (cap_read_drain(iommu->cap))
1353 val |= DMA_TLB_READ_DRAIN;
1354 #endif
1355 if (cap_write_drain(iommu->cap))
1356 val |= DMA_TLB_WRITE_DRAIN;
1358 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1359 /* Note: Only uses first TLB reg currently */
1360 if (val_iva)
1361 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1362 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1364 /* Make sure hardware complete it */
1365 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1366 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1368 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1370 /* check IOTLB invalidation granularity */
1371 if (DMA_TLB_IAIG(val) == 0)
1372 pr_err("Flush IOTLB failed\n");
1373 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1374 pr_debug("TLB flush request %Lx, actual %Lx\n",
1375 (unsigned long long)DMA_TLB_IIRG(type),
1376 (unsigned long long)DMA_TLB_IAIG(val));
1379 static struct device_domain_info *
1380 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1381 u8 bus, u8 devfn)
1383 struct device_domain_info *info;
1385 assert_spin_locked(&device_domain_lock);
1387 if (!iommu->qi)
1388 return NULL;
1390 list_for_each_entry(info, &domain->devices, link)
1391 if (info->iommu == iommu && info->bus == bus &&
1392 info->devfn == devfn) {
1393 if (info->ats_supported && info->dev)
1394 return info;
1395 break;
1398 return NULL;
1401 static void domain_update_iotlb(struct dmar_domain *domain)
1403 struct device_domain_info *info;
1404 bool has_iotlb_device = false;
1406 assert_spin_locked(&device_domain_lock);
1408 list_for_each_entry(info, &domain->devices, link) {
1409 struct pci_dev *pdev;
1411 if (!info->dev || !dev_is_pci(info->dev))
1412 continue;
1414 pdev = to_pci_dev(info->dev);
1415 if (pdev->ats_enabled) {
1416 has_iotlb_device = true;
1417 break;
1421 domain->has_iotlb_device = has_iotlb_device;
1424 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1426 struct pci_dev *pdev;
1428 assert_spin_locked(&device_domain_lock);
1430 if (!info || !dev_is_pci(info->dev))
1431 return;
1433 pdev = to_pci_dev(info->dev);
1434 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1435 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1436 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1437 * reserved, which should be set to 0.
1439 if (!ecap_dit(info->iommu->ecap))
1440 info->pfsid = 0;
1441 else {
1442 struct pci_dev *pf_pdev;
1444 /* pdev will be returned if device is not a vf */
1445 pf_pdev = pci_physfn(pdev);
1446 info->pfsid = pci_dev_id(pf_pdev);
1449 #ifdef CONFIG_INTEL_IOMMU_SVM
1450 /* The PCIe spec, in its wisdom, declares that the behaviour of
1451 the device if you enable PASID support after ATS support is
1452 undefined. So always enable PASID support on devices which
1453 have it, even if we can't yet know if we're ever going to
1454 use it. */
1455 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1456 info->pasid_enabled = 1;
1458 if (info->pri_supported &&
1459 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1460 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1461 info->pri_enabled = 1;
1462 #endif
1463 if (!pdev->untrusted && info->ats_supported &&
1464 pci_ats_page_aligned(pdev) &&
1465 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1466 info->ats_enabled = 1;
1467 domain_update_iotlb(info->domain);
1468 info->ats_qdep = pci_ats_queue_depth(pdev);
1472 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1474 struct pci_dev *pdev;
1476 assert_spin_locked(&device_domain_lock);
1478 if (!dev_is_pci(info->dev))
1479 return;
1481 pdev = to_pci_dev(info->dev);
1483 if (info->ats_enabled) {
1484 pci_disable_ats(pdev);
1485 info->ats_enabled = 0;
1486 domain_update_iotlb(info->domain);
1488 #ifdef CONFIG_INTEL_IOMMU_SVM
1489 if (info->pri_enabled) {
1490 pci_disable_pri(pdev);
1491 info->pri_enabled = 0;
1493 if (info->pasid_enabled) {
1494 pci_disable_pasid(pdev);
1495 info->pasid_enabled = 0;
1497 #endif
1500 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1501 u64 addr, unsigned mask)
1503 u16 sid, qdep;
1504 unsigned long flags;
1505 struct device_domain_info *info;
1507 if (!domain->has_iotlb_device)
1508 return;
1510 spin_lock_irqsave(&device_domain_lock, flags);
1511 list_for_each_entry(info, &domain->devices, link) {
1512 if (!info->ats_enabled)
1513 continue;
1515 sid = info->bus << 8 | info->devfn;
1516 qdep = info->ats_qdep;
1517 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1518 qdep, addr, mask);
1520 spin_unlock_irqrestore(&device_domain_lock, flags);
1523 static void domain_flush_piotlb(struct intel_iommu *iommu,
1524 struct dmar_domain *domain,
1525 u64 addr, unsigned long npages, bool ih)
1527 u16 did = domain->iommu_did[iommu->seq_id];
1529 if (domain->default_pasid)
1530 qi_flush_piotlb(iommu, did, domain->default_pasid,
1531 addr, npages, ih);
1533 if (!list_empty(&domain->devices))
1534 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1537 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1538 struct dmar_domain *domain,
1539 unsigned long pfn, unsigned int pages,
1540 int ih, int map)
1542 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1543 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1544 u16 did = domain->iommu_did[iommu->seq_id];
1546 BUG_ON(pages == 0);
1548 if (ih)
1549 ih = 1 << 6;
1551 if (domain_use_first_level(domain)) {
1552 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1553 } else {
1555 * Fallback to domain selective flush if no PSI support or
1556 * the size is too big. PSI requires page size to be 2 ^ x,
1557 * and the base address is naturally aligned to the size.
1559 if (!cap_pgsel_inv(iommu->cap) ||
1560 mask > cap_max_amask_val(iommu->cap))
1561 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1562 DMA_TLB_DSI_FLUSH);
1563 else
1564 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1565 DMA_TLB_PSI_FLUSH);
1569 * In caching mode, changes of pages from non-present to present require
1570 * flush. However, device IOTLB doesn't need to be flushed in this case.
1572 if (!cap_caching_mode(iommu->cap) || !map)
1573 iommu_flush_dev_iotlb(domain, addr, mask);
1576 /* Notification for newly created mappings */
1577 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1578 struct dmar_domain *domain,
1579 unsigned long pfn, unsigned int pages)
1582 * It's a non-present to present mapping. Only flush if caching mode
1583 * and second level.
1585 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1586 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1587 else
1588 iommu_flush_write_buffer(iommu);
1591 static void iommu_flush_iova(struct iova_domain *iovad)
1593 struct dmar_domain *domain;
1594 int idx;
1596 domain = container_of(iovad, struct dmar_domain, iovad);
1598 for_each_domain_iommu(idx, domain) {
1599 struct intel_iommu *iommu = g_iommus[idx];
1600 u16 did = domain->iommu_did[iommu->seq_id];
1602 if (domain_use_first_level(domain))
1603 domain_flush_piotlb(iommu, domain, 0, -1, 0);
1604 else
1605 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1606 DMA_TLB_DSI_FLUSH);
1608 if (!cap_caching_mode(iommu->cap))
1609 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1610 0, MAX_AGAW_PFN_WIDTH);
1614 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1616 u32 pmen;
1617 unsigned long flags;
1619 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1620 return;
1622 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1623 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1624 pmen &= ~DMA_PMEN_EPM;
1625 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1627 /* wait for the protected region status bit to clear */
1628 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1629 readl, !(pmen & DMA_PMEN_PRS), pmen);
1631 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1634 static void iommu_enable_translation(struct intel_iommu *iommu)
1636 u32 sts;
1637 unsigned long flags;
1639 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1640 iommu->gcmd |= DMA_GCMD_TE;
1641 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1643 /* Make sure hardware complete it */
1644 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1645 readl, (sts & DMA_GSTS_TES), sts);
1647 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1650 static void iommu_disable_translation(struct intel_iommu *iommu)
1652 u32 sts;
1653 unsigned long flag;
1655 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1656 iommu->gcmd &= ~DMA_GCMD_TE;
1657 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1659 /* Make sure hardware complete it */
1660 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1661 readl, (!(sts & DMA_GSTS_TES)), sts);
1663 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1666 static int iommu_init_domains(struct intel_iommu *iommu)
1668 u32 ndomains, nlongs;
1669 size_t size;
1671 ndomains = cap_ndoms(iommu->cap);
1672 pr_debug("%s: Number of Domains supported <%d>\n",
1673 iommu->name, ndomains);
1674 nlongs = BITS_TO_LONGS(ndomains);
1676 spin_lock_init(&iommu->lock);
1678 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1679 if (!iommu->domain_ids) {
1680 pr_err("%s: Allocating domain id array failed\n",
1681 iommu->name);
1682 return -ENOMEM;
1685 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1686 iommu->domains = kzalloc(size, GFP_KERNEL);
1688 if (iommu->domains) {
1689 size = 256 * sizeof(struct dmar_domain *);
1690 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1693 if (!iommu->domains || !iommu->domains[0]) {
1694 pr_err("%s: Allocating domain array failed\n",
1695 iommu->name);
1696 kfree(iommu->domain_ids);
1697 kfree(iommu->domains);
1698 iommu->domain_ids = NULL;
1699 iommu->domains = NULL;
1700 return -ENOMEM;
1704 * If Caching mode is set, then invalid translations are tagged
1705 * with domain-id 0, hence we need to pre-allocate it. We also
1706 * use domain-id 0 as a marker for non-allocated domain-id, so
1707 * make sure it is not used for a real domain.
1709 set_bit(0, iommu->domain_ids);
1712 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1713 * entry for first-level or pass-through translation modes should
1714 * be programmed with a domain id different from those used for
1715 * second-level or nested translation. We reserve a domain id for
1716 * this purpose.
1718 if (sm_supported(iommu))
1719 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1721 return 0;
1724 static void disable_dmar_iommu(struct intel_iommu *iommu)
1726 struct device_domain_info *info, *tmp;
1727 unsigned long flags;
1729 if (!iommu->domains || !iommu->domain_ids)
1730 return;
1732 spin_lock_irqsave(&device_domain_lock, flags);
1733 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1734 if (info->iommu != iommu)
1735 continue;
1737 if (!info->dev || !info->domain)
1738 continue;
1740 __dmar_remove_one_dev_info(info);
1742 spin_unlock_irqrestore(&device_domain_lock, flags);
1744 if (iommu->gcmd & DMA_GCMD_TE)
1745 iommu_disable_translation(iommu);
1748 static void free_dmar_iommu(struct intel_iommu *iommu)
1750 if ((iommu->domains) && (iommu->domain_ids)) {
1751 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1752 int i;
1754 for (i = 0; i < elems; i++)
1755 kfree(iommu->domains[i]);
1756 kfree(iommu->domains);
1757 kfree(iommu->domain_ids);
1758 iommu->domains = NULL;
1759 iommu->domain_ids = NULL;
1762 g_iommus[iommu->seq_id] = NULL;
1764 /* free context mapping */
1765 free_context_table(iommu);
1767 #ifdef CONFIG_INTEL_IOMMU_SVM
1768 if (pasid_supported(iommu)) {
1769 if (ecap_prs(iommu->ecap))
1770 intel_svm_finish_prq(iommu);
1772 #endif
1776 * Check and return whether first level is used by default for
1777 * DMA translation.
1779 static bool first_level_by_default(void)
1781 struct dmar_drhd_unit *drhd;
1782 struct intel_iommu *iommu;
1783 static int first_level_support = -1;
1785 if (likely(first_level_support != -1))
1786 return first_level_support;
1788 first_level_support = 1;
1790 rcu_read_lock();
1791 for_each_active_iommu(iommu, drhd) {
1792 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1793 first_level_support = 0;
1794 break;
1797 rcu_read_unlock();
1799 return first_level_support;
1802 static struct dmar_domain *alloc_domain(int flags)
1804 struct dmar_domain *domain;
1806 domain = alloc_domain_mem();
1807 if (!domain)
1808 return NULL;
1810 memset(domain, 0, sizeof(*domain));
1811 domain->nid = NUMA_NO_NODE;
1812 domain->flags = flags;
1813 if (first_level_by_default())
1814 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1815 domain->has_iotlb_device = false;
1816 INIT_LIST_HEAD(&domain->devices);
1818 return domain;
1821 /* Must be called with iommu->lock */
1822 static int domain_attach_iommu(struct dmar_domain *domain,
1823 struct intel_iommu *iommu)
1825 unsigned long ndomains;
1826 int num;
1828 assert_spin_locked(&device_domain_lock);
1829 assert_spin_locked(&iommu->lock);
1831 domain->iommu_refcnt[iommu->seq_id] += 1;
1832 domain->iommu_count += 1;
1833 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1834 ndomains = cap_ndoms(iommu->cap);
1835 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1837 if (num >= ndomains) {
1838 pr_err("%s: No free domain ids\n", iommu->name);
1839 domain->iommu_refcnt[iommu->seq_id] -= 1;
1840 domain->iommu_count -= 1;
1841 return -ENOSPC;
1844 set_bit(num, iommu->domain_ids);
1845 set_iommu_domain(iommu, num, domain);
1847 domain->iommu_did[iommu->seq_id] = num;
1848 domain->nid = iommu->node;
1850 domain_update_iommu_cap(domain);
1853 return 0;
1856 static int domain_detach_iommu(struct dmar_domain *domain,
1857 struct intel_iommu *iommu)
1859 int num, count;
1861 assert_spin_locked(&device_domain_lock);
1862 assert_spin_locked(&iommu->lock);
1864 domain->iommu_refcnt[iommu->seq_id] -= 1;
1865 count = --domain->iommu_count;
1866 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1867 num = domain->iommu_did[iommu->seq_id];
1868 clear_bit(num, iommu->domain_ids);
1869 set_iommu_domain(iommu, num, NULL);
1871 domain_update_iommu_cap(domain);
1872 domain->iommu_did[iommu->seq_id] = 0;
1875 return count;
1878 static struct iova_domain reserved_iova_list;
1879 static struct lock_class_key reserved_rbtree_key;
1881 static int dmar_init_reserved_ranges(void)
1883 struct pci_dev *pdev = NULL;
1884 struct iova *iova;
1885 int i;
1887 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1889 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1890 &reserved_rbtree_key);
1892 /* IOAPIC ranges shouldn't be accessed by DMA */
1893 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1894 IOVA_PFN(IOAPIC_RANGE_END));
1895 if (!iova) {
1896 pr_err("Reserve IOAPIC range failed\n");
1897 return -ENODEV;
1900 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1901 for_each_pci_dev(pdev) {
1902 struct resource *r;
1904 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1905 r = &pdev->resource[i];
1906 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1907 continue;
1908 iova = reserve_iova(&reserved_iova_list,
1909 IOVA_PFN(r->start),
1910 IOVA_PFN(r->end));
1911 if (!iova) {
1912 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1913 return -ENODEV;
1917 return 0;
1920 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1922 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1925 static inline int guestwidth_to_adjustwidth(int gaw)
1927 int agaw;
1928 int r = (gaw - 12) % 9;
1930 if (r == 0)
1931 agaw = gaw;
1932 else
1933 agaw = gaw + 9 - r;
1934 if (agaw > 64)
1935 agaw = 64;
1936 return agaw;
1939 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1940 int guest_width)
1942 int adjust_width, agaw;
1943 unsigned long sagaw;
1944 int ret;
1946 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1948 if (!intel_iommu_strict) {
1949 ret = init_iova_flush_queue(&domain->iovad,
1950 iommu_flush_iova, iova_entry_free);
1951 if (ret)
1952 pr_info("iova flush queue initialization failed\n");
1955 domain_reserve_special_ranges(domain);
1957 /* calculate AGAW */
1958 if (guest_width > cap_mgaw(iommu->cap))
1959 guest_width = cap_mgaw(iommu->cap);
1960 domain->gaw = guest_width;
1961 adjust_width = guestwidth_to_adjustwidth(guest_width);
1962 agaw = width_to_agaw(adjust_width);
1963 sagaw = cap_sagaw(iommu->cap);
1964 if (!test_bit(agaw, &sagaw)) {
1965 /* hardware doesn't support it, choose a bigger one */
1966 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1967 agaw = find_next_bit(&sagaw, 5, agaw);
1968 if (agaw >= 5)
1969 return -ENODEV;
1971 domain->agaw = agaw;
1973 if (ecap_coherent(iommu->ecap))
1974 domain->iommu_coherency = 1;
1975 else
1976 domain->iommu_coherency = 0;
1978 if (ecap_sc_support(iommu->ecap))
1979 domain->iommu_snooping = 1;
1980 else
1981 domain->iommu_snooping = 0;
1983 if (intel_iommu_superpage)
1984 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1985 else
1986 domain->iommu_superpage = 0;
1988 domain->nid = iommu->node;
1990 /* always allocate the top pgd */
1991 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1992 if (!domain->pgd)
1993 return -ENOMEM;
1994 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1995 return 0;
1998 static void domain_exit(struct dmar_domain *domain)
2001 /* Remove associated devices and clear attached or cached domains */
2002 domain_remove_dev_info(domain);
2004 /* destroy iovas */
2005 put_iova_domain(&domain->iovad);
2007 if (domain->pgd) {
2008 struct page *freelist;
2010 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2011 dma_free_pagelist(freelist);
2014 free_domain_mem(domain);
2018 * Get the PASID directory size for scalable mode context entry.
2019 * Value of X in the PDTS field of a scalable mode context entry
2020 * indicates PASID directory with 2^(X + 7) entries.
2022 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2024 int pds, max_pde;
2026 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2027 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2028 if (pds < 7)
2029 return 0;
2031 return pds - 7;
2035 * Set the RID_PASID field of a scalable mode context entry. The
2036 * IOMMU hardware will use the PASID value set in this field for
2037 * DMA translations of DMA requests without PASID.
2039 static inline void
2040 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2042 context->hi |= pasid & ((1 << 20) - 1);
2046 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2047 * entry.
2049 static inline void context_set_sm_dte(struct context_entry *context)
2051 context->lo |= (1 << 2);
2055 * Set the PRE(Page Request Enable) field of a scalable mode context
2056 * entry.
2058 static inline void context_set_sm_pre(struct context_entry *context)
2060 context->lo |= (1 << 4);
2063 /* Convert value to context PASID directory size field coding. */
2064 #define context_pdts(pds) (((pds) & 0x7) << 9)
2066 static int domain_context_mapping_one(struct dmar_domain *domain,
2067 struct intel_iommu *iommu,
2068 struct pasid_table *table,
2069 u8 bus, u8 devfn)
2071 u16 did = domain->iommu_did[iommu->seq_id];
2072 int translation = CONTEXT_TT_MULTI_LEVEL;
2073 struct device_domain_info *info = NULL;
2074 struct context_entry *context;
2075 unsigned long flags;
2076 int ret;
2078 WARN_ON(did == 0);
2080 if (hw_pass_through && domain_type_is_si(domain))
2081 translation = CONTEXT_TT_PASS_THROUGH;
2083 pr_debug("Set context mapping for %02x:%02x.%d\n",
2084 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2086 BUG_ON(!domain->pgd);
2088 spin_lock_irqsave(&device_domain_lock, flags);
2089 spin_lock(&iommu->lock);
2091 ret = -ENOMEM;
2092 context = iommu_context_addr(iommu, bus, devfn, 1);
2093 if (!context)
2094 goto out_unlock;
2096 ret = 0;
2097 if (context_present(context))
2098 goto out_unlock;
2101 * For kdump cases, old valid entries may be cached due to the
2102 * in-flight DMA and copied pgtable, but there is no unmapping
2103 * behaviour for them, thus we need an explicit cache flush for
2104 * the newly-mapped device. For kdump, at this point, the device
2105 * is supposed to finish reset at its driver probe stage, so no
2106 * in-flight DMA will exist, and we don't need to worry anymore
2107 * hereafter.
2109 if (context_copied(context)) {
2110 u16 did_old = context_domain_id(context);
2112 if (did_old < cap_ndoms(iommu->cap)) {
2113 iommu->flush.flush_context(iommu, did_old,
2114 (((u16)bus) << 8) | devfn,
2115 DMA_CCMD_MASK_NOBIT,
2116 DMA_CCMD_DEVICE_INVL);
2117 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2118 DMA_TLB_DSI_FLUSH);
2122 context_clear_entry(context);
2124 if (sm_supported(iommu)) {
2125 unsigned long pds;
2127 WARN_ON(!table);
2129 /* Setup the PASID DIR pointer: */
2130 pds = context_get_sm_pds(table);
2131 context->lo = (u64)virt_to_phys(table->table) |
2132 context_pdts(pds);
2134 /* Setup the RID_PASID field: */
2135 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2138 * Setup the Device-TLB enable bit and Page request
2139 * Enable bit:
2141 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2142 if (info && info->ats_supported)
2143 context_set_sm_dte(context);
2144 if (info && info->pri_supported)
2145 context_set_sm_pre(context);
2146 } else {
2147 struct dma_pte *pgd = domain->pgd;
2148 int agaw;
2150 context_set_domain_id(context, did);
2152 if (translation != CONTEXT_TT_PASS_THROUGH) {
2154 * Skip top levels of page tables for iommu which has
2155 * less agaw than default. Unnecessary for PT mode.
2157 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2158 ret = -ENOMEM;
2159 pgd = phys_to_virt(dma_pte_addr(pgd));
2160 if (!dma_pte_present(pgd))
2161 goto out_unlock;
2164 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2165 if (info && info->ats_supported)
2166 translation = CONTEXT_TT_DEV_IOTLB;
2167 else
2168 translation = CONTEXT_TT_MULTI_LEVEL;
2170 context_set_address_root(context, virt_to_phys(pgd));
2171 context_set_address_width(context, agaw);
2172 } else {
2174 * In pass through mode, AW must be programmed to
2175 * indicate the largest AGAW value supported by
2176 * hardware. And ASR is ignored by hardware.
2178 context_set_address_width(context, iommu->msagaw);
2181 context_set_translation_type(context, translation);
2184 context_set_fault_enable(context);
2185 context_set_present(context);
2186 if (!ecap_coherent(iommu->ecap))
2187 clflush_cache_range(context, sizeof(*context));
2190 * It's a non-present to present mapping. If hardware doesn't cache
2191 * non-present entry we only need to flush the write-buffer. If the
2192 * _does_ cache non-present entries, then it does so in the special
2193 * domain #0, which we have to flush:
2195 if (cap_caching_mode(iommu->cap)) {
2196 iommu->flush.flush_context(iommu, 0,
2197 (((u16)bus) << 8) | devfn,
2198 DMA_CCMD_MASK_NOBIT,
2199 DMA_CCMD_DEVICE_INVL);
2200 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2201 } else {
2202 iommu_flush_write_buffer(iommu);
2204 iommu_enable_dev_iotlb(info);
2206 ret = 0;
2208 out_unlock:
2209 spin_unlock(&iommu->lock);
2210 spin_unlock_irqrestore(&device_domain_lock, flags);
2212 return ret;
2215 struct domain_context_mapping_data {
2216 struct dmar_domain *domain;
2217 struct intel_iommu *iommu;
2218 struct pasid_table *table;
2221 static int domain_context_mapping_cb(struct pci_dev *pdev,
2222 u16 alias, void *opaque)
2224 struct domain_context_mapping_data *data = opaque;
2226 return domain_context_mapping_one(data->domain, data->iommu,
2227 data->table, PCI_BUS_NUM(alias),
2228 alias & 0xff);
2231 static int
2232 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2234 struct domain_context_mapping_data data;
2235 struct pasid_table *table;
2236 struct intel_iommu *iommu;
2237 u8 bus, devfn;
2239 iommu = device_to_iommu(dev, &bus, &devfn);
2240 if (!iommu)
2241 return -ENODEV;
2243 table = intel_pasid_get_table(dev);
2245 if (!dev_is_pci(dev))
2246 return domain_context_mapping_one(domain, iommu, table,
2247 bus, devfn);
2249 data.domain = domain;
2250 data.iommu = iommu;
2251 data.table = table;
2253 return pci_for_each_dma_alias(to_pci_dev(dev),
2254 &domain_context_mapping_cb, &data);
2257 static int domain_context_mapped_cb(struct pci_dev *pdev,
2258 u16 alias, void *opaque)
2260 struct intel_iommu *iommu = opaque;
2262 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2265 static int domain_context_mapped(struct device *dev)
2267 struct intel_iommu *iommu;
2268 u8 bus, devfn;
2270 iommu = device_to_iommu(dev, &bus, &devfn);
2271 if (!iommu)
2272 return -ENODEV;
2274 if (!dev_is_pci(dev))
2275 return device_context_mapped(iommu, bus, devfn);
2277 return !pci_for_each_dma_alias(to_pci_dev(dev),
2278 domain_context_mapped_cb, iommu);
2281 /* Returns a number of VTD pages, but aligned to MM page size */
2282 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2283 size_t size)
2285 host_addr &= ~PAGE_MASK;
2286 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2289 /* Return largest possible superpage level for a given mapping */
2290 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2291 unsigned long iov_pfn,
2292 unsigned long phy_pfn,
2293 unsigned long pages)
2295 int support, level = 1;
2296 unsigned long pfnmerge;
2298 support = domain->iommu_superpage;
2300 /* To use a large page, the virtual *and* physical addresses
2301 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2302 of them will mean we have to use smaller pages. So just
2303 merge them and check both at once. */
2304 pfnmerge = iov_pfn | phy_pfn;
2306 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2307 pages >>= VTD_STRIDE_SHIFT;
2308 if (!pages)
2309 break;
2310 pfnmerge >>= VTD_STRIDE_SHIFT;
2311 level++;
2312 support--;
2314 return level;
2317 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2318 struct scatterlist *sg, unsigned long phys_pfn,
2319 unsigned long nr_pages, int prot)
2321 struct dma_pte *first_pte = NULL, *pte = NULL;
2322 phys_addr_t uninitialized_var(pteval);
2323 unsigned long sg_res = 0;
2324 unsigned int largepage_lvl = 0;
2325 unsigned long lvl_pages = 0;
2326 u64 attr;
2328 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2330 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2331 return -EINVAL;
2333 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2334 if (domain_use_first_level(domain))
2335 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2337 if (!sg) {
2338 sg_res = nr_pages;
2339 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2342 while (nr_pages > 0) {
2343 uint64_t tmp;
2345 if (!sg_res) {
2346 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2348 sg_res = aligned_nrpages(sg->offset, sg->length);
2349 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2350 sg->dma_length = sg->length;
2351 pteval = (sg_phys(sg) - pgoff) | attr;
2352 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2355 if (!pte) {
2356 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2358 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2359 if (!pte)
2360 return -ENOMEM;
2361 /* It is large page*/
2362 if (largepage_lvl > 1) {
2363 unsigned long nr_superpages, end_pfn;
2365 pteval |= DMA_PTE_LARGE_PAGE;
2366 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2368 nr_superpages = sg_res / lvl_pages;
2369 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2372 * Ensure that old small page tables are
2373 * removed to make room for superpage(s).
2374 * We're adding new large pages, so make sure
2375 * we don't remove their parent tables.
2377 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2378 largepage_lvl + 1);
2379 } else {
2380 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2384 /* We don't need lock here, nobody else
2385 * touches the iova range
2387 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2388 if (tmp) {
2389 static int dumps = 5;
2390 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2391 iov_pfn, tmp, (unsigned long long)pteval);
2392 if (dumps) {
2393 dumps--;
2394 debug_dma_dump_mappings(NULL);
2396 WARN_ON(1);
2399 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2401 BUG_ON(nr_pages < lvl_pages);
2402 BUG_ON(sg_res < lvl_pages);
2404 nr_pages -= lvl_pages;
2405 iov_pfn += lvl_pages;
2406 phys_pfn += lvl_pages;
2407 pteval += lvl_pages * VTD_PAGE_SIZE;
2408 sg_res -= lvl_pages;
2410 /* If the next PTE would be the first in a new page, then we
2411 need to flush the cache on the entries we've just written.
2412 And then we'll need to recalculate 'pte', so clear it and
2413 let it get set again in the if (!pte) block above.
2415 If we're done (!nr_pages) we need to flush the cache too.
2417 Also if we've been setting superpages, we may need to
2418 recalculate 'pte' and switch back to smaller pages for the
2419 end of the mapping, if the trailing size is not enough to
2420 use another superpage (i.e. sg_res < lvl_pages). */
2421 pte++;
2422 if (!nr_pages || first_pte_in_page(pte) ||
2423 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2424 domain_flush_cache(domain, first_pte,
2425 (void *)pte - (void *)first_pte);
2426 pte = NULL;
2429 if (!sg_res && nr_pages)
2430 sg = sg_next(sg);
2432 return 0;
2435 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2436 struct scatterlist *sg, unsigned long phys_pfn,
2437 unsigned long nr_pages, int prot)
2439 int iommu_id, ret;
2440 struct intel_iommu *iommu;
2442 /* Do the real mapping first */
2443 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2444 if (ret)
2445 return ret;
2447 for_each_domain_iommu(iommu_id, domain) {
2448 iommu = g_iommus[iommu_id];
2449 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2452 return 0;
2455 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2456 struct scatterlist *sg, unsigned long nr_pages,
2457 int prot)
2459 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2462 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2463 unsigned long phys_pfn, unsigned long nr_pages,
2464 int prot)
2466 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2469 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2471 unsigned long flags;
2472 struct context_entry *context;
2473 u16 did_old;
2475 if (!iommu)
2476 return;
2478 spin_lock_irqsave(&iommu->lock, flags);
2479 context = iommu_context_addr(iommu, bus, devfn, 0);
2480 if (!context) {
2481 spin_unlock_irqrestore(&iommu->lock, flags);
2482 return;
2484 did_old = context_domain_id(context);
2485 context_clear_entry(context);
2486 __iommu_flush_cache(iommu, context, sizeof(*context));
2487 spin_unlock_irqrestore(&iommu->lock, flags);
2488 iommu->flush.flush_context(iommu,
2489 did_old,
2490 (((u16)bus) << 8) | devfn,
2491 DMA_CCMD_MASK_NOBIT,
2492 DMA_CCMD_DEVICE_INVL);
2493 iommu->flush.flush_iotlb(iommu,
2494 did_old,
2497 DMA_TLB_DSI_FLUSH);
2500 static inline void unlink_domain_info(struct device_domain_info *info)
2502 assert_spin_locked(&device_domain_lock);
2503 list_del(&info->link);
2504 list_del(&info->global);
2505 if (info->dev)
2506 info->dev->archdata.iommu = NULL;
2509 static void domain_remove_dev_info(struct dmar_domain *domain)
2511 struct device_domain_info *info, *tmp;
2512 unsigned long flags;
2514 spin_lock_irqsave(&device_domain_lock, flags);
2515 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2516 __dmar_remove_one_dev_info(info);
2517 spin_unlock_irqrestore(&device_domain_lock, flags);
2520 struct dmar_domain *find_domain(struct device *dev)
2522 struct device_domain_info *info;
2524 if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2525 return NULL;
2527 /* No lock here, assumes no domain exit in normal case */
2528 info = dev->archdata.iommu;
2529 if (likely(info))
2530 return info->domain;
2532 return NULL;
2535 static void do_deferred_attach(struct device *dev)
2537 struct iommu_domain *domain;
2539 dev->archdata.iommu = NULL;
2540 domain = iommu_get_domain_for_dev(dev);
2541 if (domain)
2542 intel_iommu_attach_device(domain, dev);
2545 static inline struct device_domain_info *
2546 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2548 struct device_domain_info *info;
2550 list_for_each_entry(info, &device_domain_list, global)
2551 if (info->segment == segment && info->bus == bus &&
2552 info->devfn == devfn)
2553 return info;
2555 return NULL;
2558 static int domain_setup_first_level(struct intel_iommu *iommu,
2559 struct dmar_domain *domain,
2560 struct device *dev,
2561 int pasid)
2563 int flags = PASID_FLAG_SUPERVISOR_MODE;
2564 struct dma_pte *pgd = domain->pgd;
2565 int agaw, level;
2568 * Skip top levels of page tables for iommu which has
2569 * less agaw than default. Unnecessary for PT mode.
2571 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2572 pgd = phys_to_virt(dma_pte_addr(pgd));
2573 if (!dma_pte_present(pgd))
2574 return -ENOMEM;
2577 level = agaw_to_level(agaw);
2578 if (level != 4 && level != 5)
2579 return -EINVAL;
2581 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2583 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2584 domain->iommu_did[iommu->seq_id],
2585 flags);
2588 static bool dev_is_real_dma_subdevice(struct device *dev)
2590 return dev && dev_is_pci(dev) &&
2591 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2594 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2595 int bus, int devfn,
2596 struct device *dev,
2597 struct dmar_domain *domain)
2599 struct dmar_domain *found = NULL;
2600 struct device_domain_info *info;
2601 unsigned long flags;
2602 int ret;
2604 info = alloc_devinfo_mem();
2605 if (!info)
2606 return NULL;
2608 if (!dev_is_real_dma_subdevice(dev)) {
2609 info->bus = bus;
2610 info->devfn = devfn;
2611 info->segment = iommu->segment;
2612 } else {
2613 struct pci_dev *pdev = to_pci_dev(dev);
2615 info->bus = pdev->bus->number;
2616 info->devfn = pdev->devfn;
2617 info->segment = pci_domain_nr(pdev->bus);
2620 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2621 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2622 info->ats_qdep = 0;
2623 info->dev = dev;
2624 info->domain = domain;
2625 info->iommu = iommu;
2626 info->pasid_table = NULL;
2627 info->auxd_enabled = 0;
2628 INIT_LIST_HEAD(&info->auxiliary_domains);
2630 if (dev && dev_is_pci(dev)) {
2631 struct pci_dev *pdev = to_pci_dev(info->dev);
2633 if (!pdev->untrusted &&
2634 !pci_ats_disabled() &&
2635 ecap_dev_iotlb_support(iommu->ecap) &&
2636 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2637 dmar_find_matched_atsr_unit(pdev))
2638 info->ats_supported = 1;
2640 if (sm_supported(iommu)) {
2641 if (pasid_supported(iommu)) {
2642 int features = pci_pasid_features(pdev);
2643 if (features >= 0)
2644 info->pasid_supported = features | 1;
2647 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2648 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2649 info->pri_supported = 1;
2653 spin_lock_irqsave(&device_domain_lock, flags);
2654 if (dev)
2655 found = find_domain(dev);
2657 if (!found) {
2658 struct device_domain_info *info2;
2659 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2660 info->devfn);
2661 if (info2) {
2662 found = info2->domain;
2663 info2->dev = dev;
2667 if (found) {
2668 spin_unlock_irqrestore(&device_domain_lock, flags);
2669 free_devinfo_mem(info);
2670 /* Caller must free the original domain */
2671 return found;
2674 spin_lock(&iommu->lock);
2675 ret = domain_attach_iommu(domain, iommu);
2676 spin_unlock(&iommu->lock);
2678 if (ret) {
2679 spin_unlock_irqrestore(&device_domain_lock, flags);
2680 free_devinfo_mem(info);
2681 return NULL;
2684 list_add(&info->link, &domain->devices);
2685 list_add(&info->global, &device_domain_list);
2686 if (dev)
2687 dev->archdata.iommu = info;
2688 spin_unlock_irqrestore(&device_domain_lock, flags);
2690 /* PASID table is mandatory for a PCI device in scalable mode. */
2691 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2692 ret = intel_pasid_alloc_table(dev);
2693 if (ret) {
2694 dev_err(dev, "PASID table allocation failed\n");
2695 dmar_remove_one_dev_info(dev);
2696 return NULL;
2699 /* Setup the PASID entry for requests without PASID: */
2700 spin_lock(&iommu->lock);
2701 if (hw_pass_through && domain_type_is_si(domain))
2702 ret = intel_pasid_setup_pass_through(iommu, domain,
2703 dev, PASID_RID2PASID);
2704 else if (domain_use_first_level(domain))
2705 ret = domain_setup_first_level(iommu, domain, dev,
2706 PASID_RID2PASID);
2707 else
2708 ret = intel_pasid_setup_second_level(iommu, domain,
2709 dev, PASID_RID2PASID);
2710 spin_unlock(&iommu->lock);
2711 if (ret) {
2712 dev_err(dev, "Setup RID2PASID failed\n");
2713 dmar_remove_one_dev_info(dev);
2714 return NULL;
2718 if (dev && domain_context_mapping(domain, dev)) {
2719 dev_err(dev, "Domain context map failed\n");
2720 dmar_remove_one_dev_info(dev);
2721 return NULL;
2724 return domain;
2727 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2729 *(u16 *)opaque = alias;
2730 return 0;
2733 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2735 struct device_domain_info *info;
2736 struct dmar_domain *domain = NULL;
2737 struct intel_iommu *iommu;
2738 u16 dma_alias;
2739 unsigned long flags;
2740 u8 bus, devfn;
2742 iommu = device_to_iommu(dev, &bus, &devfn);
2743 if (!iommu)
2744 return NULL;
2746 if (dev_is_pci(dev)) {
2747 struct pci_dev *pdev = to_pci_dev(dev);
2749 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2751 spin_lock_irqsave(&device_domain_lock, flags);
2752 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2753 PCI_BUS_NUM(dma_alias),
2754 dma_alias & 0xff);
2755 if (info) {
2756 iommu = info->iommu;
2757 domain = info->domain;
2759 spin_unlock_irqrestore(&device_domain_lock, flags);
2761 /* DMA alias already has a domain, use it */
2762 if (info)
2763 goto out;
2766 /* Allocate and initialize new domain for the device */
2767 domain = alloc_domain(0);
2768 if (!domain)
2769 return NULL;
2770 if (domain_init(domain, iommu, gaw)) {
2771 domain_exit(domain);
2772 return NULL;
2775 out:
2776 return domain;
2779 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2780 struct dmar_domain *domain)
2782 struct intel_iommu *iommu;
2783 struct dmar_domain *tmp;
2784 u16 req_id, dma_alias;
2785 u8 bus, devfn;
2787 iommu = device_to_iommu(dev, &bus, &devfn);
2788 if (!iommu)
2789 return NULL;
2791 req_id = ((u16)bus << 8) | devfn;
2793 if (dev_is_pci(dev)) {
2794 struct pci_dev *pdev = to_pci_dev(dev);
2796 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2798 /* register PCI DMA alias device */
2799 if (req_id != dma_alias) {
2800 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2801 dma_alias & 0xff, NULL, domain);
2803 if (!tmp || tmp != domain)
2804 return tmp;
2808 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2809 if (!tmp || tmp != domain)
2810 return tmp;
2812 return domain;
2815 static int iommu_domain_identity_map(struct dmar_domain *domain,
2816 unsigned long long start,
2817 unsigned long long end)
2819 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2820 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2822 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2823 dma_to_mm_pfn(last_vpfn))) {
2824 pr_err("Reserving iova failed\n");
2825 return -ENOMEM;
2828 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2830 * RMRR range might have overlap with physical memory range,
2831 * clear it first
2833 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2835 return __domain_mapping(domain, first_vpfn, NULL,
2836 first_vpfn, last_vpfn - first_vpfn + 1,
2837 DMA_PTE_READ|DMA_PTE_WRITE);
2840 static int domain_prepare_identity_map(struct device *dev,
2841 struct dmar_domain *domain,
2842 unsigned long long start,
2843 unsigned long long end)
2845 /* For _hardware_ passthrough, don't bother. But for software
2846 passthrough, we do it anyway -- it may indicate a memory
2847 range which is reserved in E820, so which didn't get set
2848 up to start with in si_domain */
2849 if (domain == si_domain && hw_pass_through) {
2850 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2851 start, end);
2852 return 0;
2855 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2857 if (end < start) {
2858 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2859 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2860 dmi_get_system_info(DMI_BIOS_VENDOR),
2861 dmi_get_system_info(DMI_BIOS_VERSION),
2862 dmi_get_system_info(DMI_PRODUCT_VERSION));
2863 return -EIO;
2866 if (end >> agaw_to_width(domain->agaw)) {
2867 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2868 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2869 agaw_to_width(domain->agaw),
2870 dmi_get_system_info(DMI_BIOS_VENDOR),
2871 dmi_get_system_info(DMI_BIOS_VERSION),
2872 dmi_get_system_info(DMI_PRODUCT_VERSION));
2873 return -EIO;
2876 return iommu_domain_identity_map(domain, start, end);
2879 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2881 static int __init si_domain_init(int hw)
2883 struct dmar_rmrr_unit *rmrr;
2884 struct device *dev;
2885 int i, nid, ret;
2887 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2888 if (!si_domain)
2889 return -EFAULT;
2891 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2892 domain_exit(si_domain);
2893 return -EFAULT;
2896 if (hw)
2897 return 0;
2899 for_each_online_node(nid) {
2900 unsigned long start_pfn, end_pfn;
2901 int i;
2903 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2904 ret = iommu_domain_identity_map(si_domain,
2905 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2906 if (ret)
2907 return ret;
2912 * Identity map the RMRRs so that devices with RMRRs could also use
2913 * the si_domain.
2915 for_each_rmrr_units(rmrr) {
2916 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2917 i, dev) {
2918 unsigned long long start = rmrr->base_address;
2919 unsigned long long end = rmrr->end_address;
2921 if (WARN_ON(end < start ||
2922 end >> agaw_to_width(si_domain->agaw)))
2923 continue;
2925 ret = iommu_domain_identity_map(si_domain, start, end);
2926 if (ret)
2927 return ret;
2931 return 0;
2934 static int identity_mapping(struct device *dev)
2936 struct device_domain_info *info;
2938 info = dev->archdata.iommu;
2939 if (info)
2940 return (info->domain == si_domain);
2942 return 0;
2945 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2947 struct dmar_domain *ndomain;
2948 struct intel_iommu *iommu;
2949 u8 bus, devfn;
2951 iommu = device_to_iommu(dev, &bus, &devfn);
2952 if (!iommu)
2953 return -ENODEV;
2955 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2956 if (ndomain != domain)
2957 return -EBUSY;
2959 return 0;
2962 static bool device_has_rmrr(struct device *dev)
2964 struct dmar_rmrr_unit *rmrr;
2965 struct device *tmp;
2966 int i;
2968 rcu_read_lock();
2969 for_each_rmrr_units(rmrr) {
2971 * Return TRUE if this RMRR contains the device that
2972 * is passed in.
2974 for_each_active_dev_scope(rmrr->devices,
2975 rmrr->devices_cnt, i, tmp)
2976 if (tmp == dev ||
2977 is_downstream_to_pci_bridge(dev, tmp)) {
2978 rcu_read_unlock();
2979 return true;
2982 rcu_read_unlock();
2983 return false;
2987 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2988 * is relaxable (ie. is allowed to be not enforced under some conditions)
2989 * @dev: device handle
2991 * We assume that PCI USB devices with RMRRs have them largely
2992 * for historical reasons and that the RMRR space is not actively used post
2993 * boot. This exclusion may change if vendors begin to abuse it.
2995 * The same exception is made for graphics devices, with the requirement that
2996 * any use of the RMRR regions will be torn down before assigning the device
2997 * to a guest.
2999 * Return: true if the RMRR is relaxable, false otherwise
3001 static bool device_rmrr_is_relaxable(struct device *dev)
3003 struct pci_dev *pdev;
3005 if (!dev_is_pci(dev))
3006 return false;
3008 pdev = to_pci_dev(dev);
3009 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
3010 return true;
3011 else
3012 return false;
3016 * There are a couple cases where we need to restrict the functionality of
3017 * devices associated with RMRRs. The first is when evaluating a device for
3018 * identity mapping because problems exist when devices are moved in and out
3019 * of domains and their respective RMRR information is lost. This means that
3020 * a device with associated RMRRs will never be in a "passthrough" domain.
3021 * The second is use of the device through the IOMMU API. This interface
3022 * expects to have full control of the IOVA space for the device. We cannot
3023 * satisfy both the requirement that RMRR access is maintained and have an
3024 * unencumbered IOVA space. We also have no ability to quiesce the device's
3025 * use of the RMRR space or even inform the IOMMU API user of the restriction.
3026 * We therefore prevent devices associated with an RMRR from participating in
3027 * the IOMMU API, which eliminates them from device assignment.
3029 * In both cases, devices which have relaxable RMRRs are not concerned by this
3030 * restriction. See device_rmrr_is_relaxable comment.
3032 static bool device_is_rmrr_locked(struct device *dev)
3034 if (!device_has_rmrr(dev))
3035 return false;
3037 if (device_rmrr_is_relaxable(dev))
3038 return false;
3040 return true;
3044 * Return the required default domain type for a specific device.
3046 * @dev: the device in query
3047 * @startup: true if this is during early boot
3049 * Returns:
3050 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
3051 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
3052 * - 0: both identity and dynamic domains work for this device
3054 static int device_def_domain_type(struct device *dev)
3056 if (dev_is_pci(dev)) {
3057 struct pci_dev *pdev = to_pci_dev(dev);
3060 * Prevent any device marked as untrusted from getting
3061 * placed into the statically identity mapping domain.
3063 if (pdev->untrusted)
3064 return IOMMU_DOMAIN_DMA;
3066 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3067 return IOMMU_DOMAIN_IDENTITY;
3069 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3070 return IOMMU_DOMAIN_IDENTITY;
3073 * We want to start off with all devices in the 1:1 domain, and
3074 * take them out later if we find they can't access all of memory.
3076 * However, we can't do this for PCI devices behind bridges,
3077 * because all PCI devices behind the same bridge will end up
3078 * with the same source-id on their transactions.
3080 * Practically speaking, we can't change things around for these
3081 * devices at run-time, because we can't be sure there'll be no
3082 * DMA transactions in flight for any of their siblings.
3084 * So PCI devices (unless they're on the root bus) as well as
3085 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
3086 * the 1:1 domain, just in _case_ one of their siblings turns out
3087 * not to be able to map all of memory.
3089 if (!pci_is_pcie(pdev)) {
3090 if (!pci_is_root_bus(pdev->bus))
3091 return IOMMU_DOMAIN_DMA;
3092 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
3093 return IOMMU_DOMAIN_DMA;
3094 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
3095 return IOMMU_DOMAIN_DMA;
3098 return 0;
3101 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3104 * Start from the sane iommu hardware state.
3105 * If the queued invalidation is already initialized by us
3106 * (for example, while enabling interrupt-remapping) then
3107 * we got the things already rolling from a sane state.
3109 if (!iommu->qi) {
3111 * Clear any previous faults.
3113 dmar_fault(-1, iommu);
3115 * Disable queued invalidation if supported and already enabled
3116 * before OS handover.
3118 dmar_disable_qi(iommu);
3121 if (dmar_enable_qi(iommu)) {
3123 * Queued Invalidate not enabled, use Register Based Invalidate
3125 iommu->flush.flush_context = __iommu_flush_context;
3126 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3127 pr_info("%s: Using Register based invalidation\n",
3128 iommu->name);
3129 } else {
3130 iommu->flush.flush_context = qi_flush_context;
3131 iommu->flush.flush_iotlb = qi_flush_iotlb;
3132 pr_info("%s: Using Queued invalidation\n", iommu->name);
3136 static int copy_context_table(struct intel_iommu *iommu,
3137 struct root_entry *old_re,
3138 struct context_entry **tbl,
3139 int bus, bool ext)
3141 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3142 struct context_entry *new_ce = NULL, ce;
3143 struct context_entry *old_ce = NULL;
3144 struct root_entry re;
3145 phys_addr_t old_ce_phys;
3147 tbl_idx = ext ? bus * 2 : bus;
3148 memcpy(&re, old_re, sizeof(re));
3150 for (devfn = 0; devfn < 256; devfn++) {
3151 /* First calculate the correct index */
3152 idx = (ext ? devfn * 2 : devfn) % 256;
3154 if (idx == 0) {
3155 /* First save what we may have and clean up */
3156 if (new_ce) {
3157 tbl[tbl_idx] = new_ce;
3158 __iommu_flush_cache(iommu, new_ce,
3159 VTD_PAGE_SIZE);
3160 pos = 1;
3163 if (old_ce)
3164 memunmap(old_ce);
3166 ret = 0;
3167 if (devfn < 0x80)
3168 old_ce_phys = root_entry_lctp(&re);
3169 else
3170 old_ce_phys = root_entry_uctp(&re);
3172 if (!old_ce_phys) {
3173 if (ext && devfn == 0) {
3174 /* No LCTP, try UCTP */
3175 devfn = 0x7f;
3176 continue;
3177 } else {
3178 goto out;
3182 ret = -ENOMEM;
3183 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3184 MEMREMAP_WB);
3185 if (!old_ce)
3186 goto out;
3188 new_ce = alloc_pgtable_page(iommu->node);
3189 if (!new_ce)
3190 goto out_unmap;
3192 ret = 0;
3195 /* Now copy the context entry */
3196 memcpy(&ce, old_ce + idx, sizeof(ce));
3198 if (!__context_present(&ce))
3199 continue;
3201 did = context_domain_id(&ce);
3202 if (did >= 0 && did < cap_ndoms(iommu->cap))
3203 set_bit(did, iommu->domain_ids);
3206 * We need a marker for copied context entries. This
3207 * marker needs to work for the old format as well as
3208 * for extended context entries.
3210 * Bit 67 of the context entry is used. In the old
3211 * format this bit is available to software, in the
3212 * extended format it is the PGE bit, but PGE is ignored
3213 * by HW if PASIDs are disabled (and thus still
3214 * available).
3216 * So disable PASIDs first and then mark the entry
3217 * copied. This means that we don't copy PASID
3218 * translations from the old kernel, but this is fine as
3219 * faults there are not fatal.
3221 context_clear_pasid_enable(&ce);
3222 context_set_copied(&ce);
3224 new_ce[idx] = ce;
3227 tbl[tbl_idx + pos] = new_ce;
3229 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3231 out_unmap:
3232 memunmap(old_ce);
3234 out:
3235 return ret;
3238 static int copy_translation_tables(struct intel_iommu *iommu)
3240 struct context_entry **ctxt_tbls;
3241 struct root_entry *old_rt;
3242 phys_addr_t old_rt_phys;
3243 int ctxt_table_entries;
3244 unsigned long flags;
3245 u64 rtaddr_reg;
3246 int bus, ret;
3247 bool new_ext, ext;
3249 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3250 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3251 new_ext = !!ecap_ecs(iommu->ecap);
3254 * The RTT bit can only be changed when translation is disabled,
3255 * but disabling translation means to open a window for data
3256 * corruption. So bail out and don't copy anything if we would
3257 * have to change the bit.
3259 if (new_ext != ext)
3260 return -EINVAL;
3262 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3263 if (!old_rt_phys)
3264 return -EINVAL;
3266 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3267 if (!old_rt)
3268 return -ENOMEM;
3270 /* This is too big for the stack - allocate it from slab */
3271 ctxt_table_entries = ext ? 512 : 256;
3272 ret = -ENOMEM;
3273 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3274 if (!ctxt_tbls)
3275 goto out_unmap;
3277 for (bus = 0; bus < 256; bus++) {
3278 ret = copy_context_table(iommu, &old_rt[bus],
3279 ctxt_tbls, bus, ext);
3280 if (ret) {
3281 pr_err("%s: Failed to copy context table for bus %d\n",
3282 iommu->name, bus);
3283 continue;
3287 spin_lock_irqsave(&iommu->lock, flags);
3289 /* Context tables are copied, now write them to the root_entry table */
3290 for (bus = 0; bus < 256; bus++) {
3291 int idx = ext ? bus * 2 : bus;
3292 u64 val;
3294 if (ctxt_tbls[idx]) {
3295 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3296 iommu->root_entry[bus].lo = val;
3299 if (!ext || !ctxt_tbls[idx + 1])
3300 continue;
3302 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3303 iommu->root_entry[bus].hi = val;
3306 spin_unlock_irqrestore(&iommu->lock, flags);
3308 kfree(ctxt_tbls);
3310 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3312 ret = 0;
3314 out_unmap:
3315 memunmap(old_rt);
3317 return ret;
3320 static int __init init_dmars(void)
3322 struct dmar_drhd_unit *drhd;
3323 struct intel_iommu *iommu;
3324 int ret;
3327 * for each drhd
3328 * allocate root
3329 * initialize and program root entry to not present
3330 * endfor
3332 for_each_drhd_unit(drhd) {
3334 * lock not needed as this is only incremented in the single
3335 * threaded kernel __init code path all other access are read
3336 * only
3338 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3339 g_num_of_iommus++;
3340 continue;
3342 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3345 /* Preallocate enough resources for IOMMU hot-addition */
3346 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3347 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3349 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3350 GFP_KERNEL);
3351 if (!g_iommus) {
3352 pr_err("Allocating global iommu array failed\n");
3353 ret = -ENOMEM;
3354 goto error;
3357 for_each_iommu(iommu, drhd) {
3358 if (drhd->ignored) {
3359 iommu_disable_translation(iommu);
3360 continue;
3364 * Find the max pasid size of all IOMMU's in the system.
3365 * We need to ensure the system pasid table is no bigger
3366 * than the smallest supported.
3368 if (pasid_supported(iommu)) {
3369 u32 temp = 2 << ecap_pss(iommu->ecap);
3371 intel_pasid_max_id = min_t(u32, temp,
3372 intel_pasid_max_id);
3375 g_iommus[iommu->seq_id] = iommu;
3377 intel_iommu_init_qi(iommu);
3379 ret = iommu_init_domains(iommu);
3380 if (ret)
3381 goto free_iommu;
3383 init_translation_status(iommu);
3385 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3386 iommu_disable_translation(iommu);
3387 clear_translation_pre_enabled(iommu);
3388 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3389 iommu->name);
3393 * TBD:
3394 * we could share the same root & context tables
3395 * among all IOMMU's. Need to Split it later.
3397 ret = iommu_alloc_root_entry(iommu);
3398 if (ret)
3399 goto free_iommu;
3401 if (translation_pre_enabled(iommu)) {
3402 pr_info("Translation already enabled - trying to copy translation structures\n");
3404 ret = copy_translation_tables(iommu);
3405 if (ret) {
3407 * We found the IOMMU with translation
3408 * enabled - but failed to copy over the
3409 * old root-entry table. Try to proceed
3410 * by disabling translation now and
3411 * allocating a clean root-entry table.
3412 * This might cause DMAR faults, but
3413 * probably the dump will still succeed.
3415 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3416 iommu->name);
3417 iommu_disable_translation(iommu);
3418 clear_translation_pre_enabled(iommu);
3419 } else {
3420 pr_info("Copied translation tables from previous kernel for %s\n",
3421 iommu->name);
3425 if (!ecap_pass_through(iommu->ecap))
3426 hw_pass_through = 0;
3427 intel_svm_check(iommu);
3431 * Now that qi is enabled on all iommus, set the root entry and flush
3432 * caches. This is required on some Intel X58 chipsets, otherwise the
3433 * flush_context function will loop forever and the boot hangs.
3435 for_each_active_iommu(iommu, drhd) {
3436 iommu_flush_write_buffer(iommu);
3437 iommu_set_root_entry(iommu);
3438 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3439 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3442 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3443 dmar_map_gfx = 0;
3444 #endif
3446 if (!dmar_map_gfx)
3447 iommu_identity_mapping |= IDENTMAP_GFX;
3449 check_tylersburg_isoch();
3451 ret = si_domain_init(hw_pass_through);
3452 if (ret)
3453 goto free_iommu;
3456 * for each drhd
3457 * enable fault log
3458 * global invalidate context cache
3459 * global invalidate iotlb
3460 * enable translation
3462 for_each_iommu(iommu, drhd) {
3463 if (drhd->ignored) {
3465 * we always have to disable PMRs or DMA may fail on
3466 * this device
3468 if (force_on)
3469 iommu_disable_protect_mem_regions(iommu);
3470 continue;
3473 iommu_flush_write_buffer(iommu);
3475 #ifdef CONFIG_INTEL_IOMMU_SVM
3476 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3478 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3479 * could cause possible lock race condition.
3481 up_write(&dmar_global_lock);
3482 ret = intel_svm_enable_prq(iommu);
3483 down_write(&dmar_global_lock);
3484 if (ret)
3485 goto free_iommu;
3487 #endif
3488 ret = dmar_set_interrupt(iommu);
3489 if (ret)
3490 goto free_iommu;
3493 return 0;
3495 free_iommu:
3496 for_each_active_iommu(iommu, drhd) {
3497 disable_dmar_iommu(iommu);
3498 free_dmar_iommu(iommu);
3501 kfree(g_iommus);
3503 error:
3504 return ret;
3507 /* This takes a number of _MM_ pages, not VTD pages */
3508 static unsigned long intel_alloc_iova(struct device *dev,
3509 struct dmar_domain *domain,
3510 unsigned long nrpages, uint64_t dma_mask)
3512 unsigned long iova_pfn;
3515 * Restrict dma_mask to the width that the iommu can handle.
3516 * First-level translation restricts the input-address to a
3517 * canonical address (i.e., address bits 63:N have the same
3518 * value as address bit [N-1], where N is 48-bits with 4-level
3519 * paging and 57-bits with 5-level paging). Hence, skip bit
3520 * [N-1].
3522 if (domain_use_first_level(domain))
3523 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3524 dma_mask);
3525 else
3526 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3527 dma_mask);
3529 /* Ensure we reserve the whole size-aligned region */
3530 nrpages = __roundup_pow_of_two(nrpages);
3532 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3534 * First try to allocate an io virtual address in
3535 * DMA_BIT_MASK(32) and if that fails then try allocating
3536 * from higher range
3538 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3539 IOVA_PFN(DMA_BIT_MASK(32)), false);
3540 if (iova_pfn)
3541 return iova_pfn;
3543 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3544 IOVA_PFN(dma_mask), true);
3545 if (unlikely(!iova_pfn)) {
3546 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3547 nrpages);
3548 return 0;
3551 return iova_pfn;
3554 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3556 struct dmar_domain *domain, *tmp;
3557 struct dmar_rmrr_unit *rmrr;
3558 struct device *i_dev;
3559 int i, ret;
3561 /* Device shouldn't be attached by any domains. */
3562 domain = find_domain(dev);
3563 if (domain)
3564 return NULL;
3566 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3567 if (!domain)
3568 goto out;
3570 /* We have a new domain - setup possible RMRRs for the device */
3571 rcu_read_lock();
3572 for_each_rmrr_units(rmrr) {
3573 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3574 i, i_dev) {
3575 if (i_dev != dev)
3576 continue;
3578 ret = domain_prepare_identity_map(dev, domain,
3579 rmrr->base_address,
3580 rmrr->end_address);
3581 if (ret)
3582 dev_err(dev, "Mapping reserved region failed\n");
3585 rcu_read_unlock();
3587 tmp = set_domain_for_dev(dev, domain);
3588 if (!tmp || domain != tmp) {
3589 domain_exit(domain);
3590 domain = tmp;
3593 out:
3594 if (!domain)
3595 dev_err(dev, "Allocating domain failed\n");
3596 else
3597 domain->domain.type = IOMMU_DOMAIN_DMA;
3599 return domain;
3602 /* Check if the dev needs to go through non-identity map and unmap process.*/
3603 static bool iommu_need_mapping(struct device *dev)
3605 int ret;
3607 if (iommu_dummy(dev))
3608 return false;
3610 if (unlikely(attach_deferred(dev)))
3611 do_deferred_attach(dev);
3613 ret = identity_mapping(dev);
3614 if (ret) {
3615 u64 dma_mask = *dev->dma_mask;
3617 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3618 dma_mask = dev->coherent_dma_mask;
3620 if (dma_mask >= dma_direct_get_required_mask(dev))
3621 return false;
3624 * 32 bit DMA is removed from si_domain and fall back to
3625 * non-identity mapping.
3627 dmar_remove_one_dev_info(dev);
3628 ret = iommu_request_dma_domain_for_dev(dev);
3629 if (ret) {
3630 struct iommu_domain *domain;
3631 struct dmar_domain *dmar_domain;
3633 domain = iommu_get_domain_for_dev(dev);
3634 if (domain) {
3635 dmar_domain = to_dmar_domain(domain);
3636 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3638 dmar_remove_one_dev_info(dev);
3639 get_private_domain_for_dev(dev);
3642 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3645 return true;
3648 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3649 size_t size, int dir, u64 dma_mask)
3651 struct dmar_domain *domain;
3652 phys_addr_t start_paddr;
3653 unsigned long iova_pfn;
3654 int prot = 0;
3655 int ret;
3656 struct intel_iommu *iommu;
3657 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3659 BUG_ON(dir == DMA_NONE);
3661 domain = find_domain(dev);
3662 if (!domain)
3663 return DMA_MAPPING_ERROR;
3665 iommu = domain_get_iommu(domain);
3666 size = aligned_nrpages(paddr, size);
3668 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3669 if (!iova_pfn)
3670 goto error;
3673 * Check if DMAR supports zero-length reads on write only
3674 * mappings..
3676 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3677 !cap_zlr(iommu->cap))
3678 prot |= DMA_PTE_READ;
3679 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3680 prot |= DMA_PTE_WRITE;
3682 * paddr - (paddr + size) might be partial page, we should map the whole
3683 * page. Note: if two part of one page are separately mapped, we
3684 * might have two guest_addr mapping to the same host paddr, but this
3685 * is not a big problem
3687 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3688 mm_to_dma_pfn(paddr_pfn), size, prot);
3689 if (ret)
3690 goto error;
3692 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3693 start_paddr += paddr & ~PAGE_MASK;
3695 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3697 return start_paddr;
3699 error:
3700 if (iova_pfn)
3701 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3702 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3703 size, (unsigned long long)paddr, dir);
3704 return DMA_MAPPING_ERROR;
3707 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3708 unsigned long offset, size_t size,
3709 enum dma_data_direction dir,
3710 unsigned long attrs)
3712 if (iommu_need_mapping(dev))
3713 return __intel_map_single(dev, page_to_phys(page) + offset,
3714 size, dir, *dev->dma_mask);
3715 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3718 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3719 size_t size, enum dma_data_direction dir,
3720 unsigned long attrs)
3722 if (iommu_need_mapping(dev))
3723 return __intel_map_single(dev, phys_addr, size, dir,
3724 *dev->dma_mask);
3725 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3728 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3730 struct dmar_domain *domain;
3731 unsigned long start_pfn, last_pfn;
3732 unsigned long nrpages;
3733 unsigned long iova_pfn;
3734 struct intel_iommu *iommu;
3735 struct page *freelist;
3736 struct pci_dev *pdev = NULL;
3738 domain = find_domain(dev);
3739 BUG_ON(!domain);
3741 iommu = domain_get_iommu(domain);
3743 iova_pfn = IOVA_PFN(dev_addr);
3745 nrpages = aligned_nrpages(dev_addr, size);
3746 start_pfn = mm_to_dma_pfn(iova_pfn);
3747 last_pfn = start_pfn + nrpages - 1;
3749 if (dev_is_pci(dev))
3750 pdev = to_pci_dev(dev);
3752 freelist = domain_unmap(domain, start_pfn, last_pfn);
3753 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3754 !has_iova_flush_queue(&domain->iovad)) {
3755 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3756 nrpages, !freelist, 0);
3757 /* free iova */
3758 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3759 dma_free_pagelist(freelist);
3760 } else {
3761 queue_iova(&domain->iovad, iova_pfn, nrpages,
3762 (unsigned long)freelist);
3764 * queue up the release of the unmap to save the 1/6th of the
3765 * cpu used up by the iotlb flush operation...
3769 trace_unmap_single(dev, dev_addr, size);
3772 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3773 size_t size, enum dma_data_direction dir,
3774 unsigned long attrs)
3776 if (iommu_need_mapping(dev))
3777 intel_unmap(dev, dev_addr, size);
3778 else
3779 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3782 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3783 size_t size, enum dma_data_direction dir, unsigned long attrs)
3785 if (iommu_need_mapping(dev))
3786 intel_unmap(dev, dev_addr, size);
3789 static void *intel_alloc_coherent(struct device *dev, size_t size,
3790 dma_addr_t *dma_handle, gfp_t flags,
3791 unsigned long attrs)
3793 struct page *page = NULL;
3794 int order;
3796 if (!iommu_need_mapping(dev))
3797 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3799 size = PAGE_ALIGN(size);
3800 order = get_order(size);
3802 if (gfpflags_allow_blocking(flags)) {
3803 unsigned int count = size >> PAGE_SHIFT;
3805 page = dma_alloc_from_contiguous(dev, count, order,
3806 flags & __GFP_NOWARN);
3809 if (!page)
3810 page = alloc_pages(flags, order);
3811 if (!page)
3812 return NULL;
3813 memset(page_address(page), 0, size);
3815 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3816 DMA_BIDIRECTIONAL,
3817 dev->coherent_dma_mask);
3818 if (*dma_handle != DMA_MAPPING_ERROR)
3819 return page_address(page);
3820 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3821 __free_pages(page, order);
3823 return NULL;
3826 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3827 dma_addr_t dma_handle, unsigned long attrs)
3829 int order;
3830 struct page *page = virt_to_page(vaddr);
3832 if (!iommu_need_mapping(dev))
3833 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3835 size = PAGE_ALIGN(size);
3836 order = get_order(size);
3838 intel_unmap(dev, dma_handle, size);
3839 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3840 __free_pages(page, order);
3843 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3844 int nelems, enum dma_data_direction dir,
3845 unsigned long attrs)
3847 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3848 unsigned long nrpages = 0;
3849 struct scatterlist *sg;
3850 int i;
3852 if (!iommu_need_mapping(dev))
3853 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3855 for_each_sg(sglist, sg, nelems, i) {
3856 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3859 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3861 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3864 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3865 enum dma_data_direction dir, unsigned long attrs)
3867 int i;
3868 struct dmar_domain *domain;
3869 size_t size = 0;
3870 int prot = 0;
3871 unsigned long iova_pfn;
3872 int ret;
3873 struct scatterlist *sg;
3874 unsigned long start_vpfn;
3875 struct intel_iommu *iommu;
3877 BUG_ON(dir == DMA_NONE);
3878 if (!iommu_need_mapping(dev))
3879 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3881 domain = find_domain(dev);
3882 if (!domain)
3883 return 0;
3885 iommu = domain_get_iommu(domain);
3887 for_each_sg(sglist, sg, nelems, i)
3888 size += aligned_nrpages(sg->offset, sg->length);
3890 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3891 *dev->dma_mask);
3892 if (!iova_pfn) {
3893 sglist->dma_length = 0;
3894 return 0;
3898 * Check if DMAR supports zero-length reads on write only
3899 * mappings..
3901 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3902 !cap_zlr(iommu->cap))
3903 prot |= DMA_PTE_READ;
3904 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3905 prot |= DMA_PTE_WRITE;
3907 start_vpfn = mm_to_dma_pfn(iova_pfn);
3909 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3910 if (unlikely(ret)) {
3911 dma_pte_free_pagetable(domain, start_vpfn,
3912 start_vpfn + size - 1,
3913 agaw_to_level(domain->agaw) + 1);
3914 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3915 return 0;
3918 for_each_sg(sglist, sg, nelems, i)
3919 trace_map_sg(dev, i + 1, nelems, sg);
3921 return nelems;
3924 static u64 intel_get_required_mask(struct device *dev)
3926 if (!iommu_need_mapping(dev))
3927 return dma_direct_get_required_mask(dev);
3928 return DMA_BIT_MASK(32);
3931 static const struct dma_map_ops intel_dma_ops = {
3932 .alloc = intel_alloc_coherent,
3933 .free = intel_free_coherent,
3934 .map_sg = intel_map_sg,
3935 .unmap_sg = intel_unmap_sg,
3936 .map_page = intel_map_page,
3937 .unmap_page = intel_unmap_page,
3938 .map_resource = intel_map_resource,
3939 .unmap_resource = intel_unmap_resource,
3940 .dma_supported = dma_direct_supported,
3941 .mmap = dma_common_mmap,
3942 .get_sgtable = dma_common_get_sgtable,
3943 .get_required_mask = intel_get_required_mask,
3946 static void
3947 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3948 enum dma_data_direction dir, enum dma_sync_target target)
3950 struct dmar_domain *domain;
3951 phys_addr_t tlb_addr;
3953 domain = find_domain(dev);
3954 if (WARN_ON(!domain))
3955 return;
3957 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3958 if (is_swiotlb_buffer(tlb_addr))
3959 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3962 static dma_addr_t
3963 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3964 enum dma_data_direction dir, unsigned long attrs,
3965 u64 dma_mask)
3967 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3968 struct dmar_domain *domain;
3969 struct intel_iommu *iommu;
3970 unsigned long iova_pfn;
3971 unsigned long nrpages;
3972 phys_addr_t tlb_addr;
3973 int prot = 0;
3974 int ret;
3976 if (unlikely(attach_deferred(dev)))
3977 do_deferred_attach(dev);
3979 domain = find_domain(dev);
3981 if (WARN_ON(dir == DMA_NONE || !domain))
3982 return DMA_MAPPING_ERROR;
3984 iommu = domain_get_iommu(domain);
3985 if (WARN_ON(!iommu))
3986 return DMA_MAPPING_ERROR;
3988 nrpages = aligned_nrpages(0, size);
3989 iova_pfn = intel_alloc_iova(dev, domain,
3990 dma_to_mm_pfn(nrpages), dma_mask);
3991 if (!iova_pfn)
3992 return DMA_MAPPING_ERROR;
3995 * Check if DMAR supports zero-length reads on write only
3996 * mappings..
3998 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3999 !cap_zlr(iommu->cap))
4000 prot |= DMA_PTE_READ;
4001 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
4002 prot |= DMA_PTE_WRITE;
4005 * If both the physical buffer start address and size are
4006 * page aligned, we don't need to use a bounce page.
4008 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
4009 tlb_addr = swiotlb_tbl_map_single(dev,
4010 __phys_to_dma(dev, io_tlb_start),
4011 paddr, size, aligned_size, dir, attrs);
4012 if (tlb_addr == DMA_MAPPING_ERROR) {
4013 goto swiotlb_error;
4014 } else {
4015 /* Cleanup the padding area. */
4016 void *padding_start = phys_to_virt(tlb_addr);
4017 size_t padding_size = aligned_size;
4019 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
4020 (dir == DMA_TO_DEVICE ||
4021 dir == DMA_BIDIRECTIONAL)) {
4022 padding_start += size;
4023 padding_size -= size;
4026 memset(padding_start, 0, padding_size);
4028 } else {
4029 tlb_addr = paddr;
4032 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
4033 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
4034 if (ret)
4035 goto mapping_error;
4037 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
4039 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
4041 mapping_error:
4042 if (is_swiotlb_buffer(tlb_addr))
4043 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4044 aligned_size, dir, attrs);
4045 swiotlb_error:
4046 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
4047 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
4048 size, (unsigned long long)paddr, dir);
4050 return DMA_MAPPING_ERROR;
4053 static void
4054 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
4055 enum dma_data_direction dir, unsigned long attrs)
4057 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
4058 struct dmar_domain *domain;
4059 phys_addr_t tlb_addr;
4061 domain = find_domain(dev);
4062 if (WARN_ON(!domain))
4063 return;
4065 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
4066 if (WARN_ON(!tlb_addr))
4067 return;
4069 intel_unmap(dev, dev_addr, size);
4070 if (is_swiotlb_buffer(tlb_addr))
4071 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4072 aligned_size, dir, attrs);
4074 trace_bounce_unmap_single(dev, dev_addr, size);
4077 static dma_addr_t
4078 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
4079 size_t size, enum dma_data_direction dir, unsigned long attrs)
4081 return bounce_map_single(dev, page_to_phys(page) + offset,
4082 size, dir, attrs, *dev->dma_mask);
4085 static dma_addr_t
4086 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
4087 enum dma_data_direction dir, unsigned long attrs)
4089 return bounce_map_single(dev, phys_addr, size,
4090 dir, attrs, *dev->dma_mask);
4093 static void
4094 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
4095 enum dma_data_direction dir, unsigned long attrs)
4097 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4100 static void
4101 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
4102 enum dma_data_direction dir, unsigned long attrs)
4104 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4107 static void
4108 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4109 enum dma_data_direction dir, unsigned long attrs)
4111 struct scatterlist *sg;
4112 int i;
4114 for_each_sg(sglist, sg, nelems, i)
4115 bounce_unmap_page(dev, sg->dma_address,
4116 sg_dma_len(sg), dir, attrs);
4119 static int
4120 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4121 enum dma_data_direction dir, unsigned long attrs)
4123 int i;
4124 struct scatterlist *sg;
4126 for_each_sg(sglist, sg, nelems, i) {
4127 sg->dma_address = bounce_map_page(dev, sg_page(sg),
4128 sg->offset, sg->length,
4129 dir, attrs);
4130 if (sg->dma_address == DMA_MAPPING_ERROR)
4131 goto out_unmap;
4132 sg_dma_len(sg) = sg->length;
4135 for_each_sg(sglist, sg, nelems, i)
4136 trace_bounce_map_sg(dev, i + 1, nelems, sg);
4138 return nelems;
4140 out_unmap:
4141 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4142 return 0;
4145 static void
4146 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4147 size_t size, enum dma_data_direction dir)
4149 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4152 static void
4153 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4154 size_t size, enum dma_data_direction dir)
4156 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4159 static void
4160 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4161 int nelems, enum dma_data_direction dir)
4163 struct scatterlist *sg;
4164 int i;
4166 for_each_sg(sglist, sg, nelems, i)
4167 bounce_sync_single(dev, sg_dma_address(sg),
4168 sg_dma_len(sg), dir, SYNC_FOR_CPU);
4171 static void
4172 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4173 int nelems, enum dma_data_direction dir)
4175 struct scatterlist *sg;
4176 int i;
4178 for_each_sg(sglist, sg, nelems, i)
4179 bounce_sync_single(dev, sg_dma_address(sg),
4180 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4183 static const struct dma_map_ops bounce_dma_ops = {
4184 .alloc = intel_alloc_coherent,
4185 .free = intel_free_coherent,
4186 .map_sg = bounce_map_sg,
4187 .unmap_sg = bounce_unmap_sg,
4188 .map_page = bounce_map_page,
4189 .unmap_page = bounce_unmap_page,
4190 .sync_single_for_cpu = bounce_sync_single_for_cpu,
4191 .sync_single_for_device = bounce_sync_single_for_device,
4192 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
4193 .sync_sg_for_device = bounce_sync_sg_for_device,
4194 .map_resource = bounce_map_resource,
4195 .unmap_resource = bounce_unmap_resource,
4196 .dma_supported = dma_direct_supported,
4199 static inline int iommu_domain_cache_init(void)
4201 int ret = 0;
4203 iommu_domain_cache = kmem_cache_create("iommu_domain",
4204 sizeof(struct dmar_domain),
4206 SLAB_HWCACHE_ALIGN,
4208 NULL);
4209 if (!iommu_domain_cache) {
4210 pr_err("Couldn't create iommu_domain cache\n");
4211 ret = -ENOMEM;
4214 return ret;
4217 static inline int iommu_devinfo_cache_init(void)
4219 int ret = 0;
4221 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4222 sizeof(struct device_domain_info),
4224 SLAB_HWCACHE_ALIGN,
4225 NULL);
4226 if (!iommu_devinfo_cache) {
4227 pr_err("Couldn't create devinfo cache\n");
4228 ret = -ENOMEM;
4231 return ret;
4234 static int __init iommu_init_mempool(void)
4236 int ret;
4237 ret = iova_cache_get();
4238 if (ret)
4239 return ret;
4241 ret = iommu_domain_cache_init();
4242 if (ret)
4243 goto domain_error;
4245 ret = iommu_devinfo_cache_init();
4246 if (!ret)
4247 return ret;
4249 kmem_cache_destroy(iommu_domain_cache);
4250 domain_error:
4251 iova_cache_put();
4253 return -ENOMEM;
4256 static void __init iommu_exit_mempool(void)
4258 kmem_cache_destroy(iommu_devinfo_cache);
4259 kmem_cache_destroy(iommu_domain_cache);
4260 iova_cache_put();
4263 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4265 struct dmar_drhd_unit *drhd;
4266 u32 vtbar;
4267 int rc;
4269 /* We know that this device on this chipset has its own IOMMU.
4270 * If we find it under a different IOMMU, then the BIOS is lying
4271 * to us. Hope that the IOMMU for this device is actually
4272 * disabled, and it needs no translation...
4274 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4275 if (rc) {
4276 /* "can't" happen */
4277 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4278 return;
4280 vtbar &= 0xffff0000;
4282 /* we know that the this iommu should be at offset 0xa000 from vtbar */
4283 drhd = dmar_find_matched_drhd_unit(pdev);
4284 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
4285 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4286 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4287 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4290 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4292 static void __init init_no_remapping_devices(void)
4294 struct dmar_drhd_unit *drhd;
4295 struct device *dev;
4296 int i;
4298 for_each_drhd_unit(drhd) {
4299 if (!drhd->include_all) {
4300 for_each_active_dev_scope(drhd->devices,
4301 drhd->devices_cnt, i, dev)
4302 break;
4303 /* ignore DMAR unit if no devices exist */
4304 if (i == drhd->devices_cnt)
4305 drhd->ignored = 1;
4309 for_each_active_drhd_unit(drhd) {
4310 if (drhd->include_all)
4311 continue;
4313 for_each_active_dev_scope(drhd->devices,
4314 drhd->devices_cnt, i, dev)
4315 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4316 break;
4317 if (i < drhd->devices_cnt)
4318 continue;
4320 /* This IOMMU has *only* gfx devices. Either bypass it or
4321 set the gfx_mapped flag, as appropriate */
4322 if (!dmar_map_gfx) {
4323 drhd->ignored = 1;
4324 for_each_active_dev_scope(drhd->devices,
4325 drhd->devices_cnt, i, dev)
4326 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4331 #ifdef CONFIG_SUSPEND
4332 static int init_iommu_hw(void)
4334 struct dmar_drhd_unit *drhd;
4335 struct intel_iommu *iommu = NULL;
4337 for_each_active_iommu(iommu, drhd)
4338 if (iommu->qi)
4339 dmar_reenable_qi(iommu);
4341 for_each_iommu(iommu, drhd) {
4342 if (drhd->ignored) {
4344 * we always have to disable PMRs or DMA may fail on
4345 * this device
4347 if (force_on)
4348 iommu_disable_protect_mem_regions(iommu);
4349 continue;
4352 iommu_flush_write_buffer(iommu);
4354 iommu_set_root_entry(iommu);
4356 iommu->flush.flush_context(iommu, 0, 0, 0,
4357 DMA_CCMD_GLOBAL_INVL);
4358 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4359 iommu_enable_translation(iommu);
4360 iommu_disable_protect_mem_regions(iommu);
4363 return 0;
4366 static void iommu_flush_all(void)
4368 struct dmar_drhd_unit *drhd;
4369 struct intel_iommu *iommu;
4371 for_each_active_iommu(iommu, drhd) {
4372 iommu->flush.flush_context(iommu, 0, 0, 0,
4373 DMA_CCMD_GLOBAL_INVL);
4374 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4375 DMA_TLB_GLOBAL_FLUSH);
4379 static int iommu_suspend(void)
4381 struct dmar_drhd_unit *drhd;
4382 struct intel_iommu *iommu = NULL;
4383 unsigned long flag;
4385 for_each_active_iommu(iommu, drhd) {
4386 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4387 GFP_ATOMIC);
4388 if (!iommu->iommu_state)
4389 goto nomem;
4392 iommu_flush_all();
4394 for_each_active_iommu(iommu, drhd) {
4395 iommu_disable_translation(iommu);
4397 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4399 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4400 readl(iommu->reg + DMAR_FECTL_REG);
4401 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4402 readl(iommu->reg + DMAR_FEDATA_REG);
4403 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4404 readl(iommu->reg + DMAR_FEADDR_REG);
4405 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4406 readl(iommu->reg + DMAR_FEUADDR_REG);
4408 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4410 return 0;
4412 nomem:
4413 for_each_active_iommu(iommu, drhd)
4414 kfree(iommu->iommu_state);
4416 return -ENOMEM;
4419 static void iommu_resume(void)
4421 struct dmar_drhd_unit *drhd;
4422 struct intel_iommu *iommu = NULL;
4423 unsigned long flag;
4425 if (init_iommu_hw()) {
4426 if (force_on)
4427 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4428 else
4429 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4430 return;
4433 for_each_active_iommu(iommu, drhd) {
4435 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4437 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4438 iommu->reg + DMAR_FECTL_REG);
4439 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4440 iommu->reg + DMAR_FEDATA_REG);
4441 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4442 iommu->reg + DMAR_FEADDR_REG);
4443 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4444 iommu->reg + DMAR_FEUADDR_REG);
4446 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4449 for_each_active_iommu(iommu, drhd)
4450 kfree(iommu->iommu_state);
4453 static struct syscore_ops iommu_syscore_ops = {
4454 .resume = iommu_resume,
4455 .suspend = iommu_suspend,
4458 static void __init init_iommu_pm_ops(void)
4460 register_syscore_ops(&iommu_syscore_ops);
4463 #else
4464 static inline void init_iommu_pm_ops(void) {}
4465 #endif /* CONFIG_PM */
4467 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4469 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4470 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4471 rmrr->end_address <= rmrr->base_address ||
4472 arch_rmrr_sanity_check(rmrr))
4473 return -EINVAL;
4475 return 0;
4478 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4480 struct acpi_dmar_reserved_memory *rmrr;
4481 struct dmar_rmrr_unit *rmrru;
4483 rmrr = (struct acpi_dmar_reserved_memory *)header;
4484 if (rmrr_sanity_check(rmrr)) {
4485 pr_warn(FW_BUG
4486 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4487 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4488 rmrr->base_address, rmrr->end_address,
4489 dmi_get_system_info(DMI_BIOS_VENDOR),
4490 dmi_get_system_info(DMI_BIOS_VERSION),
4491 dmi_get_system_info(DMI_PRODUCT_VERSION));
4492 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4495 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4496 if (!rmrru)
4497 goto out;
4499 rmrru->hdr = header;
4501 rmrru->base_address = rmrr->base_address;
4502 rmrru->end_address = rmrr->end_address;
4504 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4505 ((void *)rmrr) + rmrr->header.length,
4506 &rmrru->devices_cnt);
4507 if (rmrru->devices_cnt && rmrru->devices == NULL)
4508 goto free_rmrru;
4510 list_add(&rmrru->list, &dmar_rmrr_units);
4512 return 0;
4513 free_rmrru:
4514 kfree(rmrru);
4515 out:
4516 return -ENOMEM;
4519 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4521 struct dmar_atsr_unit *atsru;
4522 struct acpi_dmar_atsr *tmp;
4524 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4525 dmar_rcu_check()) {
4526 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4527 if (atsr->segment != tmp->segment)
4528 continue;
4529 if (atsr->header.length != tmp->header.length)
4530 continue;
4531 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4532 return atsru;
4535 return NULL;
4538 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4540 struct acpi_dmar_atsr *atsr;
4541 struct dmar_atsr_unit *atsru;
4543 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4544 return 0;
4546 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4547 atsru = dmar_find_atsr(atsr);
4548 if (atsru)
4549 return 0;
4551 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4552 if (!atsru)
4553 return -ENOMEM;
4556 * If memory is allocated from slab by ACPI _DSM method, we need to
4557 * copy the memory content because the memory buffer will be freed
4558 * on return.
4560 atsru->hdr = (void *)(atsru + 1);
4561 memcpy(atsru->hdr, hdr, hdr->length);
4562 atsru->include_all = atsr->flags & 0x1;
4563 if (!atsru->include_all) {
4564 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4565 (void *)atsr + atsr->header.length,
4566 &atsru->devices_cnt);
4567 if (atsru->devices_cnt && atsru->devices == NULL) {
4568 kfree(atsru);
4569 return -ENOMEM;
4573 list_add_rcu(&atsru->list, &dmar_atsr_units);
4575 return 0;
4578 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4580 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4581 kfree(atsru);
4584 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4586 struct acpi_dmar_atsr *atsr;
4587 struct dmar_atsr_unit *atsru;
4589 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4590 atsru = dmar_find_atsr(atsr);
4591 if (atsru) {
4592 list_del_rcu(&atsru->list);
4593 synchronize_rcu();
4594 intel_iommu_free_atsr(atsru);
4597 return 0;
4600 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4602 int i;
4603 struct device *dev;
4604 struct acpi_dmar_atsr *atsr;
4605 struct dmar_atsr_unit *atsru;
4607 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4608 atsru = dmar_find_atsr(atsr);
4609 if (!atsru)
4610 return 0;
4612 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4613 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4614 i, dev)
4615 return -EBUSY;
4618 return 0;
4621 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4623 int sp, ret;
4624 struct intel_iommu *iommu = dmaru->iommu;
4626 if (g_iommus[iommu->seq_id])
4627 return 0;
4629 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4630 pr_warn("%s: Doesn't support hardware pass through.\n",
4631 iommu->name);
4632 return -ENXIO;
4634 if (!ecap_sc_support(iommu->ecap) &&
4635 domain_update_iommu_snooping(iommu)) {
4636 pr_warn("%s: Doesn't support snooping.\n",
4637 iommu->name);
4638 return -ENXIO;
4640 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4641 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4642 pr_warn("%s: Doesn't support large page.\n",
4643 iommu->name);
4644 return -ENXIO;
4648 * Disable translation if already enabled prior to OS handover.
4650 if (iommu->gcmd & DMA_GCMD_TE)
4651 iommu_disable_translation(iommu);
4653 g_iommus[iommu->seq_id] = iommu;
4654 ret = iommu_init_domains(iommu);
4655 if (ret == 0)
4656 ret = iommu_alloc_root_entry(iommu);
4657 if (ret)
4658 goto out;
4660 intel_svm_check(iommu);
4662 if (dmaru->ignored) {
4664 * we always have to disable PMRs or DMA may fail on this device
4666 if (force_on)
4667 iommu_disable_protect_mem_regions(iommu);
4668 return 0;
4671 intel_iommu_init_qi(iommu);
4672 iommu_flush_write_buffer(iommu);
4674 #ifdef CONFIG_INTEL_IOMMU_SVM
4675 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4676 ret = intel_svm_enable_prq(iommu);
4677 if (ret)
4678 goto disable_iommu;
4680 #endif
4681 ret = dmar_set_interrupt(iommu);
4682 if (ret)
4683 goto disable_iommu;
4685 iommu_set_root_entry(iommu);
4686 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4687 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4688 iommu_enable_translation(iommu);
4690 iommu_disable_protect_mem_regions(iommu);
4691 return 0;
4693 disable_iommu:
4694 disable_dmar_iommu(iommu);
4695 out:
4696 free_dmar_iommu(iommu);
4697 return ret;
4700 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4702 int ret = 0;
4703 struct intel_iommu *iommu = dmaru->iommu;
4705 if (!intel_iommu_enabled)
4706 return 0;
4707 if (iommu == NULL)
4708 return -EINVAL;
4710 if (insert) {
4711 ret = intel_iommu_add(dmaru);
4712 } else {
4713 disable_dmar_iommu(iommu);
4714 free_dmar_iommu(iommu);
4717 return ret;
4720 static void intel_iommu_free_dmars(void)
4722 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4723 struct dmar_atsr_unit *atsru, *atsr_n;
4725 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4726 list_del(&rmrru->list);
4727 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4728 kfree(rmrru);
4731 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4732 list_del(&atsru->list);
4733 intel_iommu_free_atsr(atsru);
4737 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4739 int i, ret = 1;
4740 struct pci_bus *bus;
4741 struct pci_dev *bridge = NULL;
4742 struct device *tmp;
4743 struct acpi_dmar_atsr *atsr;
4744 struct dmar_atsr_unit *atsru;
4746 dev = pci_physfn(dev);
4747 for (bus = dev->bus; bus; bus = bus->parent) {
4748 bridge = bus->self;
4749 /* If it's an integrated device, allow ATS */
4750 if (!bridge)
4751 return 1;
4752 /* Connected via non-PCIe: no ATS */
4753 if (!pci_is_pcie(bridge) ||
4754 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4755 return 0;
4756 /* If we found the root port, look it up in the ATSR */
4757 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4758 break;
4761 rcu_read_lock();
4762 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4763 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4764 if (atsr->segment != pci_domain_nr(dev->bus))
4765 continue;
4767 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4768 if (tmp == &bridge->dev)
4769 goto out;
4771 if (atsru->include_all)
4772 goto out;
4774 ret = 0;
4775 out:
4776 rcu_read_unlock();
4778 return ret;
4781 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4783 int ret;
4784 struct dmar_rmrr_unit *rmrru;
4785 struct dmar_atsr_unit *atsru;
4786 struct acpi_dmar_atsr *atsr;
4787 struct acpi_dmar_reserved_memory *rmrr;
4789 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4790 return 0;
4792 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4793 rmrr = container_of(rmrru->hdr,
4794 struct acpi_dmar_reserved_memory, header);
4795 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4796 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4797 ((void *)rmrr) + rmrr->header.length,
4798 rmrr->segment, rmrru->devices,
4799 rmrru->devices_cnt);
4800 if (ret < 0)
4801 return ret;
4802 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4803 dmar_remove_dev_scope(info, rmrr->segment,
4804 rmrru->devices, rmrru->devices_cnt);
4808 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4809 if (atsru->include_all)
4810 continue;
4812 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4813 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4814 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4815 (void *)atsr + atsr->header.length,
4816 atsr->segment, atsru->devices,
4817 atsru->devices_cnt);
4818 if (ret > 0)
4819 break;
4820 else if (ret < 0)
4821 return ret;
4822 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4823 if (dmar_remove_dev_scope(info, atsr->segment,
4824 atsru->devices, atsru->devices_cnt))
4825 break;
4829 return 0;
4832 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4833 unsigned long val, void *v)
4835 struct memory_notify *mhp = v;
4836 unsigned long long start, end;
4837 unsigned long start_vpfn, last_vpfn;
4839 switch (val) {
4840 case MEM_GOING_ONLINE:
4841 start = mhp->start_pfn << PAGE_SHIFT;
4842 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4843 if (iommu_domain_identity_map(si_domain, start, end)) {
4844 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4845 start, end);
4846 return NOTIFY_BAD;
4848 break;
4850 case MEM_OFFLINE:
4851 case MEM_CANCEL_ONLINE:
4852 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4853 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4854 while (start_vpfn <= last_vpfn) {
4855 struct iova *iova;
4856 struct dmar_drhd_unit *drhd;
4857 struct intel_iommu *iommu;
4858 struct page *freelist;
4860 iova = find_iova(&si_domain->iovad, start_vpfn);
4861 if (iova == NULL) {
4862 pr_debug("Failed get IOVA for PFN %lx\n",
4863 start_vpfn);
4864 break;
4867 iova = split_and_remove_iova(&si_domain->iovad, iova,
4868 start_vpfn, last_vpfn);
4869 if (iova == NULL) {
4870 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4871 start_vpfn, last_vpfn);
4872 return NOTIFY_BAD;
4875 freelist = domain_unmap(si_domain, iova->pfn_lo,
4876 iova->pfn_hi);
4878 rcu_read_lock();
4879 for_each_active_iommu(iommu, drhd)
4880 iommu_flush_iotlb_psi(iommu, si_domain,
4881 iova->pfn_lo, iova_size(iova),
4882 !freelist, 0);
4883 rcu_read_unlock();
4884 dma_free_pagelist(freelist);
4886 start_vpfn = iova->pfn_hi + 1;
4887 free_iova_mem(iova);
4889 break;
4892 return NOTIFY_OK;
4895 static struct notifier_block intel_iommu_memory_nb = {
4896 .notifier_call = intel_iommu_memory_notifier,
4897 .priority = 0
4900 static void free_all_cpu_cached_iovas(unsigned int cpu)
4902 int i;
4904 for (i = 0; i < g_num_of_iommus; i++) {
4905 struct intel_iommu *iommu = g_iommus[i];
4906 struct dmar_domain *domain;
4907 int did;
4909 if (!iommu)
4910 continue;
4912 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4913 domain = get_iommu_domain(iommu, (u16)did);
4915 if (!domain)
4916 continue;
4917 free_cpu_cached_iovas(cpu, &domain->iovad);
4922 static int intel_iommu_cpu_dead(unsigned int cpu)
4924 free_all_cpu_cached_iovas(cpu);
4925 return 0;
4928 static void intel_disable_iommus(void)
4930 struct intel_iommu *iommu = NULL;
4931 struct dmar_drhd_unit *drhd;
4933 for_each_iommu(iommu, drhd)
4934 iommu_disable_translation(iommu);
4937 void intel_iommu_shutdown(void)
4939 struct dmar_drhd_unit *drhd;
4940 struct intel_iommu *iommu = NULL;
4942 if (no_iommu || dmar_disabled)
4943 return;
4945 down_write(&dmar_global_lock);
4947 /* Disable PMRs explicitly here. */
4948 for_each_iommu(iommu, drhd)
4949 iommu_disable_protect_mem_regions(iommu);
4951 /* Make sure the IOMMUs are switched off */
4952 intel_disable_iommus();
4954 up_write(&dmar_global_lock);
4957 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4959 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4961 return container_of(iommu_dev, struct intel_iommu, iommu);
4964 static ssize_t intel_iommu_show_version(struct device *dev,
4965 struct device_attribute *attr,
4966 char *buf)
4968 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4969 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4970 return sprintf(buf, "%d:%d\n",
4971 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4973 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4975 static ssize_t intel_iommu_show_address(struct device *dev,
4976 struct device_attribute *attr,
4977 char *buf)
4979 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4980 return sprintf(buf, "%llx\n", iommu->reg_phys);
4982 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4984 static ssize_t intel_iommu_show_cap(struct device *dev,
4985 struct device_attribute *attr,
4986 char *buf)
4988 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4989 return sprintf(buf, "%llx\n", iommu->cap);
4991 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4993 static ssize_t intel_iommu_show_ecap(struct device *dev,
4994 struct device_attribute *attr,
4995 char *buf)
4997 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4998 return sprintf(buf, "%llx\n", iommu->ecap);
5000 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
5002 static ssize_t intel_iommu_show_ndoms(struct device *dev,
5003 struct device_attribute *attr,
5004 char *buf)
5006 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
5007 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
5009 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
5011 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
5012 struct device_attribute *attr,
5013 char *buf)
5015 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
5016 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
5017 cap_ndoms(iommu->cap)));
5019 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
5021 static struct attribute *intel_iommu_attrs[] = {
5022 &dev_attr_version.attr,
5023 &dev_attr_address.attr,
5024 &dev_attr_cap.attr,
5025 &dev_attr_ecap.attr,
5026 &dev_attr_domains_supported.attr,
5027 &dev_attr_domains_used.attr,
5028 NULL,
5031 static struct attribute_group intel_iommu_group = {
5032 .name = "intel-iommu",
5033 .attrs = intel_iommu_attrs,
5036 const struct attribute_group *intel_iommu_groups[] = {
5037 &intel_iommu_group,
5038 NULL,
5041 static inline bool has_untrusted_dev(void)
5043 struct pci_dev *pdev = NULL;
5045 for_each_pci_dev(pdev)
5046 if (pdev->untrusted)
5047 return true;
5049 return false;
5052 static int __init platform_optin_force_iommu(void)
5054 if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
5055 return 0;
5057 if (no_iommu || dmar_disabled)
5058 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
5061 * If Intel-IOMMU is disabled by default, we will apply identity
5062 * map for all devices except those marked as being untrusted.
5064 if (dmar_disabled)
5065 iommu_set_default_passthrough(false);
5067 dmar_disabled = 0;
5068 no_iommu = 0;
5070 return 1;
5073 static int __init probe_acpi_namespace_devices(void)
5075 struct dmar_drhd_unit *drhd;
5076 /* To avoid a -Wunused-but-set-variable warning. */
5077 struct intel_iommu *iommu __maybe_unused;
5078 struct device *dev;
5079 int i, ret = 0;
5081 for_each_active_iommu(iommu, drhd) {
5082 for_each_active_dev_scope(drhd->devices,
5083 drhd->devices_cnt, i, dev) {
5084 struct acpi_device_physical_node *pn;
5085 struct iommu_group *group;
5086 struct acpi_device *adev;
5088 if (dev->bus != &acpi_bus_type)
5089 continue;
5091 adev = to_acpi_device(dev);
5092 mutex_lock(&adev->physical_node_lock);
5093 list_for_each_entry(pn,
5094 &adev->physical_node_list, node) {
5095 group = iommu_group_get(pn->dev);
5096 if (group) {
5097 iommu_group_put(group);
5098 continue;
5101 pn->dev->bus->iommu_ops = &intel_iommu_ops;
5102 ret = iommu_probe_device(pn->dev);
5103 if (ret)
5104 break;
5106 mutex_unlock(&adev->physical_node_lock);
5108 if (ret)
5109 return ret;
5113 return 0;
5116 int __init intel_iommu_init(void)
5118 int ret = -ENODEV;
5119 struct dmar_drhd_unit *drhd;
5120 struct intel_iommu *iommu;
5123 * Intel IOMMU is required for a TXT/tboot launch or platform
5124 * opt in, so enforce that.
5126 force_on = tboot_force_iommu() || platform_optin_force_iommu();
5128 if (iommu_init_mempool()) {
5129 if (force_on)
5130 panic("tboot: Failed to initialize iommu memory\n");
5131 return -ENOMEM;
5134 down_write(&dmar_global_lock);
5135 if (dmar_table_init()) {
5136 if (force_on)
5137 panic("tboot: Failed to initialize DMAR table\n");
5138 goto out_free_dmar;
5141 if (dmar_dev_scope_init() < 0) {
5142 if (force_on)
5143 panic("tboot: Failed to initialize DMAR device scope\n");
5144 goto out_free_dmar;
5147 up_write(&dmar_global_lock);
5150 * The bus notifier takes the dmar_global_lock, so lockdep will
5151 * complain later when we register it under the lock.
5153 dmar_register_bus_notifier();
5155 down_write(&dmar_global_lock);
5157 if (!no_iommu)
5158 intel_iommu_debugfs_init();
5160 if (no_iommu || dmar_disabled) {
5162 * We exit the function here to ensure IOMMU's remapping and
5163 * mempool aren't setup, which means that the IOMMU's PMRs
5164 * won't be disabled via the call to init_dmars(). So disable
5165 * it explicitly here. The PMRs were setup by tboot prior to
5166 * calling SENTER, but the kernel is expected to reset/tear
5167 * down the PMRs.
5169 if (intel_iommu_tboot_noforce) {
5170 for_each_iommu(iommu, drhd)
5171 iommu_disable_protect_mem_regions(iommu);
5175 * Make sure the IOMMUs are switched off, even when we
5176 * boot into a kexec kernel and the previous kernel left
5177 * them enabled
5179 intel_disable_iommus();
5180 goto out_free_dmar;
5183 if (list_empty(&dmar_rmrr_units))
5184 pr_info("No RMRR found\n");
5186 if (list_empty(&dmar_atsr_units))
5187 pr_info("No ATSR found\n");
5189 if (dmar_init_reserved_ranges()) {
5190 if (force_on)
5191 panic("tboot: Failed to reserve iommu ranges\n");
5192 goto out_free_reserved_range;
5195 if (dmar_map_gfx)
5196 intel_iommu_gfx_mapped = 1;
5198 init_no_remapping_devices();
5200 ret = init_dmars();
5201 if (ret) {
5202 if (force_on)
5203 panic("tboot: Failed to initialize DMARs\n");
5204 pr_err("Initialization failed\n");
5205 goto out_free_reserved_range;
5207 up_write(&dmar_global_lock);
5209 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5211 * If the system has no untrusted device or the user has decided
5212 * to disable the bounce page mechanisms, we don't need swiotlb.
5213 * Mark this and the pre-allocated bounce pages will be released
5214 * later.
5216 if (!has_untrusted_dev() || intel_no_bounce)
5217 swiotlb = 0;
5218 #endif
5219 dma_ops = &intel_dma_ops;
5221 init_iommu_pm_ops();
5223 down_read(&dmar_global_lock);
5224 for_each_active_iommu(iommu, drhd) {
5225 iommu_device_sysfs_add(&iommu->iommu, NULL,
5226 intel_iommu_groups,
5227 "%s", iommu->name);
5228 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5229 iommu_device_register(&iommu->iommu);
5231 up_read(&dmar_global_lock);
5233 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5234 if (si_domain && !hw_pass_through)
5235 register_memory_notifier(&intel_iommu_memory_nb);
5236 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5237 intel_iommu_cpu_dead);
5239 down_read(&dmar_global_lock);
5240 if (probe_acpi_namespace_devices())
5241 pr_warn("ACPI name space devices didn't probe correctly\n");
5243 /* Finally, we enable the DMA remapping hardware. */
5244 for_each_iommu(iommu, drhd) {
5245 if (!drhd->ignored && !translation_pre_enabled(iommu))
5246 iommu_enable_translation(iommu);
5248 iommu_disable_protect_mem_regions(iommu);
5250 up_read(&dmar_global_lock);
5252 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5254 intel_iommu_enabled = 1;
5256 return 0;
5258 out_free_reserved_range:
5259 put_iova_domain(&reserved_iova_list);
5260 out_free_dmar:
5261 intel_iommu_free_dmars();
5262 up_write(&dmar_global_lock);
5263 iommu_exit_mempool();
5264 return ret;
5267 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5269 struct intel_iommu *iommu = opaque;
5271 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5272 return 0;
5276 * NB - intel-iommu lacks any sort of reference counting for the users of
5277 * dependent devices. If multiple endpoints have intersecting dependent
5278 * devices, unbinding the driver from any one of them will possibly leave
5279 * the others unable to operate.
5281 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5283 if (!iommu || !dev || !dev_is_pci(dev))
5284 return;
5286 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5289 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5291 struct dmar_domain *domain;
5292 struct intel_iommu *iommu;
5293 unsigned long flags;
5295 assert_spin_locked(&device_domain_lock);
5297 if (WARN_ON(!info))
5298 return;
5300 iommu = info->iommu;
5301 domain = info->domain;
5303 if (info->dev) {
5304 if (dev_is_pci(info->dev) && sm_supported(iommu))
5305 intel_pasid_tear_down_entry(iommu, info->dev,
5306 PASID_RID2PASID);
5308 iommu_disable_dev_iotlb(info);
5309 if (!dev_is_real_dma_subdevice(info->dev))
5310 domain_context_clear(iommu, info->dev);
5311 intel_pasid_free_table(info->dev);
5314 unlink_domain_info(info);
5316 spin_lock_irqsave(&iommu->lock, flags);
5317 domain_detach_iommu(domain, iommu);
5318 spin_unlock_irqrestore(&iommu->lock, flags);
5320 /* free the private domain */
5321 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5322 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5323 list_empty(&domain->devices))
5324 domain_exit(info->domain);
5326 free_devinfo_mem(info);
5329 static void dmar_remove_one_dev_info(struct device *dev)
5331 struct device_domain_info *info;
5332 unsigned long flags;
5334 spin_lock_irqsave(&device_domain_lock, flags);
5335 info = dev->archdata.iommu;
5336 if (info && info != DEFER_DEVICE_DOMAIN_INFO
5337 && info != DUMMY_DEVICE_DOMAIN_INFO)
5338 __dmar_remove_one_dev_info(info);
5339 spin_unlock_irqrestore(&device_domain_lock, flags);
5342 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5344 int adjust_width;
5346 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5347 domain_reserve_special_ranges(domain);
5349 /* calculate AGAW */
5350 domain->gaw = guest_width;
5351 adjust_width = guestwidth_to_adjustwidth(guest_width);
5352 domain->agaw = width_to_agaw(adjust_width);
5354 domain->iommu_coherency = 0;
5355 domain->iommu_snooping = 0;
5356 domain->iommu_superpage = 0;
5357 domain->max_addr = 0;
5359 /* always allocate the top pgd */
5360 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5361 if (!domain->pgd)
5362 return -ENOMEM;
5363 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5364 return 0;
5367 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5369 struct dmar_domain *dmar_domain;
5370 struct iommu_domain *domain;
5371 int ret;
5373 switch (type) {
5374 case IOMMU_DOMAIN_DMA:
5375 /* fallthrough */
5376 case IOMMU_DOMAIN_UNMANAGED:
5377 dmar_domain = alloc_domain(0);
5378 if (!dmar_domain) {
5379 pr_err("Can't allocate dmar_domain\n");
5380 return NULL;
5382 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5383 pr_err("Domain initialization failed\n");
5384 domain_exit(dmar_domain);
5385 return NULL;
5388 if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) {
5389 ret = init_iova_flush_queue(&dmar_domain->iovad,
5390 iommu_flush_iova,
5391 iova_entry_free);
5392 if (ret)
5393 pr_info("iova flush queue initialization failed\n");
5396 domain_update_iommu_cap(dmar_domain);
5398 domain = &dmar_domain->domain;
5399 domain->geometry.aperture_start = 0;
5400 domain->geometry.aperture_end =
5401 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5402 domain->geometry.force_aperture = true;
5404 return domain;
5405 case IOMMU_DOMAIN_IDENTITY:
5406 return &si_domain->domain;
5407 default:
5408 return NULL;
5411 return NULL;
5414 static void intel_iommu_domain_free(struct iommu_domain *domain)
5416 if (domain != &si_domain->domain)
5417 domain_exit(to_dmar_domain(domain));
5421 * Check whether a @domain could be attached to the @dev through the
5422 * aux-domain attach/detach APIs.
5424 static inline bool
5425 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5427 struct device_domain_info *info = dev->archdata.iommu;
5429 return info && info->auxd_enabled &&
5430 domain->type == IOMMU_DOMAIN_UNMANAGED;
5433 static void auxiliary_link_device(struct dmar_domain *domain,
5434 struct device *dev)
5436 struct device_domain_info *info = dev->archdata.iommu;
5438 assert_spin_locked(&device_domain_lock);
5439 if (WARN_ON(!info))
5440 return;
5442 domain->auxd_refcnt++;
5443 list_add(&domain->auxd, &info->auxiliary_domains);
5446 static void auxiliary_unlink_device(struct dmar_domain *domain,
5447 struct device *dev)
5449 struct device_domain_info *info = dev->archdata.iommu;
5451 assert_spin_locked(&device_domain_lock);
5452 if (WARN_ON(!info))
5453 return;
5455 list_del(&domain->auxd);
5456 domain->auxd_refcnt--;
5458 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5459 ioasid_free(domain->default_pasid);
5462 static int aux_domain_add_dev(struct dmar_domain *domain,
5463 struct device *dev)
5465 int ret;
5466 u8 bus, devfn;
5467 unsigned long flags;
5468 struct intel_iommu *iommu;
5470 iommu = device_to_iommu(dev, &bus, &devfn);
5471 if (!iommu)
5472 return -ENODEV;
5474 if (domain->default_pasid <= 0) {
5475 int pasid;
5477 /* No private data needed for the default pasid */
5478 pasid = ioasid_alloc(NULL, PASID_MIN,
5479 pci_max_pasids(to_pci_dev(dev)) - 1,
5480 NULL);
5481 if (pasid == INVALID_IOASID) {
5482 pr_err("Can't allocate default pasid\n");
5483 return -ENODEV;
5485 domain->default_pasid = pasid;
5488 spin_lock_irqsave(&device_domain_lock, flags);
5490 * iommu->lock must be held to attach domain to iommu and setup the
5491 * pasid entry for second level translation.
5493 spin_lock(&iommu->lock);
5494 ret = domain_attach_iommu(domain, iommu);
5495 if (ret)
5496 goto attach_failed;
5498 /* Setup the PASID entry for mediated devices: */
5499 if (domain_use_first_level(domain))
5500 ret = domain_setup_first_level(iommu, domain, dev,
5501 domain->default_pasid);
5502 else
5503 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5504 domain->default_pasid);
5505 if (ret)
5506 goto table_failed;
5507 spin_unlock(&iommu->lock);
5509 auxiliary_link_device(domain, dev);
5511 spin_unlock_irqrestore(&device_domain_lock, flags);
5513 return 0;
5515 table_failed:
5516 domain_detach_iommu(domain, iommu);
5517 attach_failed:
5518 spin_unlock(&iommu->lock);
5519 spin_unlock_irqrestore(&device_domain_lock, flags);
5520 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5521 ioasid_free(domain->default_pasid);
5523 return ret;
5526 static void aux_domain_remove_dev(struct dmar_domain *domain,
5527 struct device *dev)
5529 struct device_domain_info *info;
5530 struct intel_iommu *iommu;
5531 unsigned long flags;
5533 if (!is_aux_domain(dev, &domain->domain))
5534 return;
5536 spin_lock_irqsave(&device_domain_lock, flags);
5537 info = dev->archdata.iommu;
5538 iommu = info->iommu;
5540 auxiliary_unlink_device(domain, dev);
5542 spin_lock(&iommu->lock);
5543 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5544 domain_detach_iommu(domain, iommu);
5545 spin_unlock(&iommu->lock);
5547 spin_unlock_irqrestore(&device_domain_lock, flags);
5550 static int prepare_domain_attach_device(struct iommu_domain *domain,
5551 struct device *dev)
5553 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5554 struct intel_iommu *iommu;
5555 int addr_width;
5556 u8 bus, devfn;
5558 iommu = device_to_iommu(dev, &bus, &devfn);
5559 if (!iommu)
5560 return -ENODEV;
5562 /* check if this iommu agaw is sufficient for max mapped address */
5563 addr_width = agaw_to_width(iommu->agaw);
5564 if (addr_width > cap_mgaw(iommu->cap))
5565 addr_width = cap_mgaw(iommu->cap);
5567 if (dmar_domain->max_addr > (1LL << addr_width)) {
5568 dev_err(dev, "%s: iommu width (%d) is not "
5569 "sufficient for the mapped address (%llx)\n",
5570 __func__, addr_width, dmar_domain->max_addr);
5571 return -EFAULT;
5573 dmar_domain->gaw = addr_width;
5576 * Knock out extra levels of page tables if necessary
5578 while (iommu->agaw < dmar_domain->agaw) {
5579 struct dma_pte *pte;
5581 pte = dmar_domain->pgd;
5582 if (dma_pte_present(pte)) {
5583 dmar_domain->pgd = (struct dma_pte *)
5584 phys_to_virt(dma_pte_addr(pte));
5585 free_pgtable_page(pte);
5587 dmar_domain->agaw--;
5590 return 0;
5593 static int intel_iommu_attach_device(struct iommu_domain *domain,
5594 struct device *dev)
5596 int ret;
5598 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5599 device_is_rmrr_locked(dev)) {
5600 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5601 return -EPERM;
5604 if (is_aux_domain(dev, domain))
5605 return -EPERM;
5607 /* normally dev is not mapped */
5608 if (unlikely(domain_context_mapped(dev))) {
5609 struct dmar_domain *old_domain;
5611 old_domain = find_domain(dev);
5612 if (old_domain)
5613 dmar_remove_one_dev_info(dev);
5616 ret = prepare_domain_attach_device(domain, dev);
5617 if (ret)
5618 return ret;
5620 return domain_add_dev_info(to_dmar_domain(domain), dev);
5623 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5624 struct device *dev)
5626 int ret;
5628 if (!is_aux_domain(dev, domain))
5629 return -EPERM;
5631 ret = prepare_domain_attach_device(domain, dev);
5632 if (ret)
5633 return ret;
5635 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5638 static void intel_iommu_detach_device(struct iommu_domain *domain,
5639 struct device *dev)
5641 dmar_remove_one_dev_info(dev);
5644 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5645 struct device *dev)
5647 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5650 static int intel_iommu_map(struct iommu_domain *domain,
5651 unsigned long iova, phys_addr_t hpa,
5652 size_t size, int iommu_prot, gfp_t gfp)
5654 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5655 u64 max_addr;
5656 int prot = 0;
5657 int ret;
5659 if (iommu_prot & IOMMU_READ)
5660 prot |= DMA_PTE_READ;
5661 if (iommu_prot & IOMMU_WRITE)
5662 prot |= DMA_PTE_WRITE;
5663 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5664 prot |= DMA_PTE_SNP;
5666 max_addr = iova + size;
5667 if (dmar_domain->max_addr < max_addr) {
5668 u64 end;
5670 /* check if minimum agaw is sufficient for mapped address */
5671 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5672 if (end < max_addr) {
5673 pr_err("%s: iommu width (%d) is not "
5674 "sufficient for the mapped address (%llx)\n",
5675 __func__, dmar_domain->gaw, max_addr);
5676 return -EFAULT;
5678 dmar_domain->max_addr = max_addr;
5680 /* Round up size to next multiple of PAGE_SIZE, if it and
5681 the low bits of hpa would take us onto the next page */
5682 size = aligned_nrpages(hpa, size);
5683 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5684 hpa >> VTD_PAGE_SHIFT, size, prot);
5685 return ret;
5688 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5689 unsigned long iova, size_t size,
5690 struct iommu_iotlb_gather *gather)
5692 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5693 struct page *freelist = NULL;
5694 unsigned long start_pfn, last_pfn;
5695 unsigned int npages;
5696 int iommu_id, level = 0;
5698 /* Cope with horrid API which requires us to unmap more than the
5699 size argument if it happens to be a large-page mapping. */
5700 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5702 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5703 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5705 start_pfn = iova >> VTD_PAGE_SHIFT;
5706 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5708 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5710 npages = last_pfn - start_pfn + 1;
5712 for_each_domain_iommu(iommu_id, dmar_domain)
5713 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5714 start_pfn, npages, !freelist, 0);
5716 dma_free_pagelist(freelist);
5718 if (dmar_domain->max_addr == iova + size)
5719 dmar_domain->max_addr = iova;
5721 return size;
5724 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5725 dma_addr_t iova)
5727 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5728 struct dma_pte *pte;
5729 int level = 0;
5730 u64 phys = 0;
5732 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5733 if (pte && dma_pte_present(pte))
5734 phys = dma_pte_addr(pte) +
5735 (iova & (BIT_MASK(level_to_offset_bits(level) +
5736 VTD_PAGE_SHIFT) - 1));
5738 return phys;
5741 static inline bool scalable_mode_support(void)
5743 struct dmar_drhd_unit *drhd;
5744 struct intel_iommu *iommu;
5745 bool ret = true;
5747 rcu_read_lock();
5748 for_each_active_iommu(iommu, drhd) {
5749 if (!sm_supported(iommu)) {
5750 ret = false;
5751 break;
5754 rcu_read_unlock();
5756 return ret;
5759 static inline bool iommu_pasid_support(void)
5761 struct dmar_drhd_unit *drhd;
5762 struct intel_iommu *iommu;
5763 bool ret = true;
5765 rcu_read_lock();
5766 for_each_active_iommu(iommu, drhd) {
5767 if (!pasid_supported(iommu)) {
5768 ret = false;
5769 break;
5772 rcu_read_unlock();
5774 return ret;
5777 static inline bool nested_mode_support(void)
5779 struct dmar_drhd_unit *drhd;
5780 struct intel_iommu *iommu;
5781 bool ret = true;
5783 rcu_read_lock();
5784 for_each_active_iommu(iommu, drhd) {
5785 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5786 ret = false;
5787 break;
5790 rcu_read_unlock();
5792 return ret;
5795 static bool intel_iommu_capable(enum iommu_cap cap)
5797 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5798 return domain_update_iommu_snooping(NULL) == 1;
5799 if (cap == IOMMU_CAP_INTR_REMAP)
5800 return irq_remapping_enabled == 1;
5802 return false;
5805 static int intel_iommu_add_device(struct device *dev)
5807 struct dmar_domain *dmar_domain;
5808 struct iommu_domain *domain;
5809 struct intel_iommu *iommu;
5810 struct iommu_group *group;
5811 u8 bus, devfn;
5812 int ret;
5814 iommu = device_to_iommu(dev, &bus, &devfn);
5815 if (!iommu)
5816 return -ENODEV;
5818 iommu_device_link(&iommu->iommu, dev);
5820 if (translation_pre_enabled(iommu))
5821 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5823 group = iommu_group_get_for_dev(dev);
5825 if (IS_ERR(group)) {
5826 ret = PTR_ERR(group);
5827 goto unlink;
5830 iommu_group_put(group);
5832 domain = iommu_get_domain_for_dev(dev);
5833 dmar_domain = to_dmar_domain(domain);
5834 if (domain->type == IOMMU_DOMAIN_DMA) {
5835 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5836 ret = iommu_request_dm_for_dev(dev);
5837 if (ret) {
5838 dmar_remove_one_dev_info(dev);
5839 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5840 domain_add_dev_info(si_domain, dev);
5841 dev_info(dev,
5842 "Device uses a private identity domain.\n");
5845 } else {
5846 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5847 ret = iommu_request_dma_domain_for_dev(dev);
5848 if (ret) {
5849 dmar_remove_one_dev_info(dev);
5850 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5851 if (!get_private_domain_for_dev(dev)) {
5852 dev_warn(dev,
5853 "Failed to get a private domain.\n");
5854 ret = -ENOMEM;
5855 goto unlink;
5858 dev_info(dev,
5859 "Device uses a private dma domain.\n");
5864 if (device_needs_bounce(dev)) {
5865 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5866 set_dma_ops(dev, &bounce_dma_ops);
5869 return 0;
5871 unlink:
5872 iommu_device_unlink(&iommu->iommu, dev);
5873 return ret;
5876 static void intel_iommu_remove_device(struct device *dev)
5878 struct intel_iommu *iommu;
5879 u8 bus, devfn;
5881 iommu = device_to_iommu(dev, &bus, &devfn);
5882 if (!iommu)
5883 return;
5885 dmar_remove_one_dev_info(dev);
5887 iommu_group_remove_device(dev);
5889 iommu_device_unlink(&iommu->iommu, dev);
5891 if (device_needs_bounce(dev))
5892 set_dma_ops(dev, NULL);
5895 static void intel_iommu_get_resv_regions(struct device *device,
5896 struct list_head *head)
5898 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5899 struct iommu_resv_region *reg;
5900 struct dmar_rmrr_unit *rmrr;
5901 struct device *i_dev;
5902 int i;
5904 down_read(&dmar_global_lock);
5905 for_each_rmrr_units(rmrr) {
5906 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5907 i, i_dev) {
5908 struct iommu_resv_region *resv;
5909 enum iommu_resv_type type;
5910 size_t length;
5912 if (i_dev != device &&
5913 !is_downstream_to_pci_bridge(device, i_dev))
5914 continue;
5916 length = rmrr->end_address - rmrr->base_address + 1;
5918 type = device_rmrr_is_relaxable(device) ?
5919 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5921 resv = iommu_alloc_resv_region(rmrr->base_address,
5922 length, prot, type);
5923 if (!resv)
5924 break;
5926 list_add_tail(&resv->list, head);
5929 up_read(&dmar_global_lock);
5931 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5932 if (dev_is_pci(device)) {
5933 struct pci_dev *pdev = to_pci_dev(device);
5935 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5936 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5937 IOMMU_RESV_DIRECT_RELAXABLE);
5938 if (reg)
5939 list_add_tail(&reg->list, head);
5942 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5944 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5945 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5946 0, IOMMU_RESV_MSI);
5947 if (!reg)
5948 return;
5949 list_add_tail(&reg->list, head);
5952 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5954 struct device_domain_info *info;
5955 struct context_entry *context;
5956 struct dmar_domain *domain;
5957 unsigned long flags;
5958 u64 ctx_lo;
5959 int ret;
5961 domain = find_domain(dev);
5962 if (!domain)
5963 return -EINVAL;
5965 spin_lock_irqsave(&device_domain_lock, flags);
5966 spin_lock(&iommu->lock);
5968 ret = -EINVAL;
5969 info = dev->archdata.iommu;
5970 if (!info || !info->pasid_supported)
5971 goto out;
5973 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5974 if (WARN_ON(!context))
5975 goto out;
5977 ctx_lo = context[0].lo;
5979 if (!(ctx_lo & CONTEXT_PASIDE)) {
5980 ctx_lo |= CONTEXT_PASIDE;
5981 context[0].lo = ctx_lo;
5982 wmb();
5983 iommu->flush.flush_context(iommu,
5984 domain->iommu_did[iommu->seq_id],
5985 PCI_DEVID(info->bus, info->devfn),
5986 DMA_CCMD_MASK_NOBIT,
5987 DMA_CCMD_DEVICE_INVL);
5990 /* Enable PASID support in the device, if it wasn't already */
5991 if (!info->pasid_enabled)
5992 iommu_enable_dev_iotlb(info);
5994 ret = 0;
5996 out:
5997 spin_unlock(&iommu->lock);
5998 spin_unlock_irqrestore(&device_domain_lock, flags);
6000 return ret;
6003 static void intel_iommu_apply_resv_region(struct device *dev,
6004 struct iommu_domain *domain,
6005 struct iommu_resv_region *region)
6007 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6008 unsigned long start, end;
6010 start = IOVA_PFN(region->start);
6011 end = IOVA_PFN(region->start + region->length - 1);
6013 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
6016 static struct iommu_group *intel_iommu_device_group(struct device *dev)
6018 if (dev_is_pci(dev))
6019 return pci_device_group(dev);
6020 return generic_device_group(dev);
6023 #ifdef CONFIG_INTEL_IOMMU_SVM
6024 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
6026 struct intel_iommu *iommu;
6027 u8 bus, devfn;
6029 if (iommu_dummy(dev)) {
6030 dev_warn(dev,
6031 "No IOMMU translation for device; cannot enable SVM\n");
6032 return NULL;
6035 iommu = device_to_iommu(dev, &bus, &devfn);
6036 if ((!iommu)) {
6037 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
6038 return NULL;
6041 return iommu;
6043 #endif /* CONFIG_INTEL_IOMMU_SVM */
6045 static int intel_iommu_enable_auxd(struct device *dev)
6047 struct device_domain_info *info;
6048 struct intel_iommu *iommu;
6049 unsigned long flags;
6050 u8 bus, devfn;
6051 int ret;
6053 iommu = device_to_iommu(dev, &bus, &devfn);
6054 if (!iommu || dmar_disabled)
6055 return -EINVAL;
6057 if (!sm_supported(iommu) || !pasid_supported(iommu))
6058 return -EINVAL;
6060 ret = intel_iommu_enable_pasid(iommu, dev);
6061 if (ret)
6062 return -ENODEV;
6064 spin_lock_irqsave(&device_domain_lock, flags);
6065 info = dev->archdata.iommu;
6066 info->auxd_enabled = 1;
6067 spin_unlock_irqrestore(&device_domain_lock, flags);
6069 return 0;
6072 static int intel_iommu_disable_auxd(struct device *dev)
6074 struct device_domain_info *info;
6075 unsigned long flags;
6077 spin_lock_irqsave(&device_domain_lock, flags);
6078 info = dev->archdata.iommu;
6079 if (!WARN_ON(!info))
6080 info->auxd_enabled = 0;
6081 spin_unlock_irqrestore(&device_domain_lock, flags);
6083 return 0;
6087 * A PCI express designated vendor specific extended capability is defined
6088 * in the section 3.7 of Intel scalable I/O virtualization technical spec
6089 * for system software and tools to detect endpoint devices supporting the
6090 * Intel scalable IO virtualization without host driver dependency.
6092 * Returns the address of the matching extended capability structure within
6093 * the device's PCI configuration space or 0 if the device does not support
6094 * it.
6096 static int siov_find_pci_dvsec(struct pci_dev *pdev)
6098 int pos;
6099 u16 vendor, id;
6101 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
6102 while (pos) {
6103 pci_read_config_word(pdev, pos + 4, &vendor);
6104 pci_read_config_word(pdev, pos + 8, &id);
6105 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
6106 return pos;
6108 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
6111 return 0;
6114 static bool
6115 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
6117 if (feat == IOMMU_DEV_FEAT_AUX) {
6118 int ret;
6120 if (!dev_is_pci(dev) || dmar_disabled ||
6121 !scalable_mode_support() || !iommu_pasid_support())
6122 return false;
6124 ret = pci_pasid_features(to_pci_dev(dev));
6125 if (ret < 0)
6126 return false;
6128 return !!siov_find_pci_dvsec(to_pci_dev(dev));
6131 return false;
6134 static int
6135 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6137 if (feat == IOMMU_DEV_FEAT_AUX)
6138 return intel_iommu_enable_auxd(dev);
6140 return -ENODEV;
6143 static int
6144 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6146 if (feat == IOMMU_DEV_FEAT_AUX)
6147 return intel_iommu_disable_auxd(dev);
6149 return -ENODEV;
6152 static bool
6153 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6155 struct device_domain_info *info = dev->archdata.iommu;
6157 if (feat == IOMMU_DEV_FEAT_AUX)
6158 return scalable_mode_support() && info && info->auxd_enabled;
6160 return false;
6163 static int
6164 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6166 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6168 return dmar_domain->default_pasid > 0 ?
6169 dmar_domain->default_pasid : -EINVAL;
6172 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6173 struct device *dev)
6175 return attach_deferred(dev);
6178 static int
6179 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6180 enum iommu_attr attr, void *data)
6182 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6183 unsigned long flags;
6184 int ret = 0;
6186 if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6187 return -EINVAL;
6189 switch (attr) {
6190 case DOMAIN_ATTR_NESTING:
6191 spin_lock_irqsave(&device_domain_lock, flags);
6192 if (nested_mode_support() &&
6193 list_empty(&dmar_domain->devices)) {
6194 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6195 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6196 } else {
6197 ret = -ENODEV;
6199 spin_unlock_irqrestore(&device_domain_lock, flags);
6200 break;
6201 default:
6202 ret = -EINVAL;
6203 break;
6206 return ret;
6209 const struct iommu_ops intel_iommu_ops = {
6210 .capable = intel_iommu_capable,
6211 .domain_alloc = intel_iommu_domain_alloc,
6212 .domain_free = intel_iommu_domain_free,
6213 .domain_set_attr = intel_iommu_domain_set_attr,
6214 .attach_dev = intel_iommu_attach_device,
6215 .detach_dev = intel_iommu_detach_device,
6216 .aux_attach_dev = intel_iommu_aux_attach_device,
6217 .aux_detach_dev = intel_iommu_aux_detach_device,
6218 .aux_get_pasid = intel_iommu_aux_get_pasid,
6219 .map = intel_iommu_map,
6220 .unmap = intel_iommu_unmap,
6221 .iova_to_phys = intel_iommu_iova_to_phys,
6222 .add_device = intel_iommu_add_device,
6223 .remove_device = intel_iommu_remove_device,
6224 .get_resv_regions = intel_iommu_get_resv_regions,
6225 .put_resv_regions = generic_iommu_put_resv_regions,
6226 .apply_resv_region = intel_iommu_apply_resv_region,
6227 .device_group = intel_iommu_device_group,
6228 .dev_has_feat = intel_iommu_dev_has_feat,
6229 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
6230 .dev_enable_feat = intel_iommu_dev_enable_feat,
6231 .dev_disable_feat = intel_iommu_dev_disable_feat,
6232 .is_attach_deferred = intel_iommu_is_attach_deferred,
6233 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6236 static void quirk_iommu_igfx(struct pci_dev *dev)
6238 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6239 dmar_map_gfx = 0;
6242 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6243 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6244 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6245 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6247 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6248 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6249 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6251 /* Broadwell igfx malfunctions with dmar */
6252 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6253 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6254 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6255 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6256 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6257 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6258 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6259 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6260 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6261 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6262 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6263 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6264 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6265 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6266 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6267 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6268 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6269 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6270 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6271 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6272 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6273 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6274 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6275 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6277 static void quirk_iommu_rwbf(struct pci_dev *dev)
6280 * Mobile 4 Series Chipset neglects to set RWBF capability,
6281 * but needs it. Same seems to hold for the desktop versions.
6283 pci_info(dev, "Forcing write-buffer flush capability\n");
6284 rwbf_quirk = 1;
6287 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6288 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6289 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6290 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6291 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6292 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6293 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6295 #define GGC 0x52
6296 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
6297 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6298 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
6299 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
6300 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6301 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6302 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6303 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6305 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6307 unsigned short ggc;
6309 if (pci_read_config_word(dev, GGC, &ggc))
6310 return;
6312 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6313 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6314 dmar_map_gfx = 0;
6315 } else if (dmar_map_gfx) {
6316 /* we have to ensure the gfx device is idle before we flush */
6317 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6318 intel_iommu_strict = 1;
6321 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6322 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6323 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6324 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6326 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6327 ISOCH DMAR unit for the Azalia sound device, but not give it any
6328 TLB entries, which causes it to deadlock. Check for that. We do
6329 this in a function called from init_dmars(), instead of in a PCI
6330 quirk, because we don't want to print the obnoxious "BIOS broken"
6331 message if VT-d is actually disabled.
6333 static void __init check_tylersburg_isoch(void)
6335 struct pci_dev *pdev;
6336 uint32_t vtisochctrl;
6338 /* If there's no Azalia in the system anyway, forget it. */
6339 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6340 if (!pdev)
6341 return;
6342 pci_dev_put(pdev);
6344 /* System Management Registers. Might be hidden, in which case
6345 we can't do the sanity check. But that's OK, because the
6346 known-broken BIOSes _don't_ actually hide it, so far. */
6347 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6348 if (!pdev)
6349 return;
6351 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6352 pci_dev_put(pdev);
6353 return;
6356 pci_dev_put(pdev);
6358 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6359 if (vtisochctrl & 1)
6360 return;
6362 /* Drop all bits other than the number of TLB entries */
6363 vtisochctrl &= 0x1c;
6365 /* If we have the recommended number of TLB entries (16), fine. */
6366 if (vtisochctrl == 0x10)
6367 return;
6369 /* Zero TLB entries? You get to ride the short bus to school. */
6370 if (!vtisochctrl) {
6371 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6372 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6373 dmi_get_system_info(DMI_BIOS_VENDOR),
6374 dmi_get_system_info(DMI_BIOS_VERSION),
6375 dmi_get_system_info(DMI_PRODUCT_VERSION));
6376 iommu_identity_mapping |= IDENTMAP_AZALIA;
6377 return;
6380 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6381 vtisochctrl);