net: DCB: Validate DCB_ATTR_DCB_BUFFER argument
[linux/fpc-iii.git] / drivers / iommu / intel-iommu.c
blob2ffec65df3889ee351eb7cbe2a8fb5d52e653375
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
92 * that we support.
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
106 static inline int agaw_to_level(int agaw)
108 return agaw + 2;
111 static inline int agaw_to_width(int agaw)
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 static inline int width_to_agaw(int width)
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 static inline unsigned int level_to_offset_bits(int level)
123 return (level - 1) * LEVEL_STRIDE;
126 static inline int pfn_level_offset(u64 pfn, int level)
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 static inline u64 level_mask(int level)
133 return -1ULL << level_to_offset_bits(level);
136 static inline u64 level_size(int level)
138 return 1ULL << level_to_offset_bits(level);
141 static inline u64 align_to_level(u64 pfn, int level)
143 return (pfn + level_size(level) - 1) & level_mask(level);
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
148 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
164 return mm_to_dma_pfn(page_to_pfn(pg));
166 static inline unsigned long virt_to_dma_pfn(void *p)
168 return page_to_dma_pfn(virt_to_page(p));
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189 * if marked present.
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
193 if (!(re->lo & 1))
194 return 0;
196 return re->lo & VTD_PAGE_MASK;
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201 * if marked present.
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
205 if (!(re->hi & 1))
206 return 0;
208 return re->hi & VTD_PAGE_MASK;
211 static inline void context_clear_pasid_enable(struct context_entry *context)
213 context->lo &= ~(1ULL << 11);
216 static inline bool context_pasid_enabled(struct context_entry *context)
218 return !!(context->lo & (1ULL << 11));
221 static inline void context_set_copied(struct context_entry *context)
223 context->hi |= (1ull << 3);
226 static inline bool context_copied(struct context_entry *context)
228 return !!(context->hi & (1ULL << 3));
231 static inline bool __context_present(struct context_entry *context)
233 return (context->lo & 1);
236 bool context_present(struct context_entry *context)
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
243 static inline void context_set_present(struct context_entry *context)
245 context->lo |= 1;
248 static inline void context_set_fault_enable(struct context_entry *context)
250 context->lo &= (((u64)-1) << 2) | 1;
253 static inline void context_set_translation_type(struct context_entry *context,
254 unsigned long value)
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
260 static inline void context_set_address_root(struct context_entry *context,
261 unsigned long value)
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
267 static inline void context_set_address_width(struct context_entry *context,
268 unsigned long value)
270 context->hi |= value & 7;
273 static inline void context_set_domain_id(struct context_entry *context,
274 unsigned long value)
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
279 static inline int context_domain_id(struct context_entry *c)
281 return((c->hi >> 8) & 0xffff);
284 static inline void context_clear_entry(struct context_entry *context)
286 context->lo = 0;
287 context->hi = 0;
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
303 * This is a DMA domain allocated through the iommu domain allocation
304 * interface. But one or more devices belonging to this domain have
305 * been chosen to use a private domain. We should avoid to use the
306 * map/unmap/iova_to_phys APIs on it.
308 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
310 #define for_each_domain_iommu(idx, domain) \
311 for (idx = 0; idx < g_num_of_iommus; idx++) \
312 if (domain->iommu_refcnt[idx])
314 struct dmar_rmrr_unit {
315 struct list_head list; /* list of rmrr units */
316 struct acpi_dmar_header *hdr; /* ACPI header */
317 u64 base_address; /* reserved base address*/
318 u64 end_address; /* reserved end address */
319 struct dmar_dev_scope *devices; /* target devices */
320 int devices_cnt; /* target device count */
323 struct dmar_atsr_unit {
324 struct list_head list; /* list of ATSR units */
325 struct acpi_dmar_header *hdr; /* ACPI header */
326 struct dmar_dev_scope *devices; /* target devices */
327 int devices_cnt; /* target device count */
328 u8 include_all:1; /* include all ports */
331 static LIST_HEAD(dmar_atsr_units);
332 static LIST_HEAD(dmar_rmrr_units);
334 #define for_each_rmrr_units(rmrr) \
335 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
337 /* bitmap for indexing intel_iommus */
338 static int g_num_of_iommus;
340 static void domain_exit(struct dmar_domain *domain);
341 static void domain_remove_dev_info(struct dmar_domain *domain);
342 static void dmar_remove_one_dev_info(struct device *dev);
343 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
344 static void domain_context_clear(struct intel_iommu *iommu,
345 struct device *dev);
346 static int domain_detach_iommu(struct dmar_domain *domain,
347 struct intel_iommu *iommu);
348 static bool device_is_rmrr_locked(struct device *dev);
349 static int intel_iommu_attach_device(struct iommu_domain *domain,
350 struct device *dev);
351 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
352 dma_addr_t iova);
354 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
355 int dmar_disabled = 0;
356 #else
357 int dmar_disabled = 1;
358 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
360 int intel_iommu_sm;
361 int intel_iommu_enabled = 0;
362 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
364 static int dmar_map_gfx = 1;
365 static int dmar_forcedac;
366 static int intel_iommu_strict;
367 static int intel_iommu_superpage = 1;
368 static int iommu_identity_mapping;
369 static int intel_no_bounce;
371 #define IDENTMAP_ALL 1
372 #define IDENTMAP_GFX 2
373 #define IDENTMAP_AZALIA 4
375 int intel_iommu_gfx_mapped;
376 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
378 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
379 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
380 static DEFINE_SPINLOCK(device_domain_lock);
381 static LIST_HEAD(device_domain_list);
383 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
384 to_pci_dev(d)->untrusted)
387 * Iterate over elements in device_domain_list and call the specified
388 * callback @fn against each element.
390 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
391 void *data), void *data)
393 int ret = 0;
394 unsigned long flags;
395 struct device_domain_info *info;
397 spin_lock_irqsave(&device_domain_lock, flags);
398 list_for_each_entry(info, &device_domain_list, global) {
399 ret = fn(info, data);
400 if (ret) {
401 spin_unlock_irqrestore(&device_domain_lock, flags);
402 return ret;
405 spin_unlock_irqrestore(&device_domain_lock, flags);
407 return 0;
410 const struct iommu_ops intel_iommu_ops;
412 static bool translation_pre_enabled(struct intel_iommu *iommu)
414 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
417 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
419 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
422 static void init_translation_status(struct intel_iommu *iommu)
424 u32 gsts;
426 gsts = readl(iommu->reg + DMAR_GSTS_REG);
427 if (gsts & DMA_GSTS_TES)
428 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
431 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
432 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
434 return container_of(dom, struct dmar_domain, domain);
437 static int __init intel_iommu_setup(char *str)
439 if (!str)
440 return -EINVAL;
441 while (*str) {
442 if (!strncmp(str, "on", 2)) {
443 dmar_disabled = 0;
444 pr_info("IOMMU enabled\n");
445 } else if (!strncmp(str, "off", 3)) {
446 dmar_disabled = 1;
447 no_platform_optin = 1;
448 pr_info("IOMMU disabled\n");
449 } else if (!strncmp(str, "igfx_off", 8)) {
450 dmar_map_gfx = 0;
451 pr_info("Disable GFX device mapping\n");
452 } else if (!strncmp(str, "forcedac", 8)) {
453 pr_info("Forcing DAC for PCI devices\n");
454 dmar_forcedac = 1;
455 } else if (!strncmp(str, "strict", 6)) {
456 pr_info("Disable batched IOTLB flush\n");
457 intel_iommu_strict = 1;
458 } else if (!strncmp(str, "sp_off", 6)) {
459 pr_info("Disable supported super page\n");
460 intel_iommu_superpage = 0;
461 } else if (!strncmp(str, "sm_on", 5)) {
462 pr_info("Intel-IOMMU: scalable mode supported\n");
463 intel_iommu_sm = 1;
464 } else if (!strncmp(str, "tboot_noforce", 13)) {
465 printk(KERN_INFO
466 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
467 intel_iommu_tboot_noforce = 1;
468 } else if (!strncmp(str, "nobounce", 8)) {
469 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
470 intel_no_bounce = 1;
473 str += strcspn(str, ",");
474 while (*str == ',')
475 str++;
477 return 0;
479 __setup("intel_iommu=", intel_iommu_setup);
481 static struct kmem_cache *iommu_domain_cache;
482 static struct kmem_cache *iommu_devinfo_cache;
484 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
486 struct dmar_domain **domains;
487 int idx = did >> 8;
489 domains = iommu->domains[idx];
490 if (!domains)
491 return NULL;
493 return domains[did & 0xff];
496 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
497 struct dmar_domain *domain)
499 struct dmar_domain **domains;
500 int idx = did >> 8;
502 if (!iommu->domains[idx]) {
503 size_t size = 256 * sizeof(struct dmar_domain *);
504 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
507 domains = iommu->domains[idx];
508 if (WARN_ON(!domains))
509 return;
510 else
511 domains[did & 0xff] = domain;
514 void *alloc_pgtable_page(int node)
516 struct page *page;
517 void *vaddr = NULL;
519 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
520 if (page)
521 vaddr = page_address(page);
522 return vaddr;
525 void free_pgtable_page(void *vaddr)
527 free_page((unsigned long)vaddr);
530 static inline void *alloc_domain_mem(void)
532 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
535 static void free_domain_mem(void *vaddr)
537 kmem_cache_free(iommu_domain_cache, vaddr);
540 static inline void * alloc_devinfo_mem(void)
542 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
545 static inline void free_devinfo_mem(void *vaddr)
547 kmem_cache_free(iommu_devinfo_cache, vaddr);
550 static inline int domain_type_is_si(struct dmar_domain *domain)
552 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
556 unsigned long pfn)
558 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
560 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
565 unsigned long sagaw;
566 int agaw = -1;
568 sagaw = cap_sagaw(iommu->cap);
569 for (agaw = width_to_agaw(max_gaw);
570 agaw >= 0; agaw--) {
571 if (test_bit(agaw, &sagaw))
572 break;
575 return agaw;
579 * Calculate max SAGAW for each iommu.
581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
583 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
587 * calculate agaw for each iommu.
588 * "SAGAW" may be different across iommus, use a default agaw, and
589 * get a supported less agaw for iommus that don't support the default agaw.
591 int iommu_calculate_agaw(struct intel_iommu *iommu)
593 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
596 /* This functionin only returns single iommu in a domain */
597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
599 int iommu_id;
601 /* si_domain and vm domain should not get here. */
602 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
603 return NULL;
605 for_each_domain_iommu(iommu_id, domain)
606 break;
608 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
609 return NULL;
611 return g_iommus[iommu_id];
614 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
616 return sm_supported(iommu) ?
617 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
620 static void domain_update_iommu_coherency(struct dmar_domain *domain)
622 struct dmar_drhd_unit *drhd;
623 struct intel_iommu *iommu;
624 bool found = false;
625 int i;
627 domain->iommu_coherency = 1;
629 for_each_domain_iommu(i, domain) {
630 found = true;
631 if (!iommu_paging_structure_coherency(g_iommus[i])) {
632 domain->iommu_coherency = 0;
633 break;
636 if (found)
637 return;
639 /* No hardware attached; use lowest common denominator */
640 rcu_read_lock();
641 for_each_active_iommu(iommu, drhd) {
642 if (!iommu_paging_structure_coherency(iommu)) {
643 domain->iommu_coherency = 0;
644 break;
647 rcu_read_unlock();
650 static int domain_update_iommu_snooping(struct intel_iommu *skip)
652 struct dmar_drhd_unit *drhd;
653 struct intel_iommu *iommu;
654 int ret = 1;
656 rcu_read_lock();
657 for_each_active_iommu(iommu, drhd) {
658 if (iommu != skip) {
659 if (!ecap_sc_support(iommu->ecap)) {
660 ret = 0;
661 break;
665 rcu_read_unlock();
667 return ret;
670 static int domain_update_iommu_superpage(struct intel_iommu *skip)
672 struct dmar_drhd_unit *drhd;
673 struct intel_iommu *iommu;
674 int mask = 0xf;
676 if (!intel_iommu_superpage) {
677 return 0;
680 /* set iommu_superpage to the smallest common denominator */
681 rcu_read_lock();
682 for_each_active_iommu(iommu, drhd) {
683 if (iommu != skip) {
684 mask &= cap_super_page_val(iommu->cap);
685 if (!mask)
686 break;
689 rcu_read_unlock();
691 return fls(mask);
694 /* Some capabilities may be different across iommus */
695 static void domain_update_iommu_cap(struct dmar_domain *domain)
697 domain_update_iommu_coherency(domain);
698 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
699 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
702 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
703 u8 devfn, int alloc)
705 struct root_entry *root = &iommu->root_entry[bus];
706 struct context_entry *context;
707 u64 *entry;
709 entry = &root->lo;
710 if (sm_supported(iommu)) {
711 if (devfn >= 0x80) {
712 devfn -= 0x80;
713 entry = &root->hi;
715 devfn *= 2;
717 if (*entry & 1)
718 context = phys_to_virt(*entry & VTD_PAGE_MASK);
719 else {
720 unsigned long phy_addr;
721 if (!alloc)
722 return NULL;
724 context = alloc_pgtable_page(iommu->node);
725 if (!context)
726 return NULL;
728 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
729 phy_addr = virt_to_phys((void *)context);
730 *entry = phy_addr | 1;
731 __iommu_flush_cache(iommu, entry, sizeof(*entry));
733 return &context[devfn];
736 static int iommu_dummy(struct device *dev)
738 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
742 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
743 * sub-hierarchy of a candidate PCI-PCI bridge
744 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
745 * @bridge: the candidate PCI-PCI bridge
747 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
749 static bool
750 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
752 struct pci_dev *pdev, *pbridge;
754 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
755 return false;
757 pdev = to_pci_dev(dev);
758 pbridge = to_pci_dev(bridge);
760 if (pbridge->subordinate &&
761 pbridge->subordinate->number <= pdev->bus->number &&
762 pbridge->subordinate->busn_res.end >= pdev->bus->number)
763 return true;
765 return false;
768 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
770 struct dmar_drhd_unit *drhd = NULL;
771 struct intel_iommu *iommu;
772 struct device *tmp;
773 struct pci_dev *pdev = NULL;
774 u16 segment = 0;
775 int i;
777 if (iommu_dummy(dev))
778 return NULL;
780 if (dev_is_pci(dev)) {
781 struct pci_dev *pf_pdev;
783 pdev = to_pci_dev(dev);
785 #ifdef CONFIG_X86
786 /* VMD child devices currently cannot be handled individually */
787 if (is_vmd(pdev->bus))
788 return NULL;
789 #endif
791 /* VFs aren't listed in scope tables; we need to look up
792 * the PF instead to find the IOMMU. */
793 pf_pdev = pci_physfn(pdev);
794 dev = &pf_pdev->dev;
795 segment = pci_domain_nr(pdev->bus);
796 } else if (has_acpi_companion(dev))
797 dev = &ACPI_COMPANION(dev)->dev;
799 rcu_read_lock();
800 for_each_active_iommu(iommu, drhd) {
801 if (pdev && segment != drhd->segment)
802 continue;
804 for_each_active_dev_scope(drhd->devices,
805 drhd->devices_cnt, i, tmp) {
806 if (tmp == dev) {
807 /* For a VF use its original BDF# not that of the PF
808 * which we used for the IOMMU lookup. Strictly speaking
809 * we could do this for all PCI devices; we only need to
810 * get the BDF# from the scope table for ACPI matches. */
811 if (pdev && pdev->is_virtfn)
812 goto got_pdev;
814 *bus = drhd->devices[i].bus;
815 *devfn = drhd->devices[i].devfn;
816 goto out;
819 if (is_downstream_to_pci_bridge(dev, tmp))
820 goto got_pdev;
823 if (pdev && drhd->include_all) {
824 got_pdev:
825 *bus = pdev->bus->number;
826 *devfn = pdev->devfn;
827 goto out;
830 iommu = NULL;
831 out:
832 rcu_read_unlock();
834 return iommu;
837 static void domain_flush_cache(struct dmar_domain *domain,
838 void *addr, int size)
840 if (!domain->iommu_coherency)
841 clflush_cache_range(addr, size);
844 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
846 struct context_entry *context;
847 int ret = 0;
848 unsigned long flags;
850 spin_lock_irqsave(&iommu->lock, flags);
851 context = iommu_context_addr(iommu, bus, devfn, 0);
852 if (context)
853 ret = context_present(context);
854 spin_unlock_irqrestore(&iommu->lock, flags);
855 return ret;
858 static void free_context_table(struct intel_iommu *iommu)
860 int i;
861 unsigned long flags;
862 struct context_entry *context;
864 spin_lock_irqsave(&iommu->lock, flags);
865 if (!iommu->root_entry) {
866 goto out;
868 for (i = 0; i < ROOT_ENTRY_NR; i++) {
869 context = iommu_context_addr(iommu, i, 0, 0);
870 if (context)
871 free_pgtable_page(context);
873 if (!sm_supported(iommu))
874 continue;
876 context = iommu_context_addr(iommu, i, 0x80, 0);
877 if (context)
878 free_pgtable_page(context);
881 free_pgtable_page(iommu->root_entry);
882 iommu->root_entry = NULL;
883 out:
884 spin_unlock_irqrestore(&iommu->lock, flags);
887 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
888 unsigned long pfn, int *target_level)
890 struct dma_pte *parent, *pte;
891 int level = agaw_to_level(domain->agaw);
892 int offset;
894 BUG_ON(!domain->pgd);
896 if (!domain_pfn_supported(domain, pfn))
897 /* Address beyond IOMMU's addressing capabilities. */
898 return NULL;
900 parent = domain->pgd;
902 while (1) {
903 void *tmp_page;
905 offset = pfn_level_offset(pfn, level);
906 pte = &parent[offset];
907 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
908 break;
909 if (level == *target_level)
910 break;
912 if (!dma_pte_present(pte)) {
913 uint64_t pteval;
915 tmp_page = alloc_pgtable_page(domain->nid);
917 if (!tmp_page)
918 return NULL;
920 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
921 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
922 if (cmpxchg64(&pte->val, 0ULL, pteval))
923 /* Someone else set it while we were thinking; use theirs. */
924 free_pgtable_page(tmp_page);
925 else
926 domain_flush_cache(domain, pte, sizeof(*pte));
928 if (level == 1)
929 break;
931 parent = phys_to_virt(dma_pte_addr(pte));
932 level--;
935 if (!*target_level)
936 *target_level = level;
938 return pte;
941 /* return address's pte at specific level */
942 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
943 unsigned long pfn,
944 int level, int *large_page)
946 struct dma_pte *parent, *pte;
947 int total = agaw_to_level(domain->agaw);
948 int offset;
950 parent = domain->pgd;
951 while (level <= total) {
952 offset = pfn_level_offset(pfn, total);
953 pte = &parent[offset];
954 if (level == total)
955 return pte;
957 if (!dma_pte_present(pte)) {
958 *large_page = total;
959 break;
962 if (dma_pte_superpage(pte)) {
963 *large_page = total;
964 return pte;
967 parent = phys_to_virt(dma_pte_addr(pte));
968 total--;
970 return NULL;
973 /* clear last level pte, a tlb flush should be followed */
974 static void dma_pte_clear_range(struct dmar_domain *domain,
975 unsigned long start_pfn,
976 unsigned long last_pfn)
978 unsigned int large_page;
979 struct dma_pte *first_pte, *pte;
981 BUG_ON(!domain_pfn_supported(domain, start_pfn));
982 BUG_ON(!domain_pfn_supported(domain, last_pfn));
983 BUG_ON(start_pfn > last_pfn);
985 /* we don't need lock here; nobody else touches the iova range */
986 do {
987 large_page = 1;
988 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
989 if (!pte) {
990 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
991 continue;
993 do {
994 dma_clear_pte(pte);
995 start_pfn += lvl_to_nr_pages(large_page);
996 pte++;
997 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
999 domain_flush_cache(domain, first_pte,
1000 (void *)pte - (void *)first_pte);
1002 } while (start_pfn && start_pfn <= last_pfn);
1005 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1006 int retain_level, struct dma_pte *pte,
1007 unsigned long pfn, unsigned long start_pfn,
1008 unsigned long last_pfn)
1010 pfn = max(start_pfn, pfn);
1011 pte = &pte[pfn_level_offset(pfn, level)];
1013 do {
1014 unsigned long level_pfn;
1015 struct dma_pte *level_pte;
1017 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1018 goto next;
1020 level_pfn = pfn & level_mask(level);
1021 level_pte = phys_to_virt(dma_pte_addr(pte));
1023 if (level > 2) {
1024 dma_pte_free_level(domain, level - 1, retain_level,
1025 level_pte, level_pfn, start_pfn,
1026 last_pfn);
1030 * Free the page table if we're below the level we want to
1031 * retain and the range covers the entire table.
1033 if (level < retain_level && !(start_pfn > level_pfn ||
1034 last_pfn < level_pfn + level_size(level) - 1)) {
1035 dma_clear_pte(pte);
1036 domain_flush_cache(domain, pte, sizeof(*pte));
1037 free_pgtable_page(level_pte);
1039 next:
1040 pfn += level_size(level);
1041 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1045 * clear last level (leaf) ptes and free page table pages below the
1046 * level we wish to keep intact.
1048 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1049 unsigned long start_pfn,
1050 unsigned long last_pfn,
1051 int retain_level)
1053 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1054 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1055 BUG_ON(start_pfn > last_pfn);
1057 dma_pte_clear_range(domain, start_pfn, last_pfn);
1059 /* We don't need lock here; nobody else touches the iova range */
1060 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1061 domain->pgd, 0, start_pfn, last_pfn);
1063 /* free pgd */
1064 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1065 free_pgtable_page(domain->pgd);
1066 domain->pgd = NULL;
1070 /* When a page at a given level is being unlinked from its parent, we don't
1071 need to *modify* it at all. All we need to do is make a list of all the
1072 pages which can be freed just as soon as we've flushed the IOTLB and we
1073 know the hardware page-walk will no longer touch them.
1074 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1075 be freed. */
1076 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1077 int level, struct dma_pte *pte,
1078 struct page *freelist)
1080 struct page *pg;
1082 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1083 pg->freelist = freelist;
1084 freelist = pg;
1086 if (level == 1)
1087 return freelist;
1089 pte = page_address(pg);
1090 do {
1091 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1092 freelist = dma_pte_list_pagetables(domain, level - 1,
1093 pte, freelist);
1094 pte++;
1095 } while (!first_pte_in_page(pte));
1097 return freelist;
1100 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1101 struct dma_pte *pte, unsigned long pfn,
1102 unsigned long start_pfn,
1103 unsigned long last_pfn,
1104 struct page *freelist)
1106 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1108 pfn = max(start_pfn, pfn);
1109 pte = &pte[pfn_level_offset(pfn, level)];
1111 do {
1112 unsigned long level_pfn;
1114 if (!dma_pte_present(pte))
1115 goto next;
1117 level_pfn = pfn & level_mask(level);
1119 /* If range covers entire pagetable, free it */
1120 if (start_pfn <= level_pfn &&
1121 last_pfn >= level_pfn + level_size(level) - 1) {
1122 /* These suborbinate page tables are going away entirely. Don't
1123 bother to clear them; we're just going to *free* them. */
1124 if (level > 1 && !dma_pte_superpage(pte))
1125 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1127 dma_clear_pte(pte);
1128 if (!first_pte)
1129 first_pte = pte;
1130 last_pte = pte;
1131 } else if (level > 1) {
1132 /* Recurse down into a level that isn't *entirely* obsolete */
1133 freelist = dma_pte_clear_level(domain, level - 1,
1134 phys_to_virt(dma_pte_addr(pte)),
1135 level_pfn, start_pfn, last_pfn,
1136 freelist);
1138 next:
1139 pfn += level_size(level);
1140 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1142 if (first_pte)
1143 domain_flush_cache(domain, first_pte,
1144 (void *)++last_pte - (void *)first_pte);
1146 return freelist;
1149 /* We can't just free the pages because the IOMMU may still be walking
1150 the page tables, and may have cached the intermediate levels. The
1151 pages can only be freed after the IOTLB flush has been done. */
1152 static struct page *domain_unmap(struct dmar_domain *domain,
1153 unsigned long start_pfn,
1154 unsigned long last_pfn)
1156 struct page *freelist;
1158 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1159 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1160 BUG_ON(start_pfn > last_pfn);
1162 /* we don't need lock here; nobody else touches the iova range */
1163 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1164 domain->pgd, 0, start_pfn, last_pfn, NULL);
1166 /* free pgd */
1167 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1168 struct page *pgd_page = virt_to_page(domain->pgd);
1169 pgd_page->freelist = freelist;
1170 freelist = pgd_page;
1172 domain->pgd = NULL;
1175 return freelist;
1178 static void dma_free_pagelist(struct page *freelist)
1180 struct page *pg;
1182 while ((pg = freelist)) {
1183 freelist = pg->freelist;
1184 free_pgtable_page(page_address(pg));
1188 static void iova_entry_free(unsigned long data)
1190 struct page *freelist = (struct page *)data;
1192 dma_free_pagelist(freelist);
1195 /* iommu handling */
1196 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1198 struct root_entry *root;
1199 unsigned long flags;
1201 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1202 if (!root) {
1203 pr_err("Allocating root entry for %s failed\n",
1204 iommu->name);
1205 return -ENOMEM;
1208 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1210 spin_lock_irqsave(&iommu->lock, flags);
1211 iommu->root_entry = root;
1212 spin_unlock_irqrestore(&iommu->lock, flags);
1214 return 0;
1217 static void iommu_set_root_entry(struct intel_iommu *iommu)
1219 u64 addr;
1220 u32 sts;
1221 unsigned long flag;
1223 addr = virt_to_phys(iommu->root_entry);
1224 if (sm_supported(iommu))
1225 addr |= DMA_RTADDR_SMT;
1227 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1228 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1230 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1232 /* Make sure hardware complete it */
1233 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1234 readl, (sts & DMA_GSTS_RTPS), sts);
1236 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1239 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1241 u32 val;
1242 unsigned long flag;
1244 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1245 return;
1247 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1248 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1250 /* Make sure hardware complete it */
1251 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1252 readl, (!(val & DMA_GSTS_WBFS)), val);
1254 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1257 /* return value determine if we need a write buffer flush */
1258 static void __iommu_flush_context(struct intel_iommu *iommu,
1259 u16 did, u16 source_id, u8 function_mask,
1260 u64 type)
1262 u64 val = 0;
1263 unsigned long flag;
1265 switch (type) {
1266 case DMA_CCMD_GLOBAL_INVL:
1267 val = DMA_CCMD_GLOBAL_INVL;
1268 break;
1269 case DMA_CCMD_DOMAIN_INVL:
1270 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1271 break;
1272 case DMA_CCMD_DEVICE_INVL:
1273 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1274 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1275 break;
1276 default:
1277 BUG();
1279 val |= DMA_CCMD_ICC;
1281 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1282 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1284 /* Make sure hardware complete it */
1285 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1286 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1288 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1291 /* return value determine if we need a write buffer flush */
1292 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1293 u64 addr, unsigned int size_order, u64 type)
1295 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1296 u64 val = 0, val_iva = 0;
1297 unsigned long flag;
1299 switch (type) {
1300 case DMA_TLB_GLOBAL_FLUSH:
1301 /* global flush doesn't need set IVA_REG */
1302 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1303 break;
1304 case DMA_TLB_DSI_FLUSH:
1305 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1306 break;
1307 case DMA_TLB_PSI_FLUSH:
1308 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1309 /* IH bit is passed in as part of address */
1310 val_iva = size_order | addr;
1311 break;
1312 default:
1313 BUG();
1315 /* Note: set drain read/write */
1316 #if 0
1318 * This is probably to be super secure.. Looks like we can
1319 * ignore it without any impact.
1321 if (cap_read_drain(iommu->cap))
1322 val |= DMA_TLB_READ_DRAIN;
1323 #endif
1324 if (cap_write_drain(iommu->cap))
1325 val |= DMA_TLB_WRITE_DRAIN;
1327 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1328 /* Note: Only uses first TLB reg currently */
1329 if (val_iva)
1330 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1331 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1333 /* Make sure hardware complete it */
1334 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1335 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1337 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1339 /* check IOTLB invalidation granularity */
1340 if (DMA_TLB_IAIG(val) == 0)
1341 pr_err("Flush IOTLB failed\n");
1342 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1343 pr_debug("TLB flush request %Lx, actual %Lx\n",
1344 (unsigned long long)DMA_TLB_IIRG(type),
1345 (unsigned long long)DMA_TLB_IAIG(val));
1348 static struct device_domain_info *
1349 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1350 u8 bus, u8 devfn)
1352 struct device_domain_info *info;
1354 assert_spin_locked(&device_domain_lock);
1356 if (!iommu->qi)
1357 return NULL;
1359 list_for_each_entry(info, &domain->devices, link)
1360 if (info->iommu == iommu && info->bus == bus &&
1361 info->devfn == devfn) {
1362 if (info->ats_supported && info->dev)
1363 return info;
1364 break;
1367 return NULL;
1370 static void domain_update_iotlb(struct dmar_domain *domain)
1372 struct device_domain_info *info;
1373 bool has_iotlb_device = false;
1375 assert_spin_locked(&device_domain_lock);
1377 list_for_each_entry(info, &domain->devices, link) {
1378 struct pci_dev *pdev;
1380 if (!info->dev || !dev_is_pci(info->dev))
1381 continue;
1383 pdev = to_pci_dev(info->dev);
1384 if (pdev->ats_enabled) {
1385 has_iotlb_device = true;
1386 break;
1390 domain->has_iotlb_device = has_iotlb_device;
1393 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1395 struct pci_dev *pdev;
1397 assert_spin_locked(&device_domain_lock);
1399 if (!info || !dev_is_pci(info->dev))
1400 return;
1402 pdev = to_pci_dev(info->dev);
1403 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1404 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1405 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1406 * reserved, which should be set to 0.
1408 if (!ecap_dit(info->iommu->ecap))
1409 info->pfsid = 0;
1410 else {
1411 struct pci_dev *pf_pdev;
1413 /* pdev will be returned if device is not a vf */
1414 pf_pdev = pci_physfn(pdev);
1415 info->pfsid = pci_dev_id(pf_pdev);
1418 #ifdef CONFIG_INTEL_IOMMU_SVM
1419 /* The PCIe spec, in its wisdom, declares that the behaviour of
1420 the device if you enable PASID support after ATS support is
1421 undefined. So always enable PASID support on devices which
1422 have it, even if we can't yet know if we're ever going to
1423 use it. */
1424 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1425 info->pasid_enabled = 1;
1427 if (info->pri_supported &&
1428 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1429 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1430 info->pri_enabled = 1;
1431 #endif
1432 if (!pdev->untrusted && info->ats_supported &&
1433 pci_ats_page_aligned(pdev) &&
1434 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1435 info->ats_enabled = 1;
1436 domain_update_iotlb(info->domain);
1437 info->ats_qdep = pci_ats_queue_depth(pdev);
1441 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1443 struct pci_dev *pdev;
1445 assert_spin_locked(&device_domain_lock);
1447 if (!dev_is_pci(info->dev))
1448 return;
1450 pdev = to_pci_dev(info->dev);
1452 if (info->ats_enabled) {
1453 pci_disable_ats(pdev);
1454 info->ats_enabled = 0;
1455 domain_update_iotlb(info->domain);
1457 #ifdef CONFIG_INTEL_IOMMU_SVM
1458 if (info->pri_enabled) {
1459 pci_disable_pri(pdev);
1460 info->pri_enabled = 0;
1462 if (info->pasid_enabled) {
1463 pci_disable_pasid(pdev);
1464 info->pasid_enabled = 0;
1466 #endif
1469 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1470 u64 addr, unsigned mask)
1472 u16 sid, qdep;
1473 unsigned long flags;
1474 struct device_domain_info *info;
1476 if (!domain->has_iotlb_device)
1477 return;
1479 spin_lock_irqsave(&device_domain_lock, flags);
1480 list_for_each_entry(info, &domain->devices, link) {
1481 if (!info->ats_enabled)
1482 continue;
1484 sid = info->bus << 8 | info->devfn;
1485 qdep = info->ats_qdep;
1486 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1487 qdep, addr, mask);
1489 spin_unlock_irqrestore(&device_domain_lock, flags);
1492 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1493 struct dmar_domain *domain,
1494 unsigned long pfn, unsigned int pages,
1495 int ih, int map)
1497 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1498 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1499 u16 did = domain->iommu_did[iommu->seq_id];
1501 BUG_ON(pages == 0);
1503 if (ih)
1504 ih = 1 << 6;
1506 * Fallback to domain selective flush if no PSI support or the size is
1507 * too big.
1508 * PSI requires page size to be 2 ^ x, and the base address is naturally
1509 * aligned to the size
1511 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1512 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1513 DMA_TLB_DSI_FLUSH);
1514 else
1515 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1516 DMA_TLB_PSI_FLUSH);
1519 * In caching mode, changes of pages from non-present to present require
1520 * flush. However, device IOTLB doesn't need to be flushed in this case.
1522 if (!cap_caching_mode(iommu->cap) || !map)
1523 iommu_flush_dev_iotlb(domain, addr, mask);
1526 /* Notification for newly created mappings */
1527 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1528 struct dmar_domain *domain,
1529 unsigned long pfn, unsigned int pages)
1531 /* It's a non-present to present mapping. Only flush if caching mode */
1532 if (cap_caching_mode(iommu->cap))
1533 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1534 else
1535 iommu_flush_write_buffer(iommu);
1538 static void iommu_flush_iova(struct iova_domain *iovad)
1540 struct dmar_domain *domain;
1541 int idx;
1543 domain = container_of(iovad, struct dmar_domain, iovad);
1545 for_each_domain_iommu(idx, domain) {
1546 struct intel_iommu *iommu = g_iommus[idx];
1547 u16 did = domain->iommu_did[iommu->seq_id];
1549 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1551 if (!cap_caching_mode(iommu->cap))
1552 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1553 0, MAX_AGAW_PFN_WIDTH);
1557 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1559 u32 pmen;
1560 unsigned long flags;
1562 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1563 return;
1565 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1566 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1567 pmen &= ~DMA_PMEN_EPM;
1568 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1570 /* wait for the protected region status bit to clear */
1571 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1572 readl, !(pmen & DMA_PMEN_PRS), pmen);
1574 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1577 static void iommu_enable_translation(struct intel_iommu *iommu)
1579 u32 sts;
1580 unsigned long flags;
1582 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1583 iommu->gcmd |= DMA_GCMD_TE;
1584 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1586 /* Make sure hardware complete it */
1587 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1588 readl, (sts & DMA_GSTS_TES), sts);
1590 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1593 static void iommu_disable_translation(struct intel_iommu *iommu)
1595 u32 sts;
1596 unsigned long flag;
1598 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1599 iommu->gcmd &= ~DMA_GCMD_TE;
1600 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1602 /* Make sure hardware complete it */
1603 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1604 readl, (!(sts & DMA_GSTS_TES)), sts);
1606 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1609 static int iommu_init_domains(struct intel_iommu *iommu)
1611 u32 ndomains, nlongs;
1612 size_t size;
1614 ndomains = cap_ndoms(iommu->cap);
1615 pr_debug("%s: Number of Domains supported <%d>\n",
1616 iommu->name, ndomains);
1617 nlongs = BITS_TO_LONGS(ndomains);
1619 spin_lock_init(&iommu->lock);
1621 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1622 if (!iommu->domain_ids) {
1623 pr_err("%s: Allocating domain id array failed\n",
1624 iommu->name);
1625 return -ENOMEM;
1628 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1629 iommu->domains = kzalloc(size, GFP_KERNEL);
1631 if (iommu->domains) {
1632 size = 256 * sizeof(struct dmar_domain *);
1633 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1636 if (!iommu->domains || !iommu->domains[0]) {
1637 pr_err("%s: Allocating domain array failed\n",
1638 iommu->name);
1639 kfree(iommu->domain_ids);
1640 kfree(iommu->domains);
1641 iommu->domain_ids = NULL;
1642 iommu->domains = NULL;
1643 return -ENOMEM;
1647 * If Caching mode is set, then invalid translations are tagged
1648 * with domain-id 0, hence we need to pre-allocate it. We also
1649 * use domain-id 0 as a marker for non-allocated domain-id, so
1650 * make sure it is not used for a real domain.
1652 set_bit(0, iommu->domain_ids);
1655 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1656 * entry for first-level or pass-through translation modes should
1657 * be programmed with a domain id different from those used for
1658 * second-level or nested translation. We reserve a domain id for
1659 * this purpose.
1661 if (sm_supported(iommu))
1662 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1664 return 0;
1667 static void disable_dmar_iommu(struct intel_iommu *iommu)
1669 struct device_domain_info *info, *tmp;
1670 unsigned long flags;
1672 if (!iommu->domains || !iommu->domain_ids)
1673 return;
1675 spin_lock_irqsave(&device_domain_lock, flags);
1676 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1677 if (info->iommu != iommu)
1678 continue;
1680 if (!info->dev || !info->domain)
1681 continue;
1683 __dmar_remove_one_dev_info(info);
1685 spin_unlock_irqrestore(&device_domain_lock, flags);
1687 if (iommu->gcmd & DMA_GCMD_TE)
1688 iommu_disable_translation(iommu);
1691 static void free_dmar_iommu(struct intel_iommu *iommu)
1693 if ((iommu->domains) && (iommu->domain_ids)) {
1694 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1695 int i;
1697 for (i = 0; i < elems; i++)
1698 kfree(iommu->domains[i]);
1699 kfree(iommu->domains);
1700 kfree(iommu->domain_ids);
1701 iommu->domains = NULL;
1702 iommu->domain_ids = NULL;
1705 g_iommus[iommu->seq_id] = NULL;
1707 /* free context mapping */
1708 free_context_table(iommu);
1710 #ifdef CONFIG_INTEL_IOMMU_SVM
1711 if (pasid_supported(iommu)) {
1712 if (ecap_prs(iommu->ecap))
1713 intel_svm_finish_prq(iommu);
1715 #endif
1718 static struct dmar_domain *alloc_domain(int flags)
1720 struct dmar_domain *domain;
1722 domain = alloc_domain_mem();
1723 if (!domain)
1724 return NULL;
1726 memset(domain, 0, sizeof(*domain));
1727 domain->nid = NUMA_NO_NODE;
1728 domain->flags = flags;
1729 domain->has_iotlb_device = false;
1730 INIT_LIST_HEAD(&domain->devices);
1732 return domain;
1735 /* Must be called with iommu->lock */
1736 static int domain_attach_iommu(struct dmar_domain *domain,
1737 struct intel_iommu *iommu)
1739 unsigned long ndomains;
1740 int num;
1742 assert_spin_locked(&device_domain_lock);
1743 assert_spin_locked(&iommu->lock);
1745 domain->iommu_refcnt[iommu->seq_id] += 1;
1746 domain->iommu_count += 1;
1747 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1748 ndomains = cap_ndoms(iommu->cap);
1749 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1751 if (num >= ndomains) {
1752 pr_err("%s: No free domain ids\n", iommu->name);
1753 domain->iommu_refcnt[iommu->seq_id] -= 1;
1754 domain->iommu_count -= 1;
1755 return -ENOSPC;
1758 set_bit(num, iommu->domain_ids);
1759 set_iommu_domain(iommu, num, domain);
1761 domain->iommu_did[iommu->seq_id] = num;
1762 domain->nid = iommu->node;
1764 domain_update_iommu_cap(domain);
1767 return 0;
1770 static int domain_detach_iommu(struct dmar_domain *domain,
1771 struct intel_iommu *iommu)
1773 int num, count;
1775 assert_spin_locked(&device_domain_lock);
1776 assert_spin_locked(&iommu->lock);
1778 domain->iommu_refcnt[iommu->seq_id] -= 1;
1779 count = --domain->iommu_count;
1780 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1781 num = domain->iommu_did[iommu->seq_id];
1782 clear_bit(num, iommu->domain_ids);
1783 set_iommu_domain(iommu, num, NULL);
1785 domain_update_iommu_cap(domain);
1786 domain->iommu_did[iommu->seq_id] = 0;
1789 return count;
1792 static struct iova_domain reserved_iova_list;
1793 static struct lock_class_key reserved_rbtree_key;
1795 static int dmar_init_reserved_ranges(void)
1797 struct pci_dev *pdev = NULL;
1798 struct iova *iova;
1799 int i;
1801 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1803 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1804 &reserved_rbtree_key);
1806 /* IOAPIC ranges shouldn't be accessed by DMA */
1807 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1808 IOVA_PFN(IOAPIC_RANGE_END));
1809 if (!iova) {
1810 pr_err("Reserve IOAPIC range failed\n");
1811 return -ENODEV;
1814 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1815 for_each_pci_dev(pdev) {
1816 struct resource *r;
1818 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1819 r = &pdev->resource[i];
1820 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1821 continue;
1822 iova = reserve_iova(&reserved_iova_list,
1823 IOVA_PFN(r->start),
1824 IOVA_PFN(r->end));
1825 if (!iova) {
1826 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1827 return -ENODEV;
1831 return 0;
1834 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1836 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1839 static inline int guestwidth_to_adjustwidth(int gaw)
1841 int agaw;
1842 int r = (gaw - 12) % 9;
1844 if (r == 0)
1845 agaw = gaw;
1846 else
1847 agaw = gaw + 9 - r;
1848 if (agaw > 64)
1849 agaw = 64;
1850 return agaw;
1853 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1854 int guest_width)
1856 int adjust_width, agaw;
1857 unsigned long sagaw;
1858 int err;
1860 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1862 err = init_iova_flush_queue(&domain->iovad,
1863 iommu_flush_iova, iova_entry_free);
1864 if (err)
1865 return err;
1867 domain_reserve_special_ranges(domain);
1869 /* calculate AGAW */
1870 if (guest_width > cap_mgaw(iommu->cap))
1871 guest_width = cap_mgaw(iommu->cap);
1872 domain->gaw = guest_width;
1873 adjust_width = guestwidth_to_adjustwidth(guest_width);
1874 agaw = width_to_agaw(adjust_width);
1875 sagaw = cap_sagaw(iommu->cap);
1876 if (!test_bit(agaw, &sagaw)) {
1877 /* hardware doesn't support it, choose a bigger one */
1878 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1879 agaw = find_next_bit(&sagaw, 5, agaw);
1880 if (agaw >= 5)
1881 return -ENODEV;
1883 domain->agaw = agaw;
1885 if (ecap_coherent(iommu->ecap))
1886 domain->iommu_coherency = 1;
1887 else
1888 domain->iommu_coherency = 0;
1890 if (ecap_sc_support(iommu->ecap))
1891 domain->iommu_snooping = 1;
1892 else
1893 domain->iommu_snooping = 0;
1895 if (intel_iommu_superpage)
1896 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1897 else
1898 domain->iommu_superpage = 0;
1900 domain->nid = iommu->node;
1902 /* always allocate the top pgd */
1903 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1904 if (!domain->pgd)
1905 return -ENOMEM;
1906 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1907 return 0;
1910 static void domain_exit(struct dmar_domain *domain)
1913 /* Remove associated devices and clear attached or cached domains */
1914 domain_remove_dev_info(domain);
1916 /* destroy iovas */
1917 put_iova_domain(&domain->iovad);
1919 if (domain->pgd) {
1920 struct page *freelist;
1922 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1923 dma_free_pagelist(freelist);
1926 free_domain_mem(domain);
1930 * Get the PASID directory size for scalable mode context entry.
1931 * Value of X in the PDTS field of a scalable mode context entry
1932 * indicates PASID directory with 2^(X + 7) entries.
1934 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1936 int pds, max_pde;
1938 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1939 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1940 if (pds < 7)
1941 return 0;
1943 return pds - 7;
1947 * Set the RID_PASID field of a scalable mode context entry. The
1948 * IOMMU hardware will use the PASID value set in this field for
1949 * DMA translations of DMA requests without PASID.
1951 static inline void
1952 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1954 context->hi |= pasid & ((1 << 20) - 1);
1955 context->hi |= (1 << 20);
1959 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1960 * entry.
1962 static inline void context_set_sm_dte(struct context_entry *context)
1964 context->lo |= (1 << 2);
1968 * Set the PRE(Page Request Enable) field of a scalable mode context
1969 * entry.
1971 static inline void context_set_sm_pre(struct context_entry *context)
1973 context->lo |= (1 << 4);
1976 /* Convert value to context PASID directory size field coding. */
1977 #define context_pdts(pds) (((pds) & 0x7) << 9)
1979 static int domain_context_mapping_one(struct dmar_domain *domain,
1980 struct intel_iommu *iommu,
1981 struct pasid_table *table,
1982 u8 bus, u8 devfn)
1984 u16 did = domain->iommu_did[iommu->seq_id];
1985 int translation = CONTEXT_TT_MULTI_LEVEL;
1986 struct device_domain_info *info = NULL;
1987 struct context_entry *context;
1988 unsigned long flags;
1989 int ret;
1991 WARN_ON(did == 0);
1993 if (hw_pass_through && domain_type_is_si(domain))
1994 translation = CONTEXT_TT_PASS_THROUGH;
1996 pr_debug("Set context mapping for %02x:%02x.%d\n",
1997 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1999 BUG_ON(!domain->pgd);
2001 spin_lock_irqsave(&device_domain_lock, flags);
2002 spin_lock(&iommu->lock);
2004 ret = -ENOMEM;
2005 context = iommu_context_addr(iommu, bus, devfn, 1);
2006 if (!context)
2007 goto out_unlock;
2009 ret = 0;
2010 if (context_present(context))
2011 goto out_unlock;
2014 * For kdump cases, old valid entries may be cached due to the
2015 * in-flight DMA and copied pgtable, but there is no unmapping
2016 * behaviour for them, thus we need an explicit cache flush for
2017 * the newly-mapped device. For kdump, at this point, the device
2018 * is supposed to finish reset at its driver probe stage, so no
2019 * in-flight DMA will exist, and we don't need to worry anymore
2020 * hereafter.
2022 if (context_copied(context)) {
2023 u16 did_old = context_domain_id(context);
2025 if (did_old < cap_ndoms(iommu->cap)) {
2026 iommu->flush.flush_context(iommu, did_old,
2027 (((u16)bus) << 8) | devfn,
2028 DMA_CCMD_MASK_NOBIT,
2029 DMA_CCMD_DEVICE_INVL);
2030 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2031 DMA_TLB_DSI_FLUSH);
2035 context_clear_entry(context);
2037 if (sm_supported(iommu)) {
2038 unsigned long pds;
2040 WARN_ON(!table);
2042 /* Setup the PASID DIR pointer: */
2043 pds = context_get_sm_pds(table);
2044 context->lo = (u64)virt_to_phys(table->table) |
2045 context_pdts(pds);
2047 /* Setup the RID_PASID field: */
2048 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2051 * Setup the Device-TLB enable bit and Page request
2052 * Enable bit:
2054 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2055 if (info && info->ats_supported)
2056 context_set_sm_dte(context);
2057 if (info && info->pri_supported)
2058 context_set_sm_pre(context);
2059 } else {
2060 struct dma_pte *pgd = domain->pgd;
2061 int agaw;
2063 context_set_domain_id(context, did);
2065 if (translation != CONTEXT_TT_PASS_THROUGH) {
2067 * Skip top levels of page tables for iommu which has
2068 * less agaw than default. Unnecessary for PT mode.
2070 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2071 ret = -ENOMEM;
2072 pgd = phys_to_virt(dma_pte_addr(pgd));
2073 if (!dma_pte_present(pgd))
2074 goto out_unlock;
2077 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2078 if (info && info->ats_supported)
2079 translation = CONTEXT_TT_DEV_IOTLB;
2080 else
2081 translation = CONTEXT_TT_MULTI_LEVEL;
2083 context_set_address_root(context, virt_to_phys(pgd));
2084 context_set_address_width(context, agaw);
2085 } else {
2087 * In pass through mode, AW must be programmed to
2088 * indicate the largest AGAW value supported by
2089 * hardware. And ASR is ignored by hardware.
2091 context_set_address_width(context, iommu->msagaw);
2094 context_set_translation_type(context, translation);
2097 context_set_fault_enable(context);
2098 context_set_present(context);
2099 if (!ecap_coherent(iommu->ecap))
2100 clflush_cache_range(context, sizeof(*context));
2103 * It's a non-present to present mapping. If hardware doesn't cache
2104 * non-present entry we only need to flush the write-buffer. If the
2105 * _does_ cache non-present entries, then it does so in the special
2106 * domain #0, which we have to flush:
2108 if (cap_caching_mode(iommu->cap)) {
2109 iommu->flush.flush_context(iommu, 0,
2110 (((u16)bus) << 8) | devfn,
2111 DMA_CCMD_MASK_NOBIT,
2112 DMA_CCMD_DEVICE_INVL);
2113 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2114 } else {
2115 iommu_flush_write_buffer(iommu);
2117 iommu_enable_dev_iotlb(info);
2119 ret = 0;
2121 out_unlock:
2122 spin_unlock(&iommu->lock);
2123 spin_unlock_irqrestore(&device_domain_lock, flags);
2125 return ret;
2128 struct domain_context_mapping_data {
2129 struct dmar_domain *domain;
2130 struct intel_iommu *iommu;
2131 struct pasid_table *table;
2134 static int domain_context_mapping_cb(struct pci_dev *pdev,
2135 u16 alias, void *opaque)
2137 struct domain_context_mapping_data *data = opaque;
2139 return domain_context_mapping_one(data->domain, data->iommu,
2140 data->table, PCI_BUS_NUM(alias),
2141 alias & 0xff);
2144 static int
2145 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2147 struct domain_context_mapping_data data;
2148 struct pasid_table *table;
2149 struct intel_iommu *iommu;
2150 u8 bus, devfn;
2152 iommu = device_to_iommu(dev, &bus, &devfn);
2153 if (!iommu)
2154 return -ENODEV;
2156 table = intel_pasid_get_table(dev);
2158 if (!dev_is_pci(dev))
2159 return domain_context_mapping_one(domain, iommu, table,
2160 bus, devfn);
2162 data.domain = domain;
2163 data.iommu = iommu;
2164 data.table = table;
2166 return pci_for_each_dma_alias(to_pci_dev(dev),
2167 &domain_context_mapping_cb, &data);
2170 static int domain_context_mapped_cb(struct pci_dev *pdev,
2171 u16 alias, void *opaque)
2173 struct intel_iommu *iommu = opaque;
2175 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2178 static int domain_context_mapped(struct device *dev)
2180 struct intel_iommu *iommu;
2181 u8 bus, devfn;
2183 iommu = device_to_iommu(dev, &bus, &devfn);
2184 if (!iommu)
2185 return -ENODEV;
2187 if (!dev_is_pci(dev))
2188 return device_context_mapped(iommu, bus, devfn);
2190 return !pci_for_each_dma_alias(to_pci_dev(dev),
2191 domain_context_mapped_cb, iommu);
2194 /* Returns a number of VTD pages, but aligned to MM page size */
2195 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2196 size_t size)
2198 host_addr &= ~PAGE_MASK;
2199 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2202 /* Return largest possible superpage level for a given mapping */
2203 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2204 unsigned long iov_pfn,
2205 unsigned long phy_pfn,
2206 unsigned long pages)
2208 int support, level = 1;
2209 unsigned long pfnmerge;
2211 support = domain->iommu_superpage;
2213 /* To use a large page, the virtual *and* physical addresses
2214 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2215 of them will mean we have to use smaller pages. So just
2216 merge them and check both at once. */
2217 pfnmerge = iov_pfn | phy_pfn;
2219 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2220 pages >>= VTD_STRIDE_SHIFT;
2221 if (!pages)
2222 break;
2223 pfnmerge >>= VTD_STRIDE_SHIFT;
2224 level++;
2225 support--;
2227 return level;
2230 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2231 struct scatterlist *sg, unsigned long phys_pfn,
2232 unsigned long nr_pages, int prot)
2234 struct dma_pte *first_pte = NULL, *pte = NULL;
2235 phys_addr_t uninitialized_var(pteval);
2236 unsigned long sg_res = 0;
2237 unsigned int largepage_lvl = 0;
2238 unsigned long lvl_pages = 0;
2240 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2242 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2243 return -EINVAL;
2245 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2247 if (!sg) {
2248 sg_res = nr_pages;
2249 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2252 while (nr_pages > 0) {
2253 uint64_t tmp;
2255 if (!sg_res) {
2256 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2258 sg_res = aligned_nrpages(sg->offset, sg->length);
2259 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2260 sg->dma_length = sg->length;
2261 pteval = (sg_phys(sg) - pgoff) | prot;
2262 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2265 if (!pte) {
2266 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2268 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2269 if (!pte)
2270 return -ENOMEM;
2271 /* It is large page*/
2272 if (largepage_lvl > 1) {
2273 unsigned long nr_superpages, end_pfn;
2275 pteval |= DMA_PTE_LARGE_PAGE;
2276 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2278 nr_superpages = sg_res / lvl_pages;
2279 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2282 * Ensure that old small page tables are
2283 * removed to make room for superpage(s).
2284 * We're adding new large pages, so make sure
2285 * we don't remove their parent tables.
2287 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2288 largepage_lvl + 1);
2289 } else {
2290 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2294 /* We don't need lock here, nobody else
2295 * touches the iova range
2297 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2298 if (tmp) {
2299 static int dumps = 5;
2300 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2301 iov_pfn, tmp, (unsigned long long)pteval);
2302 if (dumps) {
2303 dumps--;
2304 debug_dma_dump_mappings(NULL);
2306 WARN_ON(1);
2309 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2311 BUG_ON(nr_pages < lvl_pages);
2312 BUG_ON(sg_res < lvl_pages);
2314 nr_pages -= lvl_pages;
2315 iov_pfn += lvl_pages;
2316 phys_pfn += lvl_pages;
2317 pteval += lvl_pages * VTD_PAGE_SIZE;
2318 sg_res -= lvl_pages;
2320 /* If the next PTE would be the first in a new page, then we
2321 need to flush the cache on the entries we've just written.
2322 And then we'll need to recalculate 'pte', so clear it and
2323 let it get set again in the if (!pte) block above.
2325 If we're done (!nr_pages) we need to flush the cache too.
2327 Also if we've been setting superpages, we may need to
2328 recalculate 'pte' and switch back to smaller pages for the
2329 end of the mapping, if the trailing size is not enough to
2330 use another superpage (i.e. sg_res < lvl_pages). */
2331 pte++;
2332 if (!nr_pages || first_pte_in_page(pte) ||
2333 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2334 domain_flush_cache(domain, first_pte,
2335 (void *)pte - (void *)first_pte);
2336 pte = NULL;
2339 if (!sg_res && nr_pages)
2340 sg = sg_next(sg);
2342 return 0;
2345 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2346 struct scatterlist *sg, unsigned long phys_pfn,
2347 unsigned long nr_pages, int prot)
2349 int iommu_id, ret;
2350 struct intel_iommu *iommu;
2352 /* Do the real mapping first */
2353 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2354 if (ret)
2355 return ret;
2357 for_each_domain_iommu(iommu_id, domain) {
2358 iommu = g_iommus[iommu_id];
2359 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2362 return 0;
2365 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2366 struct scatterlist *sg, unsigned long nr_pages,
2367 int prot)
2369 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2372 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2373 unsigned long phys_pfn, unsigned long nr_pages,
2374 int prot)
2376 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2379 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2381 unsigned long flags;
2382 struct context_entry *context;
2383 u16 did_old;
2385 if (!iommu)
2386 return;
2388 spin_lock_irqsave(&iommu->lock, flags);
2389 context = iommu_context_addr(iommu, bus, devfn, 0);
2390 if (!context) {
2391 spin_unlock_irqrestore(&iommu->lock, flags);
2392 return;
2394 did_old = context_domain_id(context);
2395 context_clear_entry(context);
2396 __iommu_flush_cache(iommu, context, sizeof(*context));
2397 spin_unlock_irqrestore(&iommu->lock, flags);
2398 iommu->flush.flush_context(iommu,
2399 did_old,
2400 (((u16)bus) << 8) | devfn,
2401 DMA_CCMD_MASK_NOBIT,
2402 DMA_CCMD_DEVICE_INVL);
2403 iommu->flush.flush_iotlb(iommu,
2404 did_old,
2407 DMA_TLB_DSI_FLUSH);
2410 static inline void unlink_domain_info(struct device_domain_info *info)
2412 assert_spin_locked(&device_domain_lock);
2413 list_del(&info->link);
2414 list_del(&info->global);
2415 if (info->dev)
2416 info->dev->archdata.iommu = NULL;
2419 static void domain_remove_dev_info(struct dmar_domain *domain)
2421 struct device_domain_info *info, *tmp;
2422 unsigned long flags;
2424 spin_lock_irqsave(&device_domain_lock, flags);
2425 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2426 __dmar_remove_one_dev_info(info);
2427 spin_unlock_irqrestore(&device_domain_lock, flags);
2431 * find_domain
2432 * Note: we use struct device->archdata.iommu stores the info
2434 static struct dmar_domain *find_domain(struct device *dev)
2436 struct device_domain_info *info;
2438 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2439 struct iommu_domain *domain;
2441 dev->archdata.iommu = NULL;
2442 domain = iommu_get_domain_for_dev(dev);
2443 if (domain)
2444 intel_iommu_attach_device(domain, dev);
2447 /* No lock here, assumes no domain exit in normal case */
2448 info = dev->archdata.iommu;
2450 if (likely(info))
2451 return info->domain;
2452 return NULL;
2455 static inline struct device_domain_info *
2456 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2458 struct device_domain_info *info;
2460 list_for_each_entry(info, &device_domain_list, global)
2461 if (info->iommu->segment == segment && info->bus == bus &&
2462 info->devfn == devfn)
2463 return info;
2465 return NULL;
2468 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2469 int bus, int devfn,
2470 struct device *dev,
2471 struct dmar_domain *domain)
2473 struct dmar_domain *found = NULL;
2474 struct device_domain_info *info;
2475 unsigned long flags;
2476 int ret;
2478 info = alloc_devinfo_mem();
2479 if (!info)
2480 return NULL;
2482 info->bus = bus;
2483 info->devfn = devfn;
2484 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2485 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2486 info->ats_qdep = 0;
2487 info->dev = dev;
2488 info->domain = domain;
2489 info->iommu = iommu;
2490 info->pasid_table = NULL;
2491 info->auxd_enabled = 0;
2492 INIT_LIST_HEAD(&info->auxiliary_domains);
2494 if (dev && dev_is_pci(dev)) {
2495 struct pci_dev *pdev = to_pci_dev(info->dev);
2497 if (!pdev->untrusted &&
2498 !pci_ats_disabled() &&
2499 ecap_dev_iotlb_support(iommu->ecap) &&
2500 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2501 dmar_find_matched_atsr_unit(pdev))
2502 info->ats_supported = 1;
2504 if (sm_supported(iommu)) {
2505 if (pasid_supported(iommu)) {
2506 int features = pci_pasid_features(pdev);
2507 if (features >= 0)
2508 info->pasid_supported = features | 1;
2511 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2512 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2513 info->pri_supported = 1;
2517 spin_lock_irqsave(&device_domain_lock, flags);
2518 if (dev)
2519 found = find_domain(dev);
2521 if (!found) {
2522 struct device_domain_info *info2;
2523 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2524 if (info2) {
2525 found = info2->domain;
2526 info2->dev = dev;
2530 if (found) {
2531 spin_unlock_irqrestore(&device_domain_lock, flags);
2532 free_devinfo_mem(info);
2533 /* Caller must free the original domain */
2534 return found;
2537 spin_lock(&iommu->lock);
2538 ret = domain_attach_iommu(domain, iommu);
2539 spin_unlock(&iommu->lock);
2541 if (ret) {
2542 spin_unlock_irqrestore(&device_domain_lock, flags);
2543 free_devinfo_mem(info);
2544 return NULL;
2547 list_add(&info->link, &domain->devices);
2548 list_add(&info->global, &device_domain_list);
2549 if (dev)
2550 dev->archdata.iommu = info;
2551 spin_unlock_irqrestore(&device_domain_lock, flags);
2553 /* PASID table is mandatory for a PCI device in scalable mode. */
2554 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2555 ret = intel_pasid_alloc_table(dev);
2556 if (ret) {
2557 dev_err(dev, "PASID table allocation failed\n");
2558 dmar_remove_one_dev_info(dev);
2559 return NULL;
2562 /* Setup the PASID entry for requests without PASID: */
2563 spin_lock(&iommu->lock);
2564 if (hw_pass_through && domain_type_is_si(domain))
2565 ret = intel_pasid_setup_pass_through(iommu, domain,
2566 dev, PASID_RID2PASID);
2567 else
2568 ret = intel_pasid_setup_second_level(iommu, domain,
2569 dev, PASID_RID2PASID);
2570 spin_unlock(&iommu->lock);
2571 if (ret) {
2572 dev_err(dev, "Setup RID2PASID failed\n");
2573 dmar_remove_one_dev_info(dev);
2574 return NULL;
2578 if (dev && domain_context_mapping(domain, dev)) {
2579 dev_err(dev, "Domain context map failed\n");
2580 dmar_remove_one_dev_info(dev);
2581 return NULL;
2584 return domain;
2587 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2589 *(u16 *)opaque = alias;
2590 return 0;
2593 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2595 struct device_domain_info *info;
2596 struct dmar_domain *domain = NULL;
2597 struct intel_iommu *iommu;
2598 u16 dma_alias;
2599 unsigned long flags;
2600 u8 bus, devfn;
2602 iommu = device_to_iommu(dev, &bus, &devfn);
2603 if (!iommu)
2604 return NULL;
2606 if (dev_is_pci(dev)) {
2607 struct pci_dev *pdev = to_pci_dev(dev);
2609 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2611 spin_lock_irqsave(&device_domain_lock, flags);
2612 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2613 PCI_BUS_NUM(dma_alias),
2614 dma_alias & 0xff);
2615 if (info) {
2616 iommu = info->iommu;
2617 domain = info->domain;
2619 spin_unlock_irqrestore(&device_domain_lock, flags);
2621 /* DMA alias already has a domain, use it */
2622 if (info)
2623 goto out;
2626 /* Allocate and initialize new domain for the device */
2627 domain = alloc_domain(0);
2628 if (!domain)
2629 return NULL;
2630 if (domain_init(domain, iommu, gaw)) {
2631 domain_exit(domain);
2632 return NULL;
2635 out:
2636 return domain;
2639 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2640 struct dmar_domain *domain)
2642 struct intel_iommu *iommu;
2643 struct dmar_domain *tmp;
2644 u16 req_id, dma_alias;
2645 u8 bus, devfn;
2647 iommu = device_to_iommu(dev, &bus, &devfn);
2648 if (!iommu)
2649 return NULL;
2651 req_id = ((u16)bus << 8) | devfn;
2653 if (dev_is_pci(dev)) {
2654 struct pci_dev *pdev = to_pci_dev(dev);
2656 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2658 /* register PCI DMA alias device */
2659 if (req_id != dma_alias) {
2660 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2661 dma_alias & 0xff, NULL, domain);
2663 if (!tmp || tmp != domain)
2664 return tmp;
2668 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2669 if (!tmp || tmp != domain)
2670 return tmp;
2672 return domain;
2675 static int iommu_domain_identity_map(struct dmar_domain *domain,
2676 unsigned long long start,
2677 unsigned long long end)
2679 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2680 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2682 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2683 dma_to_mm_pfn(last_vpfn))) {
2684 pr_err("Reserving iova failed\n");
2685 return -ENOMEM;
2688 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2690 * RMRR range might have overlap with physical memory range,
2691 * clear it first
2693 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2695 return __domain_mapping(domain, first_vpfn, NULL,
2696 first_vpfn, last_vpfn - first_vpfn + 1,
2697 DMA_PTE_READ|DMA_PTE_WRITE);
2700 static int domain_prepare_identity_map(struct device *dev,
2701 struct dmar_domain *domain,
2702 unsigned long long start,
2703 unsigned long long end)
2705 /* For _hardware_ passthrough, don't bother. But for software
2706 passthrough, we do it anyway -- it may indicate a memory
2707 range which is reserved in E820, so which didn't get set
2708 up to start with in si_domain */
2709 if (domain == si_domain && hw_pass_through) {
2710 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2711 start, end);
2712 return 0;
2715 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2717 if (end < start) {
2718 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2719 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2720 dmi_get_system_info(DMI_BIOS_VENDOR),
2721 dmi_get_system_info(DMI_BIOS_VERSION),
2722 dmi_get_system_info(DMI_PRODUCT_VERSION));
2723 return -EIO;
2726 if (end >> agaw_to_width(domain->agaw)) {
2727 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2728 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2729 agaw_to_width(domain->agaw),
2730 dmi_get_system_info(DMI_BIOS_VENDOR),
2731 dmi_get_system_info(DMI_BIOS_VERSION),
2732 dmi_get_system_info(DMI_PRODUCT_VERSION));
2733 return -EIO;
2736 return iommu_domain_identity_map(domain, start, end);
2739 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2741 static int __init si_domain_init(int hw)
2743 struct dmar_rmrr_unit *rmrr;
2744 struct device *dev;
2745 int i, nid, ret;
2747 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2748 if (!si_domain)
2749 return -EFAULT;
2751 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2752 domain_exit(si_domain);
2753 return -EFAULT;
2756 if (hw)
2757 return 0;
2759 for_each_online_node(nid) {
2760 unsigned long start_pfn, end_pfn;
2761 int i;
2763 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2764 ret = iommu_domain_identity_map(si_domain,
2765 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2766 if (ret)
2767 return ret;
2772 * Identity map the RMRRs so that devices with RMRRs could also use
2773 * the si_domain.
2775 for_each_rmrr_units(rmrr) {
2776 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2777 i, dev) {
2778 unsigned long long start = rmrr->base_address;
2779 unsigned long long end = rmrr->end_address;
2781 if (WARN_ON(end < start ||
2782 end >> agaw_to_width(si_domain->agaw)))
2783 continue;
2785 ret = iommu_domain_identity_map(si_domain, start, end);
2786 if (ret)
2787 return ret;
2791 return 0;
2794 static int identity_mapping(struct device *dev)
2796 struct device_domain_info *info;
2798 info = dev->archdata.iommu;
2799 if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO)
2800 return (info->domain == si_domain);
2802 return 0;
2805 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2807 struct dmar_domain *ndomain;
2808 struct intel_iommu *iommu;
2809 u8 bus, devfn;
2811 iommu = device_to_iommu(dev, &bus, &devfn);
2812 if (!iommu)
2813 return -ENODEV;
2815 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2816 if (ndomain != domain)
2817 return -EBUSY;
2819 return 0;
2822 static bool device_has_rmrr(struct device *dev)
2824 struct dmar_rmrr_unit *rmrr;
2825 struct device *tmp;
2826 int i;
2828 rcu_read_lock();
2829 for_each_rmrr_units(rmrr) {
2831 * Return TRUE if this RMRR contains the device that
2832 * is passed in.
2834 for_each_active_dev_scope(rmrr->devices,
2835 rmrr->devices_cnt, i, tmp)
2836 if (tmp == dev ||
2837 is_downstream_to_pci_bridge(dev, tmp)) {
2838 rcu_read_unlock();
2839 return true;
2842 rcu_read_unlock();
2843 return false;
2847 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2848 * is relaxable (ie. is allowed to be not enforced under some conditions)
2849 * @dev: device handle
2851 * We assume that PCI USB devices with RMRRs have them largely
2852 * for historical reasons and that the RMRR space is not actively used post
2853 * boot. This exclusion may change if vendors begin to abuse it.
2855 * The same exception is made for graphics devices, with the requirement that
2856 * any use of the RMRR regions will be torn down before assigning the device
2857 * to a guest.
2859 * Return: true if the RMRR is relaxable, false otherwise
2861 static bool device_rmrr_is_relaxable(struct device *dev)
2863 struct pci_dev *pdev;
2865 if (!dev_is_pci(dev))
2866 return false;
2868 pdev = to_pci_dev(dev);
2869 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2870 return true;
2871 else
2872 return false;
2876 * There are a couple cases where we need to restrict the functionality of
2877 * devices associated with RMRRs. The first is when evaluating a device for
2878 * identity mapping because problems exist when devices are moved in and out
2879 * of domains and their respective RMRR information is lost. This means that
2880 * a device with associated RMRRs will never be in a "passthrough" domain.
2881 * The second is use of the device through the IOMMU API. This interface
2882 * expects to have full control of the IOVA space for the device. We cannot
2883 * satisfy both the requirement that RMRR access is maintained and have an
2884 * unencumbered IOVA space. We also have no ability to quiesce the device's
2885 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2886 * We therefore prevent devices associated with an RMRR from participating in
2887 * the IOMMU API, which eliminates them from device assignment.
2889 * In both cases, devices which have relaxable RMRRs are not concerned by this
2890 * restriction. See device_rmrr_is_relaxable comment.
2892 static bool device_is_rmrr_locked(struct device *dev)
2894 if (!device_has_rmrr(dev))
2895 return false;
2897 if (device_rmrr_is_relaxable(dev))
2898 return false;
2900 return true;
2904 * Return the required default domain type for a specific device.
2906 * @dev: the device in query
2907 * @startup: true if this is during early boot
2909 * Returns:
2910 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2911 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2912 * - 0: both identity and dynamic domains work for this device
2914 static int device_def_domain_type(struct device *dev)
2916 if (dev_is_pci(dev)) {
2917 struct pci_dev *pdev = to_pci_dev(dev);
2920 * Prevent any device marked as untrusted from getting
2921 * placed into the statically identity mapping domain.
2923 if (pdev->untrusted)
2924 return IOMMU_DOMAIN_DMA;
2926 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2927 return IOMMU_DOMAIN_IDENTITY;
2929 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2930 return IOMMU_DOMAIN_IDENTITY;
2933 * We want to start off with all devices in the 1:1 domain, and
2934 * take them out later if we find they can't access all of memory.
2936 * However, we can't do this for PCI devices behind bridges,
2937 * because all PCI devices behind the same bridge will end up
2938 * with the same source-id on their transactions.
2940 * Practically speaking, we can't change things around for these
2941 * devices at run-time, because we can't be sure there'll be no
2942 * DMA transactions in flight for any of their siblings.
2944 * So PCI devices (unless they're on the root bus) as well as
2945 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2946 * the 1:1 domain, just in _case_ one of their siblings turns out
2947 * not to be able to map all of memory.
2949 if (!pci_is_pcie(pdev)) {
2950 if (!pci_is_root_bus(pdev->bus))
2951 return IOMMU_DOMAIN_DMA;
2952 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2953 return IOMMU_DOMAIN_DMA;
2954 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2955 return IOMMU_DOMAIN_DMA;
2958 return (iommu_identity_mapping & IDENTMAP_ALL) ?
2959 IOMMU_DOMAIN_IDENTITY : 0;
2962 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2965 * Start from the sane iommu hardware state.
2966 * If the queued invalidation is already initialized by us
2967 * (for example, while enabling interrupt-remapping) then
2968 * we got the things already rolling from a sane state.
2970 if (!iommu->qi) {
2972 * Clear any previous faults.
2974 dmar_fault(-1, iommu);
2976 * Disable queued invalidation if supported and already enabled
2977 * before OS handover.
2979 dmar_disable_qi(iommu);
2982 if (dmar_enable_qi(iommu)) {
2984 * Queued Invalidate not enabled, use Register Based Invalidate
2986 iommu->flush.flush_context = __iommu_flush_context;
2987 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2988 pr_info("%s: Using Register based invalidation\n",
2989 iommu->name);
2990 } else {
2991 iommu->flush.flush_context = qi_flush_context;
2992 iommu->flush.flush_iotlb = qi_flush_iotlb;
2993 pr_info("%s: Using Queued invalidation\n", iommu->name);
2997 static int copy_context_table(struct intel_iommu *iommu,
2998 struct root_entry *old_re,
2999 struct context_entry **tbl,
3000 int bus, bool ext)
3002 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3003 struct context_entry *new_ce = NULL, ce;
3004 struct context_entry *old_ce = NULL;
3005 struct root_entry re;
3006 phys_addr_t old_ce_phys;
3008 tbl_idx = ext ? bus * 2 : bus;
3009 memcpy(&re, old_re, sizeof(re));
3011 for (devfn = 0; devfn < 256; devfn++) {
3012 /* First calculate the correct index */
3013 idx = (ext ? devfn * 2 : devfn) % 256;
3015 if (idx == 0) {
3016 /* First save what we may have and clean up */
3017 if (new_ce) {
3018 tbl[tbl_idx] = new_ce;
3019 __iommu_flush_cache(iommu, new_ce,
3020 VTD_PAGE_SIZE);
3021 pos = 1;
3024 if (old_ce)
3025 memunmap(old_ce);
3027 ret = 0;
3028 if (devfn < 0x80)
3029 old_ce_phys = root_entry_lctp(&re);
3030 else
3031 old_ce_phys = root_entry_uctp(&re);
3033 if (!old_ce_phys) {
3034 if (ext && devfn == 0) {
3035 /* No LCTP, try UCTP */
3036 devfn = 0x7f;
3037 continue;
3038 } else {
3039 goto out;
3043 ret = -ENOMEM;
3044 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3045 MEMREMAP_WB);
3046 if (!old_ce)
3047 goto out;
3049 new_ce = alloc_pgtable_page(iommu->node);
3050 if (!new_ce)
3051 goto out_unmap;
3053 ret = 0;
3056 /* Now copy the context entry */
3057 memcpy(&ce, old_ce + idx, sizeof(ce));
3059 if (!__context_present(&ce))
3060 continue;
3062 did = context_domain_id(&ce);
3063 if (did >= 0 && did < cap_ndoms(iommu->cap))
3064 set_bit(did, iommu->domain_ids);
3067 * We need a marker for copied context entries. This
3068 * marker needs to work for the old format as well as
3069 * for extended context entries.
3071 * Bit 67 of the context entry is used. In the old
3072 * format this bit is available to software, in the
3073 * extended format it is the PGE bit, but PGE is ignored
3074 * by HW if PASIDs are disabled (and thus still
3075 * available).
3077 * So disable PASIDs first and then mark the entry
3078 * copied. This means that we don't copy PASID
3079 * translations from the old kernel, but this is fine as
3080 * faults there are not fatal.
3082 context_clear_pasid_enable(&ce);
3083 context_set_copied(&ce);
3085 new_ce[idx] = ce;
3088 tbl[tbl_idx + pos] = new_ce;
3090 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3092 out_unmap:
3093 memunmap(old_ce);
3095 out:
3096 return ret;
3099 static int copy_translation_tables(struct intel_iommu *iommu)
3101 struct context_entry **ctxt_tbls;
3102 struct root_entry *old_rt;
3103 phys_addr_t old_rt_phys;
3104 int ctxt_table_entries;
3105 unsigned long flags;
3106 u64 rtaddr_reg;
3107 int bus, ret;
3108 bool new_ext, ext;
3110 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3111 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3112 new_ext = !!ecap_ecs(iommu->ecap);
3115 * The RTT bit can only be changed when translation is disabled,
3116 * but disabling translation means to open a window for data
3117 * corruption. So bail out and don't copy anything if we would
3118 * have to change the bit.
3120 if (new_ext != ext)
3121 return -EINVAL;
3123 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3124 if (!old_rt_phys)
3125 return -EINVAL;
3127 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3128 if (!old_rt)
3129 return -ENOMEM;
3131 /* This is too big for the stack - allocate it from slab */
3132 ctxt_table_entries = ext ? 512 : 256;
3133 ret = -ENOMEM;
3134 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3135 if (!ctxt_tbls)
3136 goto out_unmap;
3138 for (bus = 0; bus < 256; bus++) {
3139 ret = copy_context_table(iommu, &old_rt[bus],
3140 ctxt_tbls, bus, ext);
3141 if (ret) {
3142 pr_err("%s: Failed to copy context table for bus %d\n",
3143 iommu->name, bus);
3144 continue;
3148 spin_lock_irqsave(&iommu->lock, flags);
3150 /* Context tables are copied, now write them to the root_entry table */
3151 for (bus = 0; bus < 256; bus++) {
3152 int idx = ext ? bus * 2 : bus;
3153 u64 val;
3155 if (ctxt_tbls[idx]) {
3156 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3157 iommu->root_entry[bus].lo = val;
3160 if (!ext || !ctxt_tbls[idx + 1])
3161 continue;
3163 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3164 iommu->root_entry[bus].hi = val;
3167 spin_unlock_irqrestore(&iommu->lock, flags);
3169 kfree(ctxt_tbls);
3171 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3173 ret = 0;
3175 out_unmap:
3176 memunmap(old_rt);
3178 return ret;
3181 static int __init init_dmars(void)
3183 struct dmar_drhd_unit *drhd;
3184 struct intel_iommu *iommu;
3185 int ret;
3188 * for each drhd
3189 * allocate root
3190 * initialize and program root entry to not present
3191 * endfor
3193 for_each_drhd_unit(drhd) {
3195 * lock not needed as this is only incremented in the single
3196 * threaded kernel __init code path all other access are read
3197 * only
3199 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3200 g_num_of_iommus++;
3201 continue;
3203 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3206 /* Preallocate enough resources for IOMMU hot-addition */
3207 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3208 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3210 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3211 GFP_KERNEL);
3212 if (!g_iommus) {
3213 pr_err("Allocating global iommu array failed\n");
3214 ret = -ENOMEM;
3215 goto error;
3218 for_each_iommu(iommu, drhd) {
3219 if (drhd->ignored) {
3220 iommu_disable_translation(iommu);
3221 continue;
3225 * Find the max pasid size of all IOMMU's in the system.
3226 * We need to ensure the system pasid table is no bigger
3227 * than the smallest supported.
3229 if (pasid_supported(iommu)) {
3230 u32 temp = 2 << ecap_pss(iommu->ecap);
3232 intel_pasid_max_id = min_t(u32, temp,
3233 intel_pasid_max_id);
3236 g_iommus[iommu->seq_id] = iommu;
3238 intel_iommu_init_qi(iommu);
3240 ret = iommu_init_domains(iommu);
3241 if (ret)
3242 goto free_iommu;
3244 init_translation_status(iommu);
3246 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3247 iommu_disable_translation(iommu);
3248 clear_translation_pre_enabled(iommu);
3249 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3250 iommu->name);
3254 * TBD:
3255 * we could share the same root & context tables
3256 * among all IOMMU's. Need to Split it later.
3258 ret = iommu_alloc_root_entry(iommu);
3259 if (ret)
3260 goto free_iommu;
3262 if (translation_pre_enabled(iommu)) {
3263 pr_info("Translation already enabled - trying to copy translation structures\n");
3265 ret = copy_translation_tables(iommu);
3266 if (ret) {
3268 * We found the IOMMU with translation
3269 * enabled - but failed to copy over the
3270 * old root-entry table. Try to proceed
3271 * by disabling translation now and
3272 * allocating a clean root-entry table.
3273 * This might cause DMAR faults, but
3274 * probably the dump will still succeed.
3276 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3277 iommu->name);
3278 iommu_disable_translation(iommu);
3279 clear_translation_pre_enabled(iommu);
3280 } else {
3281 pr_info("Copied translation tables from previous kernel for %s\n",
3282 iommu->name);
3286 if (!ecap_pass_through(iommu->ecap))
3287 hw_pass_through = 0;
3288 #ifdef CONFIG_INTEL_IOMMU_SVM
3289 if (pasid_supported(iommu))
3290 intel_svm_init(iommu);
3291 #endif
3295 * Now that qi is enabled on all iommus, set the root entry and flush
3296 * caches. This is required on some Intel X58 chipsets, otherwise the
3297 * flush_context function will loop forever and the boot hangs.
3299 for_each_active_iommu(iommu, drhd) {
3300 iommu_flush_write_buffer(iommu);
3301 iommu_set_root_entry(iommu);
3302 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3303 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3306 if (iommu_default_passthrough())
3307 iommu_identity_mapping |= IDENTMAP_ALL;
3309 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3310 dmar_map_gfx = 0;
3311 #endif
3313 if (!dmar_map_gfx)
3314 iommu_identity_mapping |= IDENTMAP_GFX;
3316 check_tylersburg_isoch();
3318 ret = si_domain_init(hw_pass_through);
3319 if (ret)
3320 goto free_iommu;
3323 * for each drhd
3324 * enable fault log
3325 * global invalidate context cache
3326 * global invalidate iotlb
3327 * enable translation
3329 for_each_iommu(iommu, drhd) {
3330 if (drhd->ignored) {
3332 * we always have to disable PMRs or DMA may fail on
3333 * this device
3335 if (force_on)
3336 iommu_disable_protect_mem_regions(iommu);
3337 continue;
3340 iommu_flush_write_buffer(iommu);
3342 #ifdef CONFIG_INTEL_IOMMU_SVM
3343 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3345 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3346 * could cause possible lock race condition.
3348 up_write(&dmar_global_lock);
3349 ret = intel_svm_enable_prq(iommu);
3350 down_write(&dmar_global_lock);
3351 if (ret)
3352 goto free_iommu;
3354 #endif
3355 ret = dmar_set_interrupt(iommu);
3356 if (ret)
3357 goto free_iommu;
3360 return 0;
3362 free_iommu:
3363 for_each_active_iommu(iommu, drhd) {
3364 disable_dmar_iommu(iommu);
3365 free_dmar_iommu(iommu);
3368 kfree(g_iommus);
3370 error:
3371 return ret;
3374 /* This takes a number of _MM_ pages, not VTD pages */
3375 static unsigned long intel_alloc_iova(struct device *dev,
3376 struct dmar_domain *domain,
3377 unsigned long nrpages, uint64_t dma_mask)
3379 unsigned long iova_pfn;
3381 /* Restrict dma_mask to the width that the iommu can handle */
3382 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3383 /* Ensure we reserve the whole size-aligned region */
3384 nrpages = __roundup_pow_of_two(nrpages);
3386 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3388 * First try to allocate an io virtual address in
3389 * DMA_BIT_MASK(32) and if that fails then try allocating
3390 * from higher range
3392 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3393 IOVA_PFN(DMA_BIT_MASK(32)), false);
3394 if (iova_pfn)
3395 return iova_pfn;
3397 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3398 IOVA_PFN(dma_mask), true);
3399 if (unlikely(!iova_pfn)) {
3400 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3401 nrpages);
3402 return 0;
3405 return iova_pfn;
3408 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3410 struct dmar_domain *domain, *tmp;
3411 struct dmar_rmrr_unit *rmrr;
3412 struct device *i_dev;
3413 int i, ret;
3415 /* Device shouldn't be attached by any domains. */
3416 domain = find_domain(dev);
3417 if (domain)
3418 return NULL;
3420 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3421 if (!domain)
3422 goto out;
3424 /* We have a new domain - setup possible RMRRs for the device */
3425 rcu_read_lock();
3426 for_each_rmrr_units(rmrr) {
3427 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3428 i, i_dev) {
3429 if (i_dev != dev)
3430 continue;
3432 ret = domain_prepare_identity_map(dev, domain,
3433 rmrr->base_address,
3434 rmrr->end_address);
3435 if (ret)
3436 dev_err(dev, "Mapping reserved region failed\n");
3439 rcu_read_unlock();
3441 tmp = set_domain_for_dev(dev, domain);
3442 if (!tmp || domain != tmp) {
3443 domain_exit(domain);
3444 domain = tmp;
3447 out:
3448 if (!domain)
3449 dev_err(dev, "Allocating domain failed\n");
3450 else
3451 domain->domain.type = IOMMU_DOMAIN_DMA;
3453 return domain;
3456 /* Check if the dev needs to go through non-identity map and unmap process.*/
3457 static bool iommu_need_mapping(struct device *dev)
3459 int ret;
3461 if (iommu_dummy(dev))
3462 return false;
3464 ret = identity_mapping(dev);
3465 if (ret) {
3466 u64 dma_mask = *dev->dma_mask;
3468 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3469 dma_mask = dev->coherent_dma_mask;
3471 if (dma_mask >= dma_direct_get_required_mask(dev))
3472 return false;
3475 * 32 bit DMA is removed from si_domain and fall back to
3476 * non-identity mapping.
3478 dmar_remove_one_dev_info(dev);
3479 ret = iommu_request_dma_domain_for_dev(dev);
3480 if (ret) {
3481 struct iommu_domain *domain;
3482 struct dmar_domain *dmar_domain;
3484 domain = iommu_get_domain_for_dev(dev);
3485 if (domain) {
3486 dmar_domain = to_dmar_domain(domain);
3487 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3489 dmar_remove_one_dev_info(dev);
3490 get_private_domain_for_dev(dev);
3493 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3496 return true;
3499 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3500 size_t size, int dir, u64 dma_mask)
3502 struct dmar_domain *domain;
3503 phys_addr_t start_paddr;
3504 unsigned long iova_pfn;
3505 int prot = 0;
3506 int ret;
3507 struct intel_iommu *iommu;
3508 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3510 BUG_ON(dir == DMA_NONE);
3512 domain = find_domain(dev);
3513 if (!domain)
3514 return DMA_MAPPING_ERROR;
3516 iommu = domain_get_iommu(domain);
3517 size = aligned_nrpages(paddr, size);
3519 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3520 if (!iova_pfn)
3521 goto error;
3524 * Check if DMAR supports zero-length reads on write only
3525 * mappings..
3527 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3528 !cap_zlr(iommu->cap))
3529 prot |= DMA_PTE_READ;
3530 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3531 prot |= DMA_PTE_WRITE;
3533 * paddr - (paddr + size) might be partial page, we should map the whole
3534 * page. Note: if two part of one page are separately mapped, we
3535 * might have two guest_addr mapping to the same host paddr, but this
3536 * is not a big problem
3538 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3539 mm_to_dma_pfn(paddr_pfn), size, prot);
3540 if (ret)
3541 goto error;
3543 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3544 start_paddr += paddr & ~PAGE_MASK;
3546 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3548 return start_paddr;
3550 error:
3551 if (iova_pfn)
3552 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3553 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3554 size, (unsigned long long)paddr, dir);
3555 return DMA_MAPPING_ERROR;
3558 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3559 unsigned long offset, size_t size,
3560 enum dma_data_direction dir,
3561 unsigned long attrs)
3563 if (iommu_need_mapping(dev))
3564 return __intel_map_single(dev, page_to_phys(page) + offset,
3565 size, dir, *dev->dma_mask);
3566 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3569 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3570 size_t size, enum dma_data_direction dir,
3571 unsigned long attrs)
3573 if (iommu_need_mapping(dev))
3574 return __intel_map_single(dev, phys_addr, size, dir,
3575 *dev->dma_mask);
3576 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3579 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3581 struct dmar_domain *domain;
3582 unsigned long start_pfn, last_pfn;
3583 unsigned long nrpages;
3584 unsigned long iova_pfn;
3585 struct intel_iommu *iommu;
3586 struct page *freelist;
3587 struct pci_dev *pdev = NULL;
3589 domain = find_domain(dev);
3590 BUG_ON(!domain);
3592 iommu = domain_get_iommu(domain);
3594 iova_pfn = IOVA_PFN(dev_addr);
3596 nrpages = aligned_nrpages(dev_addr, size);
3597 start_pfn = mm_to_dma_pfn(iova_pfn);
3598 last_pfn = start_pfn + nrpages - 1;
3600 if (dev_is_pci(dev))
3601 pdev = to_pci_dev(dev);
3603 freelist = domain_unmap(domain, start_pfn, last_pfn);
3604 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3605 !has_iova_flush_queue(&domain->iovad)) {
3606 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3607 nrpages, !freelist, 0);
3608 /* free iova */
3609 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3610 dma_free_pagelist(freelist);
3611 } else {
3612 queue_iova(&domain->iovad, iova_pfn, nrpages,
3613 (unsigned long)freelist);
3615 * queue up the release of the unmap to save the 1/6th of the
3616 * cpu used up by the iotlb flush operation...
3620 trace_unmap_single(dev, dev_addr, size);
3623 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3624 size_t size, enum dma_data_direction dir,
3625 unsigned long attrs)
3627 if (iommu_need_mapping(dev))
3628 intel_unmap(dev, dev_addr, size);
3629 else
3630 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3633 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3634 size_t size, enum dma_data_direction dir, unsigned long attrs)
3636 if (iommu_need_mapping(dev))
3637 intel_unmap(dev, dev_addr, size);
3640 static void *intel_alloc_coherent(struct device *dev, size_t size,
3641 dma_addr_t *dma_handle, gfp_t flags,
3642 unsigned long attrs)
3644 struct page *page = NULL;
3645 int order;
3647 if (!iommu_need_mapping(dev))
3648 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3650 size = PAGE_ALIGN(size);
3651 order = get_order(size);
3653 if (gfpflags_allow_blocking(flags)) {
3654 unsigned int count = size >> PAGE_SHIFT;
3656 page = dma_alloc_from_contiguous(dev, count, order,
3657 flags & __GFP_NOWARN);
3660 if (!page)
3661 page = alloc_pages(flags, order);
3662 if (!page)
3663 return NULL;
3664 memset(page_address(page), 0, size);
3666 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3667 DMA_BIDIRECTIONAL,
3668 dev->coherent_dma_mask);
3669 if (*dma_handle != DMA_MAPPING_ERROR)
3670 return page_address(page);
3671 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3672 __free_pages(page, order);
3674 return NULL;
3677 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3678 dma_addr_t dma_handle, unsigned long attrs)
3680 int order;
3681 struct page *page = virt_to_page(vaddr);
3683 if (!iommu_need_mapping(dev))
3684 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3686 size = PAGE_ALIGN(size);
3687 order = get_order(size);
3689 intel_unmap(dev, dma_handle, size);
3690 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3691 __free_pages(page, order);
3694 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3695 int nelems, enum dma_data_direction dir,
3696 unsigned long attrs)
3698 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3699 unsigned long nrpages = 0;
3700 struct scatterlist *sg;
3701 int i;
3703 if (!iommu_need_mapping(dev))
3704 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3706 for_each_sg(sglist, sg, nelems, i) {
3707 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3710 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3712 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3715 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3716 enum dma_data_direction dir, unsigned long attrs)
3718 int i;
3719 struct dmar_domain *domain;
3720 size_t size = 0;
3721 int prot = 0;
3722 unsigned long iova_pfn;
3723 int ret;
3724 struct scatterlist *sg;
3725 unsigned long start_vpfn;
3726 struct intel_iommu *iommu;
3728 BUG_ON(dir == DMA_NONE);
3729 if (!iommu_need_mapping(dev))
3730 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3732 domain = find_domain(dev);
3733 if (!domain)
3734 return 0;
3736 iommu = domain_get_iommu(domain);
3738 for_each_sg(sglist, sg, nelems, i)
3739 size += aligned_nrpages(sg->offset, sg->length);
3741 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3742 *dev->dma_mask);
3743 if (!iova_pfn) {
3744 sglist->dma_length = 0;
3745 return 0;
3749 * Check if DMAR supports zero-length reads on write only
3750 * mappings..
3752 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3753 !cap_zlr(iommu->cap))
3754 prot |= DMA_PTE_READ;
3755 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3756 prot |= DMA_PTE_WRITE;
3758 start_vpfn = mm_to_dma_pfn(iova_pfn);
3760 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3761 if (unlikely(ret)) {
3762 dma_pte_free_pagetable(domain, start_vpfn,
3763 start_vpfn + size - 1,
3764 agaw_to_level(domain->agaw) + 1);
3765 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3766 return 0;
3769 trace_map_sg(dev, iova_pfn << PAGE_SHIFT,
3770 sg_phys(sglist), size << VTD_PAGE_SHIFT);
3772 return nelems;
3775 static u64 intel_get_required_mask(struct device *dev)
3777 if (!iommu_need_mapping(dev))
3778 return dma_direct_get_required_mask(dev);
3779 return DMA_BIT_MASK(32);
3782 static const struct dma_map_ops intel_dma_ops = {
3783 .alloc = intel_alloc_coherent,
3784 .free = intel_free_coherent,
3785 .map_sg = intel_map_sg,
3786 .unmap_sg = intel_unmap_sg,
3787 .map_page = intel_map_page,
3788 .unmap_page = intel_unmap_page,
3789 .map_resource = intel_map_resource,
3790 .unmap_resource = intel_unmap_resource,
3791 .dma_supported = dma_direct_supported,
3792 .mmap = dma_common_mmap,
3793 .get_sgtable = dma_common_get_sgtable,
3794 .get_required_mask = intel_get_required_mask,
3797 static void
3798 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3799 enum dma_data_direction dir, enum dma_sync_target target)
3801 struct dmar_domain *domain;
3802 phys_addr_t tlb_addr;
3804 domain = find_domain(dev);
3805 if (WARN_ON(!domain))
3806 return;
3808 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3809 if (is_swiotlb_buffer(tlb_addr))
3810 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3813 static dma_addr_t
3814 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3815 enum dma_data_direction dir, unsigned long attrs,
3816 u64 dma_mask)
3818 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3819 struct dmar_domain *domain;
3820 struct intel_iommu *iommu;
3821 unsigned long iova_pfn;
3822 unsigned long nrpages;
3823 phys_addr_t tlb_addr;
3824 int prot = 0;
3825 int ret;
3827 domain = find_domain(dev);
3828 if (WARN_ON(dir == DMA_NONE || !domain))
3829 return DMA_MAPPING_ERROR;
3831 iommu = domain_get_iommu(domain);
3832 if (WARN_ON(!iommu))
3833 return DMA_MAPPING_ERROR;
3835 nrpages = aligned_nrpages(0, size);
3836 iova_pfn = intel_alloc_iova(dev, domain,
3837 dma_to_mm_pfn(nrpages), dma_mask);
3838 if (!iova_pfn)
3839 return DMA_MAPPING_ERROR;
3842 * Check if DMAR supports zero-length reads on write only
3843 * mappings..
3845 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3846 !cap_zlr(iommu->cap))
3847 prot |= DMA_PTE_READ;
3848 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3849 prot |= DMA_PTE_WRITE;
3852 * If both the physical buffer start address and size are
3853 * page aligned, we don't need to use a bounce page.
3855 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3856 tlb_addr = swiotlb_tbl_map_single(dev,
3857 __phys_to_dma(dev, io_tlb_start),
3858 paddr, size, aligned_size, dir, attrs);
3859 if (tlb_addr == DMA_MAPPING_ERROR) {
3860 goto swiotlb_error;
3861 } else {
3862 /* Cleanup the padding area. */
3863 void *padding_start = phys_to_virt(tlb_addr);
3864 size_t padding_size = aligned_size;
3866 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3867 (dir == DMA_TO_DEVICE ||
3868 dir == DMA_BIDIRECTIONAL)) {
3869 padding_start += size;
3870 padding_size -= size;
3873 memset(padding_start, 0, padding_size);
3875 } else {
3876 tlb_addr = paddr;
3879 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3880 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3881 if (ret)
3882 goto mapping_error;
3884 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3886 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3888 mapping_error:
3889 if (is_swiotlb_buffer(tlb_addr))
3890 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3891 aligned_size, dir, attrs);
3892 swiotlb_error:
3893 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3894 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3895 size, (unsigned long long)paddr, dir);
3897 return DMA_MAPPING_ERROR;
3900 static void
3901 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3902 enum dma_data_direction dir, unsigned long attrs)
3904 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3905 struct dmar_domain *domain;
3906 phys_addr_t tlb_addr;
3908 domain = find_domain(dev);
3909 if (WARN_ON(!domain))
3910 return;
3912 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3913 if (WARN_ON(!tlb_addr))
3914 return;
3916 intel_unmap(dev, dev_addr, size);
3917 if (is_swiotlb_buffer(tlb_addr))
3918 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3919 aligned_size, dir, attrs);
3921 trace_bounce_unmap_single(dev, dev_addr, size);
3924 static dma_addr_t
3925 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3926 size_t size, enum dma_data_direction dir, unsigned long attrs)
3928 return bounce_map_single(dev, page_to_phys(page) + offset,
3929 size, dir, attrs, *dev->dma_mask);
3932 static dma_addr_t
3933 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3934 enum dma_data_direction dir, unsigned long attrs)
3936 return bounce_map_single(dev, phys_addr, size,
3937 dir, attrs, *dev->dma_mask);
3940 static void
3941 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3942 enum dma_data_direction dir, unsigned long attrs)
3944 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3947 static void
3948 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3949 enum dma_data_direction dir, unsigned long attrs)
3951 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3954 static void
3955 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3956 enum dma_data_direction dir, unsigned long attrs)
3958 struct scatterlist *sg;
3959 int i;
3961 for_each_sg(sglist, sg, nelems, i)
3962 bounce_unmap_page(dev, sg->dma_address,
3963 sg_dma_len(sg), dir, attrs);
3966 static int
3967 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3968 enum dma_data_direction dir, unsigned long attrs)
3970 int i;
3971 struct scatterlist *sg;
3973 for_each_sg(sglist, sg, nelems, i) {
3974 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3975 sg->offset, sg->length,
3976 dir, attrs);
3977 if (sg->dma_address == DMA_MAPPING_ERROR)
3978 goto out_unmap;
3979 sg_dma_len(sg) = sg->length;
3982 return nelems;
3984 out_unmap:
3985 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3986 return 0;
3989 static void
3990 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3991 size_t size, enum dma_data_direction dir)
3993 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3996 static void
3997 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3998 size_t size, enum dma_data_direction dir)
4000 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4003 static void
4004 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4005 int nelems, enum dma_data_direction dir)
4007 struct scatterlist *sg;
4008 int i;
4010 for_each_sg(sglist, sg, nelems, i)
4011 bounce_sync_single(dev, sg_dma_address(sg),
4012 sg_dma_len(sg), dir, SYNC_FOR_CPU);
4015 static void
4016 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4017 int nelems, enum dma_data_direction dir)
4019 struct scatterlist *sg;
4020 int i;
4022 for_each_sg(sglist, sg, nelems, i)
4023 bounce_sync_single(dev, sg_dma_address(sg),
4024 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4027 static const struct dma_map_ops bounce_dma_ops = {
4028 .alloc = intel_alloc_coherent,
4029 .free = intel_free_coherent,
4030 .map_sg = bounce_map_sg,
4031 .unmap_sg = bounce_unmap_sg,
4032 .map_page = bounce_map_page,
4033 .unmap_page = bounce_unmap_page,
4034 .sync_single_for_cpu = bounce_sync_single_for_cpu,
4035 .sync_single_for_device = bounce_sync_single_for_device,
4036 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
4037 .sync_sg_for_device = bounce_sync_sg_for_device,
4038 .map_resource = bounce_map_resource,
4039 .unmap_resource = bounce_unmap_resource,
4040 .dma_supported = dma_direct_supported,
4043 static inline int iommu_domain_cache_init(void)
4045 int ret = 0;
4047 iommu_domain_cache = kmem_cache_create("iommu_domain",
4048 sizeof(struct dmar_domain),
4050 SLAB_HWCACHE_ALIGN,
4052 NULL);
4053 if (!iommu_domain_cache) {
4054 pr_err("Couldn't create iommu_domain cache\n");
4055 ret = -ENOMEM;
4058 return ret;
4061 static inline int iommu_devinfo_cache_init(void)
4063 int ret = 0;
4065 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4066 sizeof(struct device_domain_info),
4068 SLAB_HWCACHE_ALIGN,
4069 NULL);
4070 if (!iommu_devinfo_cache) {
4071 pr_err("Couldn't create devinfo cache\n");
4072 ret = -ENOMEM;
4075 return ret;
4078 static int __init iommu_init_mempool(void)
4080 int ret;
4081 ret = iova_cache_get();
4082 if (ret)
4083 return ret;
4085 ret = iommu_domain_cache_init();
4086 if (ret)
4087 goto domain_error;
4089 ret = iommu_devinfo_cache_init();
4090 if (!ret)
4091 return ret;
4093 kmem_cache_destroy(iommu_domain_cache);
4094 domain_error:
4095 iova_cache_put();
4097 return -ENOMEM;
4100 static void __init iommu_exit_mempool(void)
4102 kmem_cache_destroy(iommu_devinfo_cache);
4103 kmem_cache_destroy(iommu_domain_cache);
4104 iova_cache_put();
4107 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4109 struct dmar_drhd_unit *drhd;
4110 u32 vtbar;
4111 int rc;
4113 /* We know that this device on this chipset has its own IOMMU.
4114 * If we find it under a different IOMMU, then the BIOS is lying
4115 * to us. Hope that the IOMMU for this device is actually
4116 * disabled, and it needs no translation...
4118 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4119 if (rc) {
4120 /* "can't" happen */
4121 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4122 return;
4124 vtbar &= 0xffff0000;
4126 /* we know that the this iommu should be at offset 0xa000 from vtbar */
4127 drhd = dmar_find_matched_drhd_unit(pdev);
4128 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
4129 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4130 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4131 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4134 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4136 static void __init init_no_remapping_devices(void)
4138 struct dmar_drhd_unit *drhd;
4139 struct device *dev;
4140 int i;
4142 for_each_drhd_unit(drhd) {
4143 if (!drhd->include_all) {
4144 for_each_active_dev_scope(drhd->devices,
4145 drhd->devices_cnt, i, dev)
4146 break;
4147 /* ignore DMAR unit if no devices exist */
4148 if (i == drhd->devices_cnt)
4149 drhd->ignored = 1;
4153 for_each_active_drhd_unit(drhd) {
4154 if (drhd->include_all)
4155 continue;
4157 for_each_active_dev_scope(drhd->devices,
4158 drhd->devices_cnt, i, dev)
4159 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4160 break;
4161 if (i < drhd->devices_cnt)
4162 continue;
4164 /* This IOMMU has *only* gfx devices. Either bypass it or
4165 set the gfx_mapped flag, as appropriate */
4166 if (!dmar_map_gfx) {
4167 drhd->ignored = 1;
4168 for_each_active_dev_scope(drhd->devices,
4169 drhd->devices_cnt, i, dev)
4170 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4175 #ifdef CONFIG_SUSPEND
4176 static int init_iommu_hw(void)
4178 struct dmar_drhd_unit *drhd;
4179 struct intel_iommu *iommu = NULL;
4181 for_each_active_iommu(iommu, drhd)
4182 if (iommu->qi)
4183 dmar_reenable_qi(iommu);
4185 for_each_iommu(iommu, drhd) {
4186 if (drhd->ignored) {
4188 * we always have to disable PMRs or DMA may fail on
4189 * this device
4191 if (force_on)
4192 iommu_disable_protect_mem_regions(iommu);
4193 continue;
4196 iommu_flush_write_buffer(iommu);
4198 iommu_set_root_entry(iommu);
4200 iommu->flush.flush_context(iommu, 0, 0, 0,
4201 DMA_CCMD_GLOBAL_INVL);
4202 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4203 iommu_enable_translation(iommu);
4204 iommu_disable_protect_mem_regions(iommu);
4207 return 0;
4210 static void iommu_flush_all(void)
4212 struct dmar_drhd_unit *drhd;
4213 struct intel_iommu *iommu;
4215 for_each_active_iommu(iommu, drhd) {
4216 iommu->flush.flush_context(iommu, 0, 0, 0,
4217 DMA_CCMD_GLOBAL_INVL);
4218 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4219 DMA_TLB_GLOBAL_FLUSH);
4223 static int iommu_suspend(void)
4225 struct dmar_drhd_unit *drhd;
4226 struct intel_iommu *iommu = NULL;
4227 unsigned long flag;
4229 for_each_active_iommu(iommu, drhd) {
4230 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4231 GFP_ATOMIC);
4232 if (!iommu->iommu_state)
4233 goto nomem;
4236 iommu_flush_all();
4238 for_each_active_iommu(iommu, drhd) {
4239 iommu_disable_translation(iommu);
4241 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4243 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4244 readl(iommu->reg + DMAR_FECTL_REG);
4245 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4246 readl(iommu->reg + DMAR_FEDATA_REG);
4247 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4248 readl(iommu->reg + DMAR_FEADDR_REG);
4249 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4250 readl(iommu->reg + DMAR_FEUADDR_REG);
4252 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4254 return 0;
4256 nomem:
4257 for_each_active_iommu(iommu, drhd)
4258 kfree(iommu->iommu_state);
4260 return -ENOMEM;
4263 static void iommu_resume(void)
4265 struct dmar_drhd_unit *drhd;
4266 struct intel_iommu *iommu = NULL;
4267 unsigned long flag;
4269 if (init_iommu_hw()) {
4270 if (force_on)
4271 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4272 else
4273 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4274 return;
4277 for_each_active_iommu(iommu, drhd) {
4279 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4281 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4282 iommu->reg + DMAR_FECTL_REG);
4283 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4284 iommu->reg + DMAR_FEDATA_REG);
4285 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4286 iommu->reg + DMAR_FEADDR_REG);
4287 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4288 iommu->reg + DMAR_FEUADDR_REG);
4290 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4293 for_each_active_iommu(iommu, drhd)
4294 kfree(iommu->iommu_state);
4297 static struct syscore_ops iommu_syscore_ops = {
4298 .resume = iommu_resume,
4299 .suspend = iommu_suspend,
4302 static void __init init_iommu_pm_ops(void)
4304 register_syscore_ops(&iommu_syscore_ops);
4307 #else
4308 static inline void init_iommu_pm_ops(void) {}
4309 #endif /* CONFIG_PM */
4311 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4313 struct acpi_dmar_reserved_memory *rmrr;
4314 struct dmar_rmrr_unit *rmrru;
4316 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4317 if (!rmrru)
4318 goto out;
4320 rmrru->hdr = header;
4321 rmrr = (struct acpi_dmar_reserved_memory *)header;
4322 rmrru->base_address = rmrr->base_address;
4323 rmrru->end_address = rmrr->end_address;
4325 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4326 ((void *)rmrr) + rmrr->header.length,
4327 &rmrru->devices_cnt);
4328 if (rmrru->devices_cnt && rmrru->devices == NULL)
4329 goto free_rmrru;
4331 list_add(&rmrru->list, &dmar_rmrr_units);
4333 return 0;
4334 free_rmrru:
4335 kfree(rmrru);
4336 out:
4337 return -ENOMEM;
4340 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4342 struct dmar_atsr_unit *atsru;
4343 struct acpi_dmar_atsr *tmp;
4345 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4346 dmar_rcu_check()) {
4347 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4348 if (atsr->segment != tmp->segment)
4349 continue;
4350 if (atsr->header.length != tmp->header.length)
4351 continue;
4352 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4353 return atsru;
4356 return NULL;
4359 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4361 struct acpi_dmar_atsr *atsr;
4362 struct dmar_atsr_unit *atsru;
4364 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4365 return 0;
4367 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4368 atsru = dmar_find_atsr(atsr);
4369 if (atsru)
4370 return 0;
4372 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4373 if (!atsru)
4374 return -ENOMEM;
4377 * If memory is allocated from slab by ACPI _DSM method, we need to
4378 * copy the memory content because the memory buffer will be freed
4379 * on return.
4381 atsru->hdr = (void *)(atsru + 1);
4382 memcpy(atsru->hdr, hdr, hdr->length);
4383 atsru->include_all = atsr->flags & 0x1;
4384 if (!atsru->include_all) {
4385 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4386 (void *)atsr + atsr->header.length,
4387 &atsru->devices_cnt);
4388 if (atsru->devices_cnt && atsru->devices == NULL) {
4389 kfree(atsru);
4390 return -ENOMEM;
4394 list_add_rcu(&atsru->list, &dmar_atsr_units);
4396 return 0;
4399 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4401 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4402 kfree(atsru);
4405 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4407 struct acpi_dmar_atsr *atsr;
4408 struct dmar_atsr_unit *atsru;
4410 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4411 atsru = dmar_find_atsr(atsr);
4412 if (atsru) {
4413 list_del_rcu(&atsru->list);
4414 synchronize_rcu();
4415 intel_iommu_free_atsr(atsru);
4418 return 0;
4421 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4423 int i;
4424 struct device *dev;
4425 struct acpi_dmar_atsr *atsr;
4426 struct dmar_atsr_unit *atsru;
4428 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4429 atsru = dmar_find_atsr(atsr);
4430 if (!atsru)
4431 return 0;
4433 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4434 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4435 i, dev)
4436 return -EBUSY;
4439 return 0;
4442 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4444 int sp, ret;
4445 struct intel_iommu *iommu = dmaru->iommu;
4447 if (g_iommus[iommu->seq_id])
4448 return 0;
4450 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4451 pr_warn("%s: Doesn't support hardware pass through.\n",
4452 iommu->name);
4453 return -ENXIO;
4455 if (!ecap_sc_support(iommu->ecap) &&
4456 domain_update_iommu_snooping(iommu)) {
4457 pr_warn("%s: Doesn't support snooping.\n",
4458 iommu->name);
4459 return -ENXIO;
4461 sp = domain_update_iommu_superpage(iommu) - 1;
4462 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4463 pr_warn("%s: Doesn't support large page.\n",
4464 iommu->name);
4465 return -ENXIO;
4469 * Disable translation if already enabled prior to OS handover.
4471 if (iommu->gcmd & DMA_GCMD_TE)
4472 iommu_disable_translation(iommu);
4474 g_iommus[iommu->seq_id] = iommu;
4475 ret = iommu_init_domains(iommu);
4476 if (ret == 0)
4477 ret = iommu_alloc_root_entry(iommu);
4478 if (ret)
4479 goto out;
4481 #ifdef CONFIG_INTEL_IOMMU_SVM
4482 if (pasid_supported(iommu))
4483 intel_svm_init(iommu);
4484 #endif
4486 if (dmaru->ignored) {
4488 * we always have to disable PMRs or DMA may fail on this device
4490 if (force_on)
4491 iommu_disable_protect_mem_regions(iommu);
4492 return 0;
4495 intel_iommu_init_qi(iommu);
4496 iommu_flush_write_buffer(iommu);
4498 #ifdef CONFIG_INTEL_IOMMU_SVM
4499 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4500 ret = intel_svm_enable_prq(iommu);
4501 if (ret)
4502 goto disable_iommu;
4504 #endif
4505 ret = dmar_set_interrupt(iommu);
4506 if (ret)
4507 goto disable_iommu;
4509 iommu_set_root_entry(iommu);
4510 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4511 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4512 iommu_enable_translation(iommu);
4514 iommu_disable_protect_mem_regions(iommu);
4515 return 0;
4517 disable_iommu:
4518 disable_dmar_iommu(iommu);
4519 out:
4520 free_dmar_iommu(iommu);
4521 return ret;
4524 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4526 int ret = 0;
4527 struct intel_iommu *iommu = dmaru->iommu;
4529 if (!intel_iommu_enabled)
4530 return 0;
4531 if (iommu == NULL)
4532 return -EINVAL;
4534 if (insert) {
4535 ret = intel_iommu_add(dmaru);
4536 } else {
4537 disable_dmar_iommu(iommu);
4538 free_dmar_iommu(iommu);
4541 return ret;
4544 static void intel_iommu_free_dmars(void)
4546 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4547 struct dmar_atsr_unit *atsru, *atsr_n;
4549 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4550 list_del(&rmrru->list);
4551 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4552 kfree(rmrru);
4555 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4556 list_del(&atsru->list);
4557 intel_iommu_free_atsr(atsru);
4561 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4563 int i, ret = 1;
4564 struct pci_bus *bus;
4565 struct pci_dev *bridge = NULL;
4566 struct device *tmp;
4567 struct acpi_dmar_atsr *atsr;
4568 struct dmar_atsr_unit *atsru;
4570 dev = pci_physfn(dev);
4571 for (bus = dev->bus; bus; bus = bus->parent) {
4572 bridge = bus->self;
4573 /* If it's an integrated device, allow ATS */
4574 if (!bridge)
4575 return 1;
4576 /* Connected via non-PCIe: no ATS */
4577 if (!pci_is_pcie(bridge) ||
4578 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4579 return 0;
4580 /* If we found the root port, look it up in the ATSR */
4581 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4582 break;
4585 rcu_read_lock();
4586 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4587 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4588 if (atsr->segment != pci_domain_nr(dev->bus))
4589 continue;
4591 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4592 if (tmp == &bridge->dev)
4593 goto out;
4595 if (atsru->include_all)
4596 goto out;
4598 ret = 0;
4599 out:
4600 rcu_read_unlock();
4602 return ret;
4605 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4607 int ret;
4608 struct dmar_rmrr_unit *rmrru;
4609 struct dmar_atsr_unit *atsru;
4610 struct acpi_dmar_atsr *atsr;
4611 struct acpi_dmar_reserved_memory *rmrr;
4613 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4614 return 0;
4616 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4617 rmrr = container_of(rmrru->hdr,
4618 struct acpi_dmar_reserved_memory, header);
4619 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4620 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4621 ((void *)rmrr) + rmrr->header.length,
4622 rmrr->segment, rmrru->devices,
4623 rmrru->devices_cnt);
4624 if (ret < 0)
4625 return ret;
4626 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4627 dmar_remove_dev_scope(info, rmrr->segment,
4628 rmrru->devices, rmrru->devices_cnt);
4632 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4633 if (atsru->include_all)
4634 continue;
4636 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4637 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4638 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4639 (void *)atsr + atsr->header.length,
4640 atsr->segment, atsru->devices,
4641 atsru->devices_cnt);
4642 if (ret > 0)
4643 break;
4644 else if (ret < 0)
4645 return ret;
4646 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4647 if (dmar_remove_dev_scope(info, atsr->segment,
4648 atsru->devices, atsru->devices_cnt))
4649 break;
4653 return 0;
4656 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4657 unsigned long val, void *v)
4659 struct memory_notify *mhp = v;
4660 unsigned long long start, end;
4661 unsigned long start_vpfn, last_vpfn;
4663 switch (val) {
4664 case MEM_GOING_ONLINE:
4665 start = mhp->start_pfn << PAGE_SHIFT;
4666 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4667 if (iommu_domain_identity_map(si_domain, start, end)) {
4668 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4669 start, end);
4670 return NOTIFY_BAD;
4672 break;
4674 case MEM_OFFLINE:
4675 case MEM_CANCEL_ONLINE:
4676 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4677 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4678 while (start_vpfn <= last_vpfn) {
4679 struct iova *iova;
4680 struct dmar_drhd_unit *drhd;
4681 struct intel_iommu *iommu;
4682 struct page *freelist;
4684 iova = find_iova(&si_domain->iovad, start_vpfn);
4685 if (iova == NULL) {
4686 pr_debug("Failed get IOVA for PFN %lx\n",
4687 start_vpfn);
4688 break;
4691 iova = split_and_remove_iova(&si_domain->iovad, iova,
4692 start_vpfn, last_vpfn);
4693 if (iova == NULL) {
4694 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4695 start_vpfn, last_vpfn);
4696 return NOTIFY_BAD;
4699 freelist = domain_unmap(si_domain, iova->pfn_lo,
4700 iova->pfn_hi);
4702 rcu_read_lock();
4703 for_each_active_iommu(iommu, drhd)
4704 iommu_flush_iotlb_psi(iommu, si_domain,
4705 iova->pfn_lo, iova_size(iova),
4706 !freelist, 0);
4707 rcu_read_unlock();
4708 dma_free_pagelist(freelist);
4710 start_vpfn = iova->pfn_hi + 1;
4711 free_iova_mem(iova);
4713 break;
4716 return NOTIFY_OK;
4719 static struct notifier_block intel_iommu_memory_nb = {
4720 .notifier_call = intel_iommu_memory_notifier,
4721 .priority = 0
4724 static void free_all_cpu_cached_iovas(unsigned int cpu)
4726 int i;
4728 for (i = 0; i < g_num_of_iommus; i++) {
4729 struct intel_iommu *iommu = g_iommus[i];
4730 struct dmar_domain *domain;
4731 int did;
4733 if (!iommu)
4734 continue;
4736 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4737 domain = get_iommu_domain(iommu, (u16)did);
4739 if (!domain)
4740 continue;
4741 free_cpu_cached_iovas(cpu, &domain->iovad);
4746 static int intel_iommu_cpu_dead(unsigned int cpu)
4748 free_all_cpu_cached_iovas(cpu);
4749 return 0;
4752 static void intel_disable_iommus(void)
4754 struct intel_iommu *iommu = NULL;
4755 struct dmar_drhd_unit *drhd;
4757 for_each_iommu(iommu, drhd)
4758 iommu_disable_translation(iommu);
4761 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4763 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4765 return container_of(iommu_dev, struct intel_iommu, iommu);
4768 static ssize_t intel_iommu_show_version(struct device *dev,
4769 struct device_attribute *attr,
4770 char *buf)
4772 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4773 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4774 return sprintf(buf, "%d:%d\n",
4775 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4777 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4779 static ssize_t intel_iommu_show_address(struct device *dev,
4780 struct device_attribute *attr,
4781 char *buf)
4783 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4784 return sprintf(buf, "%llx\n", iommu->reg_phys);
4786 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4788 static ssize_t intel_iommu_show_cap(struct device *dev,
4789 struct device_attribute *attr,
4790 char *buf)
4792 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4793 return sprintf(buf, "%llx\n", iommu->cap);
4795 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4797 static ssize_t intel_iommu_show_ecap(struct device *dev,
4798 struct device_attribute *attr,
4799 char *buf)
4801 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4802 return sprintf(buf, "%llx\n", iommu->ecap);
4804 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4806 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4807 struct device_attribute *attr,
4808 char *buf)
4810 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4811 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4813 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4815 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4816 struct device_attribute *attr,
4817 char *buf)
4819 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4820 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4821 cap_ndoms(iommu->cap)));
4823 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4825 static struct attribute *intel_iommu_attrs[] = {
4826 &dev_attr_version.attr,
4827 &dev_attr_address.attr,
4828 &dev_attr_cap.attr,
4829 &dev_attr_ecap.attr,
4830 &dev_attr_domains_supported.attr,
4831 &dev_attr_domains_used.attr,
4832 NULL,
4835 static struct attribute_group intel_iommu_group = {
4836 .name = "intel-iommu",
4837 .attrs = intel_iommu_attrs,
4840 const struct attribute_group *intel_iommu_groups[] = {
4841 &intel_iommu_group,
4842 NULL,
4845 static inline bool has_untrusted_dev(void)
4847 struct pci_dev *pdev = NULL;
4849 for_each_pci_dev(pdev)
4850 if (pdev->untrusted)
4851 return true;
4853 return false;
4856 static int __init platform_optin_force_iommu(void)
4858 if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4859 return 0;
4861 if (no_iommu || dmar_disabled)
4862 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4865 * If Intel-IOMMU is disabled by default, we will apply identity
4866 * map for all devices except those marked as being untrusted.
4868 if (dmar_disabled)
4869 iommu_identity_mapping |= IDENTMAP_ALL;
4871 dmar_disabled = 0;
4872 no_iommu = 0;
4874 return 1;
4877 static int __init probe_acpi_namespace_devices(void)
4879 struct dmar_drhd_unit *drhd;
4880 /* To avoid a -Wunused-but-set-variable warning. */
4881 struct intel_iommu *iommu __maybe_unused;
4882 struct device *dev;
4883 int i, ret = 0;
4885 for_each_active_iommu(iommu, drhd) {
4886 for_each_active_dev_scope(drhd->devices,
4887 drhd->devices_cnt, i, dev) {
4888 struct acpi_device_physical_node *pn;
4889 struct iommu_group *group;
4890 struct acpi_device *adev;
4892 if (dev->bus != &acpi_bus_type)
4893 continue;
4895 adev = to_acpi_device(dev);
4896 mutex_lock(&adev->physical_node_lock);
4897 list_for_each_entry(pn,
4898 &adev->physical_node_list, node) {
4899 group = iommu_group_get(pn->dev);
4900 if (group) {
4901 iommu_group_put(group);
4902 continue;
4905 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4906 ret = iommu_probe_device(pn->dev);
4907 if (ret)
4908 break;
4910 mutex_unlock(&adev->physical_node_lock);
4912 if (ret)
4913 return ret;
4917 return 0;
4920 int __init intel_iommu_init(void)
4922 int ret = -ENODEV;
4923 struct dmar_drhd_unit *drhd;
4924 struct intel_iommu *iommu;
4927 * Intel IOMMU is required for a TXT/tboot launch or platform
4928 * opt in, so enforce that.
4930 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4932 if (iommu_init_mempool()) {
4933 if (force_on)
4934 panic("tboot: Failed to initialize iommu memory\n");
4935 return -ENOMEM;
4938 down_write(&dmar_global_lock);
4939 if (dmar_table_init()) {
4940 if (force_on)
4941 panic("tboot: Failed to initialize DMAR table\n");
4942 goto out_free_dmar;
4945 if (dmar_dev_scope_init() < 0) {
4946 if (force_on)
4947 panic("tboot: Failed to initialize DMAR device scope\n");
4948 goto out_free_dmar;
4951 up_write(&dmar_global_lock);
4954 * The bus notifier takes the dmar_global_lock, so lockdep will
4955 * complain later when we register it under the lock.
4957 dmar_register_bus_notifier();
4959 down_write(&dmar_global_lock);
4961 if (!no_iommu)
4962 intel_iommu_debugfs_init();
4964 if (no_iommu || dmar_disabled) {
4966 * We exit the function here to ensure IOMMU's remapping and
4967 * mempool aren't setup, which means that the IOMMU's PMRs
4968 * won't be disabled via the call to init_dmars(). So disable
4969 * it explicitly here. The PMRs were setup by tboot prior to
4970 * calling SENTER, but the kernel is expected to reset/tear
4971 * down the PMRs.
4973 if (intel_iommu_tboot_noforce) {
4974 for_each_iommu(iommu, drhd)
4975 iommu_disable_protect_mem_regions(iommu);
4979 * Make sure the IOMMUs are switched off, even when we
4980 * boot into a kexec kernel and the previous kernel left
4981 * them enabled
4983 intel_disable_iommus();
4984 goto out_free_dmar;
4987 if (list_empty(&dmar_rmrr_units))
4988 pr_info("No RMRR found\n");
4990 if (list_empty(&dmar_atsr_units))
4991 pr_info("No ATSR found\n");
4993 if (dmar_init_reserved_ranges()) {
4994 if (force_on)
4995 panic("tboot: Failed to reserve iommu ranges\n");
4996 goto out_free_reserved_range;
4999 if (dmar_map_gfx)
5000 intel_iommu_gfx_mapped = 1;
5002 init_no_remapping_devices();
5004 ret = init_dmars();
5005 if (ret) {
5006 if (force_on)
5007 panic("tboot: Failed to initialize DMARs\n");
5008 pr_err("Initialization failed\n");
5009 goto out_free_reserved_range;
5011 up_write(&dmar_global_lock);
5013 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5015 * If the system has no untrusted device or the user has decided
5016 * to disable the bounce page mechanisms, we don't need swiotlb.
5017 * Mark this and the pre-allocated bounce pages will be released
5018 * later.
5020 if (!has_untrusted_dev() || intel_no_bounce)
5021 swiotlb = 0;
5022 #endif
5023 dma_ops = &intel_dma_ops;
5025 init_iommu_pm_ops();
5027 down_read(&dmar_global_lock);
5028 for_each_active_iommu(iommu, drhd) {
5029 iommu_device_sysfs_add(&iommu->iommu, NULL,
5030 intel_iommu_groups,
5031 "%s", iommu->name);
5032 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5033 iommu_device_register(&iommu->iommu);
5035 up_read(&dmar_global_lock);
5037 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5038 if (si_domain && !hw_pass_through)
5039 register_memory_notifier(&intel_iommu_memory_nb);
5040 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5041 intel_iommu_cpu_dead);
5043 down_read(&dmar_global_lock);
5044 if (probe_acpi_namespace_devices())
5045 pr_warn("ACPI name space devices didn't probe correctly\n");
5047 /* Finally, we enable the DMA remapping hardware. */
5048 for_each_iommu(iommu, drhd) {
5049 if (!drhd->ignored && !translation_pre_enabled(iommu))
5050 iommu_enable_translation(iommu);
5052 iommu_disable_protect_mem_regions(iommu);
5054 up_read(&dmar_global_lock);
5056 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5058 intel_iommu_enabled = 1;
5060 return 0;
5062 out_free_reserved_range:
5063 put_iova_domain(&reserved_iova_list);
5064 out_free_dmar:
5065 intel_iommu_free_dmars();
5066 up_write(&dmar_global_lock);
5067 iommu_exit_mempool();
5068 return ret;
5071 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5073 struct intel_iommu *iommu = opaque;
5075 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5076 return 0;
5080 * NB - intel-iommu lacks any sort of reference counting for the users of
5081 * dependent devices. If multiple endpoints have intersecting dependent
5082 * devices, unbinding the driver from any one of them will possibly leave
5083 * the others unable to operate.
5085 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5087 if (!iommu || !dev || !dev_is_pci(dev))
5088 return;
5090 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5093 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5095 struct dmar_domain *domain;
5096 struct intel_iommu *iommu;
5097 unsigned long flags;
5099 assert_spin_locked(&device_domain_lock);
5101 if (WARN_ON(!info))
5102 return;
5104 iommu = info->iommu;
5105 domain = info->domain;
5107 if (info->dev) {
5108 if (dev_is_pci(info->dev) && sm_supported(iommu))
5109 intel_pasid_tear_down_entry(iommu, info->dev,
5110 PASID_RID2PASID);
5112 iommu_disable_dev_iotlb(info);
5113 domain_context_clear(iommu, info->dev);
5114 intel_pasid_free_table(info->dev);
5117 unlink_domain_info(info);
5119 spin_lock_irqsave(&iommu->lock, flags);
5120 domain_detach_iommu(domain, iommu);
5121 spin_unlock_irqrestore(&iommu->lock, flags);
5123 /* free the private domain */
5124 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5125 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5126 list_empty(&domain->devices))
5127 domain_exit(info->domain);
5129 free_devinfo_mem(info);
5132 static void dmar_remove_one_dev_info(struct device *dev)
5134 struct device_domain_info *info;
5135 unsigned long flags;
5137 spin_lock_irqsave(&device_domain_lock, flags);
5138 info = dev->archdata.iommu;
5139 if (info && info != DEFER_DEVICE_DOMAIN_INFO
5140 && info != DUMMY_DEVICE_DOMAIN_INFO)
5141 __dmar_remove_one_dev_info(info);
5142 spin_unlock_irqrestore(&device_domain_lock, flags);
5145 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5147 int adjust_width;
5149 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5150 domain_reserve_special_ranges(domain);
5152 /* calculate AGAW */
5153 domain->gaw = guest_width;
5154 adjust_width = guestwidth_to_adjustwidth(guest_width);
5155 domain->agaw = width_to_agaw(adjust_width);
5157 domain->iommu_coherency = 0;
5158 domain->iommu_snooping = 0;
5159 domain->iommu_superpage = 0;
5160 domain->max_addr = 0;
5162 /* always allocate the top pgd */
5163 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5164 if (!domain->pgd)
5165 return -ENOMEM;
5166 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5167 return 0;
5170 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5172 struct dmar_domain *dmar_domain;
5173 struct iommu_domain *domain;
5175 switch (type) {
5176 case IOMMU_DOMAIN_DMA:
5177 /* fallthrough */
5178 case IOMMU_DOMAIN_UNMANAGED:
5179 dmar_domain = alloc_domain(0);
5180 if (!dmar_domain) {
5181 pr_err("Can't allocate dmar_domain\n");
5182 return NULL;
5184 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5185 pr_err("Domain initialization failed\n");
5186 domain_exit(dmar_domain);
5187 return NULL;
5190 if (type == IOMMU_DOMAIN_DMA &&
5191 init_iova_flush_queue(&dmar_domain->iovad,
5192 iommu_flush_iova, iova_entry_free)) {
5193 pr_warn("iova flush queue initialization failed\n");
5194 intel_iommu_strict = 1;
5197 domain_update_iommu_cap(dmar_domain);
5199 domain = &dmar_domain->domain;
5200 domain->geometry.aperture_start = 0;
5201 domain->geometry.aperture_end =
5202 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5203 domain->geometry.force_aperture = true;
5205 return domain;
5206 case IOMMU_DOMAIN_IDENTITY:
5207 return &si_domain->domain;
5208 default:
5209 return NULL;
5212 return NULL;
5215 static void intel_iommu_domain_free(struct iommu_domain *domain)
5217 if (domain != &si_domain->domain)
5218 domain_exit(to_dmar_domain(domain));
5222 * Check whether a @domain could be attached to the @dev through the
5223 * aux-domain attach/detach APIs.
5225 static inline bool
5226 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5228 struct device_domain_info *info = dev->archdata.iommu;
5230 return info && info->auxd_enabled &&
5231 domain->type == IOMMU_DOMAIN_UNMANAGED;
5234 static void auxiliary_link_device(struct dmar_domain *domain,
5235 struct device *dev)
5237 struct device_domain_info *info = dev->archdata.iommu;
5239 assert_spin_locked(&device_domain_lock);
5240 if (WARN_ON(!info))
5241 return;
5243 domain->auxd_refcnt++;
5244 list_add(&domain->auxd, &info->auxiliary_domains);
5247 static void auxiliary_unlink_device(struct dmar_domain *domain,
5248 struct device *dev)
5250 struct device_domain_info *info = dev->archdata.iommu;
5252 assert_spin_locked(&device_domain_lock);
5253 if (WARN_ON(!info))
5254 return;
5256 list_del(&domain->auxd);
5257 domain->auxd_refcnt--;
5259 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5260 intel_pasid_free_id(domain->default_pasid);
5263 static int aux_domain_add_dev(struct dmar_domain *domain,
5264 struct device *dev)
5266 int ret;
5267 u8 bus, devfn;
5268 unsigned long flags;
5269 struct intel_iommu *iommu;
5271 iommu = device_to_iommu(dev, &bus, &devfn);
5272 if (!iommu)
5273 return -ENODEV;
5275 if (domain->default_pasid <= 0) {
5276 int pasid;
5278 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
5279 pci_max_pasids(to_pci_dev(dev)),
5280 GFP_KERNEL);
5281 if (pasid <= 0) {
5282 pr_err("Can't allocate default pasid\n");
5283 return -ENODEV;
5285 domain->default_pasid = pasid;
5288 spin_lock_irqsave(&device_domain_lock, flags);
5290 * iommu->lock must be held to attach domain to iommu and setup the
5291 * pasid entry for second level translation.
5293 spin_lock(&iommu->lock);
5294 ret = domain_attach_iommu(domain, iommu);
5295 if (ret)
5296 goto attach_failed;
5298 /* Setup the PASID entry for mediated devices: */
5299 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5300 domain->default_pasid);
5301 if (ret)
5302 goto table_failed;
5303 spin_unlock(&iommu->lock);
5305 auxiliary_link_device(domain, dev);
5307 spin_unlock_irqrestore(&device_domain_lock, flags);
5309 return 0;
5311 table_failed:
5312 domain_detach_iommu(domain, iommu);
5313 attach_failed:
5314 spin_unlock(&iommu->lock);
5315 spin_unlock_irqrestore(&device_domain_lock, flags);
5316 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5317 intel_pasid_free_id(domain->default_pasid);
5319 return ret;
5322 static void aux_domain_remove_dev(struct dmar_domain *domain,
5323 struct device *dev)
5325 struct device_domain_info *info;
5326 struct intel_iommu *iommu;
5327 unsigned long flags;
5329 if (!is_aux_domain(dev, &domain->domain))
5330 return;
5332 spin_lock_irqsave(&device_domain_lock, flags);
5333 info = dev->archdata.iommu;
5334 iommu = info->iommu;
5336 auxiliary_unlink_device(domain, dev);
5338 spin_lock(&iommu->lock);
5339 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5340 domain_detach_iommu(domain, iommu);
5341 spin_unlock(&iommu->lock);
5343 spin_unlock_irqrestore(&device_domain_lock, flags);
5346 static int prepare_domain_attach_device(struct iommu_domain *domain,
5347 struct device *dev)
5349 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5350 struct intel_iommu *iommu;
5351 int addr_width;
5352 u8 bus, devfn;
5354 iommu = device_to_iommu(dev, &bus, &devfn);
5355 if (!iommu)
5356 return -ENODEV;
5358 /* check if this iommu agaw is sufficient for max mapped address */
5359 addr_width = agaw_to_width(iommu->agaw);
5360 if (addr_width > cap_mgaw(iommu->cap))
5361 addr_width = cap_mgaw(iommu->cap);
5363 if (dmar_domain->max_addr > (1LL << addr_width)) {
5364 dev_err(dev, "%s: iommu width (%d) is not "
5365 "sufficient for the mapped address (%llx)\n",
5366 __func__, addr_width, dmar_domain->max_addr);
5367 return -EFAULT;
5369 dmar_domain->gaw = addr_width;
5372 * Knock out extra levels of page tables if necessary
5374 while (iommu->agaw < dmar_domain->agaw) {
5375 struct dma_pte *pte;
5377 pte = dmar_domain->pgd;
5378 if (dma_pte_present(pte)) {
5379 dmar_domain->pgd = (struct dma_pte *)
5380 phys_to_virt(dma_pte_addr(pte));
5381 free_pgtable_page(pte);
5383 dmar_domain->agaw--;
5386 return 0;
5389 static int intel_iommu_attach_device(struct iommu_domain *domain,
5390 struct device *dev)
5392 int ret;
5394 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5395 device_is_rmrr_locked(dev)) {
5396 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5397 return -EPERM;
5400 if (is_aux_domain(dev, domain))
5401 return -EPERM;
5403 /* normally dev is not mapped */
5404 if (unlikely(domain_context_mapped(dev))) {
5405 struct dmar_domain *old_domain;
5407 old_domain = find_domain(dev);
5408 if (old_domain)
5409 dmar_remove_one_dev_info(dev);
5412 ret = prepare_domain_attach_device(domain, dev);
5413 if (ret)
5414 return ret;
5416 return domain_add_dev_info(to_dmar_domain(domain), dev);
5419 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5420 struct device *dev)
5422 int ret;
5424 if (!is_aux_domain(dev, domain))
5425 return -EPERM;
5427 ret = prepare_domain_attach_device(domain, dev);
5428 if (ret)
5429 return ret;
5431 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5434 static void intel_iommu_detach_device(struct iommu_domain *domain,
5435 struct device *dev)
5437 dmar_remove_one_dev_info(dev);
5440 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5441 struct device *dev)
5443 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5446 static int intel_iommu_map(struct iommu_domain *domain,
5447 unsigned long iova, phys_addr_t hpa,
5448 size_t size, int iommu_prot)
5450 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5451 u64 max_addr;
5452 int prot = 0;
5453 int ret;
5455 if (iommu_prot & IOMMU_READ)
5456 prot |= DMA_PTE_READ;
5457 if (iommu_prot & IOMMU_WRITE)
5458 prot |= DMA_PTE_WRITE;
5459 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5460 prot |= DMA_PTE_SNP;
5462 max_addr = iova + size;
5463 if (dmar_domain->max_addr < max_addr) {
5464 u64 end;
5466 /* check if minimum agaw is sufficient for mapped address */
5467 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5468 if (end < max_addr) {
5469 pr_err("%s: iommu width (%d) is not "
5470 "sufficient for the mapped address (%llx)\n",
5471 __func__, dmar_domain->gaw, max_addr);
5472 return -EFAULT;
5474 dmar_domain->max_addr = max_addr;
5476 /* Round up size to next multiple of PAGE_SIZE, if it and
5477 the low bits of hpa would take us onto the next page */
5478 size = aligned_nrpages(hpa, size);
5479 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5480 hpa >> VTD_PAGE_SHIFT, size, prot);
5481 return ret;
5484 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5485 unsigned long iova, size_t size,
5486 struct iommu_iotlb_gather *gather)
5488 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5489 struct page *freelist = NULL;
5490 unsigned long start_pfn, last_pfn;
5491 unsigned int npages;
5492 int iommu_id, level = 0;
5494 /* Cope with horrid API which requires us to unmap more than the
5495 size argument if it happens to be a large-page mapping. */
5496 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5498 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5499 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5501 start_pfn = iova >> VTD_PAGE_SHIFT;
5502 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5504 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5506 npages = last_pfn - start_pfn + 1;
5508 for_each_domain_iommu(iommu_id, dmar_domain)
5509 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5510 start_pfn, npages, !freelist, 0);
5512 dma_free_pagelist(freelist);
5514 if (dmar_domain->max_addr == iova + size)
5515 dmar_domain->max_addr = iova;
5517 return size;
5520 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5521 dma_addr_t iova)
5523 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5524 struct dma_pte *pte;
5525 int level = 0;
5526 u64 phys = 0;
5528 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5529 if (pte && dma_pte_present(pte))
5530 phys = dma_pte_addr(pte) +
5531 (iova & (BIT_MASK(level_to_offset_bits(level) +
5532 VTD_PAGE_SHIFT) - 1));
5534 return phys;
5537 static inline bool scalable_mode_support(void)
5539 struct dmar_drhd_unit *drhd;
5540 struct intel_iommu *iommu;
5541 bool ret = true;
5543 rcu_read_lock();
5544 for_each_active_iommu(iommu, drhd) {
5545 if (!sm_supported(iommu)) {
5546 ret = false;
5547 break;
5550 rcu_read_unlock();
5552 return ret;
5555 static inline bool iommu_pasid_support(void)
5557 struct dmar_drhd_unit *drhd;
5558 struct intel_iommu *iommu;
5559 bool ret = true;
5561 rcu_read_lock();
5562 for_each_active_iommu(iommu, drhd) {
5563 if (!pasid_supported(iommu)) {
5564 ret = false;
5565 break;
5568 rcu_read_unlock();
5570 return ret;
5573 static bool intel_iommu_capable(enum iommu_cap cap)
5575 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5576 return domain_update_iommu_snooping(NULL) == 1;
5577 if (cap == IOMMU_CAP_INTR_REMAP)
5578 return irq_remapping_enabled == 1;
5580 return false;
5583 static int intel_iommu_add_device(struct device *dev)
5585 struct dmar_domain *dmar_domain;
5586 struct iommu_domain *domain;
5587 struct intel_iommu *iommu;
5588 struct iommu_group *group;
5589 u8 bus, devfn;
5590 int ret;
5592 iommu = device_to_iommu(dev, &bus, &devfn);
5593 if (!iommu)
5594 return -ENODEV;
5596 iommu_device_link(&iommu->iommu, dev);
5598 if (translation_pre_enabled(iommu))
5599 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5601 group = iommu_group_get_for_dev(dev);
5603 if (IS_ERR(group)) {
5604 ret = PTR_ERR(group);
5605 goto unlink;
5608 iommu_group_put(group);
5610 domain = iommu_get_domain_for_dev(dev);
5611 dmar_domain = to_dmar_domain(domain);
5612 if (domain->type == IOMMU_DOMAIN_DMA) {
5613 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5614 ret = iommu_request_dm_for_dev(dev);
5615 if (ret) {
5616 dmar_remove_one_dev_info(dev);
5617 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5618 domain_add_dev_info(si_domain, dev);
5619 dev_info(dev,
5620 "Device uses a private identity domain.\n");
5623 } else {
5624 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5625 ret = iommu_request_dma_domain_for_dev(dev);
5626 if (ret) {
5627 dmar_remove_one_dev_info(dev);
5628 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5629 if (!get_private_domain_for_dev(dev)) {
5630 dev_warn(dev,
5631 "Failed to get a private domain.\n");
5632 ret = -ENOMEM;
5633 goto unlink;
5636 dev_info(dev,
5637 "Device uses a private dma domain.\n");
5642 if (device_needs_bounce(dev)) {
5643 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5644 set_dma_ops(dev, &bounce_dma_ops);
5647 return 0;
5649 unlink:
5650 iommu_device_unlink(&iommu->iommu, dev);
5651 return ret;
5654 static void intel_iommu_remove_device(struct device *dev)
5656 struct intel_iommu *iommu;
5657 u8 bus, devfn;
5659 iommu = device_to_iommu(dev, &bus, &devfn);
5660 if (!iommu)
5661 return;
5663 dmar_remove_one_dev_info(dev);
5665 iommu_group_remove_device(dev);
5667 iommu_device_unlink(&iommu->iommu, dev);
5669 if (device_needs_bounce(dev))
5670 set_dma_ops(dev, NULL);
5673 static void intel_iommu_get_resv_regions(struct device *device,
5674 struct list_head *head)
5676 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5677 struct iommu_resv_region *reg;
5678 struct dmar_rmrr_unit *rmrr;
5679 struct device *i_dev;
5680 int i;
5682 down_read(&dmar_global_lock);
5683 for_each_rmrr_units(rmrr) {
5684 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5685 i, i_dev) {
5686 struct iommu_resv_region *resv;
5687 enum iommu_resv_type type;
5688 size_t length;
5690 if (i_dev != device &&
5691 !is_downstream_to_pci_bridge(device, i_dev))
5692 continue;
5694 length = rmrr->end_address - rmrr->base_address + 1;
5696 type = device_rmrr_is_relaxable(device) ?
5697 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5699 resv = iommu_alloc_resv_region(rmrr->base_address,
5700 length, prot, type);
5701 if (!resv)
5702 break;
5704 list_add_tail(&resv->list, head);
5707 up_read(&dmar_global_lock);
5709 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5710 if (dev_is_pci(device)) {
5711 struct pci_dev *pdev = to_pci_dev(device);
5713 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5714 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5715 IOMMU_RESV_DIRECT_RELAXABLE);
5716 if (reg)
5717 list_add_tail(&reg->list, head);
5720 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5722 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5723 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5724 0, IOMMU_RESV_MSI);
5725 if (!reg)
5726 return;
5727 list_add_tail(&reg->list, head);
5730 static void intel_iommu_put_resv_regions(struct device *dev,
5731 struct list_head *head)
5733 struct iommu_resv_region *entry, *next;
5735 list_for_each_entry_safe(entry, next, head, list)
5736 kfree(entry);
5739 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5741 struct device_domain_info *info;
5742 struct context_entry *context;
5743 struct dmar_domain *domain;
5744 unsigned long flags;
5745 u64 ctx_lo;
5746 int ret;
5748 domain = find_domain(dev);
5749 if (!domain)
5750 return -EINVAL;
5752 spin_lock_irqsave(&device_domain_lock, flags);
5753 spin_lock(&iommu->lock);
5755 ret = -EINVAL;
5756 info = dev->archdata.iommu;
5757 if (!info || !info->pasid_supported)
5758 goto out;
5760 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5761 if (WARN_ON(!context))
5762 goto out;
5764 ctx_lo = context[0].lo;
5766 if (!(ctx_lo & CONTEXT_PASIDE)) {
5767 ctx_lo |= CONTEXT_PASIDE;
5768 context[0].lo = ctx_lo;
5769 wmb();
5770 iommu->flush.flush_context(iommu,
5771 domain->iommu_did[iommu->seq_id],
5772 PCI_DEVID(info->bus, info->devfn),
5773 DMA_CCMD_MASK_NOBIT,
5774 DMA_CCMD_DEVICE_INVL);
5777 /* Enable PASID support in the device, if it wasn't already */
5778 if (!info->pasid_enabled)
5779 iommu_enable_dev_iotlb(info);
5781 ret = 0;
5783 out:
5784 spin_unlock(&iommu->lock);
5785 spin_unlock_irqrestore(&device_domain_lock, flags);
5787 return ret;
5790 static void intel_iommu_apply_resv_region(struct device *dev,
5791 struct iommu_domain *domain,
5792 struct iommu_resv_region *region)
5794 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5795 unsigned long start, end;
5797 start = IOVA_PFN(region->start);
5798 end = IOVA_PFN(region->start + region->length - 1);
5800 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5803 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5805 if (dev_is_pci(dev))
5806 return pci_device_group(dev);
5807 return generic_device_group(dev);
5810 #ifdef CONFIG_INTEL_IOMMU_SVM
5811 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5813 struct intel_iommu *iommu;
5814 u8 bus, devfn;
5816 if (iommu_dummy(dev)) {
5817 dev_warn(dev,
5818 "No IOMMU translation for device; cannot enable SVM\n");
5819 return NULL;
5822 iommu = device_to_iommu(dev, &bus, &devfn);
5823 if ((!iommu)) {
5824 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5825 return NULL;
5828 return iommu;
5830 #endif /* CONFIG_INTEL_IOMMU_SVM */
5832 static int intel_iommu_enable_auxd(struct device *dev)
5834 struct device_domain_info *info;
5835 struct intel_iommu *iommu;
5836 unsigned long flags;
5837 u8 bus, devfn;
5838 int ret;
5840 iommu = device_to_iommu(dev, &bus, &devfn);
5841 if (!iommu || dmar_disabled)
5842 return -EINVAL;
5844 if (!sm_supported(iommu) || !pasid_supported(iommu))
5845 return -EINVAL;
5847 ret = intel_iommu_enable_pasid(iommu, dev);
5848 if (ret)
5849 return -ENODEV;
5851 spin_lock_irqsave(&device_domain_lock, flags);
5852 info = dev->archdata.iommu;
5853 info->auxd_enabled = 1;
5854 spin_unlock_irqrestore(&device_domain_lock, flags);
5856 return 0;
5859 static int intel_iommu_disable_auxd(struct device *dev)
5861 struct device_domain_info *info;
5862 unsigned long flags;
5864 spin_lock_irqsave(&device_domain_lock, flags);
5865 info = dev->archdata.iommu;
5866 if (!WARN_ON(!info))
5867 info->auxd_enabled = 0;
5868 spin_unlock_irqrestore(&device_domain_lock, flags);
5870 return 0;
5874 * A PCI express designated vendor specific extended capability is defined
5875 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5876 * for system software and tools to detect endpoint devices supporting the
5877 * Intel scalable IO virtualization without host driver dependency.
5879 * Returns the address of the matching extended capability structure within
5880 * the device's PCI configuration space or 0 if the device does not support
5881 * it.
5883 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5885 int pos;
5886 u16 vendor, id;
5888 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5889 while (pos) {
5890 pci_read_config_word(pdev, pos + 4, &vendor);
5891 pci_read_config_word(pdev, pos + 8, &id);
5892 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5893 return pos;
5895 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5898 return 0;
5901 static bool
5902 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5904 if (feat == IOMMU_DEV_FEAT_AUX) {
5905 int ret;
5907 if (!dev_is_pci(dev) || dmar_disabled ||
5908 !scalable_mode_support() || !iommu_pasid_support())
5909 return false;
5911 ret = pci_pasid_features(to_pci_dev(dev));
5912 if (ret < 0)
5913 return false;
5915 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5918 return false;
5921 static int
5922 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5924 if (feat == IOMMU_DEV_FEAT_AUX)
5925 return intel_iommu_enable_auxd(dev);
5927 return -ENODEV;
5930 static int
5931 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5933 if (feat == IOMMU_DEV_FEAT_AUX)
5934 return intel_iommu_disable_auxd(dev);
5936 return -ENODEV;
5939 static bool
5940 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5942 struct device_domain_info *info = dev->archdata.iommu;
5944 if (feat == IOMMU_DEV_FEAT_AUX)
5945 return scalable_mode_support() && info && info->auxd_enabled;
5947 return false;
5950 static int
5951 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5953 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5955 return dmar_domain->default_pasid > 0 ?
5956 dmar_domain->default_pasid : -EINVAL;
5959 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5960 struct device *dev)
5962 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5966 * Check that the device does not live on an external facing PCI port that is
5967 * marked as untrusted. Such devices should not be able to apply quirks and
5968 * thus not be able to bypass the IOMMU restrictions.
5970 static bool risky_device(struct pci_dev *pdev)
5972 if (pdev->untrusted) {
5973 pci_info(pdev,
5974 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5975 pdev->vendor, pdev->device);
5976 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5977 return true;
5979 return false;
5982 const struct iommu_ops intel_iommu_ops = {
5983 .capable = intel_iommu_capable,
5984 .domain_alloc = intel_iommu_domain_alloc,
5985 .domain_free = intel_iommu_domain_free,
5986 .attach_dev = intel_iommu_attach_device,
5987 .detach_dev = intel_iommu_detach_device,
5988 .aux_attach_dev = intel_iommu_aux_attach_device,
5989 .aux_detach_dev = intel_iommu_aux_detach_device,
5990 .aux_get_pasid = intel_iommu_aux_get_pasid,
5991 .map = intel_iommu_map,
5992 .unmap = intel_iommu_unmap,
5993 .iova_to_phys = intel_iommu_iova_to_phys,
5994 .add_device = intel_iommu_add_device,
5995 .remove_device = intel_iommu_remove_device,
5996 .get_resv_regions = intel_iommu_get_resv_regions,
5997 .put_resv_regions = intel_iommu_put_resv_regions,
5998 .apply_resv_region = intel_iommu_apply_resv_region,
5999 .device_group = intel_iommu_device_group,
6000 .dev_has_feat = intel_iommu_dev_has_feat,
6001 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
6002 .dev_enable_feat = intel_iommu_dev_enable_feat,
6003 .dev_disable_feat = intel_iommu_dev_disable_feat,
6004 .is_attach_deferred = intel_iommu_is_attach_deferred,
6005 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6008 static void quirk_iommu_igfx(struct pci_dev *dev)
6010 if (risky_device(dev))
6011 return;
6013 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6014 dmar_map_gfx = 0;
6017 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6018 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6019 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6020 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6021 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6022 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6023 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6024 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6026 /* Broadwell igfx malfunctions with dmar */
6027 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6028 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6029 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6030 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6031 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6032 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6033 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6034 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6035 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6036 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6037 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6038 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6039 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6040 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6041 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6042 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6043 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6044 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6045 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6046 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6047 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6048 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6049 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6050 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6052 static void quirk_iommu_rwbf(struct pci_dev *dev)
6054 if (risky_device(dev))
6055 return;
6058 * Mobile 4 Series Chipset neglects to set RWBF capability,
6059 * but needs it. Same seems to hold for the desktop versions.
6061 pci_info(dev, "Forcing write-buffer flush capability\n");
6062 rwbf_quirk = 1;
6065 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6066 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6067 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6068 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6069 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6070 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6071 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6073 #define GGC 0x52
6074 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
6075 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6076 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
6077 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
6078 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6079 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6080 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6081 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6083 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6085 unsigned short ggc;
6087 if (risky_device(dev))
6088 return;
6090 if (pci_read_config_word(dev, GGC, &ggc))
6091 return;
6093 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6094 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6095 dmar_map_gfx = 0;
6096 } else if (dmar_map_gfx) {
6097 /* we have to ensure the gfx device is idle before we flush */
6098 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6099 intel_iommu_strict = 1;
6102 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6103 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6104 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6105 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6107 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6108 ISOCH DMAR unit for the Azalia sound device, but not give it any
6109 TLB entries, which causes it to deadlock. Check for that. We do
6110 this in a function called from init_dmars(), instead of in a PCI
6111 quirk, because we don't want to print the obnoxious "BIOS broken"
6112 message if VT-d is actually disabled.
6114 static void __init check_tylersburg_isoch(void)
6116 struct pci_dev *pdev;
6117 uint32_t vtisochctrl;
6119 /* If there's no Azalia in the system anyway, forget it. */
6120 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6121 if (!pdev)
6122 return;
6124 if (risky_device(pdev)) {
6125 pci_dev_put(pdev);
6126 return;
6129 pci_dev_put(pdev);
6131 /* System Management Registers. Might be hidden, in which case
6132 we can't do the sanity check. But that's OK, because the
6133 known-broken BIOSes _don't_ actually hide it, so far. */
6134 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6135 if (!pdev)
6136 return;
6138 if (risky_device(pdev)) {
6139 pci_dev_put(pdev);
6140 return;
6143 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6144 pci_dev_put(pdev);
6145 return;
6148 pci_dev_put(pdev);
6150 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6151 if (vtisochctrl & 1)
6152 return;
6154 /* Drop all bits other than the number of TLB entries */
6155 vtisochctrl &= 0x1c;
6157 /* If we have the recommended number of TLB entries (16), fine. */
6158 if (vtisochctrl == 0x10)
6159 return;
6161 /* Zero TLB entries? You get to ride the short bus to school. */
6162 if (!vtisochctrl) {
6163 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6164 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6165 dmi_get_system_info(DMI_BIOS_VENDOR),
6166 dmi_get_system_info(DMI_BIOS_VERSION),
6167 dmi_get_system_info(DMI_PRODUCT_VERSION));
6168 iommu_identity_mapping |= IDENTMAP_AZALIA;
6169 return;
6172 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6173 vtisochctrl);