treewide: remove redundant IS_ERR() before error code check
[linux/fpc-iii.git] / drivers / iommu / intel-iommu.c
blob35a4a3abedc6a611f304d6b79961c0fd16481f5b
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
92 * that we support.
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
106 static inline int agaw_to_level(int agaw)
108 return agaw + 2;
111 static inline int agaw_to_width(int agaw)
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 static inline int width_to_agaw(int width)
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 static inline unsigned int level_to_offset_bits(int level)
123 return (level - 1) * LEVEL_STRIDE;
126 static inline int pfn_level_offset(unsigned long pfn, int level)
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 static inline unsigned long level_mask(int level)
133 return -1UL << level_to_offset_bits(level);
136 static inline unsigned long level_size(int level)
138 return 1UL << level_to_offset_bits(level);
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
143 return (pfn + level_size(level) - 1) & level_mask(level);
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
148 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
164 return mm_to_dma_pfn(page_to_pfn(pg));
166 static inline unsigned long virt_to_dma_pfn(void *p)
168 return page_to_dma_pfn(virt_to_page(p));
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189 * if marked present.
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
193 if (!(re->lo & 1))
194 return 0;
196 return re->lo & VTD_PAGE_MASK;
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201 * if marked present.
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
205 if (!(re->hi & 1))
206 return 0;
208 return re->hi & VTD_PAGE_MASK;
211 static inline void context_clear_pasid_enable(struct context_entry *context)
213 context->lo &= ~(1ULL << 11);
216 static inline bool context_pasid_enabled(struct context_entry *context)
218 return !!(context->lo & (1ULL << 11));
221 static inline void context_set_copied(struct context_entry *context)
223 context->hi |= (1ull << 3);
226 static inline bool context_copied(struct context_entry *context)
228 return !!(context->hi & (1ULL << 3));
231 static inline bool __context_present(struct context_entry *context)
233 return (context->lo & 1);
236 bool context_present(struct context_entry *context)
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
243 static inline void context_set_present(struct context_entry *context)
245 context->lo |= 1;
248 static inline void context_set_fault_enable(struct context_entry *context)
250 context->lo &= (((u64)-1) << 2) | 1;
253 static inline void context_set_translation_type(struct context_entry *context,
254 unsigned long value)
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
260 static inline void context_set_address_root(struct context_entry *context,
261 unsigned long value)
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
267 static inline void context_set_address_width(struct context_entry *context,
268 unsigned long value)
270 context->hi |= value & 7;
273 static inline void context_set_domain_id(struct context_entry *context,
274 unsigned long value)
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
279 static inline int context_domain_id(struct context_entry *c)
281 return((c->hi >> 8) & 0xffff);
284 static inline void context_clear_entry(struct context_entry *context)
286 context->lo = 0;
287 context->hi = 0;
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
303 * This is a DMA domain allocated through the iommu domain allocation
304 * interface. But one or more devices belonging to this domain have
305 * been chosen to use a private domain. We should avoid to use the
306 * map/unmap/iova_to_phys APIs on it.
308 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
310 #define for_each_domain_iommu(idx, domain) \
311 for (idx = 0; idx < g_num_of_iommus; idx++) \
312 if (domain->iommu_refcnt[idx])
314 struct dmar_rmrr_unit {
315 struct list_head list; /* list of rmrr units */
316 struct acpi_dmar_header *hdr; /* ACPI header */
317 u64 base_address; /* reserved base address*/
318 u64 end_address; /* reserved end address */
319 struct dmar_dev_scope *devices; /* target devices */
320 int devices_cnt; /* target device count */
323 struct dmar_atsr_unit {
324 struct list_head list; /* list of ATSR units */
325 struct acpi_dmar_header *hdr; /* ACPI header */
326 struct dmar_dev_scope *devices; /* target devices */
327 int devices_cnt; /* target device count */
328 u8 include_all:1; /* include all ports */
331 static LIST_HEAD(dmar_atsr_units);
332 static LIST_HEAD(dmar_rmrr_units);
334 #define for_each_rmrr_units(rmrr) \
335 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
337 /* bitmap for indexing intel_iommus */
338 static int g_num_of_iommus;
340 static void domain_exit(struct dmar_domain *domain);
341 static void domain_remove_dev_info(struct dmar_domain *domain);
342 static void dmar_remove_one_dev_info(struct device *dev);
343 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
344 static void domain_context_clear(struct intel_iommu *iommu,
345 struct device *dev);
346 static int domain_detach_iommu(struct dmar_domain *domain,
347 struct intel_iommu *iommu);
348 static bool device_is_rmrr_locked(struct device *dev);
349 static int intel_iommu_attach_device(struct iommu_domain *domain,
350 struct device *dev);
351 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
352 dma_addr_t iova);
354 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
355 int dmar_disabled = 0;
356 #else
357 int dmar_disabled = 1;
358 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
360 int intel_iommu_sm;
361 int intel_iommu_enabled = 0;
362 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
364 static int dmar_map_gfx = 1;
365 static int dmar_forcedac;
366 static int intel_iommu_strict;
367 static int intel_iommu_superpage = 1;
368 static int iommu_identity_mapping;
369 static int intel_no_bounce;
371 #define IDENTMAP_ALL 1
372 #define IDENTMAP_GFX 2
373 #define IDENTMAP_AZALIA 4
375 int intel_iommu_gfx_mapped;
376 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
378 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
379 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
380 static DEFINE_SPINLOCK(device_domain_lock);
381 static LIST_HEAD(device_domain_list);
383 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
384 to_pci_dev(d)->untrusted)
387 * Iterate over elements in device_domain_list and call the specified
388 * callback @fn against each element.
390 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
391 void *data), void *data)
393 int ret = 0;
394 unsigned long flags;
395 struct device_domain_info *info;
397 spin_lock_irqsave(&device_domain_lock, flags);
398 list_for_each_entry(info, &device_domain_list, global) {
399 ret = fn(info, data);
400 if (ret) {
401 spin_unlock_irqrestore(&device_domain_lock, flags);
402 return ret;
405 spin_unlock_irqrestore(&device_domain_lock, flags);
407 return 0;
410 const struct iommu_ops intel_iommu_ops;
412 static bool translation_pre_enabled(struct intel_iommu *iommu)
414 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
417 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
419 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
422 static void init_translation_status(struct intel_iommu *iommu)
424 u32 gsts;
426 gsts = readl(iommu->reg + DMAR_GSTS_REG);
427 if (gsts & DMA_GSTS_TES)
428 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
431 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
432 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
434 return container_of(dom, struct dmar_domain, domain);
437 static int __init intel_iommu_setup(char *str)
439 if (!str)
440 return -EINVAL;
441 while (*str) {
442 if (!strncmp(str, "on", 2)) {
443 dmar_disabled = 0;
444 pr_info("IOMMU enabled\n");
445 } else if (!strncmp(str, "off", 3)) {
446 dmar_disabled = 1;
447 no_platform_optin = 1;
448 pr_info("IOMMU disabled\n");
449 } else if (!strncmp(str, "igfx_off", 8)) {
450 dmar_map_gfx = 0;
451 pr_info("Disable GFX device mapping\n");
452 } else if (!strncmp(str, "forcedac", 8)) {
453 pr_info("Forcing DAC for PCI devices\n");
454 dmar_forcedac = 1;
455 } else if (!strncmp(str, "strict", 6)) {
456 pr_info("Disable batched IOTLB flush\n");
457 intel_iommu_strict = 1;
458 } else if (!strncmp(str, "sp_off", 6)) {
459 pr_info("Disable supported super page\n");
460 intel_iommu_superpage = 0;
461 } else if (!strncmp(str, "sm_on", 5)) {
462 pr_info("Intel-IOMMU: scalable mode supported\n");
463 intel_iommu_sm = 1;
464 } else if (!strncmp(str, "tboot_noforce", 13)) {
465 printk(KERN_INFO
466 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
467 intel_iommu_tboot_noforce = 1;
468 } else if (!strncmp(str, "nobounce", 8)) {
469 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
470 intel_no_bounce = 1;
473 str += strcspn(str, ",");
474 while (*str == ',')
475 str++;
477 return 0;
479 __setup("intel_iommu=", intel_iommu_setup);
481 static struct kmem_cache *iommu_domain_cache;
482 static struct kmem_cache *iommu_devinfo_cache;
484 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
486 struct dmar_domain **domains;
487 int idx = did >> 8;
489 domains = iommu->domains[idx];
490 if (!domains)
491 return NULL;
493 return domains[did & 0xff];
496 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
497 struct dmar_domain *domain)
499 struct dmar_domain **domains;
500 int idx = did >> 8;
502 if (!iommu->domains[idx]) {
503 size_t size = 256 * sizeof(struct dmar_domain *);
504 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
507 domains = iommu->domains[idx];
508 if (WARN_ON(!domains))
509 return;
510 else
511 domains[did & 0xff] = domain;
514 void *alloc_pgtable_page(int node)
516 struct page *page;
517 void *vaddr = NULL;
519 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
520 if (page)
521 vaddr = page_address(page);
522 return vaddr;
525 void free_pgtable_page(void *vaddr)
527 free_page((unsigned long)vaddr);
530 static inline void *alloc_domain_mem(void)
532 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
535 static void free_domain_mem(void *vaddr)
537 kmem_cache_free(iommu_domain_cache, vaddr);
540 static inline void * alloc_devinfo_mem(void)
542 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
545 static inline void free_devinfo_mem(void *vaddr)
547 kmem_cache_free(iommu_devinfo_cache, vaddr);
550 static inline int domain_type_is_si(struct dmar_domain *domain)
552 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
556 unsigned long pfn)
558 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
560 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
565 unsigned long sagaw;
566 int agaw = -1;
568 sagaw = cap_sagaw(iommu->cap);
569 for (agaw = width_to_agaw(max_gaw);
570 agaw >= 0; agaw--) {
571 if (test_bit(agaw, &sagaw))
572 break;
575 return agaw;
579 * Calculate max SAGAW for each iommu.
581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
583 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
587 * calculate agaw for each iommu.
588 * "SAGAW" may be different across iommus, use a default agaw, and
589 * get a supported less agaw for iommus that don't support the default agaw.
591 int iommu_calculate_agaw(struct intel_iommu *iommu)
593 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
596 /* This functionin only returns single iommu in a domain */
597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
599 int iommu_id;
601 /* si_domain and vm domain should not get here. */
602 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
603 return NULL;
605 for_each_domain_iommu(iommu_id, domain)
606 break;
608 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
609 return NULL;
611 return g_iommus[iommu_id];
614 static void domain_update_iommu_coherency(struct dmar_domain *domain)
616 struct dmar_drhd_unit *drhd;
617 struct intel_iommu *iommu;
618 bool found = false;
619 int i;
621 domain->iommu_coherency = 1;
623 for_each_domain_iommu(i, domain) {
624 found = true;
625 if (!ecap_coherent(g_iommus[i]->ecap)) {
626 domain->iommu_coherency = 0;
627 break;
630 if (found)
631 return;
633 /* No hardware attached; use lowest common denominator */
634 rcu_read_lock();
635 for_each_active_iommu(iommu, drhd) {
636 if (!ecap_coherent(iommu->ecap)) {
637 domain->iommu_coherency = 0;
638 break;
641 rcu_read_unlock();
644 static int domain_update_iommu_snooping(struct intel_iommu *skip)
646 struct dmar_drhd_unit *drhd;
647 struct intel_iommu *iommu;
648 int ret = 1;
650 rcu_read_lock();
651 for_each_active_iommu(iommu, drhd) {
652 if (iommu != skip) {
653 if (!ecap_sc_support(iommu->ecap)) {
654 ret = 0;
655 break;
659 rcu_read_unlock();
661 return ret;
664 static int domain_update_iommu_superpage(struct intel_iommu *skip)
666 struct dmar_drhd_unit *drhd;
667 struct intel_iommu *iommu;
668 int mask = 0xf;
670 if (!intel_iommu_superpage) {
671 return 0;
674 /* set iommu_superpage to the smallest common denominator */
675 rcu_read_lock();
676 for_each_active_iommu(iommu, drhd) {
677 if (iommu != skip) {
678 mask &= cap_super_page_val(iommu->cap);
679 if (!mask)
680 break;
683 rcu_read_unlock();
685 return fls(mask);
688 /* Some capabilities may be different across iommus */
689 static void domain_update_iommu_cap(struct dmar_domain *domain)
691 domain_update_iommu_coherency(domain);
692 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
693 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
696 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
697 u8 devfn, int alloc)
699 struct root_entry *root = &iommu->root_entry[bus];
700 struct context_entry *context;
701 u64 *entry;
703 entry = &root->lo;
704 if (sm_supported(iommu)) {
705 if (devfn >= 0x80) {
706 devfn -= 0x80;
707 entry = &root->hi;
709 devfn *= 2;
711 if (*entry & 1)
712 context = phys_to_virt(*entry & VTD_PAGE_MASK);
713 else {
714 unsigned long phy_addr;
715 if (!alloc)
716 return NULL;
718 context = alloc_pgtable_page(iommu->node);
719 if (!context)
720 return NULL;
722 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
723 phy_addr = virt_to_phys((void *)context);
724 *entry = phy_addr | 1;
725 __iommu_flush_cache(iommu, entry, sizeof(*entry));
727 return &context[devfn];
730 static int iommu_dummy(struct device *dev)
732 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
736 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
737 * sub-hierarchy of a candidate PCI-PCI bridge
738 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
739 * @bridge: the candidate PCI-PCI bridge
741 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
743 static bool
744 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
746 struct pci_dev *pdev, *pbridge;
748 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
749 return false;
751 pdev = to_pci_dev(dev);
752 pbridge = to_pci_dev(bridge);
754 if (pbridge->subordinate &&
755 pbridge->subordinate->number <= pdev->bus->number &&
756 pbridge->subordinate->busn_res.end >= pdev->bus->number)
757 return true;
759 return false;
762 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
764 struct dmar_drhd_unit *drhd = NULL;
765 struct intel_iommu *iommu;
766 struct device *tmp;
767 struct pci_dev *pdev = NULL;
768 u16 segment = 0;
769 int i;
771 if (iommu_dummy(dev))
772 return NULL;
774 if (dev_is_pci(dev)) {
775 struct pci_dev *pf_pdev;
777 pdev = pci_real_dma_dev(to_pci_dev(dev));
779 /* VFs aren't listed in scope tables; we need to look up
780 * the PF instead to find the IOMMU. */
781 pf_pdev = pci_physfn(pdev);
782 dev = &pf_pdev->dev;
783 segment = pci_domain_nr(pdev->bus);
784 } else if (has_acpi_companion(dev))
785 dev = &ACPI_COMPANION(dev)->dev;
787 rcu_read_lock();
788 for_each_active_iommu(iommu, drhd) {
789 if (pdev && segment != drhd->segment)
790 continue;
792 for_each_active_dev_scope(drhd->devices,
793 drhd->devices_cnt, i, tmp) {
794 if (tmp == dev) {
795 /* For a VF use its original BDF# not that of the PF
796 * which we used for the IOMMU lookup. Strictly speaking
797 * we could do this for all PCI devices; we only need to
798 * get the BDF# from the scope table for ACPI matches. */
799 if (pdev && pdev->is_virtfn)
800 goto got_pdev;
802 *bus = drhd->devices[i].bus;
803 *devfn = drhd->devices[i].devfn;
804 goto out;
807 if (is_downstream_to_pci_bridge(dev, tmp))
808 goto got_pdev;
811 if (pdev && drhd->include_all) {
812 got_pdev:
813 *bus = pdev->bus->number;
814 *devfn = pdev->devfn;
815 goto out;
818 iommu = NULL;
819 out:
820 rcu_read_unlock();
822 return iommu;
825 static void domain_flush_cache(struct dmar_domain *domain,
826 void *addr, int size)
828 if (!domain->iommu_coherency)
829 clflush_cache_range(addr, size);
832 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
834 struct context_entry *context;
835 int ret = 0;
836 unsigned long flags;
838 spin_lock_irqsave(&iommu->lock, flags);
839 context = iommu_context_addr(iommu, bus, devfn, 0);
840 if (context)
841 ret = context_present(context);
842 spin_unlock_irqrestore(&iommu->lock, flags);
843 return ret;
846 static void free_context_table(struct intel_iommu *iommu)
848 int i;
849 unsigned long flags;
850 struct context_entry *context;
852 spin_lock_irqsave(&iommu->lock, flags);
853 if (!iommu->root_entry) {
854 goto out;
856 for (i = 0; i < ROOT_ENTRY_NR; i++) {
857 context = iommu_context_addr(iommu, i, 0, 0);
858 if (context)
859 free_pgtable_page(context);
861 if (!sm_supported(iommu))
862 continue;
864 context = iommu_context_addr(iommu, i, 0x80, 0);
865 if (context)
866 free_pgtable_page(context);
869 free_pgtable_page(iommu->root_entry);
870 iommu->root_entry = NULL;
871 out:
872 spin_unlock_irqrestore(&iommu->lock, flags);
875 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
876 unsigned long pfn, int *target_level)
878 struct dma_pte *parent, *pte;
879 int level = agaw_to_level(domain->agaw);
880 int offset;
882 BUG_ON(!domain->pgd);
884 if (!domain_pfn_supported(domain, pfn))
885 /* Address beyond IOMMU's addressing capabilities. */
886 return NULL;
888 parent = domain->pgd;
890 while (1) {
891 void *tmp_page;
893 offset = pfn_level_offset(pfn, level);
894 pte = &parent[offset];
895 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
896 break;
897 if (level == *target_level)
898 break;
900 if (!dma_pte_present(pte)) {
901 uint64_t pteval;
903 tmp_page = alloc_pgtable_page(domain->nid);
905 if (!tmp_page)
906 return NULL;
908 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
909 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
910 if (cmpxchg64(&pte->val, 0ULL, pteval))
911 /* Someone else set it while we were thinking; use theirs. */
912 free_pgtable_page(tmp_page);
913 else
914 domain_flush_cache(domain, pte, sizeof(*pte));
916 if (level == 1)
917 break;
919 parent = phys_to_virt(dma_pte_addr(pte));
920 level--;
923 if (!*target_level)
924 *target_level = level;
926 return pte;
929 /* return address's pte at specific level */
930 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
931 unsigned long pfn,
932 int level, int *large_page)
934 struct dma_pte *parent, *pte;
935 int total = agaw_to_level(domain->agaw);
936 int offset;
938 parent = domain->pgd;
939 while (level <= total) {
940 offset = pfn_level_offset(pfn, total);
941 pte = &parent[offset];
942 if (level == total)
943 return pte;
945 if (!dma_pte_present(pte)) {
946 *large_page = total;
947 break;
950 if (dma_pte_superpage(pte)) {
951 *large_page = total;
952 return pte;
955 parent = phys_to_virt(dma_pte_addr(pte));
956 total--;
958 return NULL;
961 /* clear last level pte, a tlb flush should be followed */
962 static void dma_pte_clear_range(struct dmar_domain *domain,
963 unsigned long start_pfn,
964 unsigned long last_pfn)
966 unsigned int large_page;
967 struct dma_pte *first_pte, *pte;
969 BUG_ON(!domain_pfn_supported(domain, start_pfn));
970 BUG_ON(!domain_pfn_supported(domain, last_pfn));
971 BUG_ON(start_pfn > last_pfn);
973 /* we don't need lock here; nobody else touches the iova range */
974 do {
975 large_page = 1;
976 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
977 if (!pte) {
978 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
979 continue;
981 do {
982 dma_clear_pte(pte);
983 start_pfn += lvl_to_nr_pages(large_page);
984 pte++;
985 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
987 domain_flush_cache(domain, first_pte,
988 (void *)pte - (void *)first_pte);
990 } while (start_pfn && start_pfn <= last_pfn);
993 static void dma_pte_free_level(struct dmar_domain *domain, int level,
994 int retain_level, struct dma_pte *pte,
995 unsigned long pfn, unsigned long start_pfn,
996 unsigned long last_pfn)
998 pfn = max(start_pfn, pfn);
999 pte = &pte[pfn_level_offset(pfn, level)];
1001 do {
1002 unsigned long level_pfn;
1003 struct dma_pte *level_pte;
1005 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1006 goto next;
1008 level_pfn = pfn & level_mask(level);
1009 level_pte = phys_to_virt(dma_pte_addr(pte));
1011 if (level > 2) {
1012 dma_pte_free_level(domain, level - 1, retain_level,
1013 level_pte, level_pfn, start_pfn,
1014 last_pfn);
1018 * Free the page table if we're below the level we want to
1019 * retain and the range covers the entire table.
1021 if (level < retain_level && !(start_pfn > level_pfn ||
1022 last_pfn < level_pfn + level_size(level) - 1)) {
1023 dma_clear_pte(pte);
1024 domain_flush_cache(domain, pte, sizeof(*pte));
1025 free_pgtable_page(level_pte);
1027 next:
1028 pfn += level_size(level);
1029 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1033 * clear last level (leaf) ptes and free page table pages below the
1034 * level we wish to keep intact.
1036 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1037 unsigned long start_pfn,
1038 unsigned long last_pfn,
1039 int retain_level)
1041 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1042 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1043 BUG_ON(start_pfn > last_pfn);
1045 dma_pte_clear_range(domain, start_pfn, last_pfn);
1047 /* We don't need lock here; nobody else touches the iova range */
1048 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1049 domain->pgd, 0, start_pfn, last_pfn);
1051 /* free pgd */
1052 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1053 free_pgtable_page(domain->pgd);
1054 domain->pgd = NULL;
1058 /* When a page at a given level is being unlinked from its parent, we don't
1059 need to *modify* it at all. All we need to do is make a list of all the
1060 pages which can be freed just as soon as we've flushed the IOTLB and we
1061 know the hardware page-walk will no longer touch them.
1062 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1063 be freed. */
1064 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1065 int level, struct dma_pte *pte,
1066 struct page *freelist)
1068 struct page *pg;
1070 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1071 pg->freelist = freelist;
1072 freelist = pg;
1074 if (level == 1)
1075 return freelist;
1077 pte = page_address(pg);
1078 do {
1079 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1080 freelist = dma_pte_list_pagetables(domain, level - 1,
1081 pte, freelist);
1082 pte++;
1083 } while (!first_pte_in_page(pte));
1085 return freelist;
1088 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1089 struct dma_pte *pte, unsigned long pfn,
1090 unsigned long start_pfn,
1091 unsigned long last_pfn,
1092 struct page *freelist)
1094 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1096 pfn = max(start_pfn, pfn);
1097 pte = &pte[pfn_level_offset(pfn, level)];
1099 do {
1100 unsigned long level_pfn;
1102 if (!dma_pte_present(pte))
1103 goto next;
1105 level_pfn = pfn & level_mask(level);
1107 /* If range covers entire pagetable, free it */
1108 if (start_pfn <= level_pfn &&
1109 last_pfn >= level_pfn + level_size(level) - 1) {
1110 /* These suborbinate page tables are going away entirely. Don't
1111 bother to clear them; we're just going to *free* them. */
1112 if (level > 1 && !dma_pte_superpage(pte))
1113 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1115 dma_clear_pte(pte);
1116 if (!first_pte)
1117 first_pte = pte;
1118 last_pte = pte;
1119 } else if (level > 1) {
1120 /* Recurse down into a level that isn't *entirely* obsolete */
1121 freelist = dma_pte_clear_level(domain, level - 1,
1122 phys_to_virt(dma_pte_addr(pte)),
1123 level_pfn, start_pfn, last_pfn,
1124 freelist);
1126 next:
1127 pfn += level_size(level);
1128 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1130 if (first_pte)
1131 domain_flush_cache(domain, first_pte,
1132 (void *)++last_pte - (void *)first_pte);
1134 return freelist;
1137 /* We can't just free the pages because the IOMMU may still be walking
1138 the page tables, and may have cached the intermediate levels. The
1139 pages can only be freed after the IOTLB flush has been done. */
1140 static struct page *domain_unmap(struct dmar_domain *domain,
1141 unsigned long start_pfn,
1142 unsigned long last_pfn)
1144 struct page *freelist;
1146 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1147 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1148 BUG_ON(start_pfn > last_pfn);
1150 /* we don't need lock here; nobody else touches the iova range */
1151 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1152 domain->pgd, 0, start_pfn, last_pfn, NULL);
1154 /* free pgd */
1155 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1156 struct page *pgd_page = virt_to_page(domain->pgd);
1157 pgd_page->freelist = freelist;
1158 freelist = pgd_page;
1160 domain->pgd = NULL;
1163 return freelist;
1166 static void dma_free_pagelist(struct page *freelist)
1168 struct page *pg;
1170 while ((pg = freelist)) {
1171 freelist = pg->freelist;
1172 free_pgtable_page(page_address(pg));
1176 static void iova_entry_free(unsigned long data)
1178 struct page *freelist = (struct page *)data;
1180 dma_free_pagelist(freelist);
1183 /* iommu handling */
1184 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1186 struct root_entry *root;
1187 unsigned long flags;
1189 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1190 if (!root) {
1191 pr_err("Allocating root entry for %s failed\n",
1192 iommu->name);
1193 return -ENOMEM;
1196 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1198 spin_lock_irqsave(&iommu->lock, flags);
1199 iommu->root_entry = root;
1200 spin_unlock_irqrestore(&iommu->lock, flags);
1202 return 0;
1205 static void iommu_set_root_entry(struct intel_iommu *iommu)
1207 u64 addr;
1208 u32 sts;
1209 unsigned long flag;
1211 addr = virt_to_phys(iommu->root_entry);
1212 if (sm_supported(iommu))
1213 addr |= DMA_RTADDR_SMT;
1215 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1216 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1218 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1220 /* Make sure hardware complete it */
1221 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1222 readl, (sts & DMA_GSTS_RTPS), sts);
1224 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1227 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1229 u32 val;
1230 unsigned long flag;
1232 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1233 return;
1235 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1236 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1238 /* Make sure hardware complete it */
1239 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1240 readl, (!(val & DMA_GSTS_WBFS)), val);
1242 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1245 /* return value determine if we need a write buffer flush */
1246 static void __iommu_flush_context(struct intel_iommu *iommu,
1247 u16 did, u16 source_id, u8 function_mask,
1248 u64 type)
1250 u64 val = 0;
1251 unsigned long flag;
1253 switch (type) {
1254 case DMA_CCMD_GLOBAL_INVL:
1255 val = DMA_CCMD_GLOBAL_INVL;
1256 break;
1257 case DMA_CCMD_DOMAIN_INVL:
1258 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1259 break;
1260 case DMA_CCMD_DEVICE_INVL:
1261 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1262 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1263 break;
1264 default:
1265 BUG();
1267 val |= DMA_CCMD_ICC;
1269 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1270 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1272 /* Make sure hardware complete it */
1273 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1274 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1276 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1279 /* return value determine if we need a write buffer flush */
1280 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1281 u64 addr, unsigned int size_order, u64 type)
1283 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1284 u64 val = 0, val_iva = 0;
1285 unsigned long flag;
1287 switch (type) {
1288 case DMA_TLB_GLOBAL_FLUSH:
1289 /* global flush doesn't need set IVA_REG */
1290 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1291 break;
1292 case DMA_TLB_DSI_FLUSH:
1293 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1294 break;
1295 case DMA_TLB_PSI_FLUSH:
1296 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1297 /* IH bit is passed in as part of address */
1298 val_iva = size_order | addr;
1299 break;
1300 default:
1301 BUG();
1303 /* Note: set drain read/write */
1304 #if 0
1306 * This is probably to be super secure.. Looks like we can
1307 * ignore it without any impact.
1309 if (cap_read_drain(iommu->cap))
1310 val |= DMA_TLB_READ_DRAIN;
1311 #endif
1312 if (cap_write_drain(iommu->cap))
1313 val |= DMA_TLB_WRITE_DRAIN;
1315 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1316 /* Note: Only uses first TLB reg currently */
1317 if (val_iva)
1318 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1319 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1321 /* Make sure hardware complete it */
1322 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1323 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1325 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1327 /* check IOTLB invalidation granularity */
1328 if (DMA_TLB_IAIG(val) == 0)
1329 pr_err("Flush IOTLB failed\n");
1330 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1331 pr_debug("TLB flush request %Lx, actual %Lx\n",
1332 (unsigned long long)DMA_TLB_IIRG(type),
1333 (unsigned long long)DMA_TLB_IAIG(val));
1336 static struct device_domain_info *
1337 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1338 u8 bus, u8 devfn)
1340 struct device_domain_info *info;
1342 assert_spin_locked(&device_domain_lock);
1344 if (!iommu->qi)
1345 return NULL;
1347 list_for_each_entry(info, &domain->devices, link)
1348 if (info->iommu == iommu && info->bus == bus &&
1349 info->devfn == devfn) {
1350 if (info->ats_supported && info->dev)
1351 return info;
1352 break;
1355 return NULL;
1358 static void domain_update_iotlb(struct dmar_domain *domain)
1360 struct device_domain_info *info;
1361 bool has_iotlb_device = false;
1363 assert_spin_locked(&device_domain_lock);
1365 list_for_each_entry(info, &domain->devices, link) {
1366 struct pci_dev *pdev;
1368 if (!info->dev || !dev_is_pci(info->dev))
1369 continue;
1371 pdev = to_pci_dev(info->dev);
1372 if (pdev->ats_enabled) {
1373 has_iotlb_device = true;
1374 break;
1378 domain->has_iotlb_device = has_iotlb_device;
1381 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1383 struct pci_dev *pdev;
1385 assert_spin_locked(&device_domain_lock);
1387 if (!info || !dev_is_pci(info->dev))
1388 return;
1390 pdev = to_pci_dev(info->dev);
1391 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1392 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1393 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1394 * reserved, which should be set to 0.
1396 if (!ecap_dit(info->iommu->ecap))
1397 info->pfsid = 0;
1398 else {
1399 struct pci_dev *pf_pdev;
1401 /* pdev will be returned if device is not a vf */
1402 pf_pdev = pci_physfn(pdev);
1403 info->pfsid = pci_dev_id(pf_pdev);
1406 #ifdef CONFIG_INTEL_IOMMU_SVM
1407 /* The PCIe spec, in its wisdom, declares that the behaviour of
1408 the device if you enable PASID support after ATS support is
1409 undefined. So always enable PASID support on devices which
1410 have it, even if we can't yet know if we're ever going to
1411 use it. */
1412 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1413 info->pasid_enabled = 1;
1415 if (info->pri_supported &&
1416 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1417 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1418 info->pri_enabled = 1;
1419 #endif
1420 if (!pdev->untrusted && info->ats_supported &&
1421 pci_ats_page_aligned(pdev) &&
1422 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1423 info->ats_enabled = 1;
1424 domain_update_iotlb(info->domain);
1425 info->ats_qdep = pci_ats_queue_depth(pdev);
1429 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1431 struct pci_dev *pdev;
1433 assert_spin_locked(&device_domain_lock);
1435 if (!dev_is_pci(info->dev))
1436 return;
1438 pdev = to_pci_dev(info->dev);
1440 if (info->ats_enabled) {
1441 pci_disable_ats(pdev);
1442 info->ats_enabled = 0;
1443 domain_update_iotlb(info->domain);
1445 #ifdef CONFIG_INTEL_IOMMU_SVM
1446 if (info->pri_enabled) {
1447 pci_disable_pri(pdev);
1448 info->pri_enabled = 0;
1450 if (info->pasid_enabled) {
1451 pci_disable_pasid(pdev);
1452 info->pasid_enabled = 0;
1454 #endif
1457 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1458 u64 addr, unsigned mask)
1460 u16 sid, qdep;
1461 unsigned long flags;
1462 struct device_domain_info *info;
1464 if (!domain->has_iotlb_device)
1465 return;
1467 spin_lock_irqsave(&device_domain_lock, flags);
1468 list_for_each_entry(info, &domain->devices, link) {
1469 if (!info->ats_enabled)
1470 continue;
1472 sid = info->bus << 8 | info->devfn;
1473 qdep = info->ats_qdep;
1474 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1475 qdep, addr, mask);
1477 spin_unlock_irqrestore(&device_domain_lock, flags);
1480 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1481 struct dmar_domain *domain,
1482 unsigned long pfn, unsigned int pages,
1483 int ih, int map)
1485 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1486 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1487 u16 did = domain->iommu_did[iommu->seq_id];
1489 BUG_ON(pages == 0);
1491 if (ih)
1492 ih = 1 << 6;
1494 * Fallback to domain selective flush if no PSI support or the size is
1495 * too big.
1496 * PSI requires page size to be 2 ^ x, and the base address is naturally
1497 * aligned to the size
1499 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1500 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1501 DMA_TLB_DSI_FLUSH);
1502 else
1503 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1504 DMA_TLB_PSI_FLUSH);
1507 * In caching mode, changes of pages from non-present to present require
1508 * flush. However, device IOTLB doesn't need to be flushed in this case.
1510 if (!cap_caching_mode(iommu->cap) || !map)
1511 iommu_flush_dev_iotlb(domain, addr, mask);
1514 /* Notification for newly created mappings */
1515 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1516 struct dmar_domain *domain,
1517 unsigned long pfn, unsigned int pages)
1519 /* It's a non-present to present mapping. Only flush if caching mode */
1520 if (cap_caching_mode(iommu->cap))
1521 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1522 else
1523 iommu_flush_write_buffer(iommu);
1526 static void iommu_flush_iova(struct iova_domain *iovad)
1528 struct dmar_domain *domain;
1529 int idx;
1531 domain = container_of(iovad, struct dmar_domain, iovad);
1533 for_each_domain_iommu(idx, domain) {
1534 struct intel_iommu *iommu = g_iommus[idx];
1535 u16 did = domain->iommu_did[iommu->seq_id];
1537 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1539 if (!cap_caching_mode(iommu->cap))
1540 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1541 0, MAX_AGAW_PFN_WIDTH);
1545 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1547 u32 pmen;
1548 unsigned long flags;
1550 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1551 return;
1553 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1554 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1555 pmen &= ~DMA_PMEN_EPM;
1556 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1558 /* wait for the protected region status bit to clear */
1559 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1560 readl, !(pmen & DMA_PMEN_PRS), pmen);
1562 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1565 static void iommu_enable_translation(struct intel_iommu *iommu)
1567 u32 sts;
1568 unsigned long flags;
1570 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1571 iommu->gcmd |= DMA_GCMD_TE;
1572 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1574 /* Make sure hardware complete it */
1575 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1576 readl, (sts & DMA_GSTS_TES), sts);
1578 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1581 static void iommu_disable_translation(struct intel_iommu *iommu)
1583 u32 sts;
1584 unsigned long flag;
1586 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1587 iommu->gcmd &= ~DMA_GCMD_TE;
1588 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1590 /* Make sure hardware complete it */
1591 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1592 readl, (!(sts & DMA_GSTS_TES)), sts);
1594 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1597 static int iommu_init_domains(struct intel_iommu *iommu)
1599 u32 ndomains, nlongs;
1600 size_t size;
1602 ndomains = cap_ndoms(iommu->cap);
1603 pr_debug("%s: Number of Domains supported <%d>\n",
1604 iommu->name, ndomains);
1605 nlongs = BITS_TO_LONGS(ndomains);
1607 spin_lock_init(&iommu->lock);
1609 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1610 if (!iommu->domain_ids) {
1611 pr_err("%s: Allocating domain id array failed\n",
1612 iommu->name);
1613 return -ENOMEM;
1616 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1617 iommu->domains = kzalloc(size, GFP_KERNEL);
1619 if (iommu->domains) {
1620 size = 256 * sizeof(struct dmar_domain *);
1621 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1624 if (!iommu->domains || !iommu->domains[0]) {
1625 pr_err("%s: Allocating domain array failed\n",
1626 iommu->name);
1627 kfree(iommu->domain_ids);
1628 kfree(iommu->domains);
1629 iommu->domain_ids = NULL;
1630 iommu->domains = NULL;
1631 return -ENOMEM;
1635 * If Caching mode is set, then invalid translations are tagged
1636 * with domain-id 0, hence we need to pre-allocate it. We also
1637 * use domain-id 0 as a marker for non-allocated domain-id, so
1638 * make sure it is not used for a real domain.
1640 set_bit(0, iommu->domain_ids);
1643 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1644 * entry for first-level or pass-through translation modes should
1645 * be programmed with a domain id different from those used for
1646 * second-level or nested translation. We reserve a domain id for
1647 * this purpose.
1649 if (sm_supported(iommu))
1650 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1652 return 0;
1655 static void disable_dmar_iommu(struct intel_iommu *iommu)
1657 struct device_domain_info *info, *tmp;
1658 unsigned long flags;
1660 if (!iommu->domains || !iommu->domain_ids)
1661 return;
1663 spin_lock_irqsave(&device_domain_lock, flags);
1664 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1665 if (info->iommu != iommu)
1666 continue;
1668 if (!info->dev || !info->domain)
1669 continue;
1671 __dmar_remove_one_dev_info(info);
1673 spin_unlock_irqrestore(&device_domain_lock, flags);
1675 if (iommu->gcmd & DMA_GCMD_TE)
1676 iommu_disable_translation(iommu);
1679 static void free_dmar_iommu(struct intel_iommu *iommu)
1681 if ((iommu->domains) && (iommu->domain_ids)) {
1682 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1683 int i;
1685 for (i = 0; i < elems; i++)
1686 kfree(iommu->domains[i]);
1687 kfree(iommu->domains);
1688 kfree(iommu->domain_ids);
1689 iommu->domains = NULL;
1690 iommu->domain_ids = NULL;
1693 g_iommus[iommu->seq_id] = NULL;
1695 /* free context mapping */
1696 free_context_table(iommu);
1698 #ifdef CONFIG_INTEL_IOMMU_SVM
1699 if (pasid_supported(iommu)) {
1700 if (ecap_prs(iommu->ecap))
1701 intel_svm_finish_prq(iommu);
1703 #endif
1706 static struct dmar_domain *alloc_domain(int flags)
1708 struct dmar_domain *domain;
1710 domain = alloc_domain_mem();
1711 if (!domain)
1712 return NULL;
1714 memset(domain, 0, sizeof(*domain));
1715 domain->nid = NUMA_NO_NODE;
1716 domain->flags = flags;
1717 domain->has_iotlb_device = false;
1718 INIT_LIST_HEAD(&domain->devices);
1720 return domain;
1723 /* Must be called with iommu->lock */
1724 static int domain_attach_iommu(struct dmar_domain *domain,
1725 struct intel_iommu *iommu)
1727 unsigned long ndomains;
1728 int num;
1730 assert_spin_locked(&device_domain_lock);
1731 assert_spin_locked(&iommu->lock);
1733 domain->iommu_refcnt[iommu->seq_id] += 1;
1734 domain->iommu_count += 1;
1735 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1736 ndomains = cap_ndoms(iommu->cap);
1737 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1739 if (num >= ndomains) {
1740 pr_err("%s: No free domain ids\n", iommu->name);
1741 domain->iommu_refcnt[iommu->seq_id] -= 1;
1742 domain->iommu_count -= 1;
1743 return -ENOSPC;
1746 set_bit(num, iommu->domain_ids);
1747 set_iommu_domain(iommu, num, domain);
1749 domain->iommu_did[iommu->seq_id] = num;
1750 domain->nid = iommu->node;
1752 domain_update_iommu_cap(domain);
1755 return 0;
1758 static int domain_detach_iommu(struct dmar_domain *domain,
1759 struct intel_iommu *iommu)
1761 int num, count;
1763 assert_spin_locked(&device_domain_lock);
1764 assert_spin_locked(&iommu->lock);
1766 domain->iommu_refcnt[iommu->seq_id] -= 1;
1767 count = --domain->iommu_count;
1768 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1769 num = domain->iommu_did[iommu->seq_id];
1770 clear_bit(num, iommu->domain_ids);
1771 set_iommu_domain(iommu, num, NULL);
1773 domain_update_iommu_cap(domain);
1774 domain->iommu_did[iommu->seq_id] = 0;
1777 return count;
1780 static struct iova_domain reserved_iova_list;
1781 static struct lock_class_key reserved_rbtree_key;
1783 static int dmar_init_reserved_ranges(void)
1785 struct pci_dev *pdev = NULL;
1786 struct iova *iova;
1787 int i;
1789 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1791 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1792 &reserved_rbtree_key);
1794 /* IOAPIC ranges shouldn't be accessed by DMA */
1795 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1796 IOVA_PFN(IOAPIC_RANGE_END));
1797 if (!iova) {
1798 pr_err("Reserve IOAPIC range failed\n");
1799 return -ENODEV;
1802 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1803 for_each_pci_dev(pdev) {
1804 struct resource *r;
1806 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1807 r = &pdev->resource[i];
1808 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1809 continue;
1810 iova = reserve_iova(&reserved_iova_list,
1811 IOVA_PFN(r->start),
1812 IOVA_PFN(r->end));
1813 if (!iova) {
1814 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1815 return -ENODEV;
1819 return 0;
1822 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1824 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1827 static inline int guestwidth_to_adjustwidth(int gaw)
1829 int agaw;
1830 int r = (gaw - 12) % 9;
1832 if (r == 0)
1833 agaw = gaw;
1834 else
1835 agaw = gaw + 9 - r;
1836 if (agaw > 64)
1837 agaw = 64;
1838 return agaw;
1841 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1842 int guest_width)
1844 int adjust_width, agaw;
1845 unsigned long sagaw;
1846 int err;
1848 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1850 err = init_iova_flush_queue(&domain->iovad,
1851 iommu_flush_iova, iova_entry_free);
1852 if (err)
1853 return err;
1855 domain_reserve_special_ranges(domain);
1857 /* calculate AGAW */
1858 if (guest_width > cap_mgaw(iommu->cap))
1859 guest_width = cap_mgaw(iommu->cap);
1860 domain->gaw = guest_width;
1861 adjust_width = guestwidth_to_adjustwidth(guest_width);
1862 agaw = width_to_agaw(adjust_width);
1863 sagaw = cap_sagaw(iommu->cap);
1864 if (!test_bit(agaw, &sagaw)) {
1865 /* hardware doesn't support it, choose a bigger one */
1866 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1867 agaw = find_next_bit(&sagaw, 5, agaw);
1868 if (agaw >= 5)
1869 return -ENODEV;
1871 domain->agaw = agaw;
1873 if (ecap_coherent(iommu->ecap))
1874 domain->iommu_coherency = 1;
1875 else
1876 domain->iommu_coherency = 0;
1878 if (ecap_sc_support(iommu->ecap))
1879 domain->iommu_snooping = 1;
1880 else
1881 domain->iommu_snooping = 0;
1883 if (intel_iommu_superpage)
1884 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1885 else
1886 domain->iommu_superpage = 0;
1888 domain->nid = iommu->node;
1890 /* always allocate the top pgd */
1891 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1892 if (!domain->pgd)
1893 return -ENOMEM;
1894 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1895 return 0;
1898 static void domain_exit(struct dmar_domain *domain)
1901 /* Remove associated devices and clear attached or cached domains */
1902 domain_remove_dev_info(domain);
1904 /* destroy iovas */
1905 put_iova_domain(&domain->iovad);
1907 if (domain->pgd) {
1908 struct page *freelist;
1910 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1911 dma_free_pagelist(freelist);
1914 free_domain_mem(domain);
1918 * Get the PASID directory size for scalable mode context entry.
1919 * Value of X in the PDTS field of a scalable mode context entry
1920 * indicates PASID directory with 2^(X + 7) entries.
1922 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1924 int pds, max_pde;
1926 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1927 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1928 if (pds < 7)
1929 return 0;
1931 return pds - 7;
1935 * Set the RID_PASID field of a scalable mode context entry. The
1936 * IOMMU hardware will use the PASID value set in this field for
1937 * DMA translations of DMA requests without PASID.
1939 static inline void
1940 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1942 context->hi |= pasid & ((1 << 20) - 1);
1943 context->hi |= (1 << 20);
1947 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1948 * entry.
1950 static inline void context_set_sm_dte(struct context_entry *context)
1952 context->lo |= (1 << 2);
1956 * Set the PRE(Page Request Enable) field of a scalable mode context
1957 * entry.
1959 static inline void context_set_sm_pre(struct context_entry *context)
1961 context->lo |= (1 << 4);
1964 /* Convert value to context PASID directory size field coding. */
1965 #define context_pdts(pds) (((pds) & 0x7) << 9)
1967 static int domain_context_mapping_one(struct dmar_domain *domain,
1968 struct intel_iommu *iommu,
1969 struct pasid_table *table,
1970 u8 bus, u8 devfn)
1972 u16 did = domain->iommu_did[iommu->seq_id];
1973 int translation = CONTEXT_TT_MULTI_LEVEL;
1974 struct device_domain_info *info = NULL;
1975 struct context_entry *context;
1976 unsigned long flags;
1977 int ret;
1979 WARN_ON(did == 0);
1981 if (hw_pass_through && domain_type_is_si(domain))
1982 translation = CONTEXT_TT_PASS_THROUGH;
1984 pr_debug("Set context mapping for %02x:%02x.%d\n",
1985 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1987 BUG_ON(!domain->pgd);
1989 spin_lock_irqsave(&device_domain_lock, flags);
1990 spin_lock(&iommu->lock);
1992 ret = -ENOMEM;
1993 context = iommu_context_addr(iommu, bus, devfn, 1);
1994 if (!context)
1995 goto out_unlock;
1997 ret = 0;
1998 if (context_present(context))
1999 goto out_unlock;
2002 * For kdump cases, old valid entries may be cached due to the
2003 * in-flight DMA and copied pgtable, but there is no unmapping
2004 * behaviour for them, thus we need an explicit cache flush for
2005 * the newly-mapped device. For kdump, at this point, the device
2006 * is supposed to finish reset at its driver probe stage, so no
2007 * in-flight DMA will exist, and we don't need to worry anymore
2008 * hereafter.
2010 if (context_copied(context)) {
2011 u16 did_old = context_domain_id(context);
2013 if (did_old < cap_ndoms(iommu->cap)) {
2014 iommu->flush.flush_context(iommu, did_old,
2015 (((u16)bus) << 8) | devfn,
2016 DMA_CCMD_MASK_NOBIT,
2017 DMA_CCMD_DEVICE_INVL);
2018 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2019 DMA_TLB_DSI_FLUSH);
2023 context_clear_entry(context);
2025 if (sm_supported(iommu)) {
2026 unsigned long pds;
2028 WARN_ON(!table);
2030 /* Setup the PASID DIR pointer: */
2031 pds = context_get_sm_pds(table);
2032 context->lo = (u64)virt_to_phys(table->table) |
2033 context_pdts(pds);
2035 /* Setup the RID_PASID field: */
2036 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2039 * Setup the Device-TLB enable bit and Page request
2040 * Enable bit:
2042 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2043 if (info && info->ats_supported)
2044 context_set_sm_dte(context);
2045 if (info && info->pri_supported)
2046 context_set_sm_pre(context);
2047 } else {
2048 struct dma_pte *pgd = domain->pgd;
2049 int agaw;
2051 context_set_domain_id(context, did);
2053 if (translation != CONTEXT_TT_PASS_THROUGH) {
2055 * Skip top levels of page tables for iommu which has
2056 * less agaw than default. Unnecessary for PT mode.
2058 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2059 ret = -ENOMEM;
2060 pgd = phys_to_virt(dma_pte_addr(pgd));
2061 if (!dma_pte_present(pgd))
2062 goto out_unlock;
2065 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2066 if (info && info->ats_supported)
2067 translation = CONTEXT_TT_DEV_IOTLB;
2068 else
2069 translation = CONTEXT_TT_MULTI_LEVEL;
2071 context_set_address_root(context, virt_to_phys(pgd));
2072 context_set_address_width(context, agaw);
2073 } else {
2075 * In pass through mode, AW must be programmed to
2076 * indicate the largest AGAW value supported by
2077 * hardware. And ASR is ignored by hardware.
2079 context_set_address_width(context, iommu->msagaw);
2082 context_set_translation_type(context, translation);
2085 context_set_fault_enable(context);
2086 context_set_present(context);
2087 domain_flush_cache(domain, context, sizeof(*context));
2090 * It's a non-present to present mapping. If hardware doesn't cache
2091 * non-present entry we only need to flush the write-buffer. If the
2092 * _does_ cache non-present entries, then it does so in the special
2093 * domain #0, which we have to flush:
2095 if (cap_caching_mode(iommu->cap)) {
2096 iommu->flush.flush_context(iommu, 0,
2097 (((u16)bus) << 8) | devfn,
2098 DMA_CCMD_MASK_NOBIT,
2099 DMA_CCMD_DEVICE_INVL);
2100 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2101 } else {
2102 iommu_flush_write_buffer(iommu);
2104 iommu_enable_dev_iotlb(info);
2106 ret = 0;
2108 out_unlock:
2109 spin_unlock(&iommu->lock);
2110 spin_unlock_irqrestore(&device_domain_lock, flags);
2112 return ret;
2115 struct domain_context_mapping_data {
2116 struct dmar_domain *domain;
2117 struct intel_iommu *iommu;
2118 struct pasid_table *table;
2121 static int domain_context_mapping_cb(struct pci_dev *pdev,
2122 u16 alias, void *opaque)
2124 struct domain_context_mapping_data *data = opaque;
2126 return domain_context_mapping_one(data->domain, data->iommu,
2127 data->table, PCI_BUS_NUM(alias),
2128 alias & 0xff);
2131 static int
2132 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2134 struct domain_context_mapping_data data;
2135 struct pasid_table *table;
2136 struct intel_iommu *iommu;
2137 u8 bus, devfn;
2139 iommu = device_to_iommu(dev, &bus, &devfn);
2140 if (!iommu)
2141 return -ENODEV;
2143 table = intel_pasid_get_table(dev);
2145 if (!dev_is_pci(dev))
2146 return domain_context_mapping_one(domain, iommu, table,
2147 bus, devfn);
2149 data.domain = domain;
2150 data.iommu = iommu;
2151 data.table = table;
2153 return pci_for_each_dma_alias(to_pci_dev(dev),
2154 &domain_context_mapping_cb, &data);
2157 static int domain_context_mapped_cb(struct pci_dev *pdev,
2158 u16 alias, void *opaque)
2160 struct intel_iommu *iommu = opaque;
2162 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2165 static int domain_context_mapped(struct device *dev)
2167 struct intel_iommu *iommu;
2168 u8 bus, devfn;
2170 iommu = device_to_iommu(dev, &bus, &devfn);
2171 if (!iommu)
2172 return -ENODEV;
2174 if (!dev_is_pci(dev))
2175 return device_context_mapped(iommu, bus, devfn);
2177 return !pci_for_each_dma_alias(to_pci_dev(dev),
2178 domain_context_mapped_cb, iommu);
2181 /* Returns a number of VTD pages, but aligned to MM page size */
2182 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2183 size_t size)
2185 host_addr &= ~PAGE_MASK;
2186 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2189 /* Return largest possible superpage level for a given mapping */
2190 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2191 unsigned long iov_pfn,
2192 unsigned long phy_pfn,
2193 unsigned long pages)
2195 int support, level = 1;
2196 unsigned long pfnmerge;
2198 support = domain->iommu_superpage;
2200 /* To use a large page, the virtual *and* physical addresses
2201 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2202 of them will mean we have to use smaller pages. So just
2203 merge them and check both at once. */
2204 pfnmerge = iov_pfn | phy_pfn;
2206 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2207 pages >>= VTD_STRIDE_SHIFT;
2208 if (!pages)
2209 break;
2210 pfnmerge >>= VTD_STRIDE_SHIFT;
2211 level++;
2212 support--;
2214 return level;
2217 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2218 struct scatterlist *sg, unsigned long phys_pfn,
2219 unsigned long nr_pages, int prot)
2221 struct dma_pte *first_pte = NULL, *pte = NULL;
2222 phys_addr_t uninitialized_var(pteval);
2223 unsigned long sg_res = 0;
2224 unsigned int largepage_lvl = 0;
2225 unsigned long lvl_pages = 0;
2227 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2229 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2230 return -EINVAL;
2232 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2234 if (!sg) {
2235 sg_res = nr_pages;
2236 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2239 while (nr_pages > 0) {
2240 uint64_t tmp;
2242 if (!sg_res) {
2243 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2245 sg_res = aligned_nrpages(sg->offset, sg->length);
2246 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2247 sg->dma_length = sg->length;
2248 pteval = (sg_phys(sg) - pgoff) | prot;
2249 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2252 if (!pte) {
2253 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2255 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2256 if (!pte)
2257 return -ENOMEM;
2258 /* It is large page*/
2259 if (largepage_lvl > 1) {
2260 unsigned long nr_superpages, end_pfn;
2262 pteval |= DMA_PTE_LARGE_PAGE;
2263 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2265 nr_superpages = sg_res / lvl_pages;
2266 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2269 * Ensure that old small page tables are
2270 * removed to make room for superpage(s).
2271 * We're adding new large pages, so make sure
2272 * we don't remove their parent tables.
2274 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2275 largepage_lvl + 1);
2276 } else {
2277 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2281 /* We don't need lock here, nobody else
2282 * touches the iova range
2284 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2285 if (tmp) {
2286 static int dumps = 5;
2287 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2288 iov_pfn, tmp, (unsigned long long)pteval);
2289 if (dumps) {
2290 dumps--;
2291 debug_dma_dump_mappings(NULL);
2293 WARN_ON(1);
2296 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2298 BUG_ON(nr_pages < lvl_pages);
2299 BUG_ON(sg_res < lvl_pages);
2301 nr_pages -= lvl_pages;
2302 iov_pfn += lvl_pages;
2303 phys_pfn += lvl_pages;
2304 pteval += lvl_pages * VTD_PAGE_SIZE;
2305 sg_res -= lvl_pages;
2307 /* If the next PTE would be the first in a new page, then we
2308 need to flush the cache on the entries we've just written.
2309 And then we'll need to recalculate 'pte', so clear it and
2310 let it get set again in the if (!pte) block above.
2312 If we're done (!nr_pages) we need to flush the cache too.
2314 Also if we've been setting superpages, we may need to
2315 recalculate 'pte' and switch back to smaller pages for the
2316 end of the mapping, if the trailing size is not enough to
2317 use another superpage (i.e. sg_res < lvl_pages). */
2318 pte++;
2319 if (!nr_pages || first_pte_in_page(pte) ||
2320 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2321 domain_flush_cache(domain, first_pte,
2322 (void *)pte - (void *)first_pte);
2323 pte = NULL;
2326 if (!sg_res && nr_pages)
2327 sg = sg_next(sg);
2329 return 0;
2332 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2333 struct scatterlist *sg, unsigned long phys_pfn,
2334 unsigned long nr_pages, int prot)
2336 int iommu_id, ret;
2337 struct intel_iommu *iommu;
2339 /* Do the real mapping first */
2340 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2341 if (ret)
2342 return ret;
2344 for_each_domain_iommu(iommu_id, domain) {
2345 iommu = g_iommus[iommu_id];
2346 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2349 return 0;
2352 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2353 struct scatterlist *sg, unsigned long nr_pages,
2354 int prot)
2356 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2359 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2360 unsigned long phys_pfn, unsigned long nr_pages,
2361 int prot)
2363 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2366 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2368 unsigned long flags;
2369 struct context_entry *context;
2370 u16 did_old;
2372 if (!iommu)
2373 return;
2375 spin_lock_irqsave(&iommu->lock, flags);
2376 context = iommu_context_addr(iommu, bus, devfn, 0);
2377 if (!context) {
2378 spin_unlock_irqrestore(&iommu->lock, flags);
2379 return;
2381 did_old = context_domain_id(context);
2382 context_clear_entry(context);
2383 __iommu_flush_cache(iommu, context, sizeof(*context));
2384 spin_unlock_irqrestore(&iommu->lock, flags);
2385 iommu->flush.flush_context(iommu,
2386 did_old,
2387 (((u16)bus) << 8) | devfn,
2388 DMA_CCMD_MASK_NOBIT,
2389 DMA_CCMD_DEVICE_INVL);
2390 iommu->flush.flush_iotlb(iommu,
2391 did_old,
2394 DMA_TLB_DSI_FLUSH);
2397 static inline void unlink_domain_info(struct device_domain_info *info)
2399 assert_spin_locked(&device_domain_lock);
2400 list_del(&info->link);
2401 list_del(&info->global);
2402 if (info->dev)
2403 info->dev->archdata.iommu = NULL;
2406 static void domain_remove_dev_info(struct dmar_domain *domain)
2408 struct device_domain_info *info, *tmp;
2409 unsigned long flags;
2411 spin_lock_irqsave(&device_domain_lock, flags);
2412 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2413 __dmar_remove_one_dev_info(info);
2414 spin_unlock_irqrestore(&device_domain_lock, flags);
2417 static struct dmar_domain *find_domain(struct device *dev)
2419 struct device_domain_info *info;
2421 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO ||
2422 dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO))
2423 return NULL;
2425 if (dev_is_pci(dev))
2426 dev = &pci_real_dma_dev(to_pci_dev(dev))->dev;
2428 /* No lock here, assumes no domain exit in normal case */
2429 info = dev->archdata.iommu;
2430 if (likely(info))
2431 return info->domain;
2433 return NULL;
2436 static struct dmar_domain *deferred_attach_domain(struct device *dev)
2438 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2439 struct iommu_domain *domain;
2441 dev->archdata.iommu = NULL;
2442 domain = iommu_get_domain_for_dev(dev);
2443 if (domain)
2444 intel_iommu_attach_device(domain, dev);
2447 return find_domain(dev);
2450 static inline struct device_domain_info *
2451 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2453 struct device_domain_info *info;
2455 list_for_each_entry(info, &device_domain_list, global)
2456 if (info->iommu->segment == segment && info->bus == bus &&
2457 info->devfn == devfn)
2458 return info;
2460 return NULL;
2463 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2464 int bus, int devfn,
2465 struct device *dev,
2466 struct dmar_domain *domain)
2468 struct dmar_domain *found = NULL;
2469 struct device_domain_info *info;
2470 unsigned long flags;
2471 int ret;
2473 info = alloc_devinfo_mem();
2474 if (!info)
2475 return NULL;
2477 info->bus = bus;
2478 info->devfn = devfn;
2479 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2480 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2481 info->ats_qdep = 0;
2482 info->dev = dev;
2483 info->domain = domain;
2484 info->iommu = iommu;
2485 info->pasid_table = NULL;
2486 info->auxd_enabled = 0;
2487 INIT_LIST_HEAD(&info->auxiliary_domains);
2489 if (dev && dev_is_pci(dev)) {
2490 struct pci_dev *pdev = to_pci_dev(info->dev);
2492 if (!pdev->untrusted &&
2493 !pci_ats_disabled() &&
2494 ecap_dev_iotlb_support(iommu->ecap) &&
2495 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2496 dmar_find_matched_atsr_unit(pdev))
2497 info->ats_supported = 1;
2499 if (sm_supported(iommu)) {
2500 if (pasid_supported(iommu)) {
2501 int features = pci_pasid_features(pdev);
2502 if (features >= 0)
2503 info->pasid_supported = features | 1;
2506 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2507 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2508 info->pri_supported = 1;
2512 spin_lock_irqsave(&device_domain_lock, flags);
2513 if (dev)
2514 found = find_domain(dev);
2516 if (!found) {
2517 struct device_domain_info *info2;
2518 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2519 if (info2) {
2520 found = info2->domain;
2521 info2->dev = dev;
2525 if (found) {
2526 spin_unlock_irqrestore(&device_domain_lock, flags);
2527 free_devinfo_mem(info);
2528 /* Caller must free the original domain */
2529 return found;
2532 spin_lock(&iommu->lock);
2533 ret = domain_attach_iommu(domain, iommu);
2534 spin_unlock(&iommu->lock);
2536 if (ret) {
2537 spin_unlock_irqrestore(&device_domain_lock, flags);
2538 free_devinfo_mem(info);
2539 return NULL;
2542 list_add(&info->link, &domain->devices);
2543 list_add(&info->global, &device_domain_list);
2544 if (dev)
2545 dev->archdata.iommu = info;
2546 spin_unlock_irqrestore(&device_domain_lock, flags);
2548 /* PASID table is mandatory for a PCI device in scalable mode. */
2549 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2550 ret = intel_pasid_alloc_table(dev);
2551 if (ret) {
2552 dev_err(dev, "PASID table allocation failed\n");
2553 dmar_remove_one_dev_info(dev);
2554 return NULL;
2557 /* Setup the PASID entry for requests without PASID: */
2558 spin_lock(&iommu->lock);
2559 if (hw_pass_through && domain_type_is_si(domain))
2560 ret = intel_pasid_setup_pass_through(iommu, domain,
2561 dev, PASID_RID2PASID);
2562 else
2563 ret = intel_pasid_setup_second_level(iommu, domain,
2564 dev, PASID_RID2PASID);
2565 spin_unlock(&iommu->lock);
2566 if (ret) {
2567 dev_err(dev, "Setup RID2PASID failed\n");
2568 dmar_remove_one_dev_info(dev);
2569 return NULL;
2573 if (dev && domain_context_mapping(domain, dev)) {
2574 dev_err(dev, "Domain context map failed\n");
2575 dmar_remove_one_dev_info(dev);
2576 return NULL;
2579 return domain;
2582 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2584 *(u16 *)opaque = alias;
2585 return 0;
2588 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2590 struct device_domain_info *info;
2591 struct dmar_domain *domain = NULL;
2592 struct intel_iommu *iommu;
2593 u16 dma_alias;
2594 unsigned long flags;
2595 u8 bus, devfn;
2597 iommu = device_to_iommu(dev, &bus, &devfn);
2598 if (!iommu)
2599 return NULL;
2601 if (dev_is_pci(dev)) {
2602 struct pci_dev *pdev = to_pci_dev(dev);
2604 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2606 spin_lock_irqsave(&device_domain_lock, flags);
2607 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2608 PCI_BUS_NUM(dma_alias),
2609 dma_alias & 0xff);
2610 if (info) {
2611 iommu = info->iommu;
2612 domain = info->domain;
2614 spin_unlock_irqrestore(&device_domain_lock, flags);
2616 /* DMA alias already has a domain, use it */
2617 if (info)
2618 goto out;
2621 /* Allocate and initialize new domain for the device */
2622 domain = alloc_domain(0);
2623 if (!domain)
2624 return NULL;
2625 if (domain_init(domain, iommu, gaw)) {
2626 domain_exit(domain);
2627 return NULL;
2630 out:
2631 return domain;
2634 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2635 struct dmar_domain *domain)
2637 struct intel_iommu *iommu;
2638 struct dmar_domain *tmp;
2639 u16 req_id, dma_alias;
2640 u8 bus, devfn;
2642 iommu = device_to_iommu(dev, &bus, &devfn);
2643 if (!iommu)
2644 return NULL;
2646 req_id = ((u16)bus << 8) | devfn;
2648 if (dev_is_pci(dev)) {
2649 struct pci_dev *pdev = to_pci_dev(dev);
2651 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2653 /* register PCI DMA alias device */
2654 if (req_id != dma_alias) {
2655 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2656 dma_alias & 0xff, NULL, domain);
2658 if (!tmp || tmp != domain)
2659 return tmp;
2663 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2664 if (!tmp || tmp != domain)
2665 return tmp;
2667 return domain;
2670 static int iommu_domain_identity_map(struct dmar_domain *domain,
2671 unsigned long long start,
2672 unsigned long long end)
2674 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2675 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2677 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2678 dma_to_mm_pfn(last_vpfn))) {
2679 pr_err("Reserving iova failed\n");
2680 return -ENOMEM;
2683 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2685 * RMRR range might have overlap with physical memory range,
2686 * clear it first
2688 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2690 return __domain_mapping(domain, first_vpfn, NULL,
2691 first_vpfn, last_vpfn - first_vpfn + 1,
2692 DMA_PTE_READ|DMA_PTE_WRITE);
2695 static int domain_prepare_identity_map(struct device *dev,
2696 struct dmar_domain *domain,
2697 unsigned long long start,
2698 unsigned long long end)
2700 /* For _hardware_ passthrough, don't bother. But for software
2701 passthrough, we do it anyway -- it may indicate a memory
2702 range which is reserved in E820, so which didn't get set
2703 up to start with in si_domain */
2704 if (domain == si_domain && hw_pass_through) {
2705 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2706 start, end);
2707 return 0;
2710 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2712 if (end < start) {
2713 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2714 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2715 dmi_get_system_info(DMI_BIOS_VENDOR),
2716 dmi_get_system_info(DMI_BIOS_VERSION),
2717 dmi_get_system_info(DMI_PRODUCT_VERSION));
2718 return -EIO;
2721 if (end >> agaw_to_width(domain->agaw)) {
2722 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2723 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2724 agaw_to_width(domain->agaw),
2725 dmi_get_system_info(DMI_BIOS_VENDOR),
2726 dmi_get_system_info(DMI_BIOS_VERSION),
2727 dmi_get_system_info(DMI_PRODUCT_VERSION));
2728 return -EIO;
2731 return iommu_domain_identity_map(domain, start, end);
2734 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2736 static int __init si_domain_init(int hw)
2738 struct dmar_rmrr_unit *rmrr;
2739 struct device *dev;
2740 int i, nid, ret;
2742 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2743 if (!si_domain)
2744 return -EFAULT;
2746 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2747 domain_exit(si_domain);
2748 return -EFAULT;
2751 if (hw)
2752 return 0;
2754 for_each_online_node(nid) {
2755 unsigned long start_pfn, end_pfn;
2756 int i;
2758 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2759 ret = iommu_domain_identity_map(si_domain,
2760 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2761 if (ret)
2762 return ret;
2767 * Normally we use DMA domains for devices which have RMRRs. But we
2768 * loose this requirement for graphic and usb devices. Identity map
2769 * the RMRRs for graphic and USB devices so that they could use the
2770 * si_domain.
2772 for_each_rmrr_units(rmrr) {
2773 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2774 i, dev) {
2775 unsigned long long start = rmrr->base_address;
2776 unsigned long long end = rmrr->end_address;
2778 if (device_is_rmrr_locked(dev))
2779 continue;
2781 if (WARN_ON(end < start ||
2782 end >> agaw_to_width(si_domain->agaw)))
2783 continue;
2785 ret = iommu_domain_identity_map(si_domain, start, end);
2786 if (ret)
2787 return ret;
2791 return 0;
2794 static int identity_mapping(struct device *dev)
2796 struct device_domain_info *info;
2798 info = dev->archdata.iommu;
2799 if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO)
2800 return (info->domain == si_domain);
2802 return 0;
2805 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2807 struct dmar_domain *ndomain;
2808 struct intel_iommu *iommu;
2809 u8 bus, devfn;
2811 iommu = device_to_iommu(dev, &bus, &devfn);
2812 if (!iommu)
2813 return -ENODEV;
2815 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2816 if (ndomain != domain)
2817 return -EBUSY;
2819 return 0;
2822 static bool device_has_rmrr(struct device *dev)
2824 struct dmar_rmrr_unit *rmrr;
2825 struct device *tmp;
2826 int i;
2828 rcu_read_lock();
2829 for_each_rmrr_units(rmrr) {
2831 * Return TRUE if this RMRR contains the device that
2832 * is passed in.
2834 for_each_active_dev_scope(rmrr->devices,
2835 rmrr->devices_cnt, i, tmp)
2836 if (tmp == dev ||
2837 is_downstream_to_pci_bridge(dev, tmp)) {
2838 rcu_read_unlock();
2839 return true;
2842 rcu_read_unlock();
2843 return false;
2847 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2848 * is relaxable (ie. is allowed to be not enforced under some conditions)
2849 * @dev: device handle
2851 * We assume that PCI USB devices with RMRRs have them largely
2852 * for historical reasons and that the RMRR space is not actively used post
2853 * boot. This exclusion may change if vendors begin to abuse it.
2855 * The same exception is made for graphics devices, with the requirement that
2856 * any use of the RMRR regions will be torn down before assigning the device
2857 * to a guest.
2859 * Return: true if the RMRR is relaxable, false otherwise
2861 static bool device_rmrr_is_relaxable(struct device *dev)
2863 struct pci_dev *pdev;
2865 if (!dev_is_pci(dev))
2866 return false;
2868 pdev = to_pci_dev(dev);
2869 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2870 return true;
2871 else
2872 return false;
2876 * There are a couple cases where we need to restrict the functionality of
2877 * devices associated with RMRRs. The first is when evaluating a device for
2878 * identity mapping because problems exist when devices are moved in and out
2879 * of domains and their respective RMRR information is lost. This means that
2880 * a device with associated RMRRs will never be in a "passthrough" domain.
2881 * The second is use of the device through the IOMMU API. This interface
2882 * expects to have full control of the IOVA space for the device. We cannot
2883 * satisfy both the requirement that RMRR access is maintained and have an
2884 * unencumbered IOVA space. We also have no ability to quiesce the device's
2885 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2886 * We therefore prevent devices associated with an RMRR from participating in
2887 * the IOMMU API, which eliminates them from device assignment.
2889 * In both cases, devices which have relaxable RMRRs are not concerned by this
2890 * restriction. See device_rmrr_is_relaxable comment.
2892 static bool device_is_rmrr_locked(struct device *dev)
2894 if (!device_has_rmrr(dev))
2895 return false;
2897 if (device_rmrr_is_relaxable(dev))
2898 return false;
2900 return true;
2904 * Return the required default domain type for a specific device.
2906 * @dev: the device in query
2907 * @startup: true if this is during early boot
2909 * Returns:
2910 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2911 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2912 * - 0: both identity and dynamic domains work for this device
2914 static int device_def_domain_type(struct device *dev)
2916 if (dev_is_pci(dev)) {
2917 struct pci_dev *pdev = to_pci_dev(dev);
2919 if (device_is_rmrr_locked(dev))
2920 return IOMMU_DOMAIN_DMA;
2923 * Prevent any device marked as untrusted from getting
2924 * placed into the statically identity mapping domain.
2926 if (pdev->untrusted)
2927 return IOMMU_DOMAIN_DMA;
2929 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2930 return IOMMU_DOMAIN_IDENTITY;
2932 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2933 return IOMMU_DOMAIN_IDENTITY;
2936 * We want to start off with all devices in the 1:1 domain, and
2937 * take them out later if we find they can't access all of memory.
2939 * However, we can't do this for PCI devices behind bridges,
2940 * because all PCI devices behind the same bridge will end up
2941 * with the same source-id on their transactions.
2943 * Practically speaking, we can't change things around for these
2944 * devices at run-time, because we can't be sure there'll be no
2945 * DMA transactions in flight for any of their siblings.
2947 * So PCI devices (unless they're on the root bus) as well as
2948 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2949 * the 1:1 domain, just in _case_ one of their siblings turns out
2950 * not to be able to map all of memory.
2952 if (!pci_is_pcie(pdev)) {
2953 if (!pci_is_root_bus(pdev->bus))
2954 return IOMMU_DOMAIN_DMA;
2955 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2956 return IOMMU_DOMAIN_DMA;
2957 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2958 return IOMMU_DOMAIN_DMA;
2959 } else {
2960 if (device_has_rmrr(dev))
2961 return IOMMU_DOMAIN_DMA;
2964 return (iommu_identity_mapping & IDENTMAP_ALL) ?
2965 IOMMU_DOMAIN_IDENTITY : 0;
2968 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2971 * Start from the sane iommu hardware state.
2972 * If the queued invalidation is already initialized by us
2973 * (for example, while enabling interrupt-remapping) then
2974 * we got the things already rolling from a sane state.
2976 if (!iommu->qi) {
2978 * Clear any previous faults.
2980 dmar_fault(-1, iommu);
2982 * Disable queued invalidation if supported and already enabled
2983 * before OS handover.
2985 dmar_disable_qi(iommu);
2988 if (dmar_enable_qi(iommu)) {
2990 * Queued Invalidate not enabled, use Register Based Invalidate
2992 iommu->flush.flush_context = __iommu_flush_context;
2993 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2994 pr_info("%s: Using Register based invalidation\n",
2995 iommu->name);
2996 } else {
2997 iommu->flush.flush_context = qi_flush_context;
2998 iommu->flush.flush_iotlb = qi_flush_iotlb;
2999 pr_info("%s: Using Queued invalidation\n", iommu->name);
3003 static int copy_context_table(struct intel_iommu *iommu,
3004 struct root_entry *old_re,
3005 struct context_entry **tbl,
3006 int bus, bool ext)
3008 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3009 struct context_entry *new_ce = NULL, ce;
3010 struct context_entry *old_ce = NULL;
3011 struct root_entry re;
3012 phys_addr_t old_ce_phys;
3014 tbl_idx = ext ? bus * 2 : bus;
3015 memcpy(&re, old_re, sizeof(re));
3017 for (devfn = 0; devfn < 256; devfn++) {
3018 /* First calculate the correct index */
3019 idx = (ext ? devfn * 2 : devfn) % 256;
3021 if (idx == 0) {
3022 /* First save what we may have and clean up */
3023 if (new_ce) {
3024 tbl[tbl_idx] = new_ce;
3025 __iommu_flush_cache(iommu, new_ce,
3026 VTD_PAGE_SIZE);
3027 pos = 1;
3030 if (old_ce)
3031 memunmap(old_ce);
3033 ret = 0;
3034 if (devfn < 0x80)
3035 old_ce_phys = root_entry_lctp(&re);
3036 else
3037 old_ce_phys = root_entry_uctp(&re);
3039 if (!old_ce_phys) {
3040 if (ext && devfn == 0) {
3041 /* No LCTP, try UCTP */
3042 devfn = 0x7f;
3043 continue;
3044 } else {
3045 goto out;
3049 ret = -ENOMEM;
3050 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3051 MEMREMAP_WB);
3052 if (!old_ce)
3053 goto out;
3055 new_ce = alloc_pgtable_page(iommu->node);
3056 if (!new_ce)
3057 goto out_unmap;
3059 ret = 0;
3062 /* Now copy the context entry */
3063 memcpy(&ce, old_ce + idx, sizeof(ce));
3065 if (!__context_present(&ce))
3066 continue;
3068 did = context_domain_id(&ce);
3069 if (did >= 0 && did < cap_ndoms(iommu->cap))
3070 set_bit(did, iommu->domain_ids);
3073 * We need a marker for copied context entries. This
3074 * marker needs to work for the old format as well as
3075 * for extended context entries.
3077 * Bit 67 of the context entry is used. In the old
3078 * format this bit is available to software, in the
3079 * extended format it is the PGE bit, but PGE is ignored
3080 * by HW if PASIDs are disabled (and thus still
3081 * available).
3083 * So disable PASIDs first and then mark the entry
3084 * copied. This means that we don't copy PASID
3085 * translations from the old kernel, but this is fine as
3086 * faults there are not fatal.
3088 context_clear_pasid_enable(&ce);
3089 context_set_copied(&ce);
3091 new_ce[idx] = ce;
3094 tbl[tbl_idx + pos] = new_ce;
3096 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3098 out_unmap:
3099 memunmap(old_ce);
3101 out:
3102 return ret;
3105 static int copy_translation_tables(struct intel_iommu *iommu)
3107 struct context_entry **ctxt_tbls;
3108 struct root_entry *old_rt;
3109 phys_addr_t old_rt_phys;
3110 int ctxt_table_entries;
3111 unsigned long flags;
3112 u64 rtaddr_reg;
3113 int bus, ret;
3114 bool new_ext, ext;
3116 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3117 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3118 new_ext = !!ecap_ecs(iommu->ecap);
3121 * The RTT bit can only be changed when translation is disabled,
3122 * but disabling translation means to open a window for data
3123 * corruption. So bail out and don't copy anything if we would
3124 * have to change the bit.
3126 if (new_ext != ext)
3127 return -EINVAL;
3129 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3130 if (!old_rt_phys)
3131 return -EINVAL;
3133 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3134 if (!old_rt)
3135 return -ENOMEM;
3137 /* This is too big for the stack - allocate it from slab */
3138 ctxt_table_entries = ext ? 512 : 256;
3139 ret = -ENOMEM;
3140 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3141 if (!ctxt_tbls)
3142 goto out_unmap;
3144 for (bus = 0; bus < 256; bus++) {
3145 ret = copy_context_table(iommu, &old_rt[bus],
3146 ctxt_tbls, bus, ext);
3147 if (ret) {
3148 pr_err("%s: Failed to copy context table for bus %d\n",
3149 iommu->name, bus);
3150 continue;
3154 spin_lock_irqsave(&iommu->lock, flags);
3156 /* Context tables are copied, now write them to the root_entry table */
3157 for (bus = 0; bus < 256; bus++) {
3158 int idx = ext ? bus * 2 : bus;
3159 u64 val;
3161 if (ctxt_tbls[idx]) {
3162 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3163 iommu->root_entry[bus].lo = val;
3166 if (!ext || !ctxt_tbls[idx + 1])
3167 continue;
3169 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3170 iommu->root_entry[bus].hi = val;
3173 spin_unlock_irqrestore(&iommu->lock, flags);
3175 kfree(ctxt_tbls);
3177 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3179 ret = 0;
3181 out_unmap:
3182 memunmap(old_rt);
3184 return ret;
3187 static int __init init_dmars(void)
3189 struct dmar_drhd_unit *drhd;
3190 struct intel_iommu *iommu;
3191 int ret;
3194 * for each drhd
3195 * allocate root
3196 * initialize and program root entry to not present
3197 * endfor
3199 for_each_drhd_unit(drhd) {
3201 * lock not needed as this is only incremented in the single
3202 * threaded kernel __init code path all other access are read
3203 * only
3205 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3206 g_num_of_iommus++;
3207 continue;
3209 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3212 /* Preallocate enough resources for IOMMU hot-addition */
3213 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3214 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3216 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3217 GFP_KERNEL);
3218 if (!g_iommus) {
3219 pr_err("Allocating global iommu array failed\n");
3220 ret = -ENOMEM;
3221 goto error;
3224 for_each_iommu(iommu, drhd) {
3225 if (drhd->ignored) {
3226 iommu_disable_translation(iommu);
3227 continue;
3231 * Find the max pasid size of all IOMMU's in the system.
3232 * We need to ensure the system pasid table is no bigger
3233 * than the smallest supported.
3235 if (pasid_supported(iommu)) {
3236 u32 temp = 2 << ecap_pss(iommu->ecap);
3238 intel_pasid_max_id = min_t(u32, temp,
3239 intel_pasid_max_id);
3242 g_iommus[iommu->seq_id] = iommu;
3244 intel_iommu_init_qi(iommu);
3246 ret = iommu_init_domains(iommu);
3247 if (ret)
3248 goto free_iommu;
3250 init_translation_status(iommu);
3252 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3253 iommu_disable_translation(iommu);
3254 clear_translation_pre_enabled(iommu);
3255 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3256 iommu->name);
3260 * TBD:
3261 * we could share the same root & context tables
3262 * among all IOMMU's. Need to Split it later.
3264 ret = iommu_alloc_root_entry(iommu);
3265 if (ret)
3266 goto free_iommu;
3268 if (translation_pre_enabled(iommu)) {
3269 pr_info("Translation already enabled - trying to copy translation structures\n");
3271 ret = copy_translation_tables(iommu);
3272 if (ret) {
3274 * We found the IOMMU with translation
3275 * enabled - but failed to copy over the
3276 * old root-entry table. Try to proceed
3277 * by disabling translation now and
3278 * allocating a clean root-entry table.
3279 * This might cause DMAR faults, but
3280 * probably the dump will still succeed.
3282 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3283 iommu->name);
3284 iommu_disable_translation(iommu);
3285 clear_translation_pre_enabled(iommu);
3286 } else {
3287 pr_info("Copied translation tables from previous kernel for %s\n",
3288 iommu->name);
3292 if (!ecap_pass_through(iommu->ecap))
3293 hw_pass_through = 0;
3294 #ifdef CONFIG_INTEL_IOMMU_SVM
3295 if (pasid_supported(iommu))
3296 intel_svm_init(iommu);
3297 #endif
3301 * Now that qi is enabled on all iommus, set the root entry and flush
3302 * caches. This is required on some Intel X58 chipsets, otherwise the
3303 * flush_context function will loop forever and the boot hangs.
3305 for_each_active_iommu(iommu, drhd) {
3306 iommu_flush_write_buffer(iommu);
3307 iommu_set_root_entry(iommu);
3308 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3309 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3312 if (iommu_default_passthrough())
3313 iommu_identity_mapping |= IDENTMAP_ALL;
3315 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3316 dmar_map_gfx = 0;
3317 #endif
3319 if (!dmar_map_gfx)
3320 iommu_identity_mapping |= IDENTMAP_GFX;
3322 check_tylersburg_isoch();
3324 ret = si_domain_init(hw_pass_through);
3325 if (ret)
3326 goto free_iommu;
3329 * for each drhd
3330 * enable fault log
3331 * global invalidate context cache
3332 * global invalidate iotlb
3333 * enable translation
3335 for_each_iommu(iommu, drhd) {
3336 if (drhd->ignored) {
3338 * we always have to disable PMRs or DMA may fail on
3339 * this device
3341 if (force_on)
3342 iommu_disable_protect_mem_regions(iommu);
3343 continue;
3346 iommu_flush_write_buffer(iommu);
3348 #ifdef CONFIG_INTEL_IOMMU_SVM
3349 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3351 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3352 * could cause possible lock race condition.
3354 up_write(&dmar_global_lock);
3355 ret = intel_svm_enable_prq(iommu);
3356 down_write(&dmar_global_lock);
3357 if (ret)
3358 goto free_iommu;
3360 #endif
3361 ret = dmar_set_interrupt(iommu);
3362 if (ret)
3363 goto free_iommu;
3366 return 0;
3368 free_iommu:
3369 for_each_active_iommu(iommu, drhd) {
3370 disable_dmar_iommu(iommu);
3371 free_dmar_iommu(iommu);
3374 kfree(g_iommus);
3376 error:
3377 return ret;
3380 /* This takes a number of _MM_ pages, not VTD pages */
3381 static unsigned long intel_alloc_iova(struct device *dev,
3382 struct dmar_domain *domain,
3383 unsigned long nrpages, uint64_t dma_mask)
3385 unsigned long iova_pfn;
3387 /* Restrict dma_mask to the width that the iommu can handle */
3388 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3389 /* Ensure we reserve the whole size-aligned region */
3390 nrpages = __roundup_pow_of_two(nrpages);
3392 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3394 * First try to allocate an io virtual address in
3395 * DMA_BIT_MASK(32) and if that fails then try allocating
3396 * from higher range
3398 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3399 IOVA_PFN(DMA_BIT_MASK(32)), false);
3400 if (iova_pfn)
3401 return iova_pfn;
3403 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3404 IOVA_PFN(dma_mask), true);
3405 if (unlikely(!iova_pfn)) {
3406 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3407 return 0;
3410 return iova_pfn;
3413 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3415 struct dmar_domain *domain, *tmp;
3416 struct dmar_rmrr_unit *rmrr;
3417 struct device *i_dev;
3418 int i, ret;
3420 /* Device shouldn't be attached by any domains. */
3421 domain = find_domain(dev);
3422 if (domain)
3423 return NULL;
3425 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3426 if (!domain)
3427 goto out;
3429 /* We have a new domain - setup possible RMRRs for the device */
3430 rcu_read_lock();
3431 for_each_rmrr_units(rmrr) {
3432 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3433 i, i_dev) {
3434 if (i_dev != dev)
3435 continue;
3437 ret = domain_prepare_identity_map(dev, domain,
3438 rmrr->base_address,
3439 rmrr->end_address);
3440 if (ret)
3441 dev_err(dev, "Mapping reserved region failed\n");
3444 rcu_read_unlock();
3446 tmp = set_domain_for_dev(dev, domain);
3447 if (!tmp || domain != tmp) {
3448 domain_exit(domain);
3449 domain = tmp;
3452 out:
3453 if (!domain)
3454 dev_err(dev, "Allocating domain failed\n");
3455 else
3456 domain->domain.type = IOMMU_DOMAIN_DMA;
3458 return domain;
3461 /* Check if the dev needs to go through non-identity map and unmap process.*/
3462 static bool iommu_need_mapping(struct device *dev)
3464 int ret;
3466 if (iommu_dummy(dev))
3467 return false;
3469 ret = identity_mapping(dev);
3470 if (ret) {
3471 u64 dma_mask = *dev->dma_mask;
3473 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3474 dma_mask = dev->coherent_dma_mask;
3476 if (dma_mask >= dma_direct_get_required_mask(dev))
3477 return false;
3480 * 32 bit DMA is removed from si_domain and fall back to
3481 * non-identity mapping.
3483 dmar_remove_one_dev_info(dev);
3484 ret = iommu_request_dma_domain_for_dev(dev);
3485 if (ret) {
3486 struct iommu_domain *domain;
3487 struct dmar_domain *dmar_domain;
3489 domain = iommu_get_domain_for_dev(dev);
3490 if (domain) {
3491 dmar_domain = to_dmar_domain(domain);
3492 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3494 dmar_remove_one_dev_info(dev);
3495 get_private_domain_for_dev(dev);
3498 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3501 return true;
3504 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3505 size_t size, int dir, u64 dma_mask)
3507 struct dmar_domain *domain;
3508 phys_addr_t start_paddr;
3509 unsigned long iova_pfn;
3510 int prot = 0;
3511 int ret;
3512 struct intel_iommu *iommu;
3513 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3515 BUG_ON(dir == DMA_NONE);
3517 domain = deferred_attach_domain(dev);
3518 if (!domain)
3519 return DMA_MAPPING_ERROR;
3521 iommu = domain_get_iommu(domain);
3522 size = aligned_nrpages(paddr, size);
3524 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3525 if (!iova_pfn)
3526 goto error;
3529 * Check if DMAR supports zero-length reads on write only
3530 * mappings..
3532 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3533 !cap_zlr(iommu->cap))
3534 prot |= DMA_PTE_READ;
3535 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3536 prot |= DMA_PTE_WRITE;
3538 * paddr - (paddr + size) might be partial page, we should map the whole
3539 * page. Note: if two part of one page are separately mapped, we
3540 * might have two guest_addr mapping to the same host paddr, but this
3541 * is not a big problem
3543 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3544 mm_to_dma_pfn(paddr_pfn), size, prot);
3545 if (ret)
3546 goto error;
3548 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3549 start_paddr += paddr & ~PAGE_MASK;
3551 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3553 return start_paddr;
3555 error:
3556 if (iova_pfn)
3557 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3558 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3559 size, (unsigned long long)paddr, dir);
3560 return DMA_MAPPING_ERROR;
3563 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3564 unsigned long offset, size_t size,
3565 enum dma_data_direction dir,
3566 unsigned long attrs)
3568 if (iommu_need_mapping(dev))
3569 return __intel_map_single(dev, page_to_phys(page) + offset,
3570 size, dir, *dev->dma_mask);
3571 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3574 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3575 size_t size, enum dma_data_direction dir,
3576 unsigned long attrs)
3578 if (iommu_need_mapping(dev))
3579 return __intel_map_single(dev, phys_addr, size, dir,
3580 *dev->dma_mask);
3581 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3584 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3586 struct dmar_domain *domain;
3587 unsigned long start_pfn, last_pfn;
3588 unsigned long nrpages;
3589 unsigned long iova_pfn;
3590 struct intel_iommu *iommu;
3591 struct page *freelist;
3592 struct pci_dev *pdev = NULL;
3594 domain = find_domain(dev);
3595 BUG_ON(!domain);
3597 iommu = domain_get_iommu(domain);
3599 iova_pfn = IOVA_PFN(dev_addr);
3601 nrpages = aligned_nrpages(dev_addr, size);
3602 start_pfn = mm_to_dma_pfn(iova_pfn);
3603 last_pfn = start_pfn + nrpages - 1;
3605 if (dev_is_pci(dev))
3606 pdev = to_pci_dev(dev);
3608 freelist = domain_unmap(domain, start_pfn, last_pfn);
3609 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3610 !has_iova_flush_queue(&domain->iovad)) {
3611 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3612 nrpages, !freelist, 0);
3613 /* free iova */
3614 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3615 dma_free_pagelist(freelist);
3616 } else {
3617 queue_iova(&domain->iovad, iova_pfn, nrpages,
3618 (unsigned long)freelist);
3620 * queue up the release of the unmap to save the 1/6th of the
3621 * cpu used up by the iotlb flush operation...
3625 trace_unmap_single(dev, dev_addr, size);
3628 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3629 size_t size, enum dma_data_direction dir,
3630 unsigned long attrs)
3632 if (iommu_need_mapping(dev))
3633 intel_unmap(dev, dev_addr, size);
3634 else
3635 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3638 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3639 size_t size, enum dma_data_direction dir, unsigned long attrs)
3641 if (iommu_need_mapping(dev))
3642 intel_unmap(dev, dev_addr, size);
3645 static void *intel_alloc_coherent(struct device *dev, size_t size,
3646 dma_addr_t *dma_handle, gfp_t flags,
3647 unsigned long attrs)
3649 struct page *page = NULL;
3650 int order;
3652 if (!iommu_need_mapping(dev))
3653 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3655 size = PAGE_ALIGN(size);
3656 order = get_order(size);
3658 if (gfpflags_allow_blocking(flags)) {
3659 unsigned int count = size >> PAGE_SHIFT;
3661 page = dma_alloc_from_contiguous(dev, count, order,
3662 flags & __GFP_NOWARN);
3665 if (!page)
3666 page = alloc_pages(flags, order);
3667 if (!page)
3668 return NULL;
3669 memset(page_address(page), 0, size);
3671 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3672 DMA_BIDIRECTIONAL,
3673 dev->coherent_dma_mask);
3674 if (*dma_handle != DMA_MAPPING_ERROR)
3675 return page_address(page);
3676 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3677 __free_pages(page, order);
3679 return NULL;
3682 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3683 dma_addr_t dma_handle, unsigned long attrs)
3685 int order;
3686 struct page *page = virt_to_page(vaddr);
3688 if (!iommu_need_mapping(dev))
3689 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3691 size = PAGE_ALIGN(size);
3692 order = get_order(size);
3694 intel_unmap(dev, dma_handle, size);
3695 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3696 __free_pages(page, order);
3699 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3700 int nelems, enum dma_data_direction dir,
3701 unsigned long attrs)
3703 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3704 unsigned long nrpages = 0;
3705 struct scatterlist *sg;
3706 int i;
3708 if (!iommu_need_mapping(dev))
3709 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3711 for_each_sg(sglist, sg, nelems, i) {
3712 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3715 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3717 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3720 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3721 enum dma_data_direction dir, unsigned long attrs)
3723 int i;
3724 struct dmar_domain *domain;
3725 size_t size = 0;
3726 int prot = 0;
3727 unsigned long iova_pfn;
3728 int ret;
3729 struct scatterlist *sg;
3730 unsigned long start_vpfn;
3731 struct intel_iommu *iommu;
3733 BUG_ON(dir == DMA_NONE);
3734 if (!iommu_need_mapping(dev))
3735 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3737 domain = deferred_attach_domain(dev);
3738 if (!domain)
3739 return 0;
3741 iommu = domain_get_iommu(domain);
3743 for_each_sg(sglist, sg, nelems, i)
3744 size += aligned_nrpages(sg->offset, sg->length);
3746 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3747 *dev->dma_mask);
3748 if (!iova_pfn) {
3749 sglist->dma_length = 0;
3750 return 0;
3754 * Check if DMAR supports zero-length reads on write only
3755 * mappings..
3757 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3758 !cap_zlr(iommu->cap))
3759 prot |= DMA_PTE_READ;
3760 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3761 prot |= DMA_PTE_WRITE;
3763 start_vpfn = mm_to_dma_pfn(iova_pfn);
3765 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3766 if (unlikely(ret)) {
3767 dma_pte_free_pagetable(domain, start_vpfn,
3768 start_vpfn + size - 1,
3769 agaw_to_level(domain->agaw) + 1);
3770 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3771 return 0;
3774 trace_map_sg(dev, iova_pfn << PAGE_SHIFT,
3775 sg_phys(sglist), size << VTD_PAGE_SHIFT);
3777 return nelems;
3780 static u64 intel_get_required_mask(struct device *dev)
3782 if (!iommu_need_mapping(dev))
3783 return dma_direct_get_required_mask(dev);
3784 return DMA_BIT_MASK(32);
3787 static const struct dma_map_ops intel_dma_ops = {
3788 .alloc = intel_alloc_coherent,
3789 .free = intel_free_coherent,
3790 .map_sg = intel_map_sg,
3791 .unmap_sg = intel_unmap_sg,
3792 .map_page = intel_map_page,
3793 .unmap_page = intel_unmap_page,
3794 .map_resource = intel_map_resource,
3795 .unmap_resource = intel_unmap_resource,
3796 .dma_supported = dma_direct_supported,
3797 .mmap = dma_common_mmap,
3798 .get_sgtable = dma_common_get_sgtable,
3799 .get_required_mask = intel_get_required_mask,
3802 static void
3803 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3804 enum dma_data_direction dir, enum dma_sync_target target)
3806 struct dmar_domain *domain;
3807 phys_addr_t tlb_addr;
3809 domain = find_domain(dev);
3810 if (WARN_ON(!domain))
3811 return;
3813 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3814 if (is_swiotlb_buffer(tlb_addr))
3815 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3818 static dma_addr_t
3819 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3820 enum dma_data_direction dir, unsigned long attrs,
3821 u64 dma_mask)
3823 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3824 struct dmar_domain *domain;
3825 struct intel_iommu *iommu;
3826 unsigned long iova_pfn;
3827 unsigned long nrpages;
3828 phys_addr_t tlb_addr;
3829 int prot = 0;
3830 int ret;
3832 domain = deferred_attach_domain(dev);
3833 if (WARN_ON(dir == DMA_NONE || !domain))
3834 return DMA_MAPPING_ERROR;
3836 iommu = domain_get_iommu(domain);
3837 if (WARN_ON(!iommu))
3838 return DMA_MAPPING_ERROR;
3840 nrpages = aligned_nrpages(0, size);
3841 iova_pfn = intel_alloc_iova(dev, domain,
3842 dma_to_mm_pfn(nrpages), dma_mask);
3843 if (!iova_pfn)
3844 return DMA_MAPPING_ERROR;
3847 * Check if DMAR supports zero-length reads on write only
3848 * mappings..
3850 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3851 !cap_zlr(iommu->cap))
3852 prot |= DMA_PTE_READ;
3853 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3854 prot |= DMA_PTE_WRITE;
3857 * If both the physical buffer start address and size are
3858 * page aligned, we don't need to use a bounce page.
3860 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3861 tlb_addr = swiotlb_tbl_map_single(dev,
3862 __phys_to_dma(dev, io_tlb_start),
3863 paddr, size, aligned_size, dir, attrs);
3864 if (tlb_addr == DMA_MAPPING_ERROR) {
3865 goto swiotlb_error;
3866 } else {
3867 /* Cleanup the padding area. */
3868 void *padding_start = phys_to_virt(tlb_addr);
3869 size_t padding_size = aligned_size;
3871 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3872 (dir == DMA_TO_DEVICE ||
3873 dir == DMA_BIDIRECTIONAL)) {
3874 padding_start += size;
3875 padding_size -= size;
3878 memset(padding_start, 0, padding_size);
3880 } else {
3881 tlb_addr = paddr;
3884 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3885 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3886 if (ret)
3887 goto mapping_error;
3889 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3891 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3893 mapping_error:
3894 if (is_swiotlb_buffer(tlb_addr))
3895 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3896 aligned_size, dir, attrs);
3897 swiotlb_error:
3898 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3899 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3900 size, (unsigned long long)paddr, dir);
3902 return DMA_MAPPING_ERROR;
3905 static void
3906 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3907 enum dma_data_direction dir, unsigned long attrs)
3909 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3910 struct dmar_domain *domain;
3911 phys_addr_t tlb_addr;
3913 domain = find_domain(dev);
3914 if (WARN_ON(!domain))
3915 return;
3917 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3918 if (WARN_ON(!tlb_addr))
3919 return;
3921 intel_unmap(dev, dev_addr, size);
3922 if (is_swiotlb_buffer(tlb_addr))
3923 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3924 aligned_size, dir, attrs);
3926 trace_bounce_unmap_single(dev, dev_addr, size);
3929 static dma_addr_t
3930 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3931 size_t size, enum dma_data_direction dir, unsigned long attrs)
3933 return bounce_map_single(dev, page_to_phys(page) + offset,
3934 size, dir, attrs, *dev->dma_mask);
3937 static dma_addr_t
3938 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3939 enum dma_data_direction dir, unsigned long attrs)
3941 return bounce_map_single(dev, phys_addr, size,
3942 dir, attrs, *dev->dma_mask);
3945 static void
3946 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3947 enum dma_data_direction dir, unsigned long attrs)
3949 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3952 static void
3953 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3954 enum dma_data_direction dir, unsigned long attrs)
3956 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3959 static void
3960 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3961 enum dma_data_direction dir, unsigned long attrs)
3963 struct scatterlist *sg;
3964 int i;
3966 for_each_sg(sglist, sg, nelems, i)
3967 bounce_unmap_page(dev, sg->dma_address,
3968 sg_dma_len(sg), dir, attrs);
3971 static int
3972 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3973 enum dma_data_direction dir, unsigned long attrs)
3975 int i;
3976 struct scatterlist *sg;
3978 for_each_sg(sglist, sg, nelems, i) {
3979 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3980 sg->offset, sg->length,
3981 dir, attrs);
3982 if (sg->dma_address == DMA_MAPPING_ERROR)
3983 goto out_unmap;
3984 sg_dma_len(sg) = sg->length;
3987 return nelems;
3989 out_unmap:
3990 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3991 return 0;
3994 static void
3995 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3996 size_t size, enum dma_data_direction dir)
3998 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4001 static void
4002 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4003 size_t size, enum dma_data_direction dir)
4005 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4008 static void
4009 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4010 int nelems, enum dma_data_direction dir)
4012 struct scatterlist *sg;
4013 int i;
4015 for_each_sg(sglist, sg, nelems, i)
4016 bounce_sync_single(dev, sg_dma_address(sg),
4017 sg_dma_len(sg), dir, SYNC_FOR_CPU);
4020 static void
4021 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4022 int nelems, enum dma_data_direction dir)
4024 struct scatterlist *sg;
4025 int i;
4027 for_each_sg(sglist, sg, nelems, i)
4028 bounce_sync_single(dev, sg_dma_address(sg),
4029 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4032 static const struct dma_map_ops bounce_dma_ops = {
4033 .alloc = intel_alloc_coherent,
4034 .free = intel_free_coherent,
4035 .map_sg = bounce_map_sg,
4036 .unmap_sg = bounce_unmap_sg,
4037 .map_page = bounce_map_page,
4038 .unmap_page = bounce_unmap_page,
4039 .sync_single_for_cpu = bounce_sync_single_for_cpu,
4040 .sync_single_for_device = bounce_sync_single_for_device,
4041 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
4042 .sync_sg_for_device = bounce_sync_sg_for_device,
4043 .map_resource = bounce_map_resource,
4044 .unmap_resource = bounce_unmap_resource,
4045 .dma_supported = dma_direct_supported,
4048 static inline int iommu_domain_cache_init(void)
4050 int ret = 0;
4052 iommu_domain_cache = kmem_cache_create("iommu_domain",
4053 sizeof(struct dmar_domain),
4055 SLAB_HWCACHE_ALIGN,
4057 NULL);
4058 if (!iommu_domain_cache) {
4059 pr_err("Couldn't create iommu_domain cache\n");
4060 ret = -ENOMEM;
4063 return ret;
4066 static inline int iommu_devinfo_cache_init(void)
4068 int ret = 0;
4070 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4071 sizeof(struct device_domain_info),
4073 SLAB_HWCACHE_ALIGN,
4074 NULL);
4075 if (!iommu_devinfo_cache) {
4076 pr_err("Couldn't create devinfo cache\n");
4077 ret = -ENOMEM;
4080 return ret;
4083 static int __init iommu_init_mempool(void)
4085 int ret;
4086 ret = iova_cache_get();
4087 if (ret)
4088 return ret;
4090 ret = iommu_domain_cache_init();
4091 if (ret)
4092 goto domain_error;
4094 ret = iommu_devinfo_cache_init();
4095 if (!ret)
4096 return ret;
4098 kmem_cache_destroy(iommu_domain_cache);
4099 domain_error:
4100 iova_cache_put();
4102 return -ENOMEM;
4105 static void __init iommu_exit_mempool(void)
4107 kmem_cache_destroy(iommu_devinfo_cache);
4108 kmem_cache_destroy(iommu_domain_cache);
4109 iova_cache_put();
4112 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4114 struct dmar_drhd_unit *drhd;
4115 u32 vtbar;
4116 int rc;
4118 /* We know that this device on this chipset has its own IOMMU.
4119 * If we find it under a different IOMMU, then the BIOS is lying
4120 * to us. Hope that the IOMMU for this device is actually
4121 * disabled, and it needs no translation...
4123 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4124 if (rc) {
4125 /* "can't" happen */
4126 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4127 return;
4129 vtbar &= 0xffff0000;
4131 /* we know that the this iommu should be at offset 0xa000 from vtbar */
4132 drhd = dmar_find_matched_drhd_unit(pdev);
4133 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4134 TAINT_FIRMWARE_WORKAROUND,
4135 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4136 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4138 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4140 static void __init init_no_remapping_devices(void)
4142 struct dmar_drhd_unit *drhd;
4143 struct device *dev;
4144 int i;
4146 for_each_drhd_unit(drhd) {
4147 if (!drhd->include_all) {
4148 for_each_active_dev_scope(drhd->devices,
4149 drhd->devices_cnt, i, dev)
4150 break;
4151 /* ignore DMAR unit if no devices exist */
4152 if (i == drhd->devices_cnt)
4153 drhd->ignored = 1;
4157 for_each_active_drhd_unit(drhd) {
4158 if (drhd->include_all)
4159 continue;
4161 for_each_active_dev_scope(drhd->devices,
4162 drhd->devices_cnt, i, dev)
4163 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4164 break;
4165 if (i < drhd->devices_cnt)
4166 continue;
4168 /* This IOMMU has *only* gfx devices. Either bypass it or
4169 set the gfx_mapped flag, as appropriate */
4170 if (!dmar_map_gfx) {
4171 drhd->ignored = 1;
4172 for_each_active_dev_scope(drhd->devices,
4173 drhd->devices_cnt, i, dev)
4174 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4179 #ifdef CONFIG_SUSPEND
4180 static int init_iommu_hw(void)
4182 struct dmar_drhd_unit *drhd;
4183 struct intel_iommu *iommu = NULL;
4185 for_each_active_iommu(iommu, drhd)
4186 if (iommu->qi)
4187 dmar_reenable_qi(iommu);
4189 for_each_iommu(iommu, drhd) {
4190 if (drhd->ignored) {
4192 * we always have to disable PMRs or DMA may fail on
4193 * this device
4195 if (force_on)
4196 iommu_disable_protect_mem_regions(iommu);
4197 continue;
4200 iommu_flush_write_buffer(iommu);
4202 iommu_set_root_entry(iommu);
4204 iommu->flush.flush_context(iommu, 0, 0, 0,
4205 DMA_CCMD_GLOBAL_INVL);
4206 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4207 iommu_enable_translation(iommu);
4208 iommu_disable_protect_mem_regions(iommu);
4211 return 0;
4214 static void iommu_flush_all(void)
4216 struct dmar_drhd_unit *drhd;
4217 struct intel_iommu *iommu;
4219 for_each_active_iommu(iommu, drhd) {
4220 iommu->flush.flush_context(iommu, 0, 0, 0,
4221 DMA_CCMD_GLOBAL_INVL);
4222 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4223 DMA_TLB_GLOBAL_FLUSH);
4227 static int iommu_suspend(void)
4229 struct dmar_drhd_unit *drhd;
4230 struct intel_iommu *iommu = NULL;
4231 unsigned long flag;
4233 for_each_active_iommu(iommu, drhd) {
4234 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4235 GFP_ATOMIC);
4236 if (!iommu->iommu_state)
4237 goto nomem;
4240 iommu_flush_all();
4242 for_each_active_iommu(iommu, drhd) {
4243 iommu_disable_translation(iommu);
4245 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4247 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4248 readl(iommu->reg + DMAR_FECTL_REG);
4249 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4250 readl(iommu->reg + DMAR_FEDATA_REG);
4251 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4252 readl(iommu->reg + DMAR_FEADDR_REG);
4253 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4254 readl(iommu->reg + DMAR_FEUADDR_REG);
4256 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4258 return 0;
4260 nomem:
4261 for_each_active_iommu(iommu, drhd)
4262 kfree(iommu->iommu_state);
4264 return -ENOMEM;
4267 static void iommu_resume(void)
4269 struct dmar_drhd_unit *drhd;
4270 struct intel_iommu *iommu = NULL;
4271 unsigned long flag;
4273 if (init_iommu_hw()) {
4274 if (force_on)
4275 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4276 else
4277 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4278 return;
4281 for_each_active_iommu(iommu, drhd) {
4283 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4285 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4286 iommu->reg + DMAR_FECTL_REG);
4287 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4288 iommu->reg + DMAR_FEDATA_REG);
4289 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4290 iommu->reg + DMAR_FEADDR_REG);
4291 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4292 iommu->reg + DMAR_FEUADDR_REG);
4294 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4297 for_each_active_iommu(iommu, drhd)
4298 kfree(iommu->iommu_state);
4301 static struct syscore_ops iommu_syscore_ops = {
4302 .resume = iommu_resume,
4303 .suspend = iommu_suspend,
4306 static void __init init_iommu_pm_ops(void)
4308 register_syscore_ops(&iommu_syscore_ops);
4311 #else
4312 static inline void init_iommu_pm_ops(void) {}
4313 #endif /* CONFIG_PM */
4315 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4317 struct acpi_dmar_reserved_memory *rmrr;
4318 struct dmar_rmrr_unit *rmrru;
4319 int ret;
4321 rmrr = (struct acpi_dmar_reserved_memory *)header;
4322 ret = arch_rmrr_sanity_check(rmrr);
4323 if (ret)
4324 return ret;
4326 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4327 if (!rmrru)
4328 goto out;
4330 rmrru->hdr = header;
4332 rmrru->base_address = rmrr->base_address;
4333 rmrru->end_address = rmrr->end_address;
4335 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4336 ((void *)rmrr) + rmrr->header.length,
4337 &rmrru->devices_cnt);
4338 if (rmrru->devices_cnt && rmrru->devices == NULL)
4339 goto free_rmrru;
4341 list_add(&rmrru->list, &dmar_rmrr_units);
4343 return 0;
4344 free_rmrru:
4345 kfree(rmrru);
4346 out:
4347 return -ENOMEM;
4350 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4352 struct dmar_atsr_unit *atsru;
4353 struct acpi_dmar_atsr *tmp;
4355 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4356 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4357 if (atsr->segment != tmp->segment)
4358 continue;
4359 if (atsr->header.length != tmp->header.length)
4360 continue;
4361 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4362 return atsru;
4365 return NULL;
4368 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4370 struct acpi_dmar_atsr *atsr;
4371 struct dmar_atsr_unit *atsru;
4373 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4374 return 0;
4376 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4377 atsru = dmar_find_atsr(atsr);
4378 if (atsru)
4379 return 0;
4381 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4382 if (!atsru)
4383 return -ENOMEM;
4386 * If memory is allocated from slab by ACPI _DSM method, we need to
4387 * copy the memory content because the memory buffer will be freed
4388 * on return.
4390 atsru->hdr = (void *)(atsru + 1);
4391 memcpy(atsru->hdr, hdr, hdr->length);
4392 atsru->include_all = atsr->flags & 0x1;
4393 if (!atsru->include_all) {
4394 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4395 (void *)atsr + atsr->header.length,
4396 &atsru->devices_cnt);
4397 if (atsru->devices_cnt && atsru->devices == NULL) {
4398 kfree(atsru);
4399 return -ENOMEM;
4403 list_add_rcu(&atsru->list, &dmar_atsr_units);
4405 return 0;
4408 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4410 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4411 kfree(atsru);
4414 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4416 struct acpi_dmar_atsr *atsr;
4417 struct dmar_atsr_unit *atsru;
4419 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4420 atsru = dmar_find_atsr(atsr);
4421 if (atsru) {
4422 list_del_rcu(&atsru->list);
4423 synchronize_rcu();
4424 intel_iommu_free_atsr(atsru);
4427 return 0;
4430 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4432 int i;
4433 struct device *dev;
4434 struct acpi_dmar_atsr *atsr;
4435 struct dmar_atsr_unit *atsru;
4437 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4438 atsru = dmar_find_atsr(atsr);
4439 if (!atsru)
4440 return 0;
4442 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4443 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4444 i, dev)
4445 return -EBUSY;
4448 return 0;
4451 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4453 int sp, ret;
4454 struct intel_iommu *iommu = dmaru->iommu;
4456 if (g_iommus[iommu->seq_id])
4457 return 0;
4459 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4460 pr_warn("%s: Doesn't support hardware pass through.\n",
4461 iommu->name);
4462 return -ENXIO;
4464 if (!ecap_sc_support(iommu->ecap) &&
4465 domain_update_iommu_snooping(iommu)) {
4466 pr_warn("%s: Doesn't support snooping.\n",
4467 iommu->name);
4468 return -ENXIO;
4470 sp = domain_update_iommu_superpage(iommu) - 1;
4471 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4472 pr_warn("%s: Doesn't support large page.\n",
4473 iommu->name);
4474 return -ENXIO;
4478 * Disable translation if already enabled prior to OS handover.
4480 if (iommu->gcmd & DMA_GCMD_TE)
4481 iommu_disable_translation(iommu);
4483 g_iommus[iommu->seq_id] = iommu;
4484 ret = iommu_init_domains(iommu);
4485 if (ret == 0)
4486 ret = iommu_alloc_root_entry(iommu);
4487 if (ret)
4488 goto out;
4490 #ifdef CONFIG_INTEL_IOMMU_SVM
4491 if (pasid_supported(iommu))
4492 intel_svm_init(iommu);
4493 #endif
4495 if (dmaru->ignored) {
4497 * we always have to disable PMRs or DMA may fail on this device
4499 if (force_on)
4500 iommu_disable_protect_mem_regions(iommu);
4501 return 0;
4504 intel_iommu_init_qi(iommu);
4505 iommu_flush_write_buffer(iommu);
4507 #ifdef CONFIG_INTEL_IOMMU_SVM
4508 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4509 ret = intel_svm_enable_prq(iommu);
4510 if (ret)
4511 goto disable_iommu;
4513 #endif
4514 ret = dmar_set_interrupt(iommu);
4515 if (ret)
4516 goto disable_iommu;
4518 iommu_set_root_entry(iommu);
4519 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4520 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4521 iommu_enable_translation(iommu);
4523 iommu_disable_protect_mem_regions(iommu);
4524 return 0;
4526 disable_iommu:
4527 disable_dmar_iommu(iommu);
4528 out:
4529 free_dmar_iommu(iommu);
4530 return ret;
4533 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4535 int ret = 0;
4536 struct intel_iommu *iommu = dmaru->iommu;
4538 if (!intel_iommu_enabled)
4539 return 0;
4540 if (iommu == NULL)
4541 return -EINVAL;
4543 if (insert) {
4544 ret = intel_iommu_add(dmaru);
4545 } else {
4546 disable_dmar_iommu(iommu);
4547 free_dmar_iommu(iommu);
4550 return ret;
4553 static void intel_iommu_free_dmars(void)
4555 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4556 struct dmar_atsr_unit *atsru, *atsr_n;
4558 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4559 list_del(&rmrru->list);
4560 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4561 kfree(rmrru);
4564 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4565 list_del(&atsru->list);
4566 intel_iommu_free_atsr(atsru);
4570 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4572 int i, ret = 1;
4573 struct pci_bus *bus;
4574 struct pci_dev *bridge = NULL;
4575 struct device *tmp;
4576 struct acpi_dmar_atsr *atsr;
4577 struct dmar_atsr_unit *atsru;
4579 dev = pci_physfn(dev);
4580 for (bus = dev->bus; bus; bus = bus->parent) {
4581 bridge = bus->self;
4582 /* If it's an integrated device, allow ATS */
4583 if (!bridge)
4584 return 1;
4585 /* Connected via non-PCIe: no ATS */
4586 if (!pci_is_pcie(bridge) ||
4587 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4588 return 0;
4589 /* If we found the root port, look it up in the ATSR */
4590 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4591 break;
4594 rcu_read_lock();
4595 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4596 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4597 if (atsr->segment != pci_domain_nr(dev->bus))
4598 continue;
4600 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4601 if (tmp == &bridge->dev)
4602 goto out;
4604 if (atsru->include_all)
4605 goto out;
4607 ret = 0;
4608 out:
4609 rcu_read_unlock();
4611 return ret;
4614 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4616 int ret;
4617 struct dmar_rmrr_unit *rmrru;
4618 struct dmar_atsr_unit *atsru;
4619 struct acpi_dmar_atsr *atsr;
4620 struct acpi_dmar_reserved_memory *rmrr;
4622 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4623 return 0;
4625 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4626 rmrr = container_of(rmrru->hdr,
4627 struct acpi_dmar_reserved_memory, header);
4628 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4629 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4630 ((void *)rmrr) + rmrr->header.length,
4631 rmrr->segment, rmrru->devices,
4632 rmrru->devices_cnt);
4633 if (ret < 0)
4634 return ret;
4635 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4636 dmar_remove_dev_scope(info, rmrr->segment,
4637 rmrru->devices, rmrru->devices_cnt);
4641 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4642 if (atsru->include_all)
4643 continue;
4645 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4646 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4647 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4648 (void *)atsr + atsr->header.length,
4649 atsr->segment, atsru->devices,
4650 atsru->devices_cnt);
4651 if (ret > 0)
4652 break;
4653 else if (ret < 0)
4654 return ret;
4655 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4656 if (dmar_remove_dev_scope(info, atsr->segment,
4657 atsru->devices, atsru->devices_cnt))
4658 break;
4662 return 0;
4665 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4666 unsigned long val, void *v)
4668 struct memory_notify *mhp = v;
4669 unsigned long long start, end;
4670 unsigned long start_vpfn, last_vpfn;
4672 switch (val) {
4673 case MEM_GOING_ONLINE:
4674 start = mhp->start_pfn << PAGE_SHIFT;
4675 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4676 if (iommu_domain_identity_map(si_domain, start, end)) {
4677 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4678 start, end);
4679 return NOTIFY_BAD;
4681 break;
4683 case MEM_OFFLINE:
4684 case MEM_CANCEL_ONLINE:
4685 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4686 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4687 while (start_vpfn <= last_vpfn) {
4688 struct iova *iova;
4689 struct dmar_drhd_unit *drhd;
4690 struct intel_iommu *iommu;
4691 struct page *freelist;
4693 iova = find_iova(&si_domain->iovad, start_vpfn);
4694 if (iova == NULL) {
4695 pr_debug("Failed get IOVA for PFN %lx\n",
4696 start_vpfn);
4697 break;
4700 iova = split_and_remove_iova(&si_domain->iovad, iova,
4701 start_vpfn, last_vpfn);
4702 if (iova == NULL) {
4703 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4704 start_vpfn, last_vpfn);
4705 return NOTIFY_BAD;
4708 freelist = domain_unmap(si_domain, iova->pfn_lo,
4709 iova->pfn_hi);
4711 rcu_read_lock();
4712 for_each_active_iommu(iommu, drhd)
4713 iommu_flush_iotlb_psi(iommu, si_domain,
4714 iova->pfn_lo, iova_size(iova),
4715 !freelist, 0);
4716 rcu_read_unlock();
4717 dma_free_pagelist(freelist);
4719 start_vpfn = iova->pfn_hi + 1;
4720 free_iova_mem(iova);
4722 break;
4725 return NOTIFY_OK;
4728 static struct notifier_block intel_iommu_memory_nb = {
4729 .notifier_call = intel_iommu_memory_notifier,
4730 .priority = 0
4733 static void free_all_cpu_cached_iovas(unsigned int cpu)
4735 int i;
4737 for (i = 0; i < g_num_of_iommus; i++) {
4738 struct intel_iommu *iommu = g_iommus[i];
4739 struct dmar_domain *domain;
4740 int did;
4742 if (!iommu)
4743 continue;
4745 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4746 domain = get_iommu_domain(iommu, (u16)did);
4748 if (!domain)
4749 continue;
4750 free_cpu_cached_iovas(cpu, &domain->iovad);
4755 static int intel_iommu_cpu_dead(unsigned int cpu)
4757 free_all_cpu_cached_iovas(cpu);
4758 return 0;
4761 static void intel_disable_iommus(void)
4763 struct intel_iommu *iommu = NULL;
4764 struct dmar_drhd_unit *drhd;
4766 for_each_iommu(iommu, drhd)
4767 iommu_disable_translation(iommu);
4770 void intel_iommu_shutdown(void)
4772 struct dmar_drhd_unit *drhd;
4773 struct intel_iommu *iommu = NULL;
4775 if (no_iommu || dmar_disabled)
4776 return;
4778 down_write(&dmar_global_lock);
4780 /* Disable PMRs explicitly here. */
4781 for_each_iommu(iommu, drhd)
4782 iommu_disable_protect_mem_regions(iommu);
4784 /* Make sure the IOMMUs are switched off */
4785 intel_disable_iommus();
4787 up_write(&dmar_global_lock);
4790 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4792 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4794 return container_of(iommu_dev, struct intel_iommu, iommu);
4797 static ssize_t intel_iommu_show_version(struct device *dev,
4798 struct device_attribute *attr,
4799 char *buf)
4801 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4802 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4803 return sprintf(buf, "%d:%d\n",
4804 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4806 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4808 static ssize_t intel_iommu_show_address(struct device *dev,
4809 struct device_attribute *attr,
4810 char *buf)
4812 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4813 return sprintf(buf, "%llx\n", iommu->reg_phys);
4815 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4817 static ssize_t intel_iommu_show_cap(struct device *dev,
4818 struct device_attribute *attr,
4819 char *buf)
4821 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4822 return sprintf(buf, "%llx\n", iommu->cap);
4824 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4826 static ssize_t intel_iommu_show_ecap(struct device *dev,
4827 struct device_attribute *attr,
4828 char *buf)
4830 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4831 return sprintf(buf, "%llx\n", iommu->ecap);
4833 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4835 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4836 struct device_attribute *attr,
4837 char *buf)
4839 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4840 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4842 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4844 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4845 struct device_attribute *attr,
4846 char *buf)
4848 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4849 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4850 cap_ndoms(iommu->cap)));
4852 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4854 static struct attribute *intel_iommu_attrs[] = {
4855 &dev_attr_version.attr,
4856 &dev_attr_address.attr,
4857 &dev_attr_cap.attr,
4858 &dev_attr_ecap.attr,
4859 &dev_attr_domains_supported.attr,
4860 &dev_attr_domains_used.attr,
4861 NULL,
4864 static struct attribute_group intel_iommu_group = {
4865 .name = "intel-iommu",
4866 .attrs = intel_iommu_attrs,
4869 const struct attribute_group *intel_iommu_groups[] = {
4870 &intel_iommu_group,
4871 NULL,
4874 static inline bool has_untrusted_dev(void)
4876 struct pci_dev *pdev = NULL;
4878 for_each_pci_dev(pdev)
4879 if (pdev->untrusted)
4880 return true;
4882 return false;
4885 static int __init platform_optin_force_iommu(void)
4887 if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4888 return 0;
4890 if (no_iommu || dmar_disabled)
4891 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4894 * If Intel-IOMMU is disabled by default, we will apply identity
4895 * map for all devices except those marked as being untrusted.
4897 if (dmar_disabled)
4898 iommu_identity_mapping |= IDENTMAP_ALL;
4900 dmar_disabled = 0;
4901 no_iommu = 0;
4903 return 1;
4906 static int __init probe_acpi_namespace_devices(void)
4908 struct dmar_drhd_unit *drhd;
4909 /* To avoid a -Wunused-but-set-variable warning. */
4910 struct intel_iommu *iommu __maybe_unused;
4911 struct device *dev;
4912 int i, ret = 0;
4914 for_each_active_iommu(iommu, drhd) {
4915 for_each_active_dev_scope(drhd->devices,
4916 drhd->devices_cnt, i, dev) {
4917 struct acpi_device_physical_node *pn;
4918 struct iommu_group *group;
4919 struct acpi_device *adev;
4921 if (dev->bus != &acpi_bus_type)
4922 continue;
4924 adev = to_acpi_device(dev);
4925 mutex_lock(&adev->physical_node_lock);
4926 list_for_each_entry(pn,
4927 &adev->physical_node_list, node) {
4928 group = iommu_group_get(pn->dev);
4929 if (group) {
4930 iommu_group_put(group);
4931 continue;
4934 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4935 ret = iommu_probe_device(pn->dev);
4936 if (ret)
4937 break;
4939 mutex_unlock(&adev->physical_node_lock);
4941 if (ret)
4942 return ret;
4946 return 0;
4949 int __init intel_iommu_init(void)
4951 int ret = -ENODEV;
4952 struct dmar_drhd_unit *drhd;
4953 struct intel_iommu *iommu;
4956 * Intel IOMMU is required for a TXT/tboot launch or platform
4957 * opt in, so enforce that.
4959 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4961 if (iommu_init_mempool()) {
4962 if (force_on)
4963 panic("tboot: Failed to initialize iommu memory\n");
4964 return -ENOMEM;
4967 down_write(&dmar_global_lock);
4968 if (dmar_table_init()) {
4969 if (force_on)
4970 panic("tboot: Failed to initialize DMAR table\n");
4971 goto out_free_dmar;
4974 if (dmar_dev_scope_init() < 0) {
4975 if (force_on)
4976 panic("tboot: Failed to initialize DMAR device scope\n");
4977 goto out_free_dmar;
4980 up_write(&dmar_global_lock);
4983 * The bus notifier takes the dmar_global_lock, so lockdep will
4984 * complain later when we register it under the lock.
4986 dmar_register_bus_notifier();
4988 down_write(&dmar_global_lock);
4990 if (no_iommu || dmar_disabled) {
4992 * We exit the function here to ensure IOMMU's remapping and
4993 * mempool aren't setup, which means that the IOMMU's PMRs
4994 * won't be disabled via the call to init_dmars(). So disable
4995 * it explicitly here. The PMRs were setup by tboot prior to
4996 * calling SENTER, but the kernel is expected to reset/tear
4997 * down the PMRs.
4999 if (intel_iommu_tboot_noforce) {
5000 for_each_iommu(iommu, drhd)
5001 iommu_disable_protect_mem_regions(iommu);
5005 * Make sure the IOMMUs are switched off, even when we
5006 * boot into a kexec kernel and the previous kernel left
5007 * them enabled
5009 intel_disable_iommus();
5010 goto out_free_dmar;
5013 if (list_empty(&dmar_rmrr_units))
5014 pr_info("No RMRR found\n");
5016 if (list_empty(&dmar_atsr_units))
5017 pr_info("No ATSR found\n");
5019 if (dmar_init_reserved_ranges()) {
5020 if (force_on)
5021 panic("tboot: Failed to reserve iommu ranges\n");
5022 goto out_free_reserved_range;
5025 if (dmar_map_gfx)
5026 intel_iommu_gfx_mapped = 1;
5028 init_no_remapping_devices();
5030 ret = init_dmars();
5031 if (ret) {
5032 if (force_on)
5033 panic("tboot: Failed to initialize DMARs\n");
5034 pr_err("Initialization failed\n");
5035 goto out_free_reserved_range;
5037 up_write(&dmar_global_lock);
5039 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5041 * If the system has no untrusted device or the user has decided
5042 * to disable the bounce page mechanisms, we don't need swiotlb.
5043 * Mark this and the pre-allocated bounce pages will be released
5044 * later.
5046 if (!has_untrusted_dev() || intel_no_bounce)
5047 swiotlb = 0;
5048 #endif
5049 dma_ops = &intel_dma_ops;
5051 init_iommu_pm_ops();
5053 for_each_active_iommu(iommu, drhd) {
5054 iommu_device_sysfs_add(&iommu->iommu, NULL,
5055 intel_iommu_groups,
5056 "%s", iommu->name);
5057 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5058 iommu_device_register(&iommu->iommu);
5061 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5062 if (si_domain && !hw_pass_through)
5063 register_memory_notifier(&intel_iommu_memory_nb);
5064 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5065 intel_iommu_cpu_dead);
5067 down_read(&dmar_global_lock);
5068 if (probe_acpi_namespace_devices())
5069 pr_warn("ACPI name space devices didn't probe correctly\n");
5070 up_read(&dmar_global_lock);
5072 /* Finally, we enable the DMA remapping hardware. */
5073 for_each_iommu(iommu, drhd) {
5074 if (!drhd->ignored && !translation_pre_enabled(iommu))
5075 iommu_enable_translation(iommu);
5077 iommu_disable_protect_mem_regions(iommu);
5079 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5081 intel_iommu_enabled = 1;
5082 intel_iommu_debugfs_init();
5084 return 0;
5086 out_free_reserved_range:
5087 put_iova_domain(&reserved_iova_list);
5088 out_free_dmar:
5089 intel_iommu_free_dmars();
5090 up_write(&dmar_global_lock);
5091 iommu_exit_mempool();
5092 return ret;
5095 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5097 struct intel_iommu *iommu = opaque;
5099 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5100 return 0;
5104 * NB - intel-iommu lacks any sort of reference counting for the users of
5105 * dependent devices. If multiple endpoints have intersecting dependent
5106 * devices, unbinding the driver from any one of them will possibly leave
5107 * the others unable to operate.
5109 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5111 if (!iommu || !dev || !dev_is_pci(dev))
5112 return;
5114 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5117 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5119 struct dmar_domain *domain;
5120 struct intel_iommu *iommu;
5121 unsigned long flags;
5123 assert_spin_locked(&device_domain_lock);
5125 if (WARN_ON(!info))
5126 return;
5128 iommu = info->iommu;
5129 domain = info->domain;
5131 if (info->dev) {
5132 if (dev_is_pci(info->dev) && sm_supported(iommu))
5133 intel_pasid_tear_down_entry(iommu, info->dev,
5134 PASID_RID2PASID);
5136 iommu_disable_dev_iotlb(info);
5137 domain_context_clear(iommu, info->dev);
5138 intel_pasid_free_table(info->dev);
5141 unlink_domain_info(info);
5143 spin_lock_irqsave(&iommu->lock, flags);
5144 domain_detach_iommu(domain, iommu);
5145 spin_unlock_irqrestore(&iommu->lock, flags);
5147 /* free the private domain */
5148 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5149 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5150 list_empty(&domain->devices))
5151 domain_exit(info->domain);
5153 free_devinfo_mem(info);
5156 static void dmar_remove_one_dev_info(struct device *dev)
5158 struct device_domain_info *info;
5159 unsigned long flags;
5161 spin_lock_irqsave(&device_domain_lock, flags);
5162 info = dev->archdata.iommu;
5163 if (info && info != DEFER_DEVICE_DOMAIN_INFO
5164 && info != DUMMY_DEVICE_DOMAIN_INFO)
5165 __dmar_remove_one_dev_info(info);
5166 spin_unlock_irqrestore(&device_domain_lock, flags);
5169 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5171 int adjust_width;
5173 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5174 domain_reserve_special_ranges(domain);
5176 /* calculate AGAW */
5177 domain->gaw = guest_width;
5178 adjust_width = guestwidth_to_adjustwidth(guest_width);
5179 domain->agaw = width_to_agaw(adjust_width);
5181 domain->iommu_coherency = 0;
5182 domain->iommu_snooping = 0;
5183 domain->iommu_superpage = 0;
5184 domain->max_addr = 0;
5186 /* always allocate the top pgd */
5187 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5188 if (!domain->pgd)
5189 return -ENOMEM;
5190 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5191 return 0;
5194 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5196 struct dmar_domain *dmar_domain;
5197 struct iommu_domain *domain;
5199 switch (type) {
5200 case IOMMU_DOMAIN_DMA:
5201 /* fallthrough */
5202 case IOMMU_DOMAIN_UNMANAGED:
5203 dmar_domain = alloc_domain(0);
5204 if (!dmar_domain) {
5205 pr_err("Can't allocate dmar_domain\n");
5206 return NULL;
5208 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5209 pr_err("Domain initialization failed\n");
5210 domain_exit(dmar_domain);
5211 return NULL;
5214 if (type == IOMMU_DOMAIN_DMA &&
5215 init_iova_flush_queue(&dmar_domain->iovad,
5216 iommu_flush_iova, iova_entry_free)) {
5217 pr_warn("iova flush queue initialization failed\n");
5218 intel_iommu_strict = 1;
5221 domain_update_iommu_cap(dmar_domain);
5223 domain = &dmar_domain->domain;
5224 domain->geometry.aperture_start = 0;
5225 domain->geometry.aperture_end =
5226 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5227 domain->geometry.force_aperture = true;
5229 return domain;
5230 case IOMMU_DOMAIN_IDENTITY:
5231 return &si_domain->domain;
5232 default:
5233 return NULL;
5236 return NULL;
5239 static void intel_iommu_domain_free(struct iommu_domain *domain)
5241 if (domain != &si_domain->domain)
5242 domain_exit(to_dmar_domain(domain));
5246 * Check whether a @domain could be attached to the @dev through the
5247 * aux-domain attach/detach APIs.
5249 static inline bool
5250 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5252 struct device_domain_info *info = dev->archdata.iommu;
5254 return info && info->auxd_enabled &&
5255 domain->type == IOMMU_DOMAIN_UNMANAGED;
5258 static void auxiliary_link_device(struct dmar_domain *domain,
5259 struct device *dev)
5261 struct device_domain_info *info = dev->archdata.iommu;
5263 assert_spin_locked(&device_domain_lock);
5264 if (WARN_ON(!info))
5265 return;
5267 domain->auxd_refcnt++;
5268 list_add(&domain->auxd, &info->auxiliary_domains);
5271 static void auxiliary_unlink_device(struct dmar_domain *domain,
5272 struct device *dev)
5274 struct device_domain_info *info = dev->archdata.iommu;
5276 assert_spin_locked(&device_domain_lock);
5277 if (WARN_ON(!info))
5278 return;
5280 list_del(&domain->auxd);
5281 domain->auxd_refcnt--;
5283 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5284 intel_pasid_free_id(domain->default_pasid);
5287 static int aux_domain_add_dev(struct dmar_domain *domain,
5288 struct device *dev)
5290 int ret;
5291 u8 bus, devfn;
5292 unsigned long flags;
5293 struct intel_iommu *iommu;
5295 iommu = device_to_iommu(dev, &bus, &devfn);
5296 if (!iommu)
5297 return -ENODEV;
5299 if (domain->default_pasid <= 0) {
5300 int pasid;
5302 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
5303 pci_max_pasids(to_pci_dev(dev)),
5304 GFP_KERNEL);
5305 if (pasid <= 0) {
5306 pr_err("Can't allocate default pasid\n");
5307 return -ENODEV;
5309 domain->default_pasid = pasid;
5312 spin_lock_irqsave(&device_domain_lock, flags);
5314 * iommu->lock must be held to attach domain to iommu and setup the
5315 * pasid entry for second level translation.
5317 spin_lock(&iommu->lock);
5318 ret = domain_attach_iommu(domain, iommu);
5319 if (ret)
5320 goto attach_failed;
5322 /* Setup the PASID entry for mediated devices: */
5323 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5324 domain->default_pasid);
5325 if (ret)
5326 goto table_failed;
5327 spin_unlock(&iommu->lock);
5329 auxiliary_link_device(domain, dev);
5331 spin_unlock_irqrestore(&device_domain_lock, flags);
5333 return 0;
5335 table_failed:
5336 domain_detach_iommu(domain, iommu);
5337 attach_failed:
5338 spin_unlock(&iommu->lock);
5339 spin_unlock_irqrestore(&device_domain_lock, flags);
5340 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5341 intel_pasid_free_id(domain->default_pasid);
5343 return ret;
5346 static void aux_domain_remove_dev(struct dmar_domain *domain,
5347 struct device *dev)
5349 struct device_domain_info *info;
5350 struct intel_iommu *iommu;
5351 unsigned long flags;
5353 if (!is_aux_domain(dev, &domain->domain))
5354 return;
5356 spin_lock_irqsave(&device_domain_lock, flags);
5357 info = dev->archdata.iommu;
5358 iommu = info->iommu;
5360 auxiliary_unlink_device(domain, dev);
5362 spin_lock(&iommu->lock);
5363 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5364 domain_detach_iommu(domain, iommu);
5365 spin_unlock(&iommu->lock);
5367 spin_unlock_irqrestore(&device_domain_lock, flags);
5370 static int prepare_domain_attach_device(struct iommu_domain *domain,
5371 struct device *dev)
5373 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5374 struct intel_iommu *iommu;
5375 int addr_width;
5376 u8 bus, devfn;
5378 iommu = device_to_iommu(dev, &bus, &devfn);
5379 if (!iommu)
5380 return -ENODEV;
5382 /* check if this iommu agaw is sufficient for max mapped address */
5383 addr_width = agaw_to_width(iommu->agaw);
5384 if (addr_width > cap_mgaw(iommu->cap))
5385 addr_width = cap_mgaw(iommu->cap);
5387 if (dmar_domain->max_addr > (1LL << addr_width)) {
5388 dev_err(dev, "%s: iommu width (%d) is not "
5389 "sufficient for the mapped address (%llx)\n",
5390 __func__, addr_width, dmar_domain->max_addr);
5391 return -EFAULT;
5393 dmar_domain->gaw = addr_width;
5396 * Knock out extra levels of page tables if necessary
5398 while (iommu->agaw < dmar_domain->agaw) {
5399 struct dma_pte *pte;
5401 pte = dmar_domain->pgd;
5402 if (dma_pte_present(pte)) {
5403 dmar_domain->pgd = (struct dma_pte *)
5404 phys_to_virt(dma_pte_addr(pte));
5405 free_pgtable_page(pte);
5407 dmar_domain->agaw--;
5410 return 0;
5413 static int intel_iommu_attach_device(struct iommu_domain *domain,
5414 struct device *dev)
5416 int ret;
5418 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5419 device_is_rmrr_locked(dev)) {
5420 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5421 return -EPERM;
5424 if (is_aux_domain(dev, domain))
5425 return -EPERM;
5427 /* normally dev is not mapped */
5428 if (unlikely(domain_context_mapped(dev))) {
5429 struct dmar_domain *old_domain;
5431 old_domain = find_domain(dev);
5432 if (old_domain)
5433 dmar_remove_one_dev_info(dev);
5436 ret = prepare_domain_attach_device(domain, dev);
5437 if (ret)
5438 return ret;
5440 return domain_add_dev_info(to_dmar_domain(domain), dev);
5443 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5444 struct device *dev)
5446 int ret;
5448 if (!is_aux_domain(dev, domain))
5449 return -EPERM;
5451 ret = prepare_domain_attach_device(domain, dev);
5452 if (ret)
5453 return ret;
5455 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5458 static void intel_iommu_detach_device(struct iommu_domain *domain,
5459 struct device *dev)
5461 dmar_remove_one_dev_info(dev);
5464 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5465 struct device *dev)
5467 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5470 static int intel_iommu_map(struct iommu_domain *domain,
5471 unsigned long iova, phys_addr_t hpa,
5472 size_t size, int iommu_prot, gfp_t gfp)
5474 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5475 u64 max_addr;
5476 int prot = 0;
5477 int ret;
5479 if (iommu_prot & IOMMU_READ)
5480 prot |= DMA_PTE_READ;
5481 if (iommu_prot & IOMMU_WRITE)
5482 prot |= DMA_PTE_WRITE;
5483 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5484 prot |= DMA_PTE_SNP;
5486 max_addr = iova + size;
5487 if (dmar_domain->max_addr < max_addr) {
5488 u64 end;
5490 /* check if minimum agaw is sufficient for mapped address */
5491 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5492 if (end < max_addr) {
5493 pr_err("%s: iommu width (%d) is not "
5494 "sufficient for the mapped address (%llx)\n",
5495 __func__, dmar_domain->gaw, max_addr);
5496 return -EFAULT;
5498 dmar_domain->max_addr = max_addr;
5500 /* Round up size to next multiple of PAGE_SIZE, if it and
5501 the low bits of hpa would take us onto the next page */
5502 size = aligned_nrpages(hpa, size);
5503 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5504 hpa >> VTD_PAGE_SHIFT, size, prot);
5505 return ret;
5508 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5509 unsigned long iova, size_t size,
5510 struct iommu_iotlb_gather *gather)
5512 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5513 struct page *freelist = NULL;
5514 unsigned long start_pfn, last_pfn;
5515 unsigned int npages;
5516 int iommu_id, level = 0;
5518 /* Cope with horrid API which requires us to unmap more than the
5519 size argument if it happens to be a large-page mapping. */
5520 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5522 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5523 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5525 start_pfn = iova >> VTD_PAGE_SHIFT;
5526 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5528 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5530 npages = last_pfn - start_pfn + 1;
5532 for_each_domain_iommu(iommu_id, dmar_domain)
5533 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5534 start_pfn, npages, !freelist, 0);
5536 dma_free_pagelist(freelist);
5538 if (dmar_domain->max_addr == iova + size)
5539 dmar_domain->max_addr = iova;
5541 return size;
5544 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5545 dma_addr_t iova)
5547 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5548 struct dma_pte *pte;
5549 int level = 0;
5550 u64 phys = 0;
5552 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5553 if (pte)
5554 phys = dma_pte_addr(pte);
5556 return phys;
5559 static inline bool scalable_mode_support(void)
5561 struct dmar_drhd_unit *drhd;
5562 struct intel_iommu *iommu;
5563 bool ret = true;
5565 rcu_read_lock();
5566 for_each_active_iommu(iommu, drhd) {
5567 if (!sm_supported(iommu)) {
5568 ret = false;
5569 break;
5572 rcu_read_unlock();
5574 return ret;
5577 static inline bool iommu_pasid_support(void)
5579 struct dmar_drhd_unit *drhd;
5580 struct intel_iommu *iommu;
5581 bool ret = true;
5583 rcu_read_lock();
5584 for_each_active_iommu(iommu, drhd) {
5585 if (!pasid_supported(iommu)) {
5586 ret = false;
5587 break;
5590 rcu_read_unlock();
5592 return ret;
5595 static bool intel_iommu_capable(enum iommu_cap cap)
5597 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5598 return domain_update_iommu_snooping(NULL) == 1;
5599 if (cap == IOMMU_CAP_INTR_REMAP)
5600 return irq_remapping_enabled == 1;
5602 return false;
5605 static int intel_iommu_add_device(struct device *dev)
5607 struct dmar_domain *dmar_domain;
5608 struct iommu_domain *domain;
5609 struct intel_iommu *iommu;
5610 struct iommu_group *group;
5611 u8 bus, devfn;
5612 int ret;
5614 iommu = device_to_iommu(dev, &bus, &devfn);
5615 if (!iommu)
5616 return -ENODEV;
5618 iommu_device_link(&iommu->iommu, dev);
5620 if (translation_pre_enabled(iommu))
5621 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5623 group = iommu_group_get_for_dev(dev);
5625 if (IS_ERR(group)) {
5626 ret = PTR_ERR(group);
5627 goto unlink;
5630 iommu_group_put(group);
5632 domain = iommu_get_domain_for_dev(dev);
5633 dmar_domain = to_dmar_domain(domain);
5634 if (domain->type == IOMMU_DOMAIN_DMA) {
5635 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5636 ret = iommu_request_dm_for_dev(dev);
5637 if (ret) {
5638 dmar_remove_one_dev_info(dev);
5639 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5640 domain_add_dev_info(si_domain, dev);
5641 dev_info(dev,
5642 "Device uses a private identity domain.\n");
5645 } else {
5646 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5647 ret = iommu_request_dma_domain_for_dev(dev);
5648 if (ret) {
5649 dmar_remove_one_dev_info(dev);
5650 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5651 if (!get_private_domain_for_dev(dev)) {
5652 dev_warn(dev,
5653 "Failed to get a private domain.\n");
5654 ret = -ENOMEM;
5655 goto unlink;
5658 dev_info(dev,
5659 "Device uses a private dma domain.\n");
5664 if (device_needs_bounce(dev)) {
5665 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5666 set_dma_ops(dev, &bounce_dma_ops);
5669 return 0;
5671 unlink:
5672 iommu_device_unlink(&iommu->iommu, dev);
5673 return ret;
5676 static void intel_iommu_remove_device(struct device *dev)
5678 struct intel_iommu *iommu;
5679 u8 bus, devfn;
5681 iommu = device_to_iommu(dev, &bus, &devfn);
5682 if (!iommu)
5683 return;
5685 dmar_remove_one_dev_info(dev);
5687 iommu_group_remove_device(dev);
5689 iommu_device_unlink(&iommu->iommu, dev);
5691 if (device_needs_bounce(dev))
5692 set_dma_ops(dev, NULL);
5695 static void intel_iommu_get_resv_regions(struct device *device,
5696 struct list_head *head)
5698 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5699 struct iommu_resv_region *reg;
5700 struct dmar_rmrr_unit *rmrr;
5701 struct device *i_dev;
5702 int i;
5704 down_read(&dmar_global_lock);
5705 for_each_rmrr_units(rmrr) {
5706 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5707 i, i_dev) {
5708 struct iommu_resv_region *resv;
5709 enum iommu_resv_type type;
5710 size_t length;
5712 if (i_dev != device &&
5713 !is_downstream_to_pci_bridge(device, i_dev))
5714 continue;
5716 length = rmrr->end_address - rmrr->base_address + 1;
5718 type = device_rmrr_is_relaxable(device) ?
5719 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5721 resv = iommu_alloc_resv_region(rmrr->base_address,
5722 length, prot, type);
5723 if (!resv)
5724 break;
5726 list_add_tail(&resv->list, head);
5729 up_read(&dmar_global_lock);
5731 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5732 if (dev_is_pci(device)) {
5733 struct pci_dev *pdev = to_pci_dev(device);
5735 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5736 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5737 IOMMU_RESV_DIRECT_RELAXABLE);
5738 if (reg)
5739 list_add_tail(&reg->list, head);
5742 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5744 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5745 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5746 0, IOMMU_RESV_MSI);
5747 if (!reg)
5748 return;
5749 list_add_tail(&reg->list, head);
5752 static void intel_iommu_put_resv_regions(struct device *dev,
5753 struct list_head *head)
5755 struct iommu_resv_region *entry, *next;
5757 list_for_each_entry_safe(entry, next, head, list)
5758 kfree(entry);
5761 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5763 struct device_domain_info *info;
5764 struct context_entry *context;
5765 struct dmar_domain *domain;
5766 unsigned long flags;
5767 u64 ctx_lo;
5768 int ret;
5770 domain = find_domain(dev);
5771 if (!domain)
5772 return -EINVAL;
5774 spin_lock_irqsave(&device_domain_lock, flags);
5775 spin_lock(&iommu->lock);
5777 ret = -EINVAL;
5778 info = dev->archdata.iommu;
5779 if (!info || !info->pasid_supported)
5780 goto out;
5782 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5783 if (WARN_ON(!context))
5784 goto out;
5786 ctx_lo = context[0].lo;
5788 if (!(ctx_lo & CONTEXT_PASIDE)) {
5789 ctx_lo |= CONTEXT_PASIDE;
5790 context[0].lo = ctx_lo;
5791 wmb();
5792 iommu->flush.flush_context(iommu,
5793 domain->iommu_did[iommu->seq_id],
5794 PCI_DEVID(info->bus, info->devfn),
5795 DMA_CCMD_MASK_NOBIT,
5796 DMA_CCMD_DEVICE_INVL);
5799 /* Enable PASID support in the device, if it wasn't already */
5800 if (!info->pasid_enabled)
5801 iommu_enable_dev_iotlb(info);
5803 ret = 0;
5805 out:
5806 spin_unlock(&iommu->lock);
5807 spin_unlock_irqrestore(&device_domain_lock, flags);
5809 return ret;
5812 static void intel_iommu_apply_resv_region(struct device *dev,
5813 struct iommu_domain *domain,
5814 struct iommu_resv_region *region)
5816 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5817 unsigned long start, end;
5819 start = IOVA_PFN(region->start);
5820 end = IOVA_PFN(region->start + region->length - 1);
5822 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5825 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5827 if (dev_is_pci(dev))
5828 return pci_device_group(dev);
5829 return generic_device_group(dev);
5832 #ifdef CONFIG_INTEL_IOMMU_SVM
5833 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5835 struct intel_iommu *iommu;
5836 u8 bus, devfn;
5838 if (iommu_dummy(dev)) {
5839 dev_warn(dev,
5840 "No IOMMU translation for device; cannot enable SVM\n");
5841 return NULL;
5844 iommu = device_to_iommu(dev, &bus, &devfn);
5845 if ((!iommu)) {
5846 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5847 return NULL;
5850 return iommu;
5852 #endif /* CONFIG_INTEL_IOMMU_SVM */
5854 static int intel_iommu_enable_auxd(struct device *dev)
5856 struct device_domain_info *info;
5857 struct intel_iommu *iommu;
5858 unsigned long flags;
5859 u8 bus, devfn;
5860 int ret;
5862 iommu = device_to_iommu(dev, &bus, &devfn);
5863 if (!iommu || dmar_disabled)
5864 return -EINVAL;
5866 if (!sm_supported(iommu) || !pasid_supported(iommu))
5867 return -EINVAL;
5869 ret = intel_iommu_enable_pasid(iommu, dev);
5870 if (ret)
5871 return -ENODEV;
5873 spin_lock_irqsave(&device_domain_lock, flags);
5874 info = dev->archdata.iommu;
5875 info->auxd_enabled = 1;
5876 spin_unlock_irqrestore(&device_domain_lock, flags);
5878 return 0;
5881 static int intel_iommu_disable_auxd(struct device *dev)
5883 struct device_domain_info *info;
5884 unsigned long flags;
5886 spin_lock_irqsave(&device_domain_lock, flags);
5887 info = dev->archdata.iommu;
5888 if (!WARN_ON(!info))
5889 info->auxd_enabled = 0;
5890 spin_unlock_irqrestore(&device_domain_lock, flags);
5892 return 0;
5896 * A PCI express designated vendor specific extended capability is defined
5897 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5898 * for system software and tools to detect endpoint devices supporting the
5899 * Intel scalable IO virtualization without host driver dependency.
5901 * Returns the address of the matching extended capability structure within
5902 * the device's PCI configuration space or 0 if the device does not support
5903 * it.
5905 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5907 int pos;
5908 u16 vendor, id;
5910 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5911 while (pos) {
5912 pci_read_config_word(pdev, pos + 4, &vendor);
5913 pci_read_config_word(pdev, pos + 8, &id);
5914 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5915 return pos;
5917 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5920 return 0;
5923 static bool
5924 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5926 if (feat == IOMMU_DEV_FEAT_AUX) {
5927 int ret;
5929 if (!dev_is_pci(dev) || dmar_disabled ||
5930 !scalable_mode_support() || !iommu_pasid_support())
5931 return false;
5933 ret = pci_pasid_features(to_pci_dev(dev));
5934 if (ret < 0)
5935 return false;
5937 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5940 return false;
5943 static int
5944 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5946 if (feat == IOMMU_DEV_FEAT_AUX)
5947 return intel_iommu_enable_auxd(dev);
5949 return -ENODEV;
5952 static int
5953 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5955 if (feat == IOMMU_DEV_FEAT_AUX)
5956 return intel_iommu_disable_auxd(dev);
5958 return -ENODEV;
5961 static bool
5962 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5964 struct device_domain_info *info = dev->archdata.iommu;
5966 if (feat == IOMMU_DEV_FEAT_AUX)
5967 return scalable_mode_support() && info && info->auxd_enabled;
5969 return false;
5972 static int
5973 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5975 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5977 return dmar_domain->default_pasid > 0 ?
5978 dmar_domain->default_pasid : -EINVAL;
5981 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5982 struct device *dev)
5984 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5987 const struct iommu_ops intel_iommu_ops = {
5988 .capable = intel_iommu_capable,
5989 .domain_alloc = intel_iommu_domain_alloc,
5990 .domain_free = intel_iommu_domain_free,
5991 .attach_dev = intel_iommu_attach_device,
5992 .detach_dev = intel_iommu_detach_device,
5993 .aux_attach_dev = intel_iommu_aux_attach_device,
5994 .aux_detach_dev = intel_iommu_aux_detach_device,
5995 .aux_get_pasid = intel_iommu_aux_get_pasid,
5996 .map = intel_iommu_map,
5997 .unmap = intel_iommu_unmap,
5998 .iova_to_phys = intel_iommu_iova_to_phys,
5999 .add_device = intel_iommu_add_device,
6000 .remove_device = intel_iommu_remove_device,
6001 .get_resv_regions = intel_iommu_get_resv_regions,
6002 .put_resv_regions = intel_iommu_put_resv_regions,
6003 .apply_resv_region = intel_iommu_apply_resv_region,
6004 .device_group = intel_iommu_device_group,
6005 .dev_has_feat = intel_iommu_dev_has_feat,
6006 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
6007 .dev_enable_feat = intel_iommu_dev_enable_feat,
6008 .dev_disable_feat = intel_iommu_dev_disable_feat,
6009 .is_attach_deferred = intel_iommu_is_attach_deferred,
6010 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6013 static void quirk_iommu_igfx(struct pci_dev *dev)
6015 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6016 dmar_map_gfx = 0;
6019 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6020 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6021 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6022 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6023 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6024 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6025 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6026 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6028 /* Broadwell igfx malfunctions with dmar */
6029 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6030 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6031 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6032 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6033 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6034 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6035 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6036 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6037 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6038 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6039 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6040 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6041 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6042 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6043 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6044 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6045 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6046 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6047 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6048 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6049 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6050 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6051 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6052 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6054 static void quirk_iommu_rwbf(struct pci_dev *dev)
6057 * Mobile 4 Series Chipset neglects to set RWBF capability,
6058 * but needs it. Same seems to hold for the desktop versions.
6060 pci_info(dev, "Forcing write-buffer flush capability\n");
6061 rwbf_quirk = 1;
6064 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6065 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6066 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6067 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6068 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6069 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6070 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6072 #define GGC 0x52
6073 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
6074 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6075 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
6076 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
6077 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6078 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6079 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6080 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6082 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6084 unsigned short ggc;
6086 if (pci_read_config_word(dev, GGC, &ggc))
6087 return;
6089 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6090 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6091 dmar_map_gfx = 0;
6092 } else if (dmar_map_gfx) {
6093 /* we have to ensure the gfx device is idle before we flush */
6094 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6095 intel_iommu_strict = 1;
6098 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6099 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6100 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6101 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6103 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6104 ISOCH DMAR unit for the Azalia sound device, but not give it any
6105 TLB entries, which causes it to deadlock. Check for that. We do
6106 this in a function called from init_dmars(), instead of in a PCI
6107 quirk, because we don't want to print the obnoxious "BIOS broken"
6108 message if VT-d is actually disabled.
6110 static void __init check_tylersburg_isoch(void)
6112 struct pci_dev *pdev;
6113 uint32_t vtisochctrl;
6115 /* If there's no Azalia in the system anyway, forget it. */
6116 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6117 if (!pdev)
6118 return;
6119 pci_dev_put(pdev);
6121 /* System Management Registers. Might be hidden, in which case
6122 we can't do the sanity check. But that's OK, because the
6123 known-broken BIOSes _don't_ actually hide it, so far. */
6124 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6125 if (!pdev)
6126 return;
6128 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6129 pci_dev_put(pdev);
6130 return;
6133 pci_dev_put(pdev);
6135 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6136 if (vtisochctrl & 1)
6137 return;
6139 /* Drop all bits other than the number of TLB entries */
6140 vtisochctrl &= 0x1c;
6142 /* If we have the recommended number of TLB entries (16), fine. */
6143 if (vtisochctrl == 0x10)
6144 return;
6146 /* Zero TLB entries? You get to ride the short bus to school. */
6147 if (!vtisochctrl) {
6148 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6149 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6150 dmi_get_system_info(DMI_BIOS_VENDOR),
6151 dmi_get_system_info(DMI_BIOS_VERSION),
6152 dmi_get_system_info(DMI_PRODUCT_VERSION));
6153 iommu_identity_mapping |= IDENTMAP_AZALIA;
6154 return;
6157 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6158 vtisochctrl);