power: supply: axp20x_usb_power: Use a match structure
[linux/fpc-iii.git] / drivers / iommu / intel-iommu.c
blob0c8d81f56a306800ca0d065716e632f48ad3eec3
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
92 * that we support.
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
106 static inline int agaw_to_level(int agaw)
108 return agaw + 2;
111 static inline int agaw_to_width(int agaw)
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 static inline int width_to_agaw(int width)
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 static inline unsigned int level_to_offset_bits(int level)
123 return (level - 1) * LEVEL_STRIDE;
126 static inline int pfn_level_offset(unsigned long pfn, int level)
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 static inline unsigned long level_mask(int level)
133 return -1UL << level_to_offset_bits(level);
136 static inline unsigned long level_size(int level)
138 return 1UL << level_to_offset_bits(level);
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
143 return (pfn + level_size(level) - 1) & level_mask(level);
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
148 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
164 return mm_to_dma_pfn(page_to_pfn(pg));
166 static inline unsigned long virt_to_dma_pfn(void *p)
168 return page_to_dma_pfn(virt_to_page(p));
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189 * if marked present.
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
193 if (!(re->lo & 1))
194 return 0;
196 return re->lo & VTD_PAGE_MASK;
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201 * if marked present.
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
205 if (!(re->hi & 1))
206 return 0;
208 return re->hi & VTD_PAGE_MASK;
211 static inline void context_clear_pasid_enable(struct context_entry *context)
213 context->lo &= ~(1ULL << 11);
216 static inline bool context_pasid_enabled(struct context_entry *context)
218 return !!(context->lo & (1ULL << 11));
221 static inline void context_set_copied(struct context_entry *context)
223 context->hi |= (1ull << 3);
226 static inline bool context_copied(struct context_entry *context)
228 return !!(context->hi & (1ULL << 3));
231 static inline bool __context_present(struct context_entry *context)
233 return (context->lo & 1);
236 bool context_present(struct context_entry *context)
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
243 static inline void context_set_present(struct context_entry *context)
245 context->lo |= 1;
248 static inline void context_set_fault_enable(struct context_entry *context)
250 context->lo &= (((u64)-1) << 2) | 1;
253 static inline void context_set_translation_type(struct context_entry *context,
254 unsigned long value)
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
260 static inline void context_set_address_root(struct context_entry *context,
261 unsigned long value)
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
267 static inline void context_set_address_width(struct context_entry *context,
268 unsigned long value)
270 context->hi |= value & 7;
273 static inline void context_set_domain_id(struct context_entry *context,
274 unsigned long value)
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
279 static inline int context_domain_id(struct context_entry *c)
281 return((c->hi >> 8) & 0xffff);
284 static inline void context_clear_entry(struct context_entry *context)
286 context->lo = 0;
287 context->hi = 0;
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
303 * This is a DMA domain allocated through the iommu domain allocation
304 * interface. But one or more devices belonging to this domain have
305 * been chosen to use a private domain. We should avoid to use the
306 * map/unmap/iova_to_phys APIs on it.
308 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
310 #define for_each_domain_iommu(idx, domain) \
311 for (idx = 0; idx < g_num_of_iommus; idx++) \
312 if (domain->iommu_refcnt[idx])
314 struct dmar_rmrr_unit {
315 struct list_head list; /* list of rmrr units */
316 struct acpi_dmar_header *hdr; /* ACPI header */
317 u64 base_address; /* reserved base address*/
318 u64 end_address; /* reserved end address */
319 struct dmar_dev_scope *devices; /* target devices */
320 int devices_cnt; /* target device count */
323 struct dmar_atsr_unit {
324 struct list_head list; /* list of ATSR units */
325 struct acpi_dmar_header *hdr; /* ACPI header */
326 struct dmar_dev_scope *devices; /* target devices */
327 int devices_cnt; /* target device count */
328 u8 include_all:1; /* include all ports */
331 static LIST_HEAD(dmar_atsr_units);
332 static LIST_HEAD(dmar_rmrr_units);
334 #define for_each_rmrr_units(rmrr) \
335 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
337 /* bitmap for indexing intel_iommus */
338 static int g_num_of_iommus;
340 static void domain_exit(struct dmar_domain *domain);
341 static void domain_remove_dev_info(struct dmar_domain *domain);
342 static void dmar_remove_one_dev_info(struct device *dev);
343 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
344 static void domain_context_clear(struct intel_iommu *iommu,
345 struct device *dev);
346 static int domain_detach_iommu(struct dmar_domain *domain,
347 struct intel_iommu *iommu);
348 static bool device_is_rmrr_locked(struct device *dev);
349 static int intel_iommu_attach_device(struct iommu_domain *domain,
350 struct device *dev);
351 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
352 dma_addr_t iova);
354 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
355 int dmar_disabled = 0;
356 #else
357 int dmar_disabled = 1;
358 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
360 int intel_iommu_sm;
361 int intel_iommu_enabled = 0;
362 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
364 static int dmar_map_gfx = 1;
365 static int dmar_forcedac;
366 static int intel_iommu_strict;
367 static int intel_iommu_superpage = 1;
368 static int iommu_identity_mapping;
369 static int intel_no_bounce;
371 #define IDENTMAP_ALL 1
372 #define IDENTMAP_GFX 2
373 #define IDENTMAP_AZALIA 4
375 int intel_iommu_gfx_mapped;
376 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
378 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
379 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
380 static DEFINE_SPINLOCK(device_domain_lock);
381 static LIST_HEAD(device_domain_list);
383 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
384 to_pci_dev(d)->untrusted)
387 * Iterate over elements in device_domain_list and call the specified
388 * callback @fn against each element.
390 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
391 void *data), void *data)
393 int ret = 0;
394 unsigned long flags;
395 struct device_domain_info *info;
397 spin_lock_irqsave(&device_domain_lock, flags);
398 list_for_each_entry(info, &device_domain_list, global) {
399 ret = fn(info, data);
400 if (ret) {
401 spin_unlock_irqrestore(&device_domain_lock, flags);
402 return ret;
405 spin_unlock_irqrestore(&device_domain_lock, flags);
407 return 0;
410 const struct iommu_ops intel_iommu_ops;
412 static bool translation_pre_enabled(struct intel_iommu *iommu)
414 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
417 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
419 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
422 static void init_translation_status(struct intel_iommu *iommu)
424 u32 gsts;
426 gsts = readl(iommu->reg + DMAR_GSTS_REG);
427 if (gsts & DMA_GSTS_TES)
428 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
431 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
432 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
434 return container_of(dom, struct dmar_domain, domain);
437 static int __init intel_iommu_setup(char *str)
439 if (!str)
440 return -EINVAL;
441 while (*str) {
442 if (!strncmp(str, "on", 2)) {
443 dmar_disabled = 0;
444 pr_info("IOMMU enabled\n");
445 } else if (!strncmp(str, "off", 3)) {
446 dmar_disabled = 1;
447 no_platform_optin = 1;
448 pr_info("IOMMU disabled\n");
449 } else if (!strncmp(str, "igfx_off", 8)) {
450 dmar_map_gfx = 0;
451 pr_info("Disable GFX device mapping\n");
452 } else if (!strncmp(str, "forcedac", 8)) {
453 pr_info("Forcing DAC for PCI devices\n");
454 dmar_forcedac = 1;
455 } else if (!strncmp(str, "strict", 6)) {
456 pr_info("Disable batched IOTLB flush\n");
457 intel_iommu_strict = 1;
458 } else if (!strncmp(str, "sp_off", 6)) {
459 pr_info("Disable supported super page\n");
460 intel_iommu_superpage = 0;
461 } else if (!strncmp(str, "sm_on", 5)) {
462 pr_info("Intel-IOMMU: scalable mode supported\n");
463 intel_iommu_sm = 1;
464 } else if (!strncmp(str, "tboot_noforce", 13)) {
465 printk(KERN_INFO
466 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
467 intel_iommu_tboot_noforce = 1;
468 } else if (!strncmp(str, "nobounce", 8)) {
469 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
470 intel_no_bounce = 1;
473 str += strcspn(str, ",");
474 while (*str == ',')
475 str++;
477 return 0;
479 __setup("intel_iommu=", intel_iommu_setup);
481 static struct kmem_cache *iommu_domain_cache;
482 static struct kmem_cache *iommu_devinfo_cache;
484 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
486 struct dmar_domain **domains;
487 int idx = did >> 8;
489 domains = iommu->domains[idx];
490 if (!domains)
491 return NULL;
493 return domains[did & 0xff];
496 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
497 struct dmar_domain *domain)
499 struct dmar_domain **domains;
500 int idx = did >> 8;
502 if (!iommu->domains[idx]) {
503 size_t size = 256 * sizeof(struct dmar_domain *);
504 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
507 domains = iommu->domains[idx];
508 if (WARN_ON(!domains))
509 return;
510 else
511 domains[did & 0xff] = domain;
514 void *alloc_pgtable_page(int node)
516 struct page *page;
517 void *vaddr = NULL;
519 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
520 if (page)
521 vaddr = page_address(page);
522 return vaddr;
525 void free_pgtable_page(void *vaddr)
527 free_page((unsigned long)vaddr);
530 static inline void *alloc_domain_mem(void)
532 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
535 static void free_domain_mem(void *vaddr)
537 kmem_cache_free(iommu_domain_cache, vaddr);
540 static inline void * alloc_devinfo_mem(void)
542 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
545 static inline void free_devinfo_mem(void *vaddr)
547 kmem_cache_free(iommu_devinfo_cache, vaddr);
550 static inline int domain_type_is_si(struct dmar_domain *domain)
552 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
556 unsigned long pfn)
558 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
560 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
565 unsigned long sagaw;
566 int agaw = -1;
568 sagaw = cap_sagaw(iommu->cap);
569 for (agaw = width_to_agaw(max_gaw);
570 agaw >= 0; agaw--) {
571 if (test_bit(agaw, &sagaw))
572 break;
575 return agaw;
579 * Calculate max SAGAW for each iommu.
581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
583 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
587 * calculate agaw for each iommu.
588 * "SAGAW" may be different across iommus, use a default agaw, and
589 * get a supported less agaw for iommus that don't support the default agaw.
591 int iommu_calculate_agaw(struct intel_iommu *iommu)
593 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
596 /* This functionin only returns single iommu in a domain */
597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
599 int iommu_id;
601 /* si_domain and vm domain should not get here. */
602 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
603 return NULL;
605 for_each_domain_iommu(iommu_id, domain)
606 break;
608 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
609 return NULL;
611 return g_iommus[iommu_id];
614 static void domain_update_iommu_coherency(struct dmar_domain *domain)
616 struct dmar_drhd_unit *drhd;
617 struct intel_iommu *iommu;
618 bool found = false;
619 int i;
621 domain->iommu_coherency = 1;
623 for_each_domain_iommu(i, domain) {
624 found = true;
625 if (!ecap_coherent(g_iommus[i]->ecap)) {
626 domain->iommu_coherency = 0;
627 break;
630 if (found)
631 return;
633 /* No hardware attached; use lowest common denominator */
634 rcu_read_lock();
635 for_each_active_iommu(iommu, drhd) {
636 if (!ecap_coherent(iommu->ecap)) {
637 domain->iommu_coherency = 0;
638 break;
641 rcu_read_unlock();
644 static int domain_update_iommu_snooping(struct intel_iommu *skip)
646 struct dmar_drhd_unit *drhd;
647 struct intel_iommu *iommu;
648 int ret = 1;
650 rcu_read_lock();
651 for_each_active_iommu(iommu, drhd) {
652 if (iommu != skip) {
653 if (!ecap_sc_support(iommu->ecap)) {
654 ret = 0;
655 break;
659 rcu_read_unlock();
661 return ret;
664 static int domain_update_iommu_superpage(struct intel_iommu *skip)
666 struct dmar_drhd_unit *drhd;
667 struct intel_iommu *iommu;
668 int mask = 0xf;
670 if (!intel_iommu_superpage) {
671 return 0;
674 /* set iommu_superpage to the smallest common denominator */
675 rcu_read_lock();
676 for_each_active_iommu(iommu, drhd) {
677 if (iommu != skip) {
678 mask &= cap_super_page_val(iommu->cap);
679 if (!mask)
680 break;
683 rcu_read_unlock();
685 return fls(mask);
688 /* Some capabilities may be different across iommus */
689 static void domain_update_iommu_cap(struct dmar_domain *domain)
691 domain_update_iommu_coherency(domain);
692 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
693 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
696 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
697 u8 devfn, int alloc)
699 struct root_entry *root = &iommu->root_entry[bus];
700 struct context_entry *context;
701 u64 *entry;
703 entry = &root->lo;
704 if (sm_supported(iommu)) {
705 if (devfn >= 0x80) {
706 devfn -= 0x80;
707 entry = &root->hi;
709 devfn *= 2;
711 if (*entry & 1)
712 context = phys_to_virt(*entry & VTD_PAGE_MASK);
713 else {
714 unsigned long phy_addr;
715 if (!alloc)
716 return NULL;
718 context = alloc_pgtable_page(iommu->node);
719 if (!context)
720 return NULL;
722 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
723 phy_addr = virt_to_phys((void *)context);
724 *entry = phy_addr | 1;
725 __iommu_flush_cache(iommu, entry, sizeof(*entry));
727 return &context[devfn];
730 static int iommu_dummy(struct device *dev)
732 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
736 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
737 * sub-hierarchy of a candidate PCI-PCI bridge
738 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
739 * @bridge: the candidate PCI-PCI bridge
741 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
743 static bool
744 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
746 struct pci_dev *pdev, *pbridge;
748 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
749 return false;
751 pdev = to_pci_dev(dev);
752 pbridge = to_pci_dev(bridge);
754 if (pbridge->subordinate &&
755 pbridge->subordinate->number <= pdev->bus->number &&
756 pbridge->subordinate->busn_res.end >= pdev->bus->number)
757 return true;
759 return false;
762 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
764 struct dmar_drhd_unit *drhd = NULL;
765 struct intel_iommu *iommu;
766 struct device *tmp;
767 struct pci_dev *pdev = NULL;
768 u16 segment = 0;
769 int i;
771 if (iommu_dummy(dev))
772 return NULL;
774 if (dev_is_pci(dev)) {
775 struct pci_dev *pf_pdev;
777 pdev = to_pci_dev(dev);
779 #ifdef CONFIG_X86
780 /* VMD child devices currently cannot be handled individually */
781 if (is_vmd(pdev->bus))
782 return NULL;
783 #endif
785 /* VFs aren't listed in scope tables; we need to look up
786 * the PF instead to find the IOMMU. */
787 pf_pdev = pci_physfn(pdev);
788 dev = &pf_pdev->dev;
789 segment = pci_domain_nr(pdev->bus);
790 } else if (has_acpi_companion(dev))
791 dev = &ACPI_COMPANION(dev)->dev;
793 rcu_read_lock();
794 for_each_active_iommu(iommu, drhd) {
795 if (pdev && segment != drhd->segment)
796 continue;
798 for_each_active_dev_scope(drhd->devices,
799 drhd->devices_cnt, i, tmp) {
800 if (tmp == dev) {
801 /* For a VF use its original BDF# not that of the PF
802 * which we used for the IOMMU lookup. Strictly speaking
803 * we could do this for all PCI devices; we only need to
804 * get the BDF# from the scope table for ACPI matches. */
805 if (pdev && pdev->is_virtfn)
806 goto got_pdev;
808 *bus = drhd->devices[i].bus;
809 *devfn = drhd->devices[i].devfn;
810 goto out;
813 if (is_downstream_to_pci_bridge(dev, tmp))
814 goto got_pdev;
817 if (pdev && drhd->include_all) {
818 got_pdev:
819 *bus = pdev->bus->number;
820 *devfn = pdev->devfn;
821 goto out;
824 iommu = NULL;
825 out:
826 rcu_read_unlock();
828 return iommu;
831 static void domain_flush_cache(struct dmar_domain *domain,
832 void *addr, int size)
834 if (!domain->iommu_coherency)
835 clflush_cache_range(addr, size);
838 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
840 struct context_entry *context;
841 int ret = 0;
842 unsigned long flags;
844 spin_lock_irqsave(&iommu->lock, flags);
845 context = iommu_context_addr(iommu, bus, devfn, 0);
846 if (context)
847 ret = context_present(context);
848 spin_unlock_irqrestore(&iommu->lock, flags);
849 return ret;
852 static void free_context_table(struct intel_iommu *iommu)
854 int i;
855 unsigned long flags;
856 struct context_entry *context;
858 spin_lock_irqsave(&iommu->lock, flags);
859 if (!iommu->root_entry) {
860 goto out;
862 for (i = 0; i < ROOT_ENTRY_NR; i++) {
863 context = iommu_context_addr(iommu, i, 0, 0);
864 if (context)
865 free_pgtable_page(context);
867 if (!sm_supported(iommu))
868 continue;
870 context = iommu_context_addr(iommu, i, 0x80, 0);
871 if (context)
872 free_pgtable_page(context);
875 free_pgtable_page(iommu->root_entry);
876 iommu->root_entry = NULL;
877 out:
878 spin_unlock_irqrestore(&iommu->lock, flags);
881 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
882 unsigned long pfn, int *target_level)
884 struct dma_pte *parent, *pte;
885 int level = agaw_to_level(domain->agaw);
886 int offset;
888 BUG_ON(!domain->pgd);
890 if (!domain_pfn_supported(domain, pfn))
891 /* Address beyond IOMMU's addressing capabilities. */
892 return NULL;
894 parent = domain->pgd;
896 while (1) {
897 void *tmp_page;
899 offset = pfn_level_offset(pfn, level);
900 pte = &parent[offset];
901 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
902 break;
903 if (level == *target_level)
904 break;
906 if (!dma_pte_present(pte)) {
907 uint64_t pteval;
909 tmp_page = alloc_pgtable_page(domain->nid);
911 if (!tmp_page)
912 return NULL;
914 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
915 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
916 if (cmpxchg64(&pte->val, 0ULL, pteval))
917 /* Someone else set it while we were thinking; use theirs. */
918 free_pgtable_page(tmp_page);
919 else
920 domain_flush_cache(domain, pte, sizeof(*pte));
922 if (level == 1)
923 break;
925 parent = phys_to_virt(dma_pte_addr(pte));
926 level--;
929 if (!*target_level)
930 *target_level = level;
932 return pte;
935 /* return address's pte at specific level */
936 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
937 unsigned long pfn,
938 int level, int *large_page)
940 struct dma_pte *parent, *pte;
941 int total = agaw_to_level(domain->agaw);
942 int offset;
944 parent = domain->pgd;
945 while (level <= total) {
946 offset = pfn_level_offset(pfn, total);
947 pte = &parent[offset];
948 if (level == total)
949 return pte;
951 if (!dma_pte_present(pte)) {
952 *large_page = total;
953 break;
956 if (dma_pte_superpage(pte)) {
957 *large_page = total;
958 return pte;
961 parent = phys_to_virt(dma_pte_addr(pte));
962 total--;
964 return NULL;
967 /* clear last level pte, a tlb flush should be followed */
968 static void dma_pte_clear_range(struct dmar_domain *domain,
969 unsigned long start_pfn,
970 unsigned long last_pfn)
972 unsigned int large_page;
973 struct dma_pte *first_pte, *pte;
975 BUG_ON(!domain_pfn_supported(domain, start_pfn));
976 BUG_ON(!domain_pfn_supported(domain, last_pfn));
977 BUG_ON(start_pfn > last_pfn);
979 /* we don't need lock here; nobody else touches the iova range */
980 do {
981 large_page = 1;
982 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
983 if (!pte) {
984 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
985 continue;
987 do {
988 dma_clear_pte(pte);
989 start_pfn += lvl_to_nr_pages(large_page);
990 pte++;
991 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
993 domain_flush_cache(domain, first_pte,
994 (void *)pte - (void *)first_pte);
996 } while (start_pfn && start_pfn <= last_pfn);
999 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1000 int retain_level, struct dma_pte *pte,
1001 unsigned long pfn, unsigned long start_pfn,
1002 unsigned long last_pfn)
1004 pfn = max(start_pfn, pfn);
1005 pte = &pte[pfn_level_offset(pfn, level)];
1007 do {
1008 unsigned long level_pfn;
1009 struct dma_pte *level_pte;
1011 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1012 goto next;
1014 level_pfn = pfn & level_mask(level);
1015 level_pte = phys_to_virt(dma_pte_addr(pte));
1017 if (level > 2) {
1018 dma_pte_free_level(domain, level - 1, retain_level,
1019 level_pte, level_pfn, start_pfn,
1020 last_pfn);
1024 * Free the page table if we're below the level we want to
1025 * retain and the range covers the entire table.
1027 if (level < retain_level && !(start_pfn > level_pfn ||
1028 last_pfn < level_pfn + level_size(level) - 1)) {
1029 dma_clear_pte(pte);
1030 domain_flush_cache(domain, pte, sizeof(*pte));
1031 free_pgtable_page(level_pte);
1033 next:
1034 pfn += level_size(level);
1035 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1039 * clear last level (leaf) ptes and free page table pages below the
1040 * level we wish to keep intact.
1042 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1043 unsigned long start_pfn,
1044 unsigned long last_pfn,
1045 int retain_level)
1047 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1048 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1049 BUG_ON(start_pfn > last_pfn);
1051 dma_pte_clear_range(domain, start_pfn, last_pfn);
1053 /* We don't need lock here; nobody else touches the iova range */
1054 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1055 domain->pgd, 0, start_pfn, last_pfn);
1057 /* free pgd */
1058 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1059 free_pgtable_page(domain->pgd);
1060 domain->pgd = NULL;
1064 /* When a page at a given level is being unlinked from its parent, we don't
1065 need to *modify* it at all. All we need to do is make a list of all the
1066 pages which can be freed just as soon as we've flushed the IOTLB and we
1067 know the hardware page-walk will no longer touch them.
1068 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1069 be freed. */
1070 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1071 int level, struct dma_pte *pte,
1072 struct page *freelist)
1074 struct page *pg;
1076 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1077 pg->freelist = freelist;
1078 freelist = pg;
1080 if (level == 1)
1081 return freelist;
1083 pte = page_address(pg);
1084 do {
1085 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1086 freelist = dma_pte_list_pagetables(domain, level - 1,
1087 pte, freelist);
1088 pte++;
1089 } while (!first_pte_in_page(pte));
1091 return freelist;
1094 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1095 struct dma_pte *pte, unsigned long pfn,
1096 unsigned long start_pfn,
1097 unsigned long last_pfn,
1098 struct page *freelist)
1100 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1102 pfn = max(start_pfn, pfn);
1103 pte = &pte[pfn_level_offset(pfn, level)];
1105 do {
1106 unsigned long level_pfn;
1108 if (!dma_pte_present(pte))
1109 goto next;
1111 level_pfn = pfn & level_mask(level);
1113 /* If range covers entire pagetable, free it */
1114 if (start_pfn <= level_pfn &&
1115 last_pfn >= level_pfn + level_size(level) - 1) {
1116 /* These suborbinate page tables are going away entirely. Don't
1117 bother to clear them; we're just going to *free* them. */
1118 if (level > 1 && !dma_pte_superpage(pte))
1119 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1121 dma_clear_pte(pte);
1122 if (!first_pte)
1123 first_pte = pte;
1124 last_pte = pte;
1125 } else if (level > 1) {
1126 /* Recurse down into a level that isn't *entirely* obsolete */
1127 freelist = dma_pte_clear_level(domain, level - 1,
1128 phys_to_virt(dma_pte_addr(pte)),
1129 level_pfn, start_pfn, last_pfn,
1130 freelist);
1132 next:
1133 pfn += level_size(level);
1134 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1136 if (first_pte)
1137 domain_flush_cache(domain, first_pte,
1138 (void *)++last_pte - (void *)first_pte);
1140 return freelist;
1143 /* We can't just free the pages because the IOMMU may still be walking
1144 the page tables, and may have cached the intermediate levels. The
1145 pages can only be freed after the IOTLB flush has been done. */
1146 static struct page *domain_unmap(struct dmar_domain *domain,
1147 unsigned long start_pfn,
1148 unsigned long last_pfn)
1150 struct page *freelist;
1152 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1153 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1154 BUG_ON(start_pfn > last_pfn);
1156 /* we don't need lock here; nobody else touches the iova range */
1157 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1158 domain->pgd, 0, start_pfn, last_pfn, NULL);
1160 /* free pgd */
1161 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1162 struct page *pgd_page = virt_to_page(domain->pgd);
1163 pgd_page->freelist = freelist;
1164 freelist = pgd_page;
1166 domain->pgd = NULL;
1169 return freelist;
1172 static void dma_free_pagelist(struct page *freelist)
1174 struct page *pg;
1176 while ((pg = freelist)) {
1177 freelist = pg->freelist;
1178 free_pgtable_page(page_address(pg));
1182 static void iova_entry_free(unsigned long data)
1184 struct page *freelist = (struct page *)data;
1186 dma_free_pagelist(freelist);
1189 /* iommu handling */
1190 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1192 struct root_entry *root;
1193 unsigned long flags;
1195 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1196 if (!root) {
1197 pr_err("Allocating root entry for %s failed\n",
1198 iommu->name);
1199 return -ENOMEM;
1202 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1204 spin_lock_irqsave(&iommu->lock, flags);
1205 iommu->root_entry = root;
1206 spin_unlock_irqrestore(&iommu->lock, flags);
1208 return 0;
1211 static void iommu_set_root_entry(struct intel_iommu *iommu)
1213 u64 addr;
1214 u32 sts;
1215 unsigned long flag;
1217 addr = virt_to_phys(iommu->root_entry);
1218 if (sm_supported(iommu))
1219 addr |= DMA_RTADDR_SMT;
1221 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1222 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1224 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1226 /* Make sure hardware complete it */
1227 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1228 readl, (sts & DMA_GSTS_RTPS), sts);
1230 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1233 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1235 u32 val;
1236 unsigned long flag;
1238 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1239 return;
1241 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1242 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1244 /* Make sure hardware complete it */
1245 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1246 readl, (!(val & DMA_GSTS_WBFS)), val);
1248 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1251 /* return value determine if we need a write buffer flush */
1252 static void __iommu_flush_context(struct intel_iommu *iommu,
1253 u16 did, u16 source_id, u8 function_mask,
1254 u64 type)
1256 u64 val = 0;
1257 unsigned long flag;
1259 switch (type) {
1260 case DMA_CCMD_GLOBAL_INVL:
1261 val = DMA_CCMD_GLOBAL_INVL;
1262 break;
1263 case DMA_CCMD_DOMAIN_INVL:
1264 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1265 break;
1266 case DMA_CCMD_DEVICE_INVL:
1267 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1268 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1269 break;
1270 default:
1271 BUG();
1273 val |= DMA_CCMD_ICC;
1275 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1276 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1278 /* Make sure hardware complete it */
1279 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1280 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1282 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1285 /* return value determine if we need a write buffer flush */
1286 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1287 u64 addr, unsigned int size_order, u64 type)
1289 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1290 u64 val = 0, val_iva = 0;
1291 unsigned long flag;
1293 switch (type) {
1294 case DMA_TLB_GLOBAL_FLUSH:
1295 /* global flush doesn't need set IVA_REG */
1296 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1297 break;
1298 case DMA_TLB_DSI_FLUSH:
1299 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1300 break;
1301 case DMA_TLB_PSI_FLUSH:
1302 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1303 /* IH bit is passed in as part of address */
1304 val_iva = size_order | addr;
1305 break;
1306 default:
1307 BUG();
1309 /* Note: set drain read/write */
1310 #if 0
1312 * This is probably to be super secure.. Looks like we can
1313 * ignore it without any impact.
1315 if (cap_read_drain(iommu->cap))
1316 val |= DMA_TLB_READ_DRAIN;
1317 #endif
1318 if (cap_write_drain(iommu->cap))
1319 val |= DMA_TLB_WRITE_DRAIN;
1321 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1322 /* Note: Only uses first TLB reg currently */
1323 if (val_iva)
1324 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1325 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1327 /* Make sure hardware complete it */
1328 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1329 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1331 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1333 /* check IOTLB invalidation granularity */
1334 if (DMA_TLB_IAIG(val) == 0)
1335 pr_err("Flush IOTLB failed\n");
1336 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1337 pr_debug("TLB flush request %Lx, actual %Lx\n",
1338 (unsigned long long)DMA_TLB_IIRG(type),
1339 (unsigned long long)DMA_TLB_IAIG(val));
1342 static struct device_domain_info *
1343 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1344 u8 bus, u8 devfn)
1346 struct device_domain_info *info;
1348 assert_spin_locked(&device_domain_lock);
1350 if (!iommu->qi)
1351 return NULL;
1353 list_for_each_entry(info, &domain->devices, link)
1354 if (info->iommu == iommu && info->bus == bus &&
1355 info->devfn == devfn) {
1356 if (info->ats_supported && info->dev)
1357 return info;
1358 break;
1361 return NULL;
1364 static void domain_update_iotlb(struct dmar_domain *domain)
1366 struct device_domain_info *info;
1367 bool has_iotlb_device = false;
1369 assert_spin_locked(&device_domain_lock);
1371 list_for_each_entry(info, &domain->devices, link) {
1372 struct pci_dev *pdev;
1374 if (!info->dev || !dev_is_pci(info->dev))
1375 continue;
1377 pdev = to_pci_dev(info->dev);
1378 if (pdev->ats_enabled) {
1379 has_iotlb_device = true;
1380 break;
1384 domain->has_iotlb_device = has_iotlb_device;
1387 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1389 struct pci_dev *pdev;
1391 assert_spin_locked(&device_domain_lock);
1393 if (!info || !dev_is_pci(info->dev))
1394 return;
1396 pdev = to_pci_dev(info->dev);
1397 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1398 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1399 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1400 * reserved, which should be set to 0.
1402 if (!ecap_dit(info->iommu->ecap))
1403 info->pfsid = 0;
1404 else {
1405 struct pci_dev *pf_pdev;
1407 /* pdev will be returned if device is not a vf */
1408 pf_pdev = pci_physfn(pdev);
1409 info->pfsid = pci_dev_id(pf_pdev);
1412 #ifdef CONFIG_INTEL_IOMMU_SVM
1413 /* The PCIe spec, in its wisdom, declares that the behaviour of
1414 the device if you enable PASID support after ATS support is
1415 undefined. So always enable PASID support on devices which
1416 have it, even if we can't yet know if we're ever going to
1417 use it. */
1418 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1419 info->pasid_enabled = 1;
1421 if (info->pri_supported &&
1422 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1423 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1424 info->pri_enabled = 1;
1425 #endif
1426 if (!pdev->untrusted && info->ats_supported &&
1427 pci_ats_page_aligned(pdev) &&
1428 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1429 info->ats_enabled = 1;
1430 domain_update_iotlb(info->domain);
1431 info->ats_qdep = pci_ats_queue_depth(pdev);
1435 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1437 struct pci_dev *pdev;
1439 assert_spin_locked(&device_domain_lock);
1441 if (!dev_is_pci(info->dev))
1442 return;
1444 pdev = to_pci_dev(info->dev);
1446 if (info->ats_enabled) {
1447 pci_disable_ats(pdev);
1448 info->ats_enabled = 0;
1449 domain_update_iotlb(info->domain);
1451 #ifdef CONFIG_INTEL_IOMMU_SVM
1452 if (info->pri_enabled) {
1453 pci_disable_pri(pdev);
1454 info->pri_enabled = 0;
1456 if (info->pasid_enabled) {
1457 pci_disable_pasid(pdev);
1458 info->pasid_enabled = 0;
1460 #endif
1463 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1464 u64 addr, unsigned mask)
1466 u16 sid, qdep;
1467 unsigned long flags;
1468 struct device_domain_info *info;
1470 if (!domain->has_iotlb_device)
1471 return;
1473 spin_lock_irqsave(&device_domain_lock, flags);
1474 list_for_each_entry(info, &domain->devices, link) {
1475 if (!info->ats_enabled)
1476 continue;
1478 sid = info->bus << 8 | info->devfn;
1479 qdep = info->ats_qdep;
1480 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1481 qdep, addr, mask);
1483 spin_unlock_irqrestore(&device_domain_lock, flags);
1486 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1487 struct dmar_domain *domain,
1488 unsigned long pfn, unsigned int pages,
1489 int ih, int map)
1491 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1492 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1493 u16 did = domain->iommu_did[iommu->seq_id];
1495 BUG_ON(pages == 0);
1497 if (ih)
1498 ih = 1 << 6;
1500 * Fallback to domain selective flush if no PSI support or the size is
1501 * too big.
1502 * PSI requires page size to be 2 ^ x, and the base address is naturally
1503 * aligned to the size
1505 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1506 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1507 DMA_TLB_DSI_FLUSH);
1508 else
1509 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1510 DMA_TLB_PSI_FLUSH);
1513 * In caching mode, changes of pages from non-present to present require
1514 * flush. However, device IOTLB doesn't need to be flushed in this case.
1516 if (!cap_caching_mode(iommu->cap) || !map)
1517 iommu_flush_dev_iotlb(domain, addr, mask);
1520 /* Notification for newly created mappings */
1521 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1522 struct dmar_domain *domain,
1523 unsigned long pfn, unsigned int pages)
1525 /* It's a non-present to present mapping. Only flush if caching mode */
1526 if (cap_caching_mode(iommu->cap))
1527 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1528 else
1529 iommu_flush_write_buffer(iommu);
1532 static void iommu_flush_iova(struct iova_domain *iovad)
1534 struct dmar_domain *domain;
1535 int idx;
1537 domain = container_of(iovad, struct dmar_domain, iovad);
1539 for_each_domain_iommu(idx, domain) {
1540 struct intel_iommu *iommu = g_iommus[idx];
1541 u16 did = domain->iommu_did[iommu->seq_id];
1543 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1545 if (!cap_caching_mode(iommu->cap))
1546 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1547 0, MAX_AGAW_PFN_WIDTH);
1551 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1553 u32 pmen;
1554 unsigned long flags;
1556 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1557 return;
1559 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1560 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1561 pmen &= ~DMA_PMEN_EPM;
1562 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1564 /* wait for the protected region status bit to clear */
1565 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1566 readl, !(pmen & DMA_PMEN_PRS), pmen);
1568 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1571 static void iommu_enable_translation(struct intel_iommu *iommu)
1573 u32 sts;
1574 unsigned long flags;
1576 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1577 iommu->gcmd |= DMA_GCMD_TE;
1578 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1580 /* Make sure hardware complete it */
1581 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1582 readl, (sts & DMA_GSTS_TES), sts);
1584 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1587 static void iommu_disable_translation(struct intel_iommu *iommu)
1589 u32 sts;
1590 unsigned long flag;
1592 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1593 iommu->gcmd &= ~DMA_GCMD_TE;
1594 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1596 /* Make sure hardware complete it */
1597 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1598 readl, (!(sts & DMA_GSTS_TES)), sts);
1600 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1603 static int iommu_init_domains(struct intel_iommu *iommu)
1605 u32 ndomains, nlongs;
1606 size_t size;
1608 ndomains = cap_ndoms(iommu->cap);
1609 pr_debug("%s: Number of Domains supported <%d>\n",
1610 iommu->name, ndomains);
1611 nlongs = BITS_TO_LONGS(ndomains);
1613 spin_lock_init(&iommu->lock);
1615 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1616 if (!iommu->domain_ids) {
1617 pr_err("%s: Allocating domain id array failed\n",
1618 iommu->name);
1619 return -ENOMEM;
1622 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1623 iommu->domains = kzalloc(size, GFP_KERNEL);
1625 if (iommu->domains) {
1626 size = 256 * sizeof(struct dmar_domain *);
1627 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1630 if (!iommu->domains || !iommu->domains[0]) {
1631 pr_err("%s: Allocating domain array failed\n",
1632 iommu->name);
1633 kfree(iommu->domain_ids);
1634 kfree(iommu->domains);
1635 iommu->domain_ids = NULL;
1636 iommu->domains = NULL;
1637 return -ENOMEM;
1641 * If Caching mode is set, then invalid translations are tagged
1642 * with domain-id 0, hence we need to pre-allocate it. We also
1643 * use domain-id 0 as a marker for non-allocated domain-id, so
1644 * make sure it is not used for a real domain.
1646 set_bit(0, iommu->domain_ids);
1649 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1650 * entry for first-level or pass-through translation modes should
1651 * be programmed with a domain id different from those used for
1652 * second-level or nested translation. We reserve a domain id for
1653 * this purpose.
1655 if (sm_supported(iommu))
1656 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1658 return 0;
1661 static void disable_dmar_iommu(struct intel_iommu *iommu)
1663 struct device_domain_info *info, *tmp;
1664 unsigned long flags;
1666 if (!iommu->domains || !iommu->domain_ids)
1667 return;
1669 spin_lock_irqsave(&device_domain_lock, flags);
1670 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1671 if (info->iommu != iommu)
1672 continue;
1674 if (!info->dev || !info->domain)
1675 continue;
1677 __dmar_remove_one_dev_info(info);
1679 spin_unlock_irqrestore(&device_domain_lock, flags);
1681 if (iommu->gcmd & DMA_GCMD_TE)
1682 iommu_disable_translation(iommu);
1685 static void free_dmar_iommu(struct intel_iommu *iommu)
1687 if ((iommu->domains) && (iommu->domain_ids)) {
1688 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1689 int i;
1691 for (i = 0; i < elems; i++)
1692 kfree(iommu->domains[i]);
1693 kfree(iommu->domains);
1694 kfree(iommu->domain_ids);
1695 iommu->domains = NULL;
1696 iommu->domain_ids = NULL;
1699 g_iommus[iommu->seq_id] = NULL;
1701 /* free context mapping */
1702 free_context_table(iommu);
1704 #ifdef CONFIG_INTEL_IOMMU_SVM
1705 if (pasid_supported(iommu)) {
1706 if (ecap_prs(iommu->ecap))
1707 intel_svm_finish_prq(iommu);
1709 #endif
1712 static struct dmar_domain *alloc_domain(int flags)
1714 struct dmar_domain *domain;
1716 domain = alloc_domain_mem();
1717 if (!domain)
1718 return NULL;
1720 memset(domain, 0, sizeof(*domain));
1721 domain->nid = NUMA_NO_NODE;
1722 domain->flags = flags;
1723 domain->has_iotlb_device = false;
1724 INIT_LIST_HEAD(&domain->devices);
1726 return domain;
1729 /* Must be called with iommu->lock */
1730 static int domain_attach_iommu(struct dmar_domain *domain,
1731 struct intel_iommu *iommu)
1733 unsigned long ndomains;
1734 int num;
1736 assert_spin_locked(&device_domain_lock);
1737 assert_spin_locked(&iommu->lock);
1739 domain->iommu_refcnt[iommu->seq_id] += 1;
1740 domain->iommu_count += 1;
1741 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1742 ndomains = cap_ndoms(iommu->cap);
1743 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1745 if (num >= ndomains) {
1746 pr_err("%s: No free domain ids\n", iommu->name);
1747 domain->iommu_refcnt[iommu->seq_id] -= 1;
1748 domain->iommu_count -= 1;
1749 return -ENOSPC;
1752 set_bit(num, iommu->domain_ids);
1753 set_iommu_domain(iommu, num, domain);
1755 domain->iommu_did[iommu->seq_id] = num;
1756 domain->nid = iommu->node;
1758 domain_update_iommu_cap(domain);
1761 return 0;
1764 static int domain_detach_iommu(struct dmar_domain *domain,
1765 struct intel_iommu *iommu)
1767 int num, count;
1769 assert_spin_locked(&device_domain_lock);
1770 assert_spin_locked(&iommu->lock);
1772 domain->iommu_refcnt[iommu->seq_id] -= 1;
1773 count = --domain->iommu_count;
1774 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1775 num = domain->iommu_did[iommu->seq_id];
1776 clear_bit(num, iommu->domain_ids);
1777 set_iommu_domain(iommu, num, NULL);
1779 domain_update_iommu_cap(domain);
1780 domain->iommu_did[iommu->seq_id] = 0;
1783 return count;
1786 static struct iova_domain reserved_iova_list;
1787 static struct lock_class_key reserved_rbtree_key;
1789 static int dmar_init_reserved_ranges(void)
1791 struct pci_dev *pdev = NULL;
1792 struct iova *iova;
1793 int i;
1795 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1797 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1798 &reserved_rbtree_key);
1800 /* IOAPIC ranges shouldn't be accessed by DMA */
1801 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1802 IOVA_PFN(IOAPIC_RANGE_END));
1803 if (!iova) {
1804 pr_err("Reserve IOAPIC range failed\n");
1805 return -ENODEV;
1808 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1809 for_each_pci_dev(pdev) {
1810 struct resource *r;
1812 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1813 r = &pdev->resource[i];
1814 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1815 continue;
1816 iova = reserve_iova(&reserved_iova_list,
1817 IOVA_PFN(r->start),
1818 IOVA_PFN(r->end));
1819 if (!iova) {
1820 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1821 return -ENODEV;
1825 return 0;
1828 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1830 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1833 static inline int guestwidth_to_adjustwidth(int gaw)
1835 int agaw;
1836 int r = (gaw - 12) % 9;
1838 if (r == 0)
1839 agaw = gaw;
1840 else
1841 agaw = gaw + 9 - r;
1842 if (agaw > 64)
1843 agaw = 64;
1844 return agaw;
1847 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1848 int guest_width)
1850 int adjust_width, agaw;
1851 unsigned long sagaw;
1852 int err;
1854 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1856 err = init_iova_flush_queue(&domain->iovad,
1857 iommu_flush_iova, iova_entry_free);
1858 if (err)
1859 return err;
1861 domain_reserve_special_ranges(domain);
1863 /* calculate AGAW */
1864 if (guest_width > cap_mgaw(iommu->cap))
1865 guest_width = cap_mgaw(iommu->cap);
1866 domain->gaw = guest_width;
1867 adjust_width = guestwidth_to_adjustwidth(guest_width);
1868 agaw = width_to_agaw(adjust_width);
1869 sagaw = cap_sagaw(iommu->cap);
1870 if (!test_bit(agaw, &sagaw)) {
1871 /* hardware doesn't support it, choose a bigger one */
1872 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1873 agaw = find_next_bit(&sagaw, 5, agaw);
1874 if (agaw >= 5)
1875 return -ENODEV;
1877 domain->agaw = agaw;
1879 if (ecap_coherent(iommu->ecap))
1880 domain->iommu_coherency = 1;
1881 else
1882 domain->iommu_coherency = 0;
1884 if (ecap_sc_support(iommu->ecap))
1885 domain->iommu_snooping = 1;
1886 else
1887 domain->iommu_snooping = 0;
1889 if (intel_iommu_superpage)
1890 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1891 else
1892 domain->iommu_superpage = 0;
1894 domain->nid = iommu->node;
1896 /* always allocate the top pgd */
1897 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1898 if (!domain->pgd)
1899 return -ENOMEM;
1900 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1901 return 0;
1904 static void domain_exit(struct dmar_domain *domain)
1907 /* Remove associated devices and clear attached or cached domains */
1908 domain_remove_dev_info(domain);
1910 /* destroy iovas */
1911 put_iova_domain(&domain->iovad);
1913 if (domain->pgd) {
1914 struct page *freelist;
1916 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1917 dma_free_pagelist(freelist);
1920 free_domain_mem(domain);
1924 * Get the PASID directory size for scalable mode context entry.
1925 * Value of X in the PDTS field of a scalable mode context entry
1926 * indicates PASID directory with 2^(X + 7) entries.
1928 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1930 int pds, max_pde;
1932 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1933 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1934 if (pds < 7)
1935 return 0;
1937 return pds - 7;
1941 * Set the RID_PASID field of a scalable mode context entry. The
1942 * IOMMU hardware will use the PASID value set in this field for
1943 * DMA translations of DMA requests without PASID.
1945 static inline void
1946 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1948 context->hi |= pasid & ((1 << 20) - 1);
1949 context->hi |= (1 << 20);
1953 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1954 * entry.
1956 static inline void context_set_sm_dte(struct context_entry *context)
1958 context->lo |= (1 << 2);
1962 * Set the PRE(Page Request Enable) field of a scalable mode context
1963 * entry.
1965 static inline void context_set_sm_pre(struct context_entry *context)
1967 context->lo |= (1 << 4);
1970 /* Convert value to context PASID directory size field coding. */
1971 #define context_pdts(pds) (((pds) & 0x7) << 9)
1973 static int domain_context_mapping_one(struct dmar_domain *domain,
1974 struct intel_iommu *iommu,
1975 struct pasid_table *table,
1976 u8 bus, u8 devfn)
1978 u16 did = domain->iommu_did[iommu->seq_id];
1979 int translation = CONTEXT_TT_MULTI_LEVEL;
1980 struct device_domain_info *info = NULL;
1981 struct context_entry *context;
1982 unsigned long flags;
1983 int ret;
1985 WARN_ON(did == 0);
1987 if (hw_pass_through && domain_type_is_si(domain))
1988 translation = CONTEXT_TT_PASS_THROUGH;
1990 pr_debug("Set context mapping for %02x:%02x.%d\n",
1991 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1993 BUG_ON(!domain->pgd);
1995 spin_lock_irqsave(&device_domain_lock, flags);
1996 spin_lock(&iommu->lock);
1998 ret = -ENOMEM;
1999 context = iommu_context_addr(iommu, bus, devfn, 1);
2000 if (!context)
2001 goto out_unlock;
2003 ret = 0;
2004 if (context_present(context))
2005 goto out_unlock;
2008 * For kdump cases, old valid entries may be cached due to the
2009 * in-flight DMA and copied pgtable, but there is no unmapping
2010 * behaviour for them, thus we need an explicit cache flush for
2011 * the newly-mapped device. For kdump, at this point, the device
2012 * is supposed to finish reset at its driver probe stage, so no
2013 * in-flight DMA will exist, and we don't need to worry anymore
2014 * hereafter.
2016 if (context_copied(context)) {
2017 u16 did_old = context_domain_id(context);
2019 if (did_old < cap_ndoms(iommu->cap)) {
2020 iommu->flush.flush_context(iommu, did_old,
2021 (((u16)bus) << 8) | devfn,
2022 DMA_CCMD_MASK_NOBIT,
2023 DMA_CCMD_DEVICE_INVL);
2024 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2025 DMA_TLB_DSI_FLUSH);
2029 context_clear_entry(context);
2031 if (sm_supported(iommu)) {
2032 unsigned long pds;
2034 WARN_ON(!table);
2036 /* Setup the PASID DIR pointer: */
2037 pds = context_get_sm_pds(table);
2038 context->lo = (u64)virt_to_phys(table->table) |
2039 context_pdts(pds);
2041 /* Setup the RID_PASID field: */
2042 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2045 * Setup the Device-TLB enable bit and Page request
2046 * Enable bit:
2048 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2049 if (info && info->ats_supported)
2050 context_set_sm_dte(context);
2051 if (info && info->pri_supported)
2052 context_set_sm_pre(context);
2053 } else {
2054 struct dma_pte *pgd = domain->pgd;
2055 int agaw;
2057 context_set_domain_id(context, did);
2059 if (translation != CONTEXT_TT_PASS_THROUGH) {
2061 * Skip top levels of page tables for iommu which has
2062 * less agaw than default. Unnecessary for PT mode.
2064 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2065 ret = -ENOMEM;
2066 pgd = phys_to_virt(dma_pte_addr(pgd));
2067 if (!dma_pte_present(pgd))
2068 goto out_unlock;
2071 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2072 if (info && info->ats_supported)
2073 translation = CONTEXT_TT_DEV_IOTLB;
2074 else
2075 translation = CONTEXT_TT_MULTI_LEVEL;
2077 context_set_address_root(context, virt_to_phys(pgd));
2078 context_set_address_width(context, agaw);
2079 } else {
2081 * In pass through mode, AW must be programmed to
2082 * indicate the largest AGAW value supported by
2083 * hardware. And ASR is ignored by hardware.
2085 context_set_address_width(context, iommu->msagaw);
2088 context_set_translation_type(context, translation);
2091 context_set_fault_enable(context);
2092 context_set_present(context);
2093 domain_flush_cache(domain, context, sizeof(*context));
2096 * It's a non-present to present mapping. If hardware doesn't cache
2097 * non-present entry we only need to flush the write-buffer. If the
2098 * _does_ cache non-present entries, then it does so in the special
2099 * domain #0, which we have to flush:
2101 if (cap_caching_mode(iommu->cap)) {
2102 iommu->flush.flush_context(iommu, 0,
2103 (((u16)bus) << 8) | devfn,
2104 DMA_CCMD_MASK_NOBIT,
2105 DMA_CCMD_DEVICE_INVL);
2106 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2107 } else {
2108 iommu_flush_write_buffer(iommu);
2110 iommu_enable_dev_iotlb(info);
2112 ret = 0;
2114 out_unlock:
2115 spin_unlock(&iommu->lock);
2116 spin_unlock_irqrestore(&device_domain_lock, flags);
2118 return ret;
2121 struct domain_context_mapping_data {
2122 struct dmar_domain *domain;
2123 struct intel_iommu *iommu;
2124 struct pasid_table *table;
2127 static int domain_context_mapping_cb(struct pci_dev *pdev,
2128 u16 alias, void *opaque)
2130 struct domain_context_mapping_data *data = opaque;
2132 return domain_context_mapping_one(data->domain, data->iommu,
2133 data->table, PCI_BUS_NUM(alias),
2134 alias & 0xff);
2137 static int
2138 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2140 struct domain_context_mapping_data data;
2141 struct pasid_table *table;
2142 struct intel_iommu *iommu;
2143 u8 bus, devfn;
2145 iommu = device_to_iommu(dev, &bus, &devfn);
2146 if (!iommu)
2147 return -ENODEV;
2149 table = intel_pasid_get_table(dev);
2151 if (!dev_is_pci(dev))
2152 return domain_context_mapping_one(domain, iommu, table,
2153 bus, devfn);
2155 data.domain = domain;
2156 data.iommu = iommu;
2157 data.table = table;
2159 return pci_for_each_dma_alias(to_pci_dev(dev),
2160 &domain_context_mapping_cb, &data);
2163 static int domain_context_mapped_cb(struct pci_dev *pdev,
2164 u16 alias, void *opaque)
2166 struct intel_iommu *iommu = opaque;
2168 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2171 static int domain_context_mapped(struct device *dev)
2173 struct intel_iommu *iommu;
2174 u8 bus, devfn;
2176 iommu = device_to_iommu(dev, &bus, &devfn);
2177 if (!iommu)
2178 return -ENODEV;
2180 if (!dev_is_pci(dev))
2181 return device_context_mapped(iommu, bus, devfn);
2183 return !pci_for_each_dma_alias(to_pci_dev(dev),
2184 domain_context_mapped_cb, iommu);
2187 /* Returns a number of VTD pages, but aligned to MM page size */
2188 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2189 size_t size)
2191 host_addr &= ~PAGE_MASK;
2192 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2195 /* Return largest possible superpage level for a given mapping */
2196 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2197 unsigned long iov_pfn,
2198 unsigned long phy_pfn,
2199 unsigned long pages)
2201 int support, level = 1;
2202 unsigned long pfnmerge;
2204 support = domain->iommu_superpage;
2206 /* To use a large page, the virtual *and* physical addresses
2207 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2208 of them will mean we have to use smaller pages. So just
2209 merge them and check both at once. */
2210 pfnmerge = iov_pfn | phy_pfn;
2212 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2213 pages >>= VTD_STRIDE_SHIFT;
2214 if (!pages)
2215 break;
2216 pfnmerge >>= VTD_STRIDE_SHIFT;
2217 level++;
2218 support--;
2220 return level;
2223 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2224 struct scatterlist *sg, unsigned long phys_pfn,
2225 unsigned long nr_pages, int prot)
2227 struct dma_pte *first_pte = NULL, *pte = NULL;
2228 phys_addr_t uninitialized_var(pteval);
2229 unsigned long sg_res = 0;
2230 unsigned int largepage_lvl = 0;
2231 unsigned long lvl_pages = 0;
2233 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2235 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2236 return -EINVAL;
2238 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2240 if (!sg) {
2241 sg_res = nr_pages;
2242 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2245 while (nr_pages > 0) {
2246 uint64_t tmp;
2248 if (!sg_res) {
2249 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2251 sg_res = aligned_nrpages(sg->offset, sg->length);
2252 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2253 sg->dma_length = sg->length;
2254 pteval = (sg_phys(sg) - pgoff) | prot;
2255 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2258 if (!pte) {
2259 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2261 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2262 if (!pte)
2263 return -ENOMEM;
2264 /* It is large page*/
2265 if (largepage_lvl > 1) {
2266 unsigned long nr_superpages, end_pfn;
2268 pteval |= DMA_PTE_LARGE_PAGE;
2269 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2271 nr_superpages = sg_res / lvl_pages;
2272 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2275 * Ensure that old small page tables are
2276 * removed to make room for superpage(s).
2277 * We're adding new large pages, so make sure
2278 * we don't remove their parent tables.
2280 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2281 largepage_lvl + 1);
2282 } else {
2283 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2287 /* We don't need lock here, nobody else
2288 * touches the iova range
2290 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2291 if (tmp) {
2292 static int dumps = 5;
2293 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2294 iov_pfn, tmp, (unsigned long long)pteval);
2295 if (dumps) {
2296 dumps--;
2297 debug_dma_dump_mappings(NULL);
2299 WARN_ON(1);
2302 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2304 BUG_ON(nr_pages < lvl_pages);
2305 BUG_ON(sg_res < lvl_pages);
2307 nr_pages -= lvl_pages;
2308 iov_pfn += lvl_pages;
2309 phys_pfn += lvl_pages;
2310 pteval += lvl_pages * VTD_PAGE_SIZE;
2311 sg_res -= lvl_pages;
2313 /* If the next PTE would be the first in a new page, then we
2314 need to flush the cache on the entries we've just written.
2315 And then we'll need to recalculate 'pte', so clear it and
2316 let it get set again in the if (!pte) block above.
2318 If we're done (!nr_pages) we need to flush the cache too.
2320 Also if we've been setting superpages, we may need to
2321 recalculate 'pte' and switch back to smaller pages for the
2322 end of the mapping, if the trailing size is not enough to
2323 use another superpage (i.e. sg_res < lvl_pages). */
2324 pte++;
2325 if (!nr_pages || first_pte_in_page(pte) ||
2326 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2327 domain_flush_cache(domain, first_pte,
2328 (void *)pte - (void *)first_pte);
2329 pte = NULL;
2332 if (!sg_res && nr_pages)
2333 sg = sg_next(sg);
2335 return 0;
2338 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2339 struct scatterlist *sg, unsigned long phys_pfn,
2340 unsigned long nr_pages, int prot)
2342 int iommu_id, ret;
2343 struct intel_iommu *iommu;
2345 /* Do the real mapping first */
2346 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2347 if (ret)
2348 return ret;
2350 for_each_domain_iommu(iommu_id, domain) {
2351 iommu = g_iommus[iommu_id];
2352 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2355 return 0;
2358 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2359 struct scatterlist *sg, unsigned long nr_pages,
2360 int prot)
2362 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2365 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2366 unsigned long phys_pfn, unsigned long nr_pages,
2367 int prot)
2369 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2372 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2374 unsigned long flags;
2375 struct context_entry *context;
2376 u16 did_old;
2378 if (!iommu)
2379 return;
2381 spin_lock_irqsave(&iommu->lock, flags);
2382 context = iommu_context_addr(iommu, bus, devfn, 0);
2383 if (!context) {
2384 spin_unlock_irqrestore(&iommu->lock, flags);
2385 return;
2387 did_old = context_domain_id(context);
2388 context_clear_entry(context);
2389 __iommu_flush_cache(iommu, context, sizeof(*context));
2390 spin_unlock_irqrestore(&iommu->lock, flags);
2391 iommu->flush.flush_context(iommu,
2392 did_old,
2393 (((u16)bus) << 8) | devfn,
2394 DMA_CCMD_MASK_NOBIT,
2395 DMA_CCMD_DEVICE_INVL);
2396 iommu->flush.flush_iotlb(iommu,
2397 did_old,
2400 DMA_TLB_DSI_FLUSH);
2403 static inline void unlink_domain_info(struct device_domain_info *info)
2405 assert_spin_locked(&device_domain_lock);
2406 list_del(&info->link);
2407 list_del(&info->global);
2408 if (info->dev)
2409 info->dev->archdata.iommu = NULL;
2412 static void domain_remove_dev_info(struct dmar_domain *domain)
2414 struct device_domain_info *info, *tmp;
2415 unsigned long flags;
2417 spin_lock_irqsave(&device_domain_lock, flags);
2418 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2419 __dmar_remove_one_dev_info(info);
2420 spin_unlock_irqrestore(&device_domain_lock, flags);
2423 static struct dmar_domain *find_domain(struct device *dev)
2425 struct device_domain_info *info;
2427 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO ||
2428 dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO))
2429 return NULL;
2431 /* No lock here, assumes no domain exit in normal case */
2432 info = dev->archdata.iommu;
2433 if (likely(info))
2434 return info->domain;
2436 return NULL;
2439 static struct dmar_domain *deferred_attach_domain(struct device *dev)
2441 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2442 struct iommu_domain *domain;
2444 dev->archdata.iommu = NULL;
2445 domain = iommu_get_domain_for_dev(dev);
2446 if (domain)
2447 intel_iommu_attach_device(domain, dev);
2450 return find_domain(dev);
2453 static inline struct device_domain_info *
2454 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2456 struct device_domain_info *info;
2458 list_for_each_entry(info, &device_domain_list, global)
2459 if (info->iommu->segment == segment && info->bus == bus &&
2460 info->devfn == devfn)
2461 return info;
2463 return NULL;
2466 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2467 int bus, int devfn,
2468 struct device *dev,
2469 struct dmar_domain *domain)
2471 struct dmar_domain *found = NULL;
2472 struct device_domain_info *info;
2473 unsigned long flags;
2474 int ret;
2476 info = alloc_devinfo_mem();
2477 if (!info)
2478 return NULL;
2480 info->bus = bus;
2481 info->devfn = devfn;
2482 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2483 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2484 info->ats_qdep = 0;
2485 info->dev = dev;
2486 info->domain = domain;
2487 info->iommu = iommu;
2488 info->pasid_table = NULL;
2489 info->auxd_enabled = 0;
2490 INIT_LIST_HEAD(&info->auxiliary_domains);
2492 if (dev && dev_is_pci(dev)) {
2493 struct pci_dev *pdev = to_pci_dev(info->dev);
2495 if (!pdev->untrusted &&
2496 !pci_ats_disabled() &&
2497 ecap_dev_iotlb_support(iommu->ecap) &&
2498 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2499 dmar_find_matched_atsr_unit(pdev))
2500 info->ats_supported = 1;
2502 if (sm_supported(iommu)) {
2503 if (pasid_supported(iommu)) {
2504 int features = pci_pasid_features(pdev);
2505 if (features >= 0)
2506 info->pasid_supported = features | 1;
2509 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2510 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2511 info->pri_supported = 1;
2515 spin_lock_irqsave(&device_domain_lock, flags);
2516 if (dev)
2517 found = find_domain(dev);
2519 if (!found) {
2520 struct device_domain_info *info2;
2521 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2522 if (info2) {
2523 found = info2->domain;
2524 info2->dev = dev;
2528 if (found) {
2529 spin_unlock_irqrestore(&device_domain_lock, flags);
2530 free_devinfo_mem(info);
2531 /* Caller must free the original domain */
2532 return found;
2535 spin_lock(&iommu->lock);
2536 ret = domain_attach_iommu(domain, iommu);
2537 spin_unlock(&iommu->lock);
2539 if (ret) {
2540 spin_unlock_irqrestore(&device_domain_lock, flags);
2541 free_devinfo_mem(info);
2542 return NULL;
2545 list_add(&info->link, &domain->devices);
2546 list_add(&info->global, &device_domain_list);
2547 if (dev)
2548 dev->archdata.iommu = info;
2549 spin_unlock_irqrestore(&device_domain_lock, flags);
2551 /* PASID table is mandatory for a PCI device in scalable mode. */
2552 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2553 ret = intel_pasid_alloc_table(dev);
2554 if (ret) {
2555 dev_err(dev, "PASID table allocation failed\n");
2556 dmar_remove_one_dev_info(dev);
2557 return NULL;
2560 /* Setup the PASID entry for requests without PASID: */
2561 spin_lock(&iommu->lock);
2562 if (hw_pass_through && domain_type_is_si(domain))
2563 ret = intel_pasid_setup_pass_through(iommu, domain,
2564 dev, PASID_RID2PASID);
2565 else
2566 ret = intel_pasid_setup_second_level(iommu, domain,
2567 dev, PASID_RID2PASID);
2568 spin_unlock(&iommu->lock);
2569 if (ret) {
2570 dev_err(dev, "Setup RID2PASID failed\n");
2571 dmar_remove_one_dev_info(dev);
2572 return NULL;
2576 if (dev && domain_context_mapping(domain, dev)) {
2577 dev_err(dev, "Domain context map failed\n");
2578 dmar_remove_one_dev_info(dev);
2579 return NULL;
2582 return domain;
2585 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2587 *(u16 *)opaque = alias;
2588 return 0;
2591 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2593 struct device_domain_info *info;
2594 struct dmar_domain *domain = NULL;
2595 struct intel_iommu *iommu;
2596 u16 dma_alias;
2597 unsigned long flags;
2598 u8 bus, devfn;
2600 iommu = device_to_iommu(dev, &bus, &devfn);
2601 if (!iommu)
2602 return NULL;
2604 if (dev_is_pci(dev)) {
2605 struct pci_dev *pdev = to_pci_dev(dev);
2607 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2609 spin_lock_irqsave(&device_domain_lock, flags);
2610 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2611 PCI_BUS_NUM(dma_alias),
2612 dma_alias & 0xff);
2613 if (info) {
2614 iommu = info->iommu;
2615 domain = info->domain;
2617 spin_unlock_irqrestore(&device_domain_lock, flags);
2619 /* DMA alias already has a domain, use it */
2620 if (info)
2621 goto out;
2624 /* Allocate and initialize new domain for the device */
2625 domain = alloc_domain(0);
2626 if (!domain)
2627 return NULL;
2628 if (domain_init(domain, iommu, gaw)) {
2629 domain_exit(domain);
2630 return NULL;
2633 out:
2634 return domain;
2637 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2638 struct dmar_domain *domain)
2640 struct intel_iommu *iommu;
2641 struct dmar_domain *tmp;
2642 u16 req_id, dma_alias;
2643 u8 bus, devfn;
2645 iommu = device_to_iommu(dev, &bus, &devfn);
2646 if (!iommu)
2647 return NULL;
2649 req_id = ((u16)bus << 8) | devfn;
2651 if (dev_is_pci(dev)) {
2652 struct pci_dev *pdev = to_pci_dev(dev);
2654 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2656 /* register PCI DMA alias device */
2657 if (req_id != dma_alias) {
2658 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2659 dma_alias & 0xff, NULL, domain);
2661 if (!tmp || tmp != domain)
2662 return tmp;
2666 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2667 if (!tmp || tmp != domain)
2668 return tmp;
2670 return domain;
2673 static int iommu_domain_identity_map(struct dmar_domain *domain,
2674 unsigned long long start,
2675 unsigned long long end)
2677 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2678 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2680 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2681 dma_to_mm_pfn(last_vpfn))) {
2682 pr_err("Reserving iova failed\n");
2683 return -ENOMEM;
2686 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2688 * RMRR range might have overlap with physical memory range,
2689 * clear it first
2691 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2693 return __domain_mapping(domain, first_vpfn, NULL,
2694 first_vpfn, last_vpfn - first_vpfn + 1,
2695 DMA_PTE_READ|DMA_PTE_WRITE);
2698 static int domain_prepare_identity_map(struct device *dev,
2699 struct dmar_domain *domain,
2700 unsigned long long start,
2701 unsigned long long end)
2703 /* For _hardware_ passthrough, don't bother. But for software
2704 passthrough, we do it anyway -- it may indicate a memory
2705 range which is reserved in E820, so which didn't get set
2706 up to start with in si_domain */
2707 if (domain == si_domain && hw_pass_through) {
2708 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2709 start, end);
2710 return 0;
2713 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2715 if (end < start) {
2716 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2717 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2718 dmi_get_system_info(DMI_BIOS_VENDOR),
2719 dmi_get_system_info(DMI_BIOS_VERSION),
2720 dmi_get_system_info(DMI_PRODUCT_VERSION));
2721 return -EIO;
2724 if (end >> agaw_to_width(domain->agaw)) {
2725 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2726 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2727 agaw_to_width(domain->agaw),
2728 dmi_get_system_info(DMI_BIOS_VENDOR),
2729 dmi_get_system_info(DMI_BIOS_VERSION),
2730 dmi_get_system_info(DMI_PRODUCT_VERSION));
2731 return -EIO;
2734 return iommu_domain_identity_map(domain, start, end);
2737 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2739 static int __init si_domain_init(int hw)
2741 struct dmar_rmrr_unit *rmrr;
2742 struct device *dev;
2743 int i, nid, ret;
2745 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2746 if (!si_domain)
2747 return -EFAULT;
2749 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2750 domain_exit(si_domain);
2751 return -EFAULT;
2754 if (hw)
2755 return 0;
2757 for_each_online_node(nid) {
2758 unsigned long start_pfn, end_pfn;
2759 int i;
2761 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2762 ret = iommu_domain_identity_map(si_domain,
2763 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2764 if (ret)
2765 return ret;
2770 * Normally we use DMA domains for devices which have RMRRs. But we
2771 * loose this requirement for graphic and usb devices. Identity map
2772 * the RMRRs for graphic and USB devices so that they could use the
2773 * si_domain.
2775 for_each_rmrr_units(rmrr) {
2776 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2777 i, dev) {
2778 unsigned long long start = rmrr->base_address;
2779 unsigned long long end = rmrr->end_address;
2781 if (device_is_rmrr_locked(dev))
2782 continue;
2784 if (WARN_ON(end < start ||
2785 end >> agaw_to_width(si_domain->agaw)))
2786 continue;
2788 ret = iommu_domain_identity_map(si_domain, start, end);
2789 if (ret)
2790 return ret;
2794 return 0;
2797 static int identity_mapping(struct device *dev)
2799 struct device_domain_info *info;
2801 info = dev->archdata.iommu;
2802 if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO)
2803 return (info->domain == si_domain);
2805 return 0;
2808 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2810 struct dmar_domain *ndomain;
2811 struct intel_iommu *iommu;
2812 u8 bus, devfn;
2814 iommu = device_to_iommu(dev, &bus, &devfn);
2815 if (!iommu)
2816 return -ENODEV;
2818 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2819 if (ndomain != domain)
2820 return -EBUSY;
2822 return 0;
2825 static bool device_has_rmrr(struct device *dev)
2827 struct dmar_rmrr_unit *rmrr;
2828 struct device *tmp;
2829 int i;
2831 rcu_read_lock();
2832 for_each_rmrr_units(rmrr) {
2834 * Return TRUE if this RMRR contains the device that
2835 * is passed in.
2837 for_each_active_dev_scope(rmrr->devices,
2838 rmrr->devices_cnt, i, tmp)
2839 if (tmp == dev ||
2840 is_downstream_to_pci_bridge(dev, tmp)) {
2841 rcu_read_unlock();
2842 return true;
2845 rcu_read_unlock();
2846 return false;
2850 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2851 * is relaxable (ie. is allowed to be not enforced under some conditions)
2852 * @dev: device handle
2854 * We assume that PCI USB devices with RMRRs have them largely
2855 * for historical reasons and that the RMRR space is not actively used post
2856 * boot. This exclusion may change if vendors begin to abuse it.
2858 * The same exception is made for graphics devices, with the requirement that
2859 * any use of the RMRR regions will be torn down before assigning the device
2860 * to a guest.
2862 * Return: true if the RMRR is relaxable, false otherwise
2864 static bool device_rmrr_is_relaxable(struct device *dev)
2866 struct pci_dev *pdev;
2868 if (!dev_is_pci(dev))
2869 return false;
2871 pdev = to_pci_dev(dev);
2872 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2873 return true;
2874 else
2875 return false;
2879 * There are a couple cases where we need to restrict the functionality of
2880 * devices associated with RMRRs. The first is when evaluating a device for
2881 * identity mapping because problems exist when devices are moved in and out
2882 * of domains and their respective RMRR information is lost. This means that
2883 * a device with associated RMRRs will never be in a "passthrough" domain.
2884 * The second is use of the device through the IOMMU API. This interface
2885 * expects to have full control of the IOVA space for the device. We cannot
2886 * satisfy both the requirement that RMRR access is maintained and have an
2887 * unencumbered IOVA space. We also have no ability to quiesce the device's
2888 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2889 * We therefore prevent devices associated with an RMRR from participating in
2890 * the IOMMU API, which eliminates them from device assignment.
2892 * In both cases, devices which have relaxable RMRRs are not concerned by this
2893 * restriction. See device_rmrr_is_relaxable comment.
2895 static bool device_is_rmrr_locked(struct device *dev)
2897 if (!device_has_rmrr(dev))
2898 return false;
2900 if (device_rmrr_is_relaxable(dev))
2901 return false;
2903 return true;
2907 * Return the required default domain type for a specific device.
2909 * @dev: the device in query
2910 * @startup: true if this is during early boot
2912 * Returns:
2913 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2914 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2915 * - 0: both identity and dynamic domains work for this device
2917 static int device_def_domain_type(struct device *dev)
2919 if (dev_is_pci(dev)) {
2920 struct pci_dev *pdev = to_pci_dev(dev);
2922 if (device_is_rmrr_locked(dev))
2923 return IOMMU_DOMAIN_DMA;
2926 * Prevent any device marked as untrusted from getting
2927 * placed into the statically identity mapping domain.
2929 if (pdev->untrusted)
2930 return IOMMU_DOMAIN_DMA;
2932 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2933 return IOMMU_DOMAIN_IDENTITY;
2935 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2936 return IOMMU_DOMAIN_IDENTITY;
2939 * We want to start off with all devices in the 1:1 domain, and
2940 * take them out later if we find they can't access all of memory.
2942 * However, we can't do this for PCI devices behind bridges,
2943 * because all PCI devices behind the same bridge will end up
2944 * with the same source-id on their transactions.
2946 * Practically speaking, we can't change things around for these
2947 * devices at run-time, because we can't be sure there'll be no
2948 * DMA transactions in flight for any of their siblings.
2950 * So PCI devices (unless they're on the root bus) as well as
2951 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2952 * the 1:1 domain, just in _case_ one of their siblings turns out
2953 * not to be able to map all of memory.
2955 if (!pci_is_pcie(pdev)) {
2956 if (!pci_is_root_bus(pdev->bus))
2957 return IOMMU_DOMAIN_DMA;
2958 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2959 return IOMMU_DOMAIN_DMA;
2960 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2961 return IOMMU_DOMAIN_DMA;
2962 } else {
2963 if (device_has_rmrr(dev))
2964 return IOMMU_DOMAIN_DMA;
2967 return (iommu_identity_mapping & IDENTMAP_ALL) ?
2968 IOMMU_DOMAIN_IDENTITY : 0;
2971 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2974 * Start from the sane iommu hardware state.
2975 * If the queued invalidation is already initialized by us
2976 * (for example, while enabling interrupt-remapping) then
2977 * we got the things already rolling from a sane state.
2979 if (!iommu->qi) {
2981 * Clear any previous faults.
2983 dmar_fault(-1, iommu);
2985 * Disable queued invalidation if supported and already enabled
2986 * before OS handover.
2988 dmar_disable_qi(iommu);
2991 if (dmar_enable_qi(iommu)) {
2993 * Queued Invalidate not enabled, use Register Based Invalidate
2995 iommu->flush.flush_context = __iommu_flush_context;
2996 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2997 pr_info("%s: Using Register based invalidation\n",
2998 iommu->name);
2999 } else {
3000 iommu->flush.flush_context = qi_flush_context;
3001 iommu->flush.flush_iotlb = qi_flush_iotlb;
3002 pr_info("%s: Using Queued invalidation\n", iommu->name);
3006 static int copy_context_table(struct intel_iommu *iommu,
3007 struct root_entry *old_re,
3008 struct context_entry **tbl,
3009 int bus, bool ext)
3011 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3012 struct context_entry *new_ce = NULL, ce;
3013 struct context_entry *old_ce = NULL;
3014 struct root_entry re;
3015 phys_addr_t old_ce_phys;
3017 tbl_idx = ext ? bus * 2 : bus;
3018 memcpy(&re, old_re, sizeof(re));
3020 for (devfn = 0; devfn < 256; devfn++) {
3021 /* First calculate the correct index */
3022 idx = (ext ? devfn * 2 : devfn) % 256;
3024 if (idx == 0) {
3025 /* First save what we may have and clean up */
3026 if (new_ce) {
3027 tbl[tbl_idx] = new_ce;
3028 __iommu_flush_cache(iommu, new_ce,
3029 VTD_PAGE_SIZE);
3030 pos = 1;
3033 if (old_ce)
3034 memunmap(old_ce);
3036 ret = 0;
3037 if (devfn < 0x80)
3038 old_ce_phys = root_entry_lctp(&re);
3039 else
3040 old_ce_phys = root_entry_uctp(&re);
3042 if (!old_ce_phys) {
3043 if (ext && devfn == 0) {
3044 /* No LCTP, try UCTP */
3045 devfn = 0x7f;
3046 continue;
3047 } else {
3048 goto out;
3052 ret = -ENOMEM;
3053 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3054 MEMREMAP_WB);
3055 if (!old_ce)
3056 goto out;
3058 new_ce = alloc_pgtable_page(iommu->node);
3059 if (!new_ce)
3060 goto out_unmap;
3062 ret = 0;
3065 /* Now copy the context entry */
3066 memcpy(&ce, old_ce + idx, sizeof(ce));
3068 if (!__context_present(&ce))
3069 continue;
3071 did = context_domain_id(&ce);
3072 if (did >= 0 && did < cap_ndoms(iommu->cap))
3073 set_bit(did, iommu->domain_ids);
3076 * We need a marker for copied context entries. This
3077 * marker needs to work for the old format as well as
3078 * for extended context entries.
3080 * Bit 67 of the context entry is used. In the old
3081 * format this bit is available to software, in the
3082 * extended format it is the PGE bit, but PGE is ignored
3083 * by HW if PASIDs are disabled (and thus still
3084 * available).
3086 * So disable PASIDs first and then mark the entry
3087 * copied. This means that we don't copy PASID
3088 * translations from the old kernel, but this is fine as
3089 * faults there are not fatal.
3091 context_clear_pasid_enable(&ce);
3092 context_set_copied(&ce);
3094 new_ce[idx] = ce;
3097 tbl[tbl_idx + pos] = new_ce;
3099 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3101 out_unmap:
3102 memunmap(old_ce);
3104 out:
3105 return ret;
3108 static int copy_translation_tables(struct intel_iommu *iommu)
3110 struct context_entry **ctxt_tbls;
3111 struct root_entry *old_rt;
3112 phys_addr_t old_rt_phys;
3113 int ctxt_table_entries;
3114 unsigned long flags;
3115 u64 rtaddr_reg;
3116 int bus, ret;
3117 bool new_ext, ext;
3119 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3120 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3121 new_ext = !!ecap_ecs(iommu->ecap);
3124 * The RTT bit can only be changed when translation is disabled,
3125 * but disabling translation means to open a window for data
3126 * corruption. So bail out and don't copy anything if we would
3127 * have to change the bit.
3129 if (new_ext != ext)
3130 return -EINVAL;
3132 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3133 if (!old_rt_phys)
3134 return -EINVAL;
3136 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3137 if (!old_rt)
3138 return -ENOMEM;
3140 /* This is too big for the stack - allocate it from slab */
3141 ctxt_table_entries = ext ? 512 : 256;
3142 ret = -ENOMEM;
3143 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3144 if (!ctxt_tbls)
3145 goto out_unmap;
3147 for (bus = 0; bus < 256; bus++) {
3148 ret = copy_context_table(iommu, &old_rt[bus],
3149 ctxt_tbls, bus, ext);
3150 if (ret) {
3151 pr_err("%s: Failed to copy context table for bus %d\n",
3152 iommu->name, bus);
3153 continue;
3157 spin_lock_irqsave(&iommu->lock, flags);
3159 /* Context tables are copied, now write them to the root_entry table */
3160 for (bus = 0; bus < 256; bus++) {
3161 int idx = ext ? bus * 2 : bus;
3162 u64 val;
3164 if (ctxt_tbls[idx]) {
3165 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3166 iommu->root_entry[bus].lo = val;
3169 if (!ext || !ctxt_tbls[idx + 1])
3170 continue;
3172 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3173 iommu->root_entry[bus].hi = val;
3176 spin_unlock_irqrestore(&iommu->lock, flags);
3178 kfree(ctxt_tbls);
3180 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3182 ret = 0;
3184 out_unmap:
3185 memunmap(old_rt);
3187 return ret;
3190 static int __init init_dmars(void)
3192 struct dmar_drhd_unit *drhd;
3193 struct intel_iommu *iommu;
3194 int ret;
3197 * for each drhd
3198 * allocate root
3199 * initialize and program root entry to not present
3200 * endfor
3202 for_each_drhd_unit(drhd) {
3204 * lock not needed as this is only incremented in the single
3205 * threaded kernel __init code path all other access are read
3206 * only
3208 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3209 g_num_of_iommus++;
3210 continue;
3212 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3215 /* Preallocate enough resources for IOMMU hot-addition */
3216 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3217 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3219 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3220 GFP_KERNEL);
3221 if (!g_iommus) {
3222 pr_err("Allocating global iommu array failed\n");
3223 ret = -ENOMEM;
3224 goto error;
3227 for_each_iommu(iommu, drhd) {
3228 if (drhd->ignored) {
3229 iommu_disable_translation(iommu);
3230 continue;
3234 * Find the max pasid size of all IOMMU's in the system.
3235 * We need to ensure the system pasid table is no bigger
3236 * than the smallest supported.
3238 if (pasid_supported(iommu)) {
3239 u32 temp = 2 << ecap_pss(iommu->ecap);
3241 intel_pasid_max_id = min_t(u32, temp,
3242 intel_pasid_max_id);
3245 g_iommus[iommu->seq_id] = iommu;
3247 intel_iommu_init_qi(iommu);
3249 ret = iommu_init_domains(iommu);
3250 if (ret)
3251 goto free_iommu;
3253 init_translation_status(iommu);
3255 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3256 iommu_disable_translation(iommu);
3257 clear_translation_pre_enabled(iommu);
3258 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3259 iommu->name);
3263 * TBD:
3264 * we could share the same root & context tables
3265 * among all IOMMU's. Need to Split it later.
3267 ret = iommu_alloc_root_entry(iommu);
3268 if (ret)
3269 goto free_iommu;
3271 if (translation_pre_enabled(iommu)) {
3272 pr_info("Translation already enabled - trying to copy translation structures\n");
3274 ret = copy_translation_tables(iommu);
3275 if (ret) {
3277 * We found the IOMMU with translation
3278 * enabled - but failed to copy over the
3279 * old root-entry table. Try to proceed
3280 * by disabling translation now and
3281 * allocating a clean root-entry table.
3282 * This might cause DMAR faults, but
3283 * probably the dump will still succeed.
3285 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3286 iommu->name);
3287 iommu_disable_translation(iommu);
3288 clear_translation_pre_enabled(iommu);
3289 } else {
3290 pr_info("Copied translation tables from previous kernel for %s\n",
3291 iommu->name);
3295 if (!ecap_pass_through(iommu->ecap))
3296 hw_pass_through = 0;
3297 #ifdef CONFIG_INTEL_IOMMU_SVM
3298 if (pasid_supported(iommu))
3299 intel_svm_init(iommu);
3300 #endif
3304 * Now that qi is enabled on all iommus, set the root entry and flush
3305 * caches. This is required on some Intel X58 chipsets, otherwise the
3306 * flush_context function will loop forever and the boot hangs.
3308 for_each_active_iommu(iommu, drhd) {
3309 iommu_flush_write_buffer(iommu);
3310 iommu_set_root_entry(iommu);
3311 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3312 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3315 if (iommu_default_passthrough())
3316 iommu_identity_mapping |= IDENTMAP_ALL;
3318 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3319 dmar_map_gfx = 0;
3320 #endif
3322 if (!dmar_map_gfx)
3323 iommu_identity_mapping |= IDENTMAP_GFX;
3325 check_tylersburg_isoch();
3327 ret = si_domain_init(hw_pass_through);
3328 if (ret)
3329 goto free_iommu;
3332 * for each drhd
3333 * enable fault log
3334 * global invalidate context cache
3335 * global invalidate iotlb
3336 * enable translation
3338 for_each_iommu(iommu, drhd) {
3339 if (drhd->ignored) {
3341 * we always have to disable PMRs or DMA may fail on
3342 * this device
3344 if (force_on)
3345 iommu_disable_protect_mem_regions(iommu);
3346 continue;
3349 iommu_flush_write_buffer(iommu);
3351 #ifdef CONFIG_INTEL_IOMMU_SVM
3352 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3354 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3355 * could cause possible lock race condition.
3357 up_write(&dmar_global_lock);
3358 ret = intel_svm_enable_prq(iommu);
3359 down_write(&dmar_global_lock);
3360 if (ret)
3361 goto free_iommu;
3363 #endif
3364 ret = dmar_set_interrupt(iommu);
3365 if (ret)
3366 goto free_iommu;
3369 return 0;
3371 free_iommu:
3372 for_each_active_iommu(iommu, drhd) {
3373 disable_dmar_iommu(iommu);
3374 free_dmar_iommu(iommu);
3377 kfree(g_iommus);
3379 error:
3380 return ret;
3383 /* This takes a number of _MM_ pages, not VTD pages */
3384 static unsigned long intel_alloc_iova(struct device *dev,
3385 struct dmar_domain *domain,
3386 unsigned long nrpages, uint64_t dma_mask)
3388 unsigned long iova_pfn;
3390 /* Restrict dma_mask to the width that the iommu can handle */
3391 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3392 /* Ensure we reserve the whole size-aligned region */
3393 nrpages = __roundup_pow_of_two(nrpages);
3395 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3397 * First try to allocate an io virtual address in
3398 * DMA_BIT_MASK(32) and if that fails then try allocating
3399 * from higher range
3401 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3402 IOVA_PFN(DMA_BIT_MASK(32)), false);
3403 if (iova_pfn)
3404 return iova_pfn;
3406 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3407 IOVA_PFN(dma_mask), true);
3408 if (unlikely(!iova_pfn)) {
3409 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3410 return 0;
3413 return iova_pfn;
3416 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3418 struct dmar_domain *domain, *tmp;
3419 struct dmar_rmrr_unit *rmrr;
3420 struct device *i_dev;
3421 int i, ret;
3423 /* Device shouldn't be attached by any domains. */
3424 domain = find_domain(dev);
3425 if (domain)
3426 return NULL;
3428 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3429 if (!domain)
3430 goto out;
3432 /* We have a new domain - setup possible RMRRs for the device */
3433 rcu_read_lock();
3434 for_each_rmrr_units(rmrr) {
3435 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3436 i, i_dev) {
3437 if (i_dev != dev)
3438 continue;
3440 ret = domain_prepare_identity_map(dev, domain,
3441 rmrr->base_address,
3442 rmrr->end_address);
3443 if (ret)
3444 dev_err(dev, "Mapping reserved region failed\n");
3447 rcu_read_unlock();
3449 tmp = set_domain_for_dev(dev, domain);
3450 if (!tmp || domain != tmp) {
3451 domain_exit(domain);
3452 domain = tmp;
3455 out:
3456 if (!domain)
3457 dev_err(dev, "Allocating domain failed\n");
3458 else
3459 domain->domain.type = IOMMU_DOMAIN_DMA;
3461 return domain;
3464 /* Check if the dev needs to go through non-identity map and unmap process.*/
3465 static bool iommu_need_mapping(struct device *dev)
3467 int ret;
3469 if (iommu_dummy(dev))
3470 return false;
3472 ret = identity_mapping(dev);
3473 if (ret) {
3474 u64 dma_mask = *dev->dma_mask;
3476 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3477 dma_mask = dev->coherent_dma_mask;
3479 if (dma_mask >= dma_direct_get_required_mask(dev))
3480 return false;
3483 * 32 bit DMA is removed from si_domain and fall back to
3484 * non-identity mapping.
3486 dmar_remove_one_dev_info(dev);
3487 ret = iommu_request_dma_domain_for_dev(dev);
3488 if (ret) {
3489 struct iommu_domain *domain;
3490 struct dmar_domain *dmar_domain;
3492 domain = iommu_get_domain_for_dev(dev);
3493 if (domain) {
3494 dmar_domain = to_dmar_domain(domain);
3495 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3497 dmar_remove_one_dev_info(dev);
3498 get_private_domain_for_dev(dev);
3501 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3504 return true;
3507 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3508 size_t size, int dir, u64 dma_mask)
3510 struct dmar_domain *domain;
3511 phys_addr_t start_paddr;
3512 unsigned long iova_pfn;
3513 int prot = 0;
3514 int ret;
3515 struct intel_iommu *iommu;
3516 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3518 BUG_ON(dir == DMA_NONE);
3520 domain = deferred_attach_domain(dev);
3521 if (!domain)
3522 return DMA_MAPPING_ERROR;
3524 iommu = domain_get_iommu(domain);
3525 size = aligned_nrpages(paddr, size);
3527 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3528 if (!iova_pfn)
3529 goto error;
3532 * Check if DMAR supports zero-length reads on write only
3533 * mappings..
3535 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3536 !cap_zlr(iommu->cap))
3537 prot |= DMA_PTE_READ;
3538 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3539 prot |= DMA_PTE_WRITE;
3541 * paddr - (paddr + size) might be partial page, we should map the whole
3542 * page. Note: if two part of one page are separately mapped, we
3543 * might have two guest_addr mapping to the same host paddr, but this
3544 * is not a big problem
3546 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3547 mm_to_dma_pfn(paddr_pfn), size, prot);
3548 if (ret)
3549 goto error;
3551 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3552 start_paddr += paddr & ~PAGE_MASK;
3554 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3556 return start_paddr;
3558 error:
3559 if (iova_pfn)
3560 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3561 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3562 size, (unsigned long long)paddr, dir);
3563 return DMA_MAPPING_ERROR;
3566 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3567 unsigned long offset, size_t size,
3568 enum dma_data_direction dir,
3569 unsigned long attrs)
3571 if (iommu_need_mapping(dev))
3572 return __intel_map_single(dev, page_to_phys(page) + offset,
3573 size, dir, *dev->dma_mask);
3574 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3577 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3578 size_t size, enum dma_data_direction dir,
3579 unsigned long attrs)
3581 if (iommu_need_mapping(dev))
3582 return __intel_map_single(dev, phys_addr, size, dir,
3583 *dev->dma_mask);
3584 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3587 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3589 struct dmar_domain *domain;
3590 unsigned long start_pfn, last_pfn;
3591 unsigned long nrpages;
3592 unsigned long iova_pfn;
3593 struct intel_iommu *iommu;
3594 struct page *freelist;
3595 struct pci_dev *pdev = NULL;
3597 domain = find_domain(dev);
3598 BUG_ON(!domain);
3600 iommu = domain_get_iommu(domain);
3602 iova_pfn = IOVA_PFN(dev_addr);
3604 nrpages = aligned_nrpages(dev_addr, size);
3605 start_pfn = mm_to_dma_pfn(iova_pfn);
3606 last_pfn = start_pfn + nrpages - 1;
3608 if (dev_is_pci(dev))
3609 pdev = to_pci_dev(dev);
3611 freelist = domain_unmap(domain, start_pfn, last_pfn);
3612 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3613 !has_iova_flush_queue(&domain->iovad)) {
3614 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3615 nrpages, !freelist, 0);
3616 /* free iova */
3617 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3618 dma_free_pagelist(freelist);
3619 } else {
3620 queue_iova(&domain->iovad, iova_pfn, nrpages,
3621 (unsigned long)freelist);
3623 * queue up the release of the unmap to save the 1/6th of the
3624 * cpu used up by the iotlb flush operation...
3628 trace_unmap_single(dev, dev_addr, size);
3631 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3632 size_t size, enum dma_data_direction dir,
3633 unsigned long attrs)
3635 if (iommu_need_mapping(dev))
3636 intel_unmap(dev, dev_addr, size);
3637 else
3638 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3641 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3642 size_t size, enum dma_data_direction dir, unsigned long attrs)
3644 if (iommu_need_mapping(dev))
3645 intel_unmap(dev, dev_addr, size);
3648 static void *intel_alloc_coherent(struct device *dev, size_t size,
3649 dma_addr_t *dma_handle, gfp_t flags,
3650 unsigned long attrs)
3652 struct page *page = NULL;
3653 int order;
3655 if (!iommu_need_mapping(dev))
3656 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3658 size = PAGE_ALIGN(size);
3659 order = get_order(size);
3661 if (gfpflags_allow_blocking(flags)) {
3662 unsigned int count = size >> PAGE_SHIFT;
3664 page = dma_alloc_from_contiguous(dev, count, order,
3665 flags & __GFP_NOWARN);
3668 if (!page)
3669 page = alloc_pages(flags, order);
3670 if (!page)
3671 return NULL;
3672 memset(page_address(page), 0, size);
3674 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3675 DMA_BIDIRECTIONAL,
3676 dev->coherent_dma_mask);
3677 if (*dma_handle != DMA_MAPPING_ERROR)
3678 return page_address(page);
3679 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3680 __free_pages(page, order);
3682 return NULL;
3685 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3686 dma_addr_t dma_handle, unsigned long attrs)
3688 int order;
3689 struct page *page = virt_to_page(vaddr);
3691 if (!iommu_need_mapping(dev))
3692 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3694 size = PAGE_ALIGN(size);
3695 order = get_order(size);
3697 intel_unmap(dev, dma_handle, size);
3698 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3699 __free_pages(page, order);
3702 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3703 int nelems, enum dma_data_direction dir,
3704 unsigned long attrs)
3706 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3707 unsigned long nrpages = 0;
3708 struct scatterlist *sg;
3709 int i;
3711 if (!iommu_need_mapping(dev))
3712 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3714 for_each_sg(sglist, sg, nelems, i) {
3715 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3718 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3720 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3723 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3724 enum dma_data_direction dir, unsigned long attrs)
3726 int i;
3727 struct dmar_domain *domain;
3728 size_t size = 0;
3729 int prot = 0;
3730 unsigned long iova_pfn;
3731 int ret;
3732 struct scatterlist *sg;
3733 unsigned long start_vpfn;
3734 struct intel_iommu *iommu;
3736 BUG_ON(dir == DMA_NONE);
3737 if (!iommu_need_mapping(dev))
3738 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3740 domain = deferred_attach_domain(dev);
3741 if (!domain)
3742 return 0;
3744 iommu = domain_get_iommu(domain);
3746 for_each_sg(sglist, sg, nelems, i)
3747 size += aligned_nrpages(sg->offset, sg->length);
3749 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3750 *dev->dma_mask);
3751 if (!iova_pfn) {
3752 sglist->dma_length = 0;
3753 return 0;
3757 * Check if DMAR supports zero-length reads on write only
3758 * mappings..
3760 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3761 !cap_zlr(iommu->cap))
3762 prot |= DMA_PTE_READ;
3763 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3764 prot |= DMA_PTE_WRITE;
3766 start_vpfn = mm_to_dma_pfn(iova_pfn);
3768 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3769 if (unlikely(ret)) {
3770 dma_pte_free_pagetable(domain, start_vpfn,
3771 start_vpfn + size - 1,
3772 agaw_to_level(domain->agaw) + 1);
3773 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3774 return 0;
3777 trace_map_sg(dev, iova_pfn << PAGE_SHIFT,
3778 sg_phys(sglist), size << VTD_PAGE_SHIFT);
3780 return nelems;
3783 static u64 intel_get_required_mask(struct device *dev)
3785 if (!iommu_need_mapping(dev))
3786 return dma_direct_get_required_mask(dev);
3787 return DMA_BIT_MASK(32);
3790 static const struct dma_map_ops intel_dma_ops = {
3791 .alloc = intel_alloc_coherent,
3792 .free = intel_free_coherent,
3793 .map_sg = intel_map_sg,
3794 .unmap_sg = intel_unmap_sg,
3795 .map_page = intel_map_page,
3796 .unmap_page = intel_unmap_page,
3797 .map_resource = intel_map_resource,
3798 .unmap_resource = intel_unmap_resource,
3799 .dma_supported = dma_direct_supported,
3800 .mmap = dma_common_mmap,
3801 .get_sgtable = dma_common_get_sgtable,
3802 .get_required_mask = intel_get_required_mask,
3805 static void
3806 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3807 enum dma_data_direction dir, enum dma_sync_target target)
3809 struct dmar_domain *domain;
3810 phys_addr_t tlb_addr;
3812 domain = find_domain(dev);
3813 if (WARN_ON(!domain))
3814 return;
3816 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3817 if (is_swiotlb_buffer(tlb_addr))
3818 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3821 static dma_addr_t
3822 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3823 enum dma_data_direction dir, unsigned long attrs,
3824 u64 dma_mask)
3826 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3827 struct dmar_domain *domain;
3828 struct intel_iommu *iommu;
3829 unsigned long iova_pfn;
3830 unsigned long nrpages;
3831 phys_addr_t tlb_addr;
3832 int prot = 0;
3833 int ret;
3835 domain = deferred_attach_domain(dev);
3836 if (WARN_ON(dir == DMA_NONE || !domain))
3837 return DMA_MAPPING_ERROR;
3839 iommu = domain_get_iommu(domain);
3840 if (WARN_ON(!iommu))
3841 return DMA_MAPPING_ERROR;
3843 nrpages = aligned_nrpages(0, size);
3844 iova_pfn = intel_alloc_iova(dev, domain,
3845 dma_to_mm_pfn(nrpages), dma_mask);
3846 if (!iova_pfn)
3847 return DMA_MAPPING_ERROR;
3850 * Check if DMAR supports zero-length reads on write only
3851 * mappings..
3853 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3854 !cap_zlr(iommu->cap))
3855 prot |= DMA_PTE_READ;
3856 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3857 prot |= DMA_PTE_WRITE;
3860 * If both the physical buffer start address and size are
3861 * page aligned, we don't need to use a bounce page.
3863 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3864 tlb_addr = swiotlb_tbl_map_single(dev,
3865 __phys_to_dma(dev, io_tlb_start),
3866 paddr, size, aligned_size, dir, attrs);
3867 if (tlb_addr == DMA_MAPPING_ERROR) {
3868 goto swiotlb_error;
3869 } else {
3870 /* Cleanup the padding area. */
3871 void *padding_start = phys_to_virt(tlb_addr);
3872 size_t padding_size = aligned_size;
3874 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3875 (dir == DMA_TO_DEVICE ||
3876 dir == DMA_BIDIRECTIONAL)) {
3877 padding_start += size;
3878 padding_size -= size;
3881 memset(padding_start, 0, padding_size);
3883 } else {
3884 tlb_addr = paddr;
3887 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3888 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3889 if (ret)
3890 goto mapping_error;
3892 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3894 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3896 mapping_error:
3897 if (is_swiotlb_buffer(tlb_addr))
3898 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3899 aligned_size, dir, attrs);
3900 swiotlb_error:
3901 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3902 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3903 size, (unsigned long long)paddr, dir);
3905 return DMA_MAPPING_ERROR;
3908 static void
3909 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3910 enum dma_data_direction dir, unsigned long attrs)
3912 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3913 struct dmar_domain *domain;
3914 phys_addr_t tlb_addr;
3916 domain = find_domain(dev);
3917 if (WARN_ON(!domain))
3918 return;
3920 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3921 if (WARN_ON(!tlb_addr))
3922 return;
3924 intel_unmap(dev, dev_addr, size);
3925 if (is_swiotlb_buffer(tlb_addr))
3926 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3927 aligned_size, dir, attrs);
3929 trace_bounce_unmap_single(dev, dev_addr, size);
3932 static dma_addr_t
3933 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3934 size_t size, enum dma_data_direction dir, unsigned long attrs)
3936 return bounce_map_single(dev, page_to_phys(page) + offset,
3937 size, dir, attrs, *dev->dma_mask);
3940 static dma_addr_t
3941 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3942 enum dma_data_direction dir, unsigned long attrs)
3944 return bounce_map_single(dev, phys_addr, size,
3945 dir, attrs, *dev->dma_mask);
3948 static void
3949 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3950 enum dma_data_direction dir, unsigned long attrs)
3952 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3955 static void
3956 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3957 enum dma_data_direction dir, unsigned long attrs)
3959 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3962 static void
3963 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3964 enum dma_data_direction dir, unsigned long attrs)
3966 struct scatterlist *sg;
3967 int i;
3969 for_each_sg(sglist, sg, nelems, i)
3970 bounce_unmap_page(dev, sg->dma_address,
3971 sg_dma_len(sg), dir, attrs);
3974 static int
3975 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3976 enum dma_data_direction dir, unsigned long attrs)
3978 int i;
3979 struct scatterlist *sg;
3981 for_each_sg(sglist, sg, nelems, i) {
3982 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3983 sg->offset, sg->length,
3984 dir, attrs);
3985 if (sg->dma_address == DMA_MAPPING_ERROR)
3986 goto out_unmap;
3987 sg_dma_len(sg) = sg->length;
3990 return nelems;
3992 out_unmap:
3993 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3994 return 0;
3997 static void
3998 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3999 size_t size, enum dma_data_direction dir)
4001 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4004 static void
4005 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4006 size_t size, enum dma_data_direction dir)
4008 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4011 static void
4012 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4013 int nelems, enum dma_data_direction dir)
4015 struct scatterlist *sg;
4016 int i;
4018 for_each_sg(sglist, sg, nelems, i)
4019 bounce_sync_single(dev, sg_dma_address(sg),
4020 sg_dma_len(sg), dir, SYNC_FOR_CPU);
4023 static void
4024 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4025 int nelems, enum dma_data_direction dir)
4027 struct scatterlist *sg;
4028 int i;
4030 for_each_sg(sglist, sg, nelems, i)
4031 bounce_sync_single(dev, sg_dma_address(sg),
4032 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4035 static const struct dma_map_ops bounce_dma_ops = {
4036 .alloc = intel_alloc_coherent,
4037 .free = intel_free_coherent,
4038 .map_sg = bounce_map_sg,
4039 .unmap_sg = bounce_unmap_sg,
4040 .map_page = bounce_map_page,
4041 .unmap_page = bounce_unmap_page,
4042 .sync_single_for_cpu = bounce_sync_single_for_cpu,
4043 .sync_single_for_device = bounce_sync_single_for_device,
4044 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
4045 .sync_sg_for_device = bounce_sync_sg_for_device,
4046 .map_resource = bounce_map_resource,
4047 .unmap_resource = bounce_unmap_resource,
4048 .dma_supported = dma_direct_supported,
4051 static inline int iommu_domain_cache_init(void)
4053 int ret = 0;
4055 iommu_domain_cache = kmem_cache_create("iommu_domain",
4056 sizeof(struct dmar_domain),
4058 SLAB_HWCACHE_ALIGN,
4060 NULL);
4061 if (!iommu_domain_cache) {
4062 pr_err("Couldn't create iommu_domain cache\n");
4063 ret = -ENOMEM;
4066 return ret;
4069 static inline int iommu_devinfo_cache_init(void)
4071 int ret = 0;
4073 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4074 sizeof(struct device_domain_info),
4076 SLAB_HWCACHE_ALIGN,
4077 NULL);
4078 if (!iommu_devinfo_cache) {
4079 pr_err("Couldn't create devinfo cache\n");
4080 ret = -ENOMEM;
4083 return ret;
4086 static int __init iommu_init_mempool(void)
4088 int ret;
4089 ret = iova_cache_get();
4090 if (ret)
4091 return ret;
4093 ret = iommu_domain_cache_init();
4094 if (ret)
4095 goto domain_error;
4097 ret = iommu_devinfo_cache_init();
4098 if (!ret)
4099 return ret;
4101 kmem_cache_destroy(iommu_domain_cache);
4102 domain_error:
4103 iova_cache_put();
4105 return -ENOMEM;
4108 static void __init iommu_exit_mempool(void)
4110 kmem_cache_destroy(iommu_devinfo_cache);
4111 kmem_cache_destroy(iommu_domain_cache);
4112 iova_cache_put();
4115 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4117 struct dmar_drhd_unit *drhd;
4118 u32 vtbar;
4119 int rc;
4121 /* We know that this device on this chipset has its own IOMMU.
4122 * If we find it under a different IOMMU, then the BIOS is lying
4123 * to us. Hope that the IOMMU for this device is actually
4124 * disabled, and it needs no translation...
4126 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4127 if (rc) {
4128 /* "can't" happen */
4129 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4130 return;
4132 vtbar &= 0xffff0000;
4134 /* we know that the this iommu should be at offset 0xa000 from vtbar */
4135 drhd = dmar_find_matched_drhd_unit(pdev);
4136 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4137 TAINT_FIRMWARE_WORKAROUND,
4138 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4139 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4141 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4143 static void __init init_no_remapping_devices(void)
4145 struct dmar_drhd_unit *drhd;
4146 struct device *dev;
4147 int i;
4149 for_each_drhd_unit(drhd) {
4150 if (!drhd->include_all) {
4151 for_each_active_dev_scope(drhd->devices,
4152 drhd->devices_cnt, i, dev)
4153 break;
4154 /* ignore DMAR unit if no devices exist */
4155 if (i == drhd->devices_cnt)
4156 drhd->ignored = 1;
4160 for_each_active_drhd_unit(drhd) {
4161 if (drhd->include_all)
4162 continue;
4164 for_each_active_dev_scope(drhd->devices,
4165 drhd->devices_cnt, i, dev)
4166 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4167 break;
4168 if (i < drhd->devices_cnt)
4169 continue;
4171 /* This IOMMU has *only* gfx devices. Either bypass it or
4172 set the gfx_mapped flag, as appropriate */
4173 if (!dmar_map_gfx) {
4174 drhd->ignored = 1;
4175 for_each_active_dev_scope(drhd->devices,
4176 drhd->devices_cnt, i, dev)
4177 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4182 #ifdef CONFIG_SUSPEND
4183 static int init_iommu_hw(void)
4185 struct dmar_drhd_unit *drhd;
4186 struct intel_iommu *iommu = NULL;
4188 for_each_active_iommu(iommu, drhd)
4189 if (iommu->qi)
4190 dmar_reenable_qi(iommu);
4192 for_each_iommu(iommu, drhd) {
4193 if (drhd->ignored) {
4195 * we always have to disable PMRs or DMA may fail on
4196 * this device
4198 if (force_on)
4199 iommu_disable_protect_mem_regions(iommu);
4200 continue;
4203 iommu_flush_write_buffer(iommu);
4205 iommu_set_root_entry(iommu);
4207 iommu->flush.flush_context(iommu, 0, 0, 0,
4208 DMA_CCMD_GLOBAL_INVL);
4209 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4210 iommu_enable_translation(iommu);
4211 iommu_disable_protect_mem_regions(iommu);
4214 return 0;
4217 static void iommu_flush_all(void)
4219 struct dmar_drhd_unit *drhd;
4220 struct intel_iommu *iommu;
4222 for_each_active_iommu(iommu, drhd) {
4223 iommu->flush.flush_context(iommu, 0, 0, 0,
4224 DMA_CCMD_GLOBAL_INVL);
4225 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4226 DMA_TLB_GLOBAL_FLUSH);
4230 static int iommu_suspend(void)
4232 struct dmar_drhd_unit *drhd;
4233 struct intel_iommu *iommu = NULL;
4234 unsigned long flag;
4236 for_each_active_iommu(iommu, drhd) {
4237 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4238 GFP_ATOMIC);
4239 if (!iommu->iommu_state)
4240 goto nomem;
4243 iommu_flush_all();
4245 for_each_active_iommu(iommu, drhd) {
4246 iommu_disable_translation(iommu);
4248 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4250 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4251 readl(iommu->reg + DMAR_FECTL_REG);
4252 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4253 readl(iommu->reg + DMAR_FEDATA_REG);
4254 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4255 readl(iommu->reg + DMAR_FEADDR_REG);
4256 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4257 readl(iommu->reg + DMAR_FEUADDR_REG);
4259 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4261 return 0;
4263 nomem:
4264 for_each_active_iommu(iommu, drhd)
4265 kfree(iommu->iommu_state);
4267 return -ENOMEM;
4270 static void iommu_resume(void)
4272 struct dmar_drhd_unit *drhd;
4273 struct intel_iommu *iommu = NULL;
4274 unsigned long flag;
4276 if (init_iommu_hw()) {
4277 if (force_on)
4278 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4279 else
4280 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4281 return;
4284 for_each_active_iommu(iommu, drhd) {
4286 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4288 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4289 iommu->reg + DMAR_FECTL_REG);
4290 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4291 iommu->reg + DMAR_FEDATA_REG);
4292 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4293 iommu->reg + DMAR_FEADDR_REG);
4294 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4295 iommu->reg + DMAR_FEUADDR_REG);
4297 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4300 for_each_active_iommu(iommu, drhd)
4301 kfree(iommu->iommu_state);
4304 static struct syscore_ops iommu_syscore_ops = {
4305 .resume = iommu_resume,
4306 .suspend = iommu_suspend,
4309 static void __init init_iommu_pm_ops(void)
4311 register_syscore_ops(&iommu_syscore_ops);
4314 #else
4315 static inline void init_iommu_pm_ops(void) {}
4316 #endif /* CONFIG_PM */
4318 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4320 struct acpi_dmar_reserved_memory *rmrr;
4321 struct dmar_rmrr_unit *rmrru;
4322 int ret;
4324 rmrr = (struct acpi_dmar_reserved_memory *)header;
4325 ret = arch_rmrr_sanity_check(rmrr);
4326 if (ret)
4327 return ret;
4329 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4330 if (!rmrru)
4331 goto out;
4333 rmrru->hdr = header;
4335 rmrru->base_address = rmrr->base_address;
4336 rmrru->end_address = rmrr->end_address;
4338 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4339 ((void *)rmrr) + rmrr->header.length,
4340 &rmrru->devices_cnt);
4341 if (rmrru->devices_cnt && rmrru->devices == NULL)
4342 goto free_rmrru;
4344 list_add(&rmrru->list, &dmar_rmrr_units);
4346 return 0;
4347 free_rmrru:
4348 kfree(rmrru);
4349 out:
4350 return -ENOMEM;
4353 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4355 struct dmar_atsr_unit *atsru;
4356 struct acpi_dmar_atsr *tmp;
4358 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4359 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4360 if (atsr->segment != tmp->segment)
4361 continue;
4362 if (atsr->header.length != tmp->header.length)
4363 continue;
4364 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4365 return atsru;
4368 return NULL;
4371 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4373 struct acpi_dmar_atsr *atsr;
4374 struct dmar_atsr_unit *atsru;
4376 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4377 return 0;
4379 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4380 atsru = dmar_find_atsr(atsr);
4381 if (atsru)
4382 return 0;
4384 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4385 if (!atsru)
4386 return -ENOMEM;
4389 * If memory is allocated from slab by ACPI _DSM method, we need to
4390 * copy the memory content because the memory buffer will be freed
4391 * on return.
4393 atsru->hdr = (void *)(atsru + 1);
4394 memcpy(atsru->hdr, hdr, hdr->length);
4395 atsru->include_all = atsr->flags & 0x1;
4396 if (!atsru->include_all) {
4397 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4398 (void *)atsr + atsr->header.length,
4399 &atsru->devices_cnt);
4400 if (atsru->devices_cnt && atsru->devices == NULL) {
4401 kfree(atsru);
4402 return -ENOMEM;
4406 list_add_rcu(&atsru->list, &dmar_atsr_units);
4408 return 0;
4411 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4413 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4414 kfree(atsru);
4417 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4419 struct acpi_dmar_atsr *atsr;
4420 struct dmar_atsr_unit *atsru;
4422 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4423 atsru = dmar_find_atsr(atsr);
4424 if (atsru) {
4425 list_del_rcu(&atsru->list);
4426 synchronize_rcu();
4427 intel_iommu_free_atsr(atsru);
4430 return 0;
4433 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4435 int i;
4436 struct device *dev;
4437 struct acpi_dmar_atsr *atsr;
4438 struct dmar_atsr_unit *atsru;
4440 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4441 atsru = dmar_find_atsr(atsr);
4442 if (!atsru)
4443 return 0;
4445 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4446 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4447 i, dev)
4448 return -EBUSY;
4451 return 0;
4454 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4456 int sp, ret;
4457 struct intel_iommu *iommu = dmaru->iommu;
4459 if (g_iommus[iommu->seq_id])
4460 return 0;
4462 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4463 pr_warn("%s: Doesn't support hardware pass through.\n",
4464 iommu->name);
4465 return -ENXIO;
4467 if (!ecap_sc_support(iommu->ecap) &&
4468 domain_update_iommu_snooping(iommu)) {
4469 pr_warn("%s: Doesn't support snooping.\n",
4470 iommu->name);
4471 return -ENXIO;
4473 sp = domain_update_iommu_superpage(iommu) - 1;
4474 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4475 pr_warn("%s: Doesn't support large page.\n",
4476 iommu->name);
4477 return -ENXIO;
4481 * Disable translation if already enabled prior to OS handover.
4483 if (iommu->gcmd & DMA_GCMD_TE)
4484 iommu_disable_translation(iommu);
4486 g_iommus[iommu->seq_id] = iommu;
4487 ret = iommu_init_domains(iommu);
4488 if (ret == 0)
4489 ret = iommu_alloc_root_entry(iommu);
4490 if (ret)
4491 goto out;
4493 #ifdef CONFIG_INTEL_IOMMU_SVM
4494 if (pasid_supported(iommu))
4495 intel_svm_init(iommu);
4496 #endif
4498 if (dmaru->ignored) {
4500 * we always have to disable PMRs or DMA may fail on this device
4502 if (force_on)
4503 iommu_disable_protect_mem_regions(iommu);
4504 return 0;
4507 intel_iommu_init_qi(iommu);
4508 iommu_flush_write_buffer(iommu);
4510 #ifdef CONFIG_INTEL_IOMMU_SVM
4511 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4512 ret = intel_svm_enable_prq(iommu);
4513 if (ret)
4514 goto disable_iommu;
4516 #endif
4517 ret = dmar_set_interrupt(iommu);
4518 if (ret)
4519 goto disable_iommu;
4521 iommu_set_root_entry(iommu);
4522 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4523 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4524 iommu_enable_translation(iommu);
4526 iommu_disable_protect_mem_regions(iommu);
4527 return 0;
4529 disable_iommu:
4530 disable_dmar_iommu(iommu);
4531 out:
4532 free_dmar_iommu(iommu);
4533 return ret;
4536 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4538 int ret = 0;
4539 struct intel_iommu *iommu = dmaru->iommu;
4541 if (!intel_iommu_enabled)
4542 return 0;
4543 if (iommu == NULL)
4544 return -EINVAL;
4546 if (insert) {
4547 ret = intel_iommu_add(dmaru);
4548 } else {
4549 disable_dmar_iommu(iommu);
4550 free_dmar_iommu(iommu);
4553 return ret;
4556 static void intel_iommu_free_dmars(void)
4558 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4559 struct dmar_atsr_unit *atsru, *atsr_n;
4561 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4562 list_del(&rmrru->list);
4563 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4564 kfree(rmrru);
4567 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4568 list_del(&atsru->list);
4569 intel_iommu_free_atsr(atsru);
4573 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4575 int i, ret = 1;
4576 struct pci_bus *bus;
4577 struct pci_dev *bridge = NULL;
4578 struct device *tmp;
4579 struct acpi_dmar_atsr *atsr;
4580 struct dmar_atsr_unit *atsru;
4582 dev = pci_physfn(dev);
4583 for (bus = dev->bus; bus; bus = bus->parent) {
4584 bridge = bus->self;
4585 /* If it's an integrated device, allow ATS */
4586 if (!bridge)
4587 return 1;
4588 /* Connected via non-PCIe: no ATS */
4589 if (!pci_is_pcie(bridge) ||
4590 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4591 return 0;
4592 /* If we found the root port, look it up in the ATSR */
4593 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4594 break;
4597 rcu_read_lock();
4598 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4599 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4600 if (atsr->segment != pci_domain_nr(dev->bus))
4601 continue;
4603 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4604 if (tmp == &bridge->dev)
4605 goto out;
4607 if (atsru->include_all)
4608 goto out;
4610 ret = 0;
4611 out:
4612 rcu_read_unlock();
4614 return ret;
4617 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4619 int ret;
4620 struct dmar_rmrr_unit *rmrru;
4621 struct dmar_atsr_unit *atsru;
4622 struct acpi_dmar_atsr *atsr;
4623 struct acpi_dmar_reserved_memory *rmrr;
4625 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4626 return 0;
4628 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4629 rmrr = container_of(rmrru->hdr,
4630 struct acpi_dmar_reserved_memory, header);
4631 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4632 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4633 ((void *)rmrr) + rmrr->header.length,
4634 rmrr->segment, rmrru->devices,
4635 rmrru->devices_cnt);
4636 if (ret < 0)
4637 return ret;
4638 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4639 dmar_remove_dev_scope(info, rmrr->segment,
4640 rmrru->devices, rmrru->devices_cnt);
4644 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4645 if (atsru->include_all)
4646 continue;
4648 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4649 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4650 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4651 (void *)atsr + atsr->header.length,
4652 atsr->segment, atsru->devices,
4653 atsru->devices_cnt);
4654 if (ret > 0)
4655 break;
4656 else if (ret < 0)
4657 return ret;
4658 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4659 if (dmar_remove_dev_scope(info, atsr->segment,
4660 atsru->devices, atsru->devices_cnt))
4661 break;
4665 return 0;
4668 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4669 unsigned long val, void *v)
4671 struct memory_notify *mhp = v;
4672 unsigned long long start, end;
4673 unsigned long start_vpfn, last_vpfn;
4675 switch (val) {
4676 case MEM_GOING_ONLINE:
4677 start = mhp->start_pfn << PAGE_SHIFT;
4678 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4679 if (iommu_domain_identity_map(si_domain, start, end)) {
4680 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4681 start, end);
4682 return NOTIFY_BAD;
4684 break;
4686 case MEM_OFFLINE:
4687 case MEM_CANCEL_ONLINE:
4688 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4689 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4690 while (start_vpfn <= last_vpfn) {
4691 struct iova *iova;
4692 struct dmar_drhd_unit *drhd;
4693 struct intel_iommu *iommu;
4694 struct page *freelist;
4696 iova = find_iova(&si_domain->iovad, start_vpfn);
4697 if (iova == NULL) {
4698 pr_debug("Failed get IOVA for PFN %lx\n",
4699 start_vpfn);
4700 break;
4703 iova = split_and_remove_iova(&si_domain->iovad, iova,
4704 start_vpfn, last_vpfn);
4705 if (iova == NULL) {
4706 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4707 start_vpfn, last_vpfn);
4708 return NOTIFY_BAD;
4711 freelist = domain_unmap(si_domain, iova->pfn_lo,
4712 iova->pfn_hi);
4714 rcu_read_lock();
4715 for_each_active_iommu(iommu, drhd)
4716 iommu_flush_iotlb_psi(iommu, si_domain,
4717 iova->pfn_lo, iova_size(iova),
4718 !freelist, 0);
4719 rcu_read_unlock();
4720 dma_free_pagelist(freelist);
4722 start_vpfn = iova->pfn_hi + 1;
4723 free_iova_mem(iova);
4725 break;
4728 return NOTIFY_OK;
4731 static struct notifier_block intel_iommu_memory_nb = {
4732 .notifier_call = intel_iommu_memory_notifier,
4733 .priority = 0
4736 static void free_all_cpu_cached_iovas(unsigned int cpu)
4738 int i;
4740 for (i = 0; i < g_num_of_iommus; i++) {
4741 struct intel_iommu *iommu = g_iommus[i];
4742 struct dmar_domain *domain;
4743 int did;
4745 if (!iommu)
4746 continue;
4748 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4749 domain = get_iommu_domain(iommu, (u16)did);
4751 if (!domain)
4752 continue;
4753 free_cpu_cached_iovas(cpu, &domain->iovad);
4758 static int intel_iommu_cpu_dead(unsigned int cpu)
4760 free_all_cpu_cached_iovas(cpu);
4761 return 0;
4764 static void intel_disable_iommus(void)
4766 struct intel_iommu *iommu = NULL;
4767 struct dmar_drhd_unit *drhd;
4769 for_each_iommu(iommu, drhd)
4770 iommu_disable_translation(iommu);
4773 void intel_iommu_shutdown(void)
4775 struct dmar_drhd_unit *drhd;
4776 struct intel_iommu *iommu = NULL;
4778 if (no_iommu || dmar_disabled)
4779 return;
4781 down_write(&dmar_global_lock);
4783 /* Disable PMRs explicitly here. */
4784 for_each_iommu(iommu, drhd)
4785 iommu_disable_protect_mem_regions(iommu);
4787 /* Make sure the IOMMUs are switched off */
4788 intel_disable_iommus();
4790 up_write(&dmar_global_lock);
4793 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4795 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4797 return container_of(iommu_dev, struct intel_iommu, iommu);
4800 static ssize_t intel_iommu_show_version(struct device *dev,
4801 struct device_attribute *attr,
4802 char *buf)
4804 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4805 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4806 return sprintf(buf, "%d:%d\n",
4807 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4809 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4811 static ssize_t intel_iommu_show_address(struct device *dev,
4812 struct device_attribute *attr,
4813 char *buf)
4815 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4816 return sprintf(buf, "%llx\n", iommu->reg_phys);
4818 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4820 static ssize_t intel_iommu_show_cap(struct device *dev,
4821 struct device_attribute *attr,
4822 char *buf)
4824 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4825 return sprintf(buf, "%llx\n", iommu->cap);
4827 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4829 static ssize_t intel_iommu_show_ecap(struct device *dev,
4830 struct device_attribute *attr,
4831 char *buf)
4833 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4834 return sprintf(buf, "%llx\n", iommu->ecap);
4836 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4838 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4839 struct device_attribute *attr,
4840 char *buf)
4842 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4843 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4845 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4847 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4848 struct device_attribute *attr,
4849 char *buf)
4851 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4852 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4853 cap_ndoms(iommu->cap)));
4855 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4857 static struct attribute *intel_iommu_attrs[] = {
4858 &dev_attr_version.attr,
4859 &dev_attr_address.attr,
4860 &dev_attr_cap.attr,
4861 &dev_attr_ecap.attr,
4862 &dev_attr_domains_supported.attr,
4863 &dev_attr_domains_used.attr,
4864 NULL,
4867 static struct attribute_group intel_iommu_group = {
4868 .name = "intel-iommu",
4869 .attrs = intel_iommu_attrs,
4872 const struct attribute_group *intel_iommu_groups[] = {
4873 &intel_iommu_group,
4874 NULL,
4877 static inline bool has_untrusted_dev(void)
4879 struct pci_dev *pdev = NULL;
4881 for_each_pci_dev(pdev)
4882 if (pdev->untrusted)
4883 return true;
4885 return false;
4888 static int __init platform_optin_force_iommu(void)
4890 if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4891 return 0;
4893 if (no_iommu || dmar_disabled)
4894 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4897 * If Intel-IOMMU is disabled by default, we will apply identity
4898 * map for all devices except those marked as being untrusted.
4900 if (dmar_disabled)
4901 iommu_identity_mapping |= IDENTMAP_ALL;
4903 dmar_disabled = 0;
4904 no_iommu = 0;
4906 return 1;
4909 static int __init probe_acpi_namespace_devices(void)
4911 struct dmar_drhd_unit *drhd;
4912 /* To avoid a -Wunused-but-set-variable warning. */
4913 struct intel_iommu *iommu __maybe_unused;
4914 struct device *dev;
4915 int i, ret = 0;
4917 for_each_active_iommu(iommu, drhd) {
4918 for_each_active_dev_scope(drhd->devices,
4919 drhd->devices_cnt, i, dev) {
4920 struct acpi_device_physical_node *pn;
4921 struct iommu_group *group;
4922 struct acpi_device *adev;
4924 if (dev->bus != &acpi_bus_type)
4925 continue;
4927 adev = to_acpi_device(dev);
4928 mutex_lock(&adev->physical_node_lock);
4929 list_for_each_entry(pn,
4930 &adev->physical_node_list, node) {
4931 group = iommu_group_get(pn->dev);
4932 if (group) {
4933 iommu_group_put(group);
4934 continue;
4937 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4938 ret = iommu_probe_device(pn->dev);
4939 if (ret)
4940 break;
4942 mutex_unlock(&adev->physical_node_lock);
4944 if (ret)
4945 return ret;
4949 return 0;
4952 int __init intel_iommu_init(void)
4954 int ret = -ENODEV;
4955 struct dmar_drhd_unit *drhd;
4956 struct intel_iommu *iommu;
4959 * Intel IOMMU is required for a TXT/tboot launch or platform
4960 * opt in, so enforce that.
4962 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4964 if (iommu_init_mempool()) {
4965 if (force_on)
4966 panic("tboot: Failed to initialize iommu memory\n");
4967 return -ENOMEM;
4970 down_write(&dmar_global_lock);
4971 if (dmar_table_init()) {
4972 if (force_on)
4973 panic("tboot: Failed to initialize DMAR table\n");
4974 goto out_free_dmar;
4977 if (dmar_dev_scope_init() < 0) {
4978 if (force_on)
4979 panic("tboot: Failed to initialize DMAR device scope\n");
4980 goto out_free_dmar;
4983 up_write(&dmar_global_lock);
4986 * The bus notifier takes the dmar_global_lock, so lockdep will
4987 * complain later when we register it under the lock.
4989 dmar_register_bus_notifier();
4991 down_write(&dmar_global_lock);
4993 if (no_iommu || dmar_disabled) {
4995 * We exit the function here to ensure IOMMU's remapping and
4996 * mempool aren't setup, which means that the IOMMU's PMRs
4997 * won't be disabled via the call to init_dmars(). So disable
4998 * it explicitly here. The PMRs were setup by tboot prior to
4999 * calling SENTER, but the kernel is expected to reset/tear
5000 * down the PMRs.
5002 if (intel_iommu_tboot_noforce) {
5003 for_each_iommu(iommu, drhd)
5004 iommu_disable_protect_mem_regions(iommu);
5008 * Make sure the IOMMUs are switched off, even when we
5009 * boot into a kexec kernel and the previous kernel left
5010 * them enabled
5012 intel_disable_iommus();
5013 goto out_free_dmar;
5016 if (list_empty(&dmar_rmrr_units))
5017 pr_info("No RMRR found\n");
5019 if (list_empty(&dmar_atsr_units))
5020 pr_info("No ATSR found\n");
5022 if (dmar_init_reserved_ranges()) {
5023 if (force_on)
5024 panic("tboot: Failed to reserve iommu ranges\n");
5025 goto out_free_reserved_range;
5028 if (dmar_map_gfx)
5029 intel_iommu_gfx_mapped = 1;
5031 init_no_remapping_devices();
5033 ret = init_dmars();
5034 if (ret) {
5035 if (force_on)
5036 panic("tboot: Failed to initialize DMARs\n");
5037 pr_err("Initialization failed\n");
5038 goto out_free_reserved_range;
5040 up_write(&dmar_global_lock);
5042 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5044 * If the system has no untrusted device or the user has decided
5045 * to disable the bounce page mechanisms, we don't need swiotlb.
5046 * Mark this and the pre-allocated bounce pages will be released
5047 * later.
5049 if (!has_untrusted_dev() || intel_no_bounce)
5050 swiotlb = 0;
5051 #endif
5052 dma_ops = &intel_dma_ops;
5054 init_iommu_pm_ops();
5056 for_each_active_iommu(iommu, drhd) {
5057 iommu_device_sysfs_add(&iommu->iommu, NULL,
5058 intel_iommu_groups,
5059 "%s", iommu->name);
5060 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5061 iommu_device_register(&iommu->iommu);
5064 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5065 if (si_domain && !hw_pass_through)
5066 register_memory_notifier(&intel_iommu_memory_nb);
5067 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5068 intel_iommu_cpu_dead);
5070 down_read(&dmar_global_lock);
5071 if (probe_acpi_namespace_devices())
5072 pr_warn("ACPI name space devices didn't probe correctly\n");
5073 up_read(&dmar_global_lock);
5075 /* Finally, we enable the DMA remapping hardware. */
5076 for_each_iommu(iommu, drhd) {
5077 if (!drhd->ignored && !translation_pre_enabled(iommu))
5078 iommu_enable_translation(iommu);
5080 iommu_disable_protect_mem_regions(iommu);
5082 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5084 intel_iommu_enabled = 1;
5085 intel_iommu_debugfs_init();
5087 return 0;
5089 out_free_reserved_range:
5090 put_iova_domain(&reserved_iova_list);
5091 out_free_dmar:
5092 intel_iommu_free_dmars();
5093 up_write(&dmar_global_lock);
5094 iommu_exit_mempool();
5095 return ret;
5098 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5100 struct intel_iommu *iommu = opaque;
5102 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5103 return 0;
5107 * NB - intel-iommu lacks any sort of reference counting for the users of
5108 * dependent devices. If multiple endpoints have intersecting dependent
5109 * devices, unbinding the driver from any one of them will possibly leave
5110 * the others unable to operate.
5112 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5114 if (!iommu || !dev || !dev_is_pci(dev))
5115 return;
5117 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5120 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5122 struct dmar_domain *domain;
5123 struct intel_iommu *iommu;
5124 unsigned long flags;
5126 assert_spin_locked(&device_domain_lock);
5128 if (WARN_ON(!info))
5129 return;
5131 iommu = info->iommu;
5132 domain = info->domain;
5134 if (info->dev) {
5135 if (dev_is_pci(info->dev) && sm_supported(iommu))
5136 intel_pasid_tear_down_entry(iommu, info->dev,
5137 PASID_RID2PASID);
5139 iommu_disable_dev_iotlb(info);
5140 domain_context_clear(iommu, info->dev);
5141 intel_pasid_free_table(info->dev);
5144 unlink_domain_info(info);
5146 spin_lock_irqsave(&iommu->lock, flags);
5147 domain_detach_iommu(domain, iommu);
5148 spin_unlock_irqrestore(&iommu->lock, flags);
5150 /* free the private domain */
5151 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5152 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5153 list_empty(&domain->devices))
5154 domain_exit(info->domain);
5156 free_devinfo_mem(info);
5159 static void dmar_remove_one_dev_info(struct device *dev)
5161 struct device_domain_info *info;
5162 unsigned long flags;
5164 spin_lock_irqsave(&device_domain_lock, flags);
5165 info = dev->archdata.iommu;
5166 if (info)
5167 __dmar_remove_one_dev_info(info);
5168 spin_unlock_irqrestore(&device_domain_lock, flags);
5171 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5173 int adjust_width;
5175 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5176 domain_reserve_special_ranges(domain);
5178 /* calculate AGAW */
5179 domain->gaw = guest_width;
5180 adjust_width = guestwidth_to_adjustwidth(guest_width);
5181 domain->agaw = width_to_agaw(adjust_width);
5183 domain->iommu_coherency = 0;
5184 domain->iommu_snooping = 0;
5185 domain->iommu_superpage = 0;
5186 domain->max_addr = 0;
5188 /* always allocate the top pgd */
5189 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5190 if (!domain->pgd)
5191 return -ENOMEM;
5192 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5193 return 0;
5196 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5198 struct dmar_domain *dmar_domain;
5199 struct iommu_domain *domain;
5201 switch (type) {
5202 case IOMMU_DOMAIN_DMA:
5203 /* fallthrough */
5204 case IOMMU_DOMAIN_UNMANAGED:
5205 dmar_domain = alloc_domain(0);
5206 if (!dmar_domain) {
5207 pr_err("Can't allocate dmar_domain\n");
5208 return NULL;
5210 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5211 pr_err("Domain initialization failed\n");
5212 domain_exit(dmar_domain);
5213 return NULL;
5216 if (type == IOMMU_DOMAIN_DMA &&
5217 init_iova_flush_queue(&dmar_domain->iovad,
5218 iommu_flush_iova, iova_entry_free)) {
5219 pr_warn("iova flush queue initialization failed\n");
5220 intel_iommu_strict = 1;
5223 domain_update_iommu_cap(dmar_domain);
5225 domain = &dmar_domain->domain;
5226 domain->geometry.aperture_start = 0;
5227 domain->geometry.aperture_end =
5228 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5229 domain->geometry.force_aperture = true;
5231 return domain;
5232 case IOMMU_DOMAIN_IDENTITY:
5233 return &si_domain->domain;
5234 default:
5235 return NULL;
5238 return NULL;
5241 static void intel_iommu_domain_free(struct iommu_domain *domain)
5243 if (domain != &si_domain->domain)
5244 domain_exit(to_dmar_domain(domain));
5248 * Check whether a @domain could be attached to the @dev through the
5249 * aux-domain attach/detach APIs.
5251 static inline bool
5252 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5254 struct device_domain_info *info = dev->archdata.iommu;
5256 return info && info->auxd_enabled &&
5257 domain->type == IOMMU_DOMAIN_UNMANAGED;
5260 static void auxiliary_link_device(struct dmar_domain *domain,
5261 struct device *dev)
5263 struct device_domain_info *info = dev->archdata.iommu;
5265 assert_spin_locked(&device_domain_lock);
5266 if (WARN_ON(!info))
5267 return;
5269 domain->auxd_refcnt++;
5270 list_add(&domain->auxd, &info->auxiliary_domains);
5273 static void auxiliary_unlink_device(struct dmar_domain *domain,
5274 struct device *dev)
5276 struct device_domain_info *info = dev->archdata.iommu;
5278 assert_spin_locked(&device_domain_lock);
5279 if (WARN_ON(!info))
5280 return;
5282 list_del(&domain->auxd);
5283 domain->auxd_refcnt--;
5285 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5286 intel_pasid_free_id(domain->default_pasid);
5289 static int aux_domain_add_dev(struct dmar_domain *domain,
5290 struct device *dev)
5292 int ret;
5293 u8 bus, devfn;
5294 unsigned long flags;
5295 struct intel_iommu *iommu;
5297 iommu = device_to_iommu(dev, &bus, &devfn);
5298 if (!iommu)
5299 return -ENODEV;
5301 if (domain->default_pasid <= 0) {
5302 int pasid;
5304 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
5305 pci_max_pasids(to_pci_dev(dev)),
5306 GFP_KERNEL);
5307 if (pasid <= 0) {
5308 pr_err("Can't allocate default pasid\n");
5309 return -ENODEV;
5311 domain->default_pasid = pasid;
5314 spin_lock_irqsave(&device_domain_lock, flags);
5316 * iommu->lock must be held to attach domain to iommu and setup the
5317 * pasid entry for second level translation.
5319 spin_lock(&iommu->lock);
5320 ret = domain_attach_iommu(domain, iommu);
5321 if (ret)
5322 goto attach_failed;
5324 /* Setup the PASID entry for mediated devices: */
5325 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5326 domain->default_pasid);
5327 if (ret)
5328 goto table_failed;
5329 spin_unlock(&iommu->lock);
5331 auxiliary_link_device(domain, dev);
5333 spin_unlock_irqrestore(&device_domain_lock, flags);
5335 return 0;
5337 table_failed:
5338 domain_detach_iommu(domain, iommu);
5339 attach_failed:
5340 spin_unlock(&iommu->lock);
5341 spin_unlock_irqrestore(&device_domain_lock, flags);
5342 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5343 intel_pasid_free_id(domain->default_pasid);
5345 return ret;
5348 static void aux_domain_remove_dev(struct dmar_domain *domain,
5349 struct device *dev)
5351 struct device_domain_info *info;
5352 struct intel_iommu *iommu;
5353 unsigned long flags;
5355 if (!is_aux_domain(dev, &domain->domain))
5356 return;
5358 spin_lock_irqsave(&device_domain_lock, flags);
5359 info = dev->archdata.iommu;
5360 iommu = info->iommu;
5362 auxiliary_unlink_device(domain, dev);
5364 spin_lock(&iommu->lock);
5365 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5366 domain_detach_iommu(domain, iommu);
5367 spin_unlock(&iommu->lock);
5369 spin_unlock_irqrestore(&device_domain_lock, flags);
5372 static int prepare_domain_attach_device(struct iommu_domain *domain,
5373 struct device *dev)
5375 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5376 struct intel_iommu *iommu;
5377 int addr_width;
5378 u8 bus, devfn;
5380 iommu = device_to_iommu(dev, &bus, &devfn);
5381 if (!iommu)
5382 return -ENODEV;
5384 /* check if this iommu agaw is sufficient for max mapped address */
5385 addr_width = agaw_to_width(iommu->agaw);
5386 if (addr_width > cap_mgaw(iommu->cap))
5387 addr_width = cap_mgaw(iommu->cap);
5389 if (dmar_domain->max_addr > (1LL << addr_width)) {
5390 dev_err(dev, "%s: iommu width (%d) is not "
5391 "sufficient for the mapped address (%llx)\n",
5392 __func__, addr_width, dmar_domain->max_addr);
5393 return -EFAULT;
5395 dmar_domain->gaw = addr_width;
5398 * Knock out extra levels of page tables if necessary
5400 while (iommu->agaw < dmar_domain->agaw) {
5401 struct dma_pte *pte;
5403 pte = dmar_domain->pgd;
5404 if (dma_pte_present(pte)) {
5405 dmar_domain->pgd = (struct dma_pte *)
5406 phys_to_virt(dma_pte_addr(pte));
5407 free_pgtable_page(pte);
5409 dmar_domain->agaw--;
5412 return 0;
5415 static int intel_iommu_attach_device(struct iommu_domain *domain,
5416 struct device *dev)
5418 int ret;
5420 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5421 device_is_rmrr_locked(dev)) {
5422 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5423 return -EPERM;
5426 if (is_aux_domain(dev, domain))
5427 return -EPERM;
5429 /* normally dev is not mapped */
5430 if (unlikely(domain_context_mapped(dev))) {
5431 struct dmar_domain *old_domain;
5433 old_domain = find_domain(dev);
5434 if (old_domain)
5435 dmar_remove_one_dev_info(dev);
5438 ret = prepare_domain_attach_device(domain, dev);
5439 if (ret)
5440 return ret;
5442 return domain_add_dev_info(to_dmar_domain(domain), dev);
5445 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5446 struct device *dev)
5448 int ret;
5450 if (!is_aux_domain(dev, domain))
5451 return -EPERM;
5453 ret = prepare_domain_attach_device(domain, dev);
5454 if (ret)
5455 return ret;
5457 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5460 static void intel_iommu_detach_device(struct iommu_domain *domain,
5461 struct device *dev)
5463 dmar_remove_one_dev_info(dev);
5466 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5467 struct device *dev)
5469 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5472 static int intel_iommu_map(struct iommu_domain *domain,
5473 unsigned long iova, phys_addr_t hpa,
5474 size_t size, int iommu_prot, gfp_t gfp)
5476 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5477 u64 max_addr;
5478 int prot = 0;
5479 int ret;
5481 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5482 return -EINVAL;
5484 if (iommu_prot & IOMMU_READ)
5485 prot |= DMA_PTE_READ;
5486 if (iommu_prot & IOMMU_WRITE)
5487 prot |= DMA_PTE_WRITE;
5488 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5489 prot |= DMA_PTE_SNP;
5491 max_addr = iova + size;
5492 if (dmar_domain->max_addr < max_addr) {
5493 u64 end;
5495 /* check if minimum agaw is sufficient for mapped address */
5496 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5497 if (end < max_addr) {
5498 pr_err("%s: iommu width (%d) is not "
5499 "sufficient for the mapped address (%llx)\n",
5500 __func__, dmar_domain->gaw, max_addr);
5501 return -EFAULT;
5503 dmar_domain->max_addr = max_addr;
5505 /* Round up size to next multiple of PAGE_SIZE, if it and
5506 the low bits of hpa would take us onto the next page */
5507 size = aligned_nrpages(hpa, size);
5508 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5509 hpa >> VTD_PAGE_SHIFT, size, prot);
5510 return ret;
5513 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5514 unsigned long iova, size_t size,
5515 struct iommu_iotlb_gather *gather)
5517 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5518 struct page *freelist = NULL;
5519 unsigned long start_pfn, last_pfn;
5520 unsigned int npages;
5521 int iommu_id, level = 0;
5523 /* Cope with horrid API which requires us to unmap more than the
5524 size argument if it happens to be a large-page mapping. */
5525 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5526 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5527 return 0;
5529 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5530 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5532 start_pfn = iova >> VTD_PAGE_SHIFT;
5533 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5535 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5537 npages = last_pfn - start_pfn + 1;
5539 for_each_domain_iommu(iommu_id, dmar_domain)
5540 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5541 start_pfn, npages, !freelist, 0);
5543 dma_free_pagelist(freelist);
5545 if (dmar_domain->max_addr == iova + size)
5546 dmar_domain->max_addr = iova;
5548 return size;
5551 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5552 dma_addr_t iova)
5554 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5555 struct dma_pte *pte;
5556 int level = 0;
5557 u64 phys = 0;
5559 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5560 return 0;
5562 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5563 if (pte)
5564 phys = dma_pte_addr(pte);
5566 return phys;
5569 static inline bool scalable_mode_support(void)
5571 struct dmar_drhd_unit *drhd;
5572 struct intel_iommu *iommu;
5573 bool ret = true;
5575 rcu_read_lock();
5576 for_each_active_iommu(iommu, drhd) {
5577 if (!sm_supported(iommu)) {
5578 ret = false;
5579 break;
5582 rcu_read_unlock();
5584 return ret;
5587 static inline bool iommu_pasid_support(void)
5589 struct dmar_drhd_unit *drhd;
5590 struct intel_iommu *iommu;
5591 bool ret = true;
5593 rcu_read_lock();
5594 for_each_active_iommu(iommu, drhd) {
5595 if (!pasid_supported(iommu)) {
5596 ret = false;
5597 break;
5600 rcu_read_unlock();
5602 return ret;
5605 static bool intel_iommu_capable(enum iommu_cap cap)
5607 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5608 return domain_update_iommu_snooping(NULL) == 1;
5609 if (cap == IOMMU_CAP_INTR_REMAP)
5610 return irq_remapping_enabled == 1;
5612 return false;
5615 static int intel_iommu_add_device(struct device *dev)
5617 struct dmar_domain *dmar_domain;
5618 struct iommu_domain *domain;
5619 struct intel_iommu *iommu;
5620 struct iommu_group *group;
5621 u8 bus, devfn;
5622 int ret;
5624 iommu = device_to_iommu(dev, &bus, &devfn);
5625 if (!iommu)
5626 return -ENODEV;
5628 iommu_device_link(&iommu->iommu, dev);
5630 if (translation_pre_enabled(iommu))
5631 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5633 group = iommu_group_get_for_dev(dev);
5635 if (IS_ERR(group))
5636 return PTR_ERR(group);
5638 iommu_group_put(group);
5640 domain = iommu_get_domain_for_dev(dev);
5641 dmar_domain = to_dmar_domain(domain);
5642 if (domain->type == IOMMU_DOMAIN_DMA) {
5643 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5644 ret = iommu_request_dm_for_dev(dev);
5645 if (ret) {
5646 dmar_remove_one_dev_info(dev);
5647 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5648 domain_add_dev_info(si_domain, dev);
5649 dev_info(dev,
5650 "Device uses a private identity domain.\n");
5653 } else {
5654 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5655 ret = iommu_request_dma_domain_for_dev(dev);
5656 if (ret) {
5657 dmar_remove_one_dev_info(dev);
5658 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5659 if (!get_private_domain_for_dev(dev)) {
5660 dev_warn(dev,
5661 "Failed to get a private domain.\n");
5662 return -ENOMEM;
5665 dev_info(dev,
5666 "Device uses a private dma domain.\n");
5671 if (device_needs_bounce(dev)) {
5672 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5673 set_dma_ops(dev, &bounce_dma_ops);
5676 return 0;
5679 static void intel_iommu_remove_device(struct device *dev)
5681 struct intel_iommu *iommu;
5682 u8 bus, devfn;
5684 iommu = device_to_iommu(dev, &bus, &devfn);
5685 if (!iommu)
5686 return;
5688 dmar_remove_one_dev_info(dev);
5690 iommu_group_remove_device(dev);
5692 iommu_device_unlink(&iommu->iommu, dev);
5694 if (device_needs_bounce(dev))
5695 set_dma_ops(dev, NULL);
5698 static void intel_iommu_get_resv_regions(struct device *device,
5699 struct list_head *head)
5701 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5702 struct iommu_resv_region *reg;
5703 struct dmar_rmrr_unit *rmrr;
5704 struct device *i_dev;
5705 int i;
5707 down_read(&dmar_global_lock);
5708 for_each_rmrr_units(rmrr) {
5709 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5710 i, i_dev) {
5711 struct iommu_resv_region *resv;
5712 enum iommu_resv_type type;
5713 size_t length;
5715 if (i_dev != device &&
5716 !is_downstream_to_pci_bridge(device, i_dev))
5717 continue;
5719 length = rmrr->end_address - rmrr->base_address + 1;
5721 type = device_rmrr_is_relaxable(device) ?
5722 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5724 resv = iommu_alloc_resv_region(rmrr->base_address,
5725 length, prot, type);
5726 if (!resv)
5727 break;
5729 list_add_tail(&resv->list, head);
5732 up_read(&dmar_global_lock);
5734 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5735 if (dev_is_pci(device)) {
5736 struct pci_dev *pdev = to_pci_dev(device);
5738 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5739 reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5740 IOMMU_RESV_DIRECT);
5741 if (reg)
5742 list_add_tail(&reg->list, head);
5745 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5747 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5748 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5749 0, IOMMU_RESV_MSI);
5750 if (!reg)
5751 return;
5752 list_add_tail(&reg->list, head);
5755 static void intel_iommu_put_resv_regions(struct device *dev,
5756 struct list_head *head)
5758 struct iommu_resv_region *entry, *next;
5760 list_for_each_entry_safe(entry, next, head, list)
5761 kfree(entry);
5764 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5766 struct device_domain_info *info;
5767 struct context_entry *context;
5768 struct dmar_domain *domain;
5769 unsigned long flags;
5770 u64 ctx_lo;
5771 int ret;
5773 domain = find_domain(dev);
5774 if (!domain)
5775 return -EINVAL;
5777 spin_lock_irqsave(&device_domain_lock, flags);
5778 spin_lock(&iommu->lock);
5780 ret = -EINVAL;
5781 info = dev->archdata.iommu;
5782 if (!info || !info->pasid_supported)
5783 goto out;
5785 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5786 if (WARN_ON(!context))
5787 goto out;
5789 ctx_lo = context[0].lo;
5791 if (!(ctx_lo & CONTEXT_PASIDE)) {
5792 ctx_lo |= CONTEXT_PASIDE;
5793 context[0].lo = ctx_lo;
5794 wmb();
5795 iommu->flush.flush_context(iommu,
5796 domain->iommu_did[iommu->seq_id],
5797 PCI_DEVID(info->bus, info->devfn),
5798 DMA_CCMD_MASK_NOBIT,
5799 DMA_CCMD_DEVICE_INVL);
5802 /* Enable PASID support in the device, if it wasn't already */
5803 if (!info->pasid_enabled)
5804 iommu_enable_dev_iotlb(info);
5806 ret = 0;
5808 out:
5809 spin_unlock(&iommu->lock);
5810 spin_unlock_irqrestore(&device_domain_lock, flags);
5812 return ret;
5815 static void intel_iommu_apply_resv_region(struct device *dev,
5816 struct iommu_domain *domain,
5817 struct iommu_resv_region *region)
5819 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5820 unsigned long start, end;
5822 start = IOVA_PFN(region->start);
5823 end = IOVA_PFN(region->start + region->length - 1);
5825 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5828 #ifdef CONFIG_INTEL_IOMMU_SVM
5829 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5831 struct intel_iommu *iommu;
5832 u8 bus, devfn;
5834 if (iommu_dummy(dev)) {
5835 dev_warn(dev,
5836 "No IOMMU translation for device; cannot enable SVM\n");
5837 return NULL;
5840 iommu = device_to_iommu(dev, &bus, &devfn);
5841 if ((!iommu)) {
5842 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5843 return NULL;
5846 return iommu;
5848 #endif /* CONFIG_INTEL_IOMMU_SVM */
5850 static int intel_iommu_enable_auxd(struct device *dev)
5852 struct device_domain_info *info;
5853 struct intel_iommu *iommu;
5854 unsigned long flags;
5855 u8 bus, devfn;
5856 int ret;
5858 iommu = device_to_iommu(dev, &bus, &devfn);
5859 if (!iommu || dmar_disabled)
5860 return -EINVAL;
5862 if (!sm_supported(iommu) || !pasid_supported(iommu))
5863 return -EINVAL;
5865 ret = intel_iommu_enable_pasid(iommu, dev);
5866 if (ret)
5867 return -ENODEV;
5869 spin_lock_irqsave(&device_domain_lock, flags);
5870 info = dev->archdata.iommu;
5871 info->auxd_enabled = 1;
5872 spin_unlock_irqrestore(&device_domain_lock, flags);
5874 return 0;
5877 static int intel_iommu_disable_auxd(struct device *dev)
5879 struct device_domain_info *info;
5880 unsigned long flags;
5882 spin_lock_irqsave(&device_domain_lock, flags);
5883 info = dev->archdata.iommu;
5884 if (!WARN_ON(!info))
5885 info->auxd_enabled = 0;
5886 spin_unlock_irqrestore(&device_domain_lock, flags);
5888 return 0;
5892 * A PCI express designated vendor specific extended capability is defined
5893 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5894 * for system software and tools to detect endpoint devices supporting the
5895 * Intel scalable IO virtualization without host driver dependency.
5897 * Returns the address of the matching extended capability structure within
5898 * the device's PCI configuration space or 0 if the device does not support
5899 * it.
5901 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5903 int pos;
5904 u16 vendor, id;
5906 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5907 while (pos) {
5908 pci_read_config_word(pdev, pos + 4, &vendor);
5909 pci_read_config_word(pdev, pos + 8, &id);
5910 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5911 return pos;
5913 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5916 return 0;
5919 static bool
5920 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5922 if (feat == IOMMU_DEV_FEAT_AUX) {
5923 int ret;
5925 if (!dev_is_pci(dev) || dmar_disabled ||
5926 !scalable_mode_support() || !iommu_pasid_support())
5927 return false;
5929 ret = pci_pasid_features(to_pci_dev(dev));
5930 if (ret < 0)
5931 return false;
5933 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5936 return false;
5939 static int
5940 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5942 if (feat == IOMMU_DEV_FEAT_AUX)
5943 return intel_iommu_enable_auxd(dev);
5945 return -ENODEV;
5948 static int
5949 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5951 if (feat == IOMMU_DEV_FEAT_AUX)
5952 return intel_iommu_disable_auxd(dev);
5954 return -ENODEV;
5957 static bool
5958 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5960 struct device_domain_info *info = dev->archdata.iommu;
5962 if (feat == IOMMU_DEV_FEAT_AUX)
5963 return scalable_mode_support() && info && info->auxd_enabled;
5965 return false;
5968 static int
5969 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5971 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5973 return dmar_domain->default_pasid > 0 ?
5974 dmar_domain->default_pasid : -EINVAL;
5977 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5978 struct device *dev)
5980 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5983 const struct iommu_ops intel_iommu_ops = {
5984 .capable = intel_iommu_capable,
5985 .domain_alloc = intel_iommu_domain_alloc,
5986 .domain_free = intel_iommu_domain_free,
5987 .attach_dev = intel_iommu_attach_device,
5988 .detach_dev = intel_iommu_detach_device,
5989 .aux_attach_dev = intel_iommu_aux_attach_device,
5990 .aux_detach_dev = intel_iommu_aux_detach_device,
5991 .aux_get_pasid = intel_iommu_aux_get_pasid,
5992 .map = intel_iommu_map,
5993 .unmap = intel_iommu_unmap,
5994 .iova_to_phys = intel_iommu_iova_to_phys,
5995 .add_device = intel_iommu_add_device,
5996 .remove_device = intel_iommu_remove_device,
5997 .get_resv_regions = intel_iommu_get_resv_regions,
5998 .put_resv_regions = intel_iommu_put_resv_regions,
5999 .apply_resv_region = intel_iommu_apply_resv_region,
6000 .device_group = pci_device_group,
6001 .dev_has_feat = intel_iommu_dev_has_feat,
6002 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
6003 .dev_enable_feat = intel_iommu_dev_enable_feat,
6004 .dev_disable_feat = intel_iommu_dev_disable_feat,
6005 .is_attach_deferred = intel_iommu_is_attach_deferred,
6006 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6009 static void quirk_iommu_igfx(struct pci_dev *dev)
6011 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6012 dmar_map_gfx = 0;
6015 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6016 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6017 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6018 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6019 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6020 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6021 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6022 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6024 /* Broadwell igfx malfunctions with dmar */
6025 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6026 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6027 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6028 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6029 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6030 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6031 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6032 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6033 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6034 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6035 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6036 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6037 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6038 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6039 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6040 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6041 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6042 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6043 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6044 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6045 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6046 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6047 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6048 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6050 static void quirk_iommu_rwbf(struct pci_dev *dev)
6053 * Mobile 4 Series Chipset neglects to set RWBF capability,
6054 * but needs it. Same seems to hold for the desktop versions.
6056 pci_info(dev, "Forcing write-buffer flush capability\n");
6057 rwbf_quirk = 1;
6060 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6061 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6062 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6063 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6064 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6065 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6066 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6068 #define GGC 0x52
6069 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
6070 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6071 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
6072 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
6073 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6074 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6075 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6076 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6078 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6080 unsigned short ggc;
6082 if (pci_read_config_word(dev, GGC, &ggc))
6083 return;
6085 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6086 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6087 dmar_map_gfx = 0;
6088 } else if (dmar_map_gfx) {
6089 /* we have to ensure the gfx device is idle before we flush */
6090 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6091 intel_iommu_strict = 1;
6094 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6095 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6096 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6097 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6099 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6100 ISOCH DMAR unit for the Azalia sound device, but not give it any
6101 TLB entries, which causes it to deadlock. Check for that. We do
6102 this in a function called from init_dmars(), instead of in a PCI
6103 quirk, because we don't want to print the obnoxious "BIOS broken"
6104 message if VT-d is actually disabled.
6106 static void __init check_tylersburg_isoch(void)
6108 struct pci_dev *pdev;
6109 uint32_t vtisochctrl;
6111 /* If there's no Azalia in the system anyway, forget it. */
6112 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6113 if (!pdev)
6114 return;
6115 pci_dev_put(pdev);
6117 /* System Management Registers. Might be hidden, in which case
6118 we can't do the sanity check. But that's OK, because the
6119 known-broken BIOSes _don't_ actually hide it, so far. */
6120 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6121 if (!pdev)
6122 return;
6124 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6125 pci_dev_put(pdev);
6126 return;
6129 pci_dev_put(pdev);
6131 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6132 if (vtisochctrl & 1)
6133 return;
6135 /* Drop all bits other than the number of TLB entries */
6136 vtisochctrl &= 0x1c;
6138 /* If we have the recommended number of TLB entries (16), fine. */
6139 if (vtisochctrl == 0x10)
6140 return;
6142 /* Zero TLB entries? You get to ride the short bus to school. */
6143 if (!vtisochctrl) {
6144 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6145 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6146 dmi_get_system_info(DMI_BIOS_VENDOR),
6147 dmi_get_system_info(DMI_BIOS_VERSION),
6148 dmi_get_system_info(DMI_PRODUCT_VERSION));
6149 iommu_identity_mapping |= IDENTMAP_AZALIA;
6150 return;
6153 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6154 vtisochctrl);