usb: dwc3: keystone: drop dma_mask configuration
[linux/fpc-iii.git] / drivers / iommu / intel-iommu.c
blob544f968a440ab942591c2473891b84a3d9323b84
1 /*
2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
20 #include <linux/init.h>
21 #include <linux/bitmap.h>
22 #include <linux/debugfs.h>
23 #include <linux/export.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/spinlock.h>
28 #include <linux/pci.h>
29 #include <linux/dmar.h>
30 #include <linux/dma-mapping.h>
31 #include <linux/mempool.h>
32 #include <linux/memory.h>
33 #include <linux/timer.h>
34 #include <linux/iova.h>
35 #include <linux/iommu.h>
36 #include <linux/intel-iommu.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <linux/dma-contiguous.h>
43 #include <asm/irq_remapping.h>
44 #include <asm/cacheflush.h>
45 #include <asm/iommu.h>
47 #include "irq_remapping.h"
48 #include "pci.h"
50 #define ROOT_SIZE VTD_PAGE_SIZE
51 #define CONTEXT_SIZE VTD_PAGE_SIZE
53 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
54 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
55 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
56 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
58 #define IOAPIC_RANGE_START (0xfee00000)
59 #define IOAPIC_RANGE_END (0xfeefffff)
60 #define IOVA_START_ADDR (0x1000)
62 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
64 #define MAX_AGAW_WIDTH 64
65 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
67 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
68 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
70 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
71 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
72 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
73 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
74 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
76 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
77 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
78 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
80 /* page table handling */
81 #define LEVEL_STRIDE (9)
82 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
85 * This bitmap is used to advertise the page sizes our hardware support
86 * to the IOMMU core, which will then use this information to split
87 * physically contiguous memory regions it is mapping into page sizes
88 * that we support.
90 * Traditionally the IOMMU core just handed us the mappings directly,
91 * after making sure the size is an order of a 4KiB page and that the
92 * mapping has natural alignment.
94 * To retain this behavior, we currently advertise that we support
95 * all page sizes that are an order of 4KiB.
97 * If at some point we'd like to utilize the IOMMU core's new behavior,
98 * we could change this to advertise the real page sizes we support.
100 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
102 static inline int agaw_to_level(int agaw)
104 return agaw + 2;
107 static inline int agaw_to_width(int agaw)
109 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
112 static inline int width_to_agaw(int width)
114 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
117 static inline unsigned int level_to_offset_bits(int level)
119 return (level - 1) * LEVEL_STRIDE;
122 static inline int pfn_level_offset(unsigned long pfn, int level)
124 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
127 static inline unsigned long level_mask(int level)
129 return -1UL << level_to_offset_bits(level);
132 static inline unsigned long level_size(int level)
134 return 1UL << level_to_offset_bits(level);
137 static inline unsigned long align_to_level(unsigned long pfn, int level)
139 return (pfn + level_size(level) - 1) & level_mask(level);
142 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
144 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
147 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
148 are never going to work. */
149 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
151 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
156 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long page_to_dma_pfn(struct page *pg)
160 return mm_to_dma_pfn(page_to_pfn(pg));
162 static inline unsigned long virt_to_dma_pfn(void *p)
164 return page_to_dma_pfn(virt_to_page(p));
167 /* global iommu list, set NULL for ignored DMAR units */
168 static struct intel_iommu **g_iommus;
170 static void __init check_tylersburg_isoch(void);
171 static int rwbf_quirk;
174 * set to 1 to panic kernel if can't successfully enable VT-d
175 * (used when kernel is launched w/ TXT)
177 static int force_on = 0;
180 * 0: Present
181 * 1-11: Reserved
182 * 12-63: Context Ptr (12 - (haw-1))
183 * 64-127: Reserved
185 struct root_entry {
186 u64 val;
187 u64 rsvd1;
189 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
190 static inline bool root_present(struct root_entry *root)
192 return (root->val & 1);
194 static inline void set_root_present(struct root_entry *root)
196 root->val |= 1;
198 static inline void set_root_value(struct root_entry *root, unsigned long value)
200 root->val |= value & VTD_PAGE_MASK;
203 static inline struct context_entry *
204 get_context_addr_from_root(struct root_entry *root)
206 return (struct context_entry *)
207 (root_present(root)?phys_to_virt(
208 root->val & VTD_PAGE_MASK) :
209 NULL);
213 * low 64 bits:
214 * 0: present
215 * 1: fault processing disable
216 * 2-3: translation type
217 * 12-63: address space root
218 * high 64 bits:
219 * 0-2: address width
220 * 3-6: aval
221 * 8-23: domain id
223 struct context_entry {
224 u64 lo;
225 u64 hi;
228 static inline bool context_present(struct context_entry *context)
230 return (context->lo & 1);
232 static inline void context_set_present(struct context_entry *context)
234 context->lo |= 1;
237 static inline void context_set_fault_enable(struct context_entry *context)
239 context->lo &= (((u64)-1) << 2) | 1;
242 static inline void context_set_translation_type(struct context_entry *context,
243 unsigned long value)
245 context->lo &= (((u64)-1) << 4) | 3;
246 context->lo |= (value & 3) << 2;
249 static inline void context_set_address_root(struct context_entry *context,
250 unsigned long value)
252 context->lo |= value & VTD_PAGE_MASK;
255 static inline void context_set_address_width(struct context_entry *context,
256 unsigned long value)
258 context->hi |= value & 7;
261 static inline void context_set_domain_id(struct context_entry *context,
262 unsigned long value)
264 context->hi |= (value & ((1 << 16) - 1)) << 8;
267 static inline void context_clear_entry(struct context_entry *context)
269 context->lo = 0;
270 context->hi = 0;
274 * 0: readable
275 * 1: writable
276 * 2-6: reserved
277 * 7: super page
278 * 8-10: available
279 * 11: snoop behavior
280 * 12-63: Host physcial address
282 struct dma_pte {
283 u64 val;
286 static inline void dma_clear_pte(struct dma_pte *pte)
288 pte->val = 0;
291 static inline u64 dma_pte_addr(struct dma_pte *pte)
293 #ifdef CONFIG_64BIT
294 return pte->val & VTD_PAGE_MASK;
295 #else
296 /* Must have a full atomic 64-bit read */
297 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
298 #endif
301 static inline bool dma_pte_present(struct dma_pte *pte)
303 return (pte->val & 3) != 0;
306 static inline bool dma_pte_superpage(struct dma_pte *pte)
308 return (pte->val & (1 << 7));
311 static inline int first_pte_in_page(struct dma_pte *pte)
313 return !((unsigned long)pte & ~VTD_PAGE_MASK);
317 * This domain is a statically identity mapping domain.
318 * 1. This domain creats a static 1:1 mapping to all usable memory.
319 * 2. It maps to each iommu if successful.
320 * 3. Each iommu mapps to this domain if successful.
322 static struct dmar_domain *si_domain;
323 static int hw_pass_through = 1;
325 /* devices under the same p2p bridge are owned in one domain */
326 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
328 /* domain represents a virtual machine, more than one devices
329 * across iommus may be owned in one domain, e.g. kvm guest.
331 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
333 /* si_domain contains mulitple devices */
334 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
336 /* define the limit of IOMMUs supported in each domain */
337 #ifdef CONFIG_X86
338 # define IOMMU_UNITS_SUPPORTED MAX_IO_APICS
339 #else
340 # define IOMMU_UNITS_SUPPORTED 64
341 #endif
343 struct dmar_domain {
344 int id; /* domain id */
345 int nid; /* node id */
346 DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
347 /* bitmap of iommus this domain uses*/
349 struct list_head devices; /* all devices' list */
350 struct iova_domain iovad; /* iova's that belong to this domain */
352 struct dma_pte *pgd; /* virtual address */
353 int gaw; /* max guest address width */
355 /* adjusted guest address width, 0 is level 2 30-bit */
356 int agaw;
358 int flags; /* flags to find out type of domain */
360 int iommu_coherency;/* indicate coherency of iommu access */
361 int iommu_snooping; /* indicate snooping control feature*/
362 int iommu_count; /* reference count of iommu */
363 int iommu_superpage;/* Level of superpages supported:
364 0 == 4KiB (no superpages), 1 == 2MiB,
365 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
366 spinlock_t iommu_lock; /* protect iommu set in domain */
367 u64 max_addr; /* maximum mapped address */
370 /* PCI domain-device relationship */
371 struct device_domain_info {
372 struct list_head link; /* link to domain siblings */
373 struct list_head global; /* link to global list */
374 u8 bus; /* PCI bus number */
375 u8 devfn; /* PCI devfn number */
376 struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
377 struct intel_iommu *iommu; /* IOMMU used by this device */
378 struct dmar_domain *domain; /* pointer to domain */
381 struct dmar_rmrr_unit {
382 struct list_head list; /* list of rmrr units */
383 struct acpi_dmar_header *hdr; /* ACPI header */
384 u64 base_address; /* reserved base address*/
385 u64 end_address; /* reserved end address */
386 struct dmar_dev_scope *devices; /* target devices */
387 int devices_cnt; /* target device count */
390 struct dmar_atsr_unit {
391 struct list_head list; /* list of ATSR units */
392 struct acpi_dmar_header *hdr; /* ACPI header */
393 struct dmar_dev_scope *devices; /* target devices */
394 int devices_cnt; /* target device count */
395 u8 include_all:1; /* include all ports */
398 static LIST_HEAD(dmar_atsr_units);
399 static LIST_HEAD(dmar_rmrr_units);
401 #define for_each_rmrr_units(rmrr) \
402 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
404 static void flush_unmaps_timeout(unsigned long data);
406 static DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
408 #define HIGH_WATER_MARK 250
409 struct deferred_flush_tables {
410 int next;
411 struct iova *iova[HIGH_WATER_MARK];
412 struct dmar_domain *domain[HIGH_WATER_MARK];
413 struct page *freelist[HIGH_WATER_MARK];
416 static struct deferred_flush_tables *deferred_flush;
418 /* bitmap for indexing intel_iommus */
419 static int g_num_of_iommus;
421 static DEFINE_SPINLOCK(async_umap_flush_lock);
422 static LIST_HEAD(unmaps_to_do);
424 static int timer_on;
425 static long list_size;
427 static void domain_exit(struct dmar_domain *domain);
428 static void domain_remove_dev_info(struct dmar_domain *domain);
429 static void domain_remove_one_dev_info(struct dmar_domain *domain,
430 struct device *dev);
431 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
432 struct device *dev);
434 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
435 int dmar_disabled = 0;
436 #else
437 int dmar_disabled = 1;
438 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
440 int intel_iommu_enabled = 0;
441 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
443 static int dmar_map_gfx = 1;
444 static int dmar_forcedac;
445 static int intel_iommu_strict;
446 static int intel_iommu_superpage = 1;
448 int intel_iommu_gfx_mapped;
449 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
451 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
452 static DEFINE_SPINLOCK(device_domain_lock);
453 static LIST_HEAD(device_domain_list);
455 static struct iommu_ops intel_iommu_ops;
457 static int __init intel_iommu_setup(char *str)
459 if (!str)
460 return -EINVAL;
461 while (*str) {
462 if (!strncmp(str, "on", 2)) {
463 dmar_disabled = 0;
464 printk(KERN_INFO "Intel-IOMMU: enabled\n");
465 } else if (!strncmp(str, "off", 3)) {
466 dmar_disabled = 1;
467 printk(KERN_INFO "Intel-IOMMU: disabled\n");
468 } else if (!strncmp(str, "igfx_off", 8)) {
469 dmar_map_gfx = 0;
470 printk(KERN_INFO
471 "Intel-IOMMU: disable GFX device mapping\n");
472 } else if (!strncmp(str, "forcedac", 8)) {
473 printk(KERN_INFO
474 "Intel-IOMMU: Forcing DAC for PCI devices\n");
475 dmar_forcedac = 1;
476 } else if (!strncmp(str, "strict", 6)) {
477 printk(KERN_INFO
478 "Intel-IOMMU: disable batched IOTLB flush\n");
479 intel_iommu_strict = 1;
480 } else if (!strncmp(str, "sp_off", 6)) {
481 printk(KERN_INFO
482 "Intel-IOMMU: disable supported super page\n");
483 intel_iommu_superpage = 0;
486 str += strcspn(str, ",");
487 while (*str == ',')
488 str++;
490 return 0;
492 __setup("intel_iommu=", intel_iommu_setup);
494 static struct kmem_cache *iommu_domain_cache;
495 static struct kmem_cache *iommu_devinfo_cache;
496 static struct kmem_cache *iommu_iova_cache;
498 static inline void *alloc_pgtable_page(int node)
500 struct page *page;
501 void *vaddr = NULL;
503 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
504 if (page)
505 vaddr = page_address(page);
506 return vaddr;
509 static inline void free_pgtable_page(void *vaddr)
511 free_page((unsigned long)vaddr);
514 static inline void *alloc_domain_mem(void)
516 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
519 static void free_domain_mem(void *vaddr)
521 kmem_cache_free(iommu_domain_cache, vaddr);
524 static inline void * alloc_devinfo_mem(void)
526 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
529 static inline void free_devinfo_mem(void *vaddr)
531 kmem_cache_free(iommu_devinfo_cache, vaddr);
534 struct iova *alloc_iova_mem(void)
536 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
539 void free_iova_mem(struct iova *iova)
541 kmem_cache_free(iommu_iova_cache, iova);
545 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
547 unsigned long sagaw;
548 int agaw = -1;
550 sagaw = cap_sagaw(iommu->cap);
551 for (agaw = width_to_agaw(max_gaw);
552 agaw >= 0; agaw--) {
553 if (test_bit(agaw, &sagaw))
554 break;
557 return agaw;
561 * Calculate max SAGAW for each iommu.
563 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
565 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
569 * calculate agaw for each iommu.
570 * "SAGAW" may be different across iommus, use a default agaw, and
571 * get a supported less agaw for iommus that don't support the default agaw.
573 int iommu_calculate_agaw(struct intel_iommu *iommu)
575 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
578 /* This functionin only returns single iommu in a domain */
579 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
581 int iommu_id;
583 /* si_domain and vm domain should not get here. */
584 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
585 BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
587 iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
588 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
589 return NULL;
591 return g_iommus[iommu_id];
594 static void domain_update_iommu_coherency(struct dmar_domain *domain)
596 struct dmar_drhd_unit *drhd;
597 struct intel_iommu *iommu;
598 int i, found = 0;
600 domain->iommu_coherency = 1;
602 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
603 found = 1;
604 if (!ecap_coherent(g_iommus[i]->ecap)) {
605 domain->iommu_coherency = 0;
606 break;
609 if (found)
610 return;
612 /* No hardware attached; use lowest common denominator */
613 rcu_read_lock();
614 for_each_active_iommu(iommu, drhd) {
615 if (!ecap_coherent(iommu->ecap)) {
616 domain->iommu_coherency = 0;
617 break;
620 rcu_read_unlock();
623 static void domain_update_iommu_snooping(struct dmar_domain *domain)
625 int i;
627 domain->iommu_snooping = 1;
629 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
630 if (!ecap_sc_support(g_iommus[i]->ecap)) {
631 domain->iommu_snooping = 0;
632 break;
637 static void domain_update_iommu_superpage(struct dmar_domain *domain)
639 struct dmar_drhd_unit *drhd;
640 struct intel_iommu *iommu = NULL;
641 int mask = 0xf;
643 if (!intel_iommu_superpage) {
644 domain->iommu_superpage = 0;
645 return;
648 /* set iommu_superpage to the smallest common denominator */
649 rcu_read_lock();
650 for_each_active_iommu(iommu, drhd) {
651 mask &= cap_super_page_val(iommu->cap);
652 if (!mask) {
653 break;
656 rcu_read_unlock();
658 domain->iommu_superpage = fls(mask);
661 /* Some capabilities may be different across iommus */
662 static void domain_update_iommu_cap(struct dmar_domain *domain)
664 domain_update_iommu_coherency(domain);
665 domain_update_iommu_snooping(domain);
666 domain_update_iommu_superpage(domain);
669 static int iommu_dummy(struct device *dev)
671 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
674 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
676 struct dmar_drhd_unit *drhd = NULL;
677 struct intel_iommu *iommu;
678 struct device *tmp;
679 struct pci_dev *ptmp, *pdev = NULL;
680 u16 segment;
681 int i;
683 if (iommu_dummy(dev))
684 return NULL;
686 if (dev_is_pci(dev)) {
687 pdev = to_pci_dev(dev);
688 segment = pci_domain_nr(pdev->bus);
689 } else if (ACPI_COMPANION(dev))
690 dev = &ACPI_COMPANION(dev)->dev;
692 rcu_read_lock();
693 for_each_active_iommu(iommu, drhd) {
694 if (pdev && segment != drhd->segment)
695 continue;
697 for_each_active_dev_scope(drhd->devices,
698 drhd->devices_cnt, i, tmp) {
699 if (tmp == dev) {
700 *bus = drhd->devices[i].bus;
701 *devfn = drhd->devices[i].devfn;
702 goto out;
705 if (!pdev || !dev_is_pci(tmp))
706 continue;
708 ptmp = to_pci_dev(tmp);
709 if (ptmp->subordinate &&
710 ptmp->subordinate->number <= pdev->bus->number &&
711 ptmp->subordinate->busn_res.end >= pdev->bus->number)
712 goto got_pdev;
715 if (pdev && drhd->include_all) {
716 got_pdev:
717 *bus = pdev->bus->number;
718 *devfn = pdev->devfn;
719 goto out;
722 iommu = NULL;
723 out:
724 rcu_read_unlock();
726 return iommu;
729 static void domain_flush_cache(struct dmar_domain *domain,
730 void *addr, int size)
732 if (!domain->iommu_coherency)
733 clflush_cache_range(addr, size);
736 /* Gets context entry for a given bus and devfn */
737 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
738 u8 bus, u8 devfn)
740 struct root_entry *root;
741 struct context_entry *context;
742 unsigned long phy_addr;
743 unsigned long flags;
745 spin_lock_irqsave(&iommu->lock, flags);
746 root = &iommu->root_entry[bus];
747 context = get_context_addr_from_root(root);
748 if (!context) {
749 context = (struct context_entry *)
750 alloc_pgtable_page(iommu->node);
751 if (!context) {
752 spin_unlock_irqrestore(&iommu->lock, flags);
753 return NULL;
755 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
756 phy_addr = virt_to_phys((void *)context);
757 set_root_value(root, phy_addr);
758 set_root_present(root);
759 __iommu_flush_cache(iommu, root, sizeof(*root));
761 spin_unlock_irqrestore(&iommu->lock, flags);
762 return &context[devfn];
765 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
767 struct root_entry *root;
768 struct context_entry *context;
769 int ret;
770 unsigned long flags;
772 spin_lock_irqsave(&iommu->lock, flags);
773 root = &iommu->root_entry[bus];
774 context = get_context_addr_from_root(root);
775 if (!context) {
776 ret = 0;
777 goto out;
779 ret = context_present(&context[devfn]);
780 out:
781 spin_unlock_irqrestore(&iommu->lock, flags);
782 return ret;
785 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
787 struct root_entry *root;
788 struct context_entry *context;
789 unsigned long flags;
791 spin_lock_irqsave(&iommu->lock, flags);
792 root = &iommu->root_entry[bus];
793 context = get_context_addr_from_root(root);
794 if (context) {
795 context_clear_entry(&context[devfn]);
796 __iommu_flush_cache(iommu, &context[devfn], \
797 sizeof(*context));
799 spin_unlock_irqrestore(&iommu->lock, flags);
802 static void free_context_table(struct intel_iommu *iommu)
804 struct root_entry *root;
805 int i;
806 unsigned long flags;
807 struct context_entry *context;
809 spin_lock_irqsave(&iommu->lock, flags);
810 if (!iommu->root_entry) {
811 goto out;
813 for (i = 0; i < ROOT_ENTRY_NR; i++) {
814 root = &iommu->root_entry[i];
815 context = get_context_addr_from_root(root);
816 if (context)
817 free_pgtable_page(context);
819 free_pgtable_page(iommu->root_entry);
820 iommu->root_entry = NULL;
821 out:
822 spin_unlock_irqrestore(&iommu->lock, flags);
825 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
826 unsigned long pfn, int *target_level)
828 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
829 struct dma_pte *parent, *pte = NULL;
830 int level = agaw_to_level(domain->agaw);
831 int offset;
833 BUG_ON(!domain->pgd);
835 if (addr_width < BITS_PER_LONG && pfn >> addr_width)
836 /* Address beyond IOMMU's addressing capabilities. */
837 return NULL;
839 parent = domain->pgd;
841 while (1) {
842 void *tmp_page;
844 offset = pfn_level_offset(pfn, level);
845 pte = &parent[offset];
846 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
847 break;
848 if (level == *target_level)
849 break;
851 if (!dma_pte_present(pte)) {
852 uint64_t pteval;
854 tmp_page = alloc_pgtable_page(domain->nid);
856 if (!tmp_page)
857 return NULL;
859 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
860 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
861 if (cmpxchg64(&pte->val, 0ULL, pteval)) {
862 /* Someone else set it while we were thinking; use theirs. */
863 free_pgtable_page(tmp_page);
864 } else {
865 dma_pte_addr(pte);
866 domain_flush_cache(domain, pte, sizeof(*pte));
869 if (level == 1)
870 break;
872 parent = phys_to_virt(dma_pte_addr(pte));
873 level--;
876 if (!*target_level)
877 *target_level = level;
879 return pte;
883 /* return address's pte at specific level */
884 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
885 unsigned long pfn,
886 int level, int *large_page)
888 struct dma_pte *parent, *pte = NULL;
889 int total = agaw_to_level(domain->agaw);
890 int offset;
892 parent = domain->pgd;
893 while (level <= total) {
894 offset = pfn_level_offset(pfn, total);
895 pte = &parent[offset];
896 if (level == total)
897 return pte;
899 if (!dma_pte_present(pte)) {
900 *large_page = total;
901 break;
904 if (pte->val & DMA_PTE_LARGE_PAGE) {
905 *large_page = total;
906 return pte;
909 parent = phys_to_virt(dma_pte_addr(pte));
910 total--;
912 return NULL;
915 /* clear last level pte, a tlb flush should be followed */
916 static void dma_pte_clear_range(struct dmar_domain *domain,
917 unsigned long start_pfn,
918 unsigned long last_pfn)
920 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
921 unsigned int large_page = 1;
922 struct dma_pte *first_pte, *pte;
924 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
925 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
926 BUG_ON(start_pfn > last_pfn);
928 /* we don't need lock here; nobody else touches the iova range */
929 do {
930 large_page = 1;
931 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
932 if (!pte) {
933 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
934 continue;
936 do {
937 dma_clear_pte(pte);
938 start_pfn += lvl_to_nr_pages(large_page);
939 pte++;
940 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
942 domain_flush_cache(domain, first_pte,
943 (void *)pte - (void *)first_pte);
945 } while (start_pfn && start_pfn <= last_pfn);
948 static void dma_pte_free_level(struct dmar_domain *domain, int level,
949 struct dma_pte *pte, unsigned long pfn,
950 unsigned long start_pfn, unsigned long last_pfn)
952 pfn = max(start_pfn, pfn);
953 pte = &pte[pfn_level_offset(pfn, level)];
955 do {
956 unsigned long level_pfn;
957 struct dma_pte *level_pte;
959 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
960 goto next;
962 level_pfn = pfn & level_mask(level - 1);
963 level_pte = phys_to_virt(dma_pte_addr(pte));
965 if (level > 2)
966 dma_pte_free_level(domain, level - 1, level_pte,
967 level_pfn, start_pfn, last_pfn);
969 /* If range covers entire pagetable, free it */
970 if (!(start_pfn > level_pfn ||
971 last_pfn < level_pfn + level_size(level) - 1)) {
972 dma_clear_pte(pte);
973 domain_flush_cache(domain, pte, sizeof(*pte));
974 free_pgtable_page(level_pte);
976 next:
977 pfn += level_size(level);
978 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
981 /* free page table pages. last level pte should already be cleared */
982 static void dma_pte_free_pagetable(struct dmar_domain *domain,
983 unsigned long start_pfn,
984 unsigned long last_pfn)
986 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
988 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
989 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
990 BUG_ON(start_pfn > last_pfn);
992 /* We don't need lock here; nobody else touches the iova range */
993 dma_pte_free_level(domain, agaw_to_level(domain->agaw),
994 domain->pgd, 0, start_pfn, last_pfn);
996 /* free pgd */
997 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
998 free_pgtable_page(domain->pgd);
999 domain->pgd = NULL;
1003 /* When a page at a given level is being unlinked from its parent, we don't
1004 need to *modify* it at all. All we need to do is make a list of all the
1005 pages which can be freed just as soon as we've flushed the IOTLB and we
1006 know the hardware page-walk will no longer touch them.
1007 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1008 be freed. */
1009 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1010 int level, struct dma_pte *pte,
1011 struct page *freelist)
1013 struct page *pg;
1015 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1016 pg->freelist = freelist;
1017 freelist = pg;
1019 if (level == 1)
1020 return freelist;
1022 pte = page_address(pg);
1023 do {
1024 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1025 freelist = dma_pte_list_pagetables(domain, level - 1,
1026 pte, freelist);
1027 pte++;
1028 } while (!first_pte_in_page(pte));
1030 return freelist;
1033 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1034 struct dma_pte *pte, unsigned long pfn,
1035 unsigned long start_pfn,
1036 unsigned long last_pfn,
1037 struct page *freelist)
1039 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1041 pfn = max(start_pfn, pfn);
1042 pte = &pte[pfn_level_offset(pfn, level)];
1044 do {
1045 unsigned long level_pfn;
1047 if (!dma_pte_present(pte))
1048 goto next;
1050 level_pfn = pfn & level_mask(level);
1052 /* If range covers entire pagetable, free it */
1053 if (start_pfn <= level_pfn &&
1054 last_pfn >= level_pfn + level_size(level) - 1) {
1055 /* These suborbinate page tables are going away entirely. Don't
1056 bother to clear them; we're just going to *free* them. */
1057 if (level > 1 && !dma_pte_superpage(pte))
1058 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1060 dma_clear_pte(pte);
1061 if (!first_pte)
1062 first_pte = pte;
1063 last_pte = pte;
1064 } else if (level > 1) {
1065 /* Recurse down into a level that isn't *entirely* obsolete */
1066 freelist = dma_pte_clear_level(domain, level - 1,
1067 phys_to_virt(dma_pte_addr(pte)),
1068 level_pfn, start_pfn, last_pfn,
1069 freelist);
1071 next:
1072 pfn += level_size(level);
1073 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1075 if (first_pte)
1076 domain_flush_cache(domain, first_pte,
1077 (void *)++last_pte - (void *)first_pte);
1079 return freelist;
1082 /* We can't just free the pages because the IOMMU may still be walking
1083 the page tables, and may have cached the intermediate levels. The
1084 pages can only be freed after the IOTLB flush has been done. */
1085 struct page *domain_unmap(struct dmar_domain *domain,
1086 unsigned long start_pfn,
1087 unsigned long last_pfn)
1089 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1090 struct page *freelist = NULL;
1092 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
1093 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
1094 BUG_ON(start_pfn > last_pfn);
1096 /* we don't need lock here; nobody else touches the iova range */
1097 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1098 domain->pgd, 0, start_pfn, last_pfn, NULL);
1100 /* free pgd */
1101 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1102 struct page *pgd_page = virt_to_page(domain->pgd);
1103 pgd_page->freelist = freelist;
1104 freelist = pgd_page;
1106 domain->pgd = NULL;
1109 return freelist;
1112 void dma_free_pagelist(struct page *freelist)
1114 struct page *pg;
1116 while ((pg = freelist)) {
1117 freelist = pg->freelist;
1118 free_pgtable_page(page_address(pg));
1122 /* iommu handling */
1123 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1125 struct root_entry *root;
1126 unsigned long flags;
1128 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1129 if (!root)
1130 return -ENOMEM;
1132 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1134 spin_lock_irqsave(&iommu->lock, flags);
1135 iommu->root_entry = root;
1136 spin_unlock_irqrestore(&iommu->lock, flags);
1138 return 0;
1141 static void iommu_set_root_entry(struct intel_iommu *iommu)
1143 void *addr;
1144 u32 sts;
1145 unsigned long flag;
1147 addr = iommu->root_entry;
1149 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1150 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
1152 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1154 /* Make sure hardware complete it */
1155 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1156 readl, (sts & DMA_GSTS_RTPS), sts);
1158 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1161 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1163 u32 val;
1164 unsigned long flag;
1166 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1167 return;
1169 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1170 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1172 /* Make sure hardware complete it */
1173 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1174 readl, (!(val & DMA_GSTS_WBFS)), val);
1176 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1179 /* return value determine if we need a write buffer flush */
1180 static void __iommu_flush_context(struct intel_iommu *iommu,
1181 u16 did, u16 source_id, u8 function_mask,
1182 u64 type)
1184 u64 val = 0;
1185 unsigned long flag;
1187 switch (type) {
1188 case DMA_CCMD_GLOBAL_INVL:
1189 val = DMA_CCMD_GLOBAL_INVL;
1190 break;
1191 case DMA_CCMD_DOMAIN_INVL:
1192 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1193 break;
1194 case DMA_CCMD_DEVICE_INVL:
1195 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1196 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1197 break;
1198 default:
1199 BUG();
1201 val |= DMA_CCMD_ICC;
1203 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1204 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1206 /* Make sure hardware complete it */
1207 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1208 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1210 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1213 /* return value determine if we need a write buffer flush */
1214 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1215 u64 addr, unsigned int size_order, u64 type)
1217 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1218 u64 val = 0, val_iva = 0;
1219 unsigned long flag;
1221 switch (type) {
1222 case DMA_TLB_GLOBAL_FLUSH:
1223 /* global flush doesn't need set IVA_REG */
1224 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1225 break;
1226 case DMA_TLB_DSI_FLUSH:
1227 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1228 break;
1229 case DMA_TLB_PSI_FLUSH:
1230 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1231 /* IH bit is passed in as part of address */
1232 val_iva = size_order | addr;
1233 break;
1234 default:
1235 BUG();
1237 /* Note: set drain read/write */
1238 #if 0
1240 * This is probably to be super secure.. Looks like we can
1241 * ignore it without any impact.
1243 if (cap_read_drain(iommu->cap))
1244 val |= DMA_TLB_READ_DRAIN;
1245 #endif
1246 if (cap_write_drain(iommu->cap))
1247 val |= DMA_TLB_WRITE_DRAIN;
1249 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1250 /* Note: Only uses first TLB reg currently */
1251 if (val_iva)
1252 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1253 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1255 /* Make sure hardware complete it */
1256 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1257 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1259 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1261 /* check IOTLB invalidation granularity */
1262 if (DMA_TLB_IAIG(val) == 0)
1263 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1264 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1265 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1266 (unsigned long long)DMA_TLB_IIRG(type),
1267 (unsigned long long)DMA_TLB_IAIG(val));
1270 static struct device_domain_info *
1271 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1272 u8 bus, u8 devfn)
1274 int found = 0;
1275 unsigned long flags;
1276 struct device_domain_info *info;
1277 struct pci_dev *pdev;
1279 if (!ecap_dev_iotlb_support(iommu->ecap))
1280 return NULL;
1282 if (!iommu->qi)
1283 return NULL;
1285 spin_lock_irqsave(&device_domain_lock, flags);
1286 list_for_each_entry(info, &domain->devices, link)
1287 if (info->bus == bus && info->devfn == devfn) {
1288 found = 1;
1289 break;
1291 spin_unlock_irqrestore(&device_domain_lock, flags);
1293 if (!found || !info->dev || !dev_is_pci(info->dev))
1294 return NULL;
1296 pdev = to_pci_dev(info->dev);
1298 if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1299 return NULL;
1301 if (!dmar_find_matched_atsr_unit(pdev))
1302 return NULL;
1304 return info;
1307 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1309 if (!info || !dev_is_pci(info->dev))
1310 return;
1312 pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1315 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1317 if (!info->dev || !dev_is_pci(info->dev) ||
1318 !pci_ats_enabled(to_pci_dev(info->dev)))
1319 return;
1321 pci_disable_ats(to_pci_dev(info->dev));
1324 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1325 u64 addr, unsigned mask)
1327 u16 sid, qdep;
1328 unsigned long flags;
1329 struct device_domain_info *info;
1331 spin_lock_irqsave(&device_domain_lock, flags);
1332 list_for_each_entry(info, &domain->devices, link) {
1333 struct pci_dev *pdev;
1334 if (!info->dev || !dev_is_pci(info->dev))
1335 continue;
1337 pdev = to_pci_dev(info->dev);
1338 if (!pci_ats_enabled(pdev))
1339 continue;
1341 sid = info->bus << 8 | info->devfn;
1342 qdep = pci_ats_queue_depth(pdev);
1343 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1345 spin_unlock_irqrestore(&device_domain_lock, flags);
1348 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1349 unsigned long pfn, unsigned int pages, int ih, int map)
1351 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1352 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1354 BUG_ON(pages == 0);
1356 if (ih)
1357 ih = 1 << 6;
1359 * Fallback to domain selective flush if no PSI support or the size is
1360 * too big.
1361 * PSI requires page size to be 2 ^ x, and the base address is naturally
1362 * aligned to the size
1364 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1365 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1366 DMA_TLB_DSI_FLUSH);
1367 else
1368 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1369 DMA_TLB_PSI_FLUSH);
1372 * In caching mode, changes of pages from non-present to present require
1373 * flush. However, device IOTLB doesn't need to be flushed in this case.
1375 if (!cap_caching_mode(iommu->cap) || !map)
1376 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1379 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1381 u32 pmen;
1382 unsigned long flags;
1384 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1385 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1386 pmen &= ~DMA_PMEN_EPM;
1387 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1389 /* wait for the protected region status bit to clear */
1390 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1391 readl, !(pmen & DMA_PMEN_PRS), pmen);
1393 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1396 static int iommu_enable_translation(struct intel_iommu *iommu)
1398 u32 sts;
1399 unsigned long flags;
1401 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1402 iommu->gcmd |= DMA_GCMD_TE;
1403 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1405 /* Make sure hardware complete it */
1406 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1407 readl, (sts & DMA_GSTS_TES), sts);
1409 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1410 return 0;
1413 static int iommu_disable_translation(struct intel_iommu *iommu)
1415 u32 sts;
1416 unsigned long flag;
1418 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1419 iommu->gcmd &= ~DMA_GCMD_TE;
1420 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1422 /* Make sure hardware complete it */
1423 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1424 readl, (!(sts & DMA_GSTS_TES)), sts);
1426 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1427 return 0;
1431 static int iommu_init_domains(struct intel_iommu *iommu)
1433 unsigned long ndomains;
1434 unsigned long nlongs;
1436 ndomains = cap_ndoms(iommu->cap);
1437 pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1438 iommu->seq_id, ndomains);
1439 nlongs = BITS_TO_LONGS(ndomains);
1441 spin_lock_init(&iommu->lock);
1443 /* TBD: there might be 64K domains,
1444 * consider other allocation for future chip
1446 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1447 if (!iommu->domain_ids) {
1448 pr_err("IOMMU%d: allocating domain id array failed\n",
1449 iommu->seq_id);
1450 return -ENOMEM;
1452 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1453 GFP_KERNEL);
1454 if (!iommu->domains) {
1455 pr_err("IOMMU%d: allocating domain array failed\n",
1456 iommu->seq_id);
1457 kfree(iommu->domain_ids);
1458 iommu->domain_ids = NULL;
1459 return -ENOMEM;
1463 * if Caching mode is set, then invalid translations are tagged
1464 * with domainid 0. Hence we need to pre-allocate it.
1466 if (cap_caching_mode(iommu->cap))
1467 set_bit(0, iommu->domain_ids);
1468 return 0;
1471 static void free_dmar_iommu(struct intel_iommu *iommu)
1473 struct dmar_domain *domain;
1474 int i, count;
1475 unsigned long flags;
1477 if ((iommu->domains) && (iommu->domain_ids)) {
1478 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1480 * Domain id 0 is reserved for invalid translation
1481 * if hardware supports caching mode.
1483 if (cap_caching_mode(iommu->cap) && i == 0)
1484 continue;
1486 domain = iommu->domains[i];
1487 clear_bit(i, iommu->domain_ids);
1489 spin_lock_irqsave(&domain->iommu_lock, flags);
1490 count = --domain->iommu_count;
1491 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1492 if (count == 0)
1493 domain_exit(domain);
1497 if (iommu->gcmd & DMA_GCMD_TE)
1498 iommu_disable_translation(iommu);
1500 kfree(iommu->domains);
1501 kfree(iommu->domain_ids);
1502 iommu->domains = NULL;
1503 iommu->domain_ids = NULL;
1505 g_iommus[iommu->seq_id] = NULL;
1507 /* free context mapping */
1508 free_context_table(iommu);
1511 static struct dmar_domain *alloc_domain(bool vm)
1513 /* domain id for virtual machine, it won't be set in context */
1514 static atomic_t vm_domid = ATOMIC_INIT(0);
1515 struct dmar_domain *domain;
1517 domain = alloc_domain_mem();
1518 if (!domain)
1519 return NULL;
1521 domain->nid = -1;
1522 domain->iommu_count = 0;
1523 memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1524 domain->flags = 0;
1525 spin_lock_init(&domain->iommu_lock);
1526 INIT_LIST_HEAD(&domain->devices);
1527 if (vm) {
1528 domain->id = atomic_inc_return(&vm_domid);
1529 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
1532 return domain;
1535 static int iommu_attach_domain(struct dmar_domain *domain,
1536 struct intel_iommu *iommu)
1538 int num;
1539 unsigned long ndomains;
1540 unsigned long flags;
1542 ndomains = cap_ndoms(iommu->cap);
1544 spin_lock_irqsave(&iommu->lock, flags);
1546 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1547 if (num >= ndomains) {
1548 spin_unlock_irqrestore(&iommu->lock, flags);
1549 printk(KERN_ERR "IOMMU: no free domain ids\n");
1550 return -ENOMEM;
1553 domain->id = num;
1554 domain->iommu_count++;
1555 set_bit(num, iommu->domain_ids);
1556 set_bit(iommu->seq_id, domain->iommu_bmp);
1557 iommu->domains[num] = domain;
1558 spin_unlock_irqrestore(&iommu->lock, flags);
1560 return 0;
1563 static void iommu_detach_domain(struct dmar_domain *domain,
1564 struct intel_iommu *iommu)
1566 unsigned long flags;
1567 int num, ndomains;
1569 spin_lock_irqsave(&iommu->lock, flags);
1570 ndomains = cap_ndoms(iommu->cap);
1571 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1572 if (iommu->domains[num] == domain) {
1573 clear_bit(num, iommu->domain_ids);
1574 iommu->domains[num] = NULL;
1575 break;
1578 spin_unlock_irqrestore(&iommu->lock, flags);
1581 static struct iova_domain reserved_iova_list;
1582 static struct lock_class_key reserved_rbtree_key;
1584 static int dmar_init_reserved_ranges(void)
1586 struct pci_dev *pdev = NULL;
1587 struct iova *iova;
1588 int i;
1590 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1592 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1593 &reserved_rbtree_key);
1595 /* IOAPIC ranges shouldn't be accessed by DMA */
1596 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1597 IOVA_PFN(IOAPIC_RANGE_END));
1598 if (!iova) {
1599 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1600 return -ENODEV;
1603 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1604 for_each_pci_dev(pdev) {
1605 struct resource *r;
1607 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1608 r = &pdev->resource[i];
1609 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1610 continue;
1611 iova = reserve_iova(&reserved_iova_list,
1612 IOVA_PFN(r->start),
1613 IOVA_PFN(r->end));
1614 if (!iova) {
1615 printk(KERN_ERR "Reserve iova failed\n");
1616 return -ENODEV;
1620 return 0;
1623 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1625 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1628 static inline int guestwidth_to_adjustwidth(int gaw)
1630 int agaw;
1631 int r = (gaw - 12) % 9;
1633 if (r == 0)
1634 agaw = gaw;
1635 else
1636 agaw = gaw + 9 - r;
1637 if (agaw > 64)
1638 agaw = 64;
1639 return agaw;
1642 static int domain_init(struct dmar_domain *domain, int guest_width)
1644 struct intel_iommu *iommu;
1645 int adjust_width, agaw;
1646 unsigned long sagaw;
1648 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1649 domain_reserve_special_ranges(domain);
1651 /* calculate AGAW */
1652 iommu = domain_get_iommu(domain);
1653 if (guest_width > cap_mgaw(iommu->cap))
1654 guest_width = cap_mgaw(iommu->cap);
1655 domain->gaw = guest_width;
1656 adjust_width = guestwidth_to_adjustwidth(guest_width);
1657 agaw = width_to_agaw(adjust_width);
1658 sagaw = cap_sagaw(iommu->cap);
1659 if (!test_bit(agaw, &sagaw)) {
1660 /* hardware doesn't support it, choose a bigger one */
1661 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1662 agaw = find_next_bit(&sagaw, 5, agaw);
1663 if (agaw >= 5)
1664 return -ENODEV;
1666 domain->agaw = agaw;
1668 if (ecap_coherent(iommu->ecap))
1669 domain->iommu_coherency = 1;
1670 else
1671 domain->iommu_coherency = 0;
1673 if (ecap_sc_support(iommu->ecap))
1674 domain->iommu_snooping = 1;
1675 else
1676 domain->iommu_snooping = 0;
1678 if (intel_iommu_superpage)
1679 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1680 else
1681 domain->iommu_superpage = 0;
1683 domain->nid = iommu->node;
1685 /* always allocate the top pgd */
1686 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1687 if (!domain->pgd)
1688 return -ENOMEM;
1689 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1690 return 0;
1693 static void domain_exit(struct dmar_domain *domain)
1695 struct dmar_drhd_unit *drhd;
1696 struct intel_iommu *iommu;
1697 struct page *freelist = NULL;
1699 /* Domain 0 is reserved, so dont process it */
1700 if (!domain)
1701 return;
1703 /* Flush any lazy unmaps that may reference this domain */
1704 if (!intel_iommu_strict)
1705 flush_unmaps_timeout(0);
1707 /* remove associated devices */
1708 domain_remove_dev_info(domain);
1710 /* destroy iovas */
1711 put_iova_domain(&domain->iovad);
1713 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1715 /* clear attached or cached domains */
1716 rcu_read_lock();
1717 for_each_active_iommu(iommu, drhd)
1718 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1719 test_bit(iommu->seq_id, domain->iommu_bmp))
1720 iommu_detach_domain(domain, iommu);
1721 rcu_read_unlock();
1723 dma_free_pagelist(freelist);
1725 free_domain_mem(domain);
1728 static int domain_context_mapping_one(struct dmar_domain *domain,
1729 struct intel_iommu *iommu,
1730 u8 bus, u8 devfn, int translation)
1732 struct context_entry *context;
1733 unsigned long flags;
1734 struct dma_pte *pgd;
1735 unsigned long num;
1736 unsigned long ndomains;
1737 int id;
1738 int agaw;
1739 struct device_domain_info *info = NULL;
1741 pr_debug("Set context mapping for %02x:%02x.%d\n",
1742 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1744 BUG_ON(!domain->pgd);
1745 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1746 translation != CONTEXT_TT_MULTI_LEVEL);
1748 context = device_to_context_entry(iommu, bus, devfn);
1749 if (!context)
1750 return -ENOMEM;
1751 spin_lock_irqsave(&iommu->lock, flags);
1752 if (context_present(context)) {
1753 spin_unlock_irqrestore(&iommu->lock, flags);
1754 return 0;
1757 id = domain->id;
1758 pgd = domain->pgd;
1760 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1761 domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1762 int found = 0;
1764 /* find an available domain id for this device in iommu */
1765 ndomains = cap_ndoms(iommu->cap);
1766 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1767 if (iommu->domains[num] == domain) {
1768 id = num;
1769 found = 1;
1770 break;
1774 if (found == 0) {
1775 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1776 if (num >= ndomains) {
1777 spin_unlock_irqrestore(&iommu->lock, flags);
1778 printk(KERN_ERR "IOMMU: no free domain ids\n");
1779 return -EFAULT;
1782 set_bit(num, iommu->domain_ids);
1783 iommu->domains[num] = domain;
1784 id = num;
1787 /* Skip top levels of page tables for
1788 * iommu which has less agaw than default.
1789 * Unnecessary for PT mode.
1791 if (translation != CONTEXT_TT_PASS_THROUGH) {
1792 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1793 pgd = phys_to_virt(dma_pte_addr(pgd));
1794 if (!dma_pte_present(pgd)) {
1795 spin_unlock_irqrestore(&iommu->lock, flags);
1796 return -ENOMEM;
1802 context_set_domain_id(context, id);
1804 if (translation != CONTEXT_TT_PASS_THROUGH) {
1805 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1806 translation = info ? CONTEXT_TT_DEV_IOTLB :
1807 CONTEXT_TT_MULTI_LEVEL;
1810 * In pass through mode, AW must be programmed to indicate the largest
1811 * AGAW value supported by hardware. And ASR is ignored by hardware.
1813 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1814 context_set_address_width(context, iommu->msagaw);
1815 else {
1816 context_set_address_root(context, virt_to_phys(pgd));
1817 context_set_address_width(context, iommu->agaw);
1820 context_set_translation_type(context, translation);
1821 context_set_fault_enable(context);
1822 context_set_present(context);
1823 domain_flush_cache(domain, context, sizeof(*context));
1826 * It's a non-present to present mapping. If hardware doesn't cache
1827 * non-present entry we only need to flush the write-buffer. If the
1828 * _does_ cache non-present entries, then it does so in the special
1829 * domain #0, which we have to flush:
1831 if (cap_caching_mode(iommu->cap)) {
1832 iommu->flush.flush_context(iommu, 0,
1833 (((u16)bus) << 8) | devfn,
1834 DMA_CCMD_MASK_NOBIT,
1835 DMA_CCMD_DEVICE_INVL);
1836 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1837 } else {
1838 iommu_flush_write_buffer(iommu);
1840 iommu_enable_dev_iotlb(info);
1841 spin_unlock_irqrestore(&iommu->lock, flags);
1843 spin_lock_irqsave(&domain->iommu_lock, flags);
1844 if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1845 domain->iommu_count++;
1846 if (domain->iommu_count == 1)
1847 domain->nid = iommu->node;
1848 domain_update_iommu_cap(domain);
1850 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1851 return 0;
1854 static int
1855 domain_context_mapping(struct dmar_domain *domain, struct device *dev,
1856 int translation)
1858 int ret;
1859 struct pci_dev *pdev, *tmp, *parent;
1860 struct intel_iommu *iommu;
1861 u8 bus, devfn;
1863 iommu = device_to_iommu(dev, &bus, &devfn);
1864 if (!iommu)
1865 return -ENODEV;
1867 ret = domain_context_mapping_one(domain, iommu, bus, devfn,
1868 translation);
1869 if (ret || !dev_is_pci(dev))
1870 return ret;
1872 /* dependent device mapping */
1873 pdev = to_pci_dev(dev);
1874 tmp = pci_find_upstream_pcie_bridge(pdev);
1875 if (!tmp)
1876 return 0;
1877 /* Secondary interface's bus number and devfn 0 */
1878 parent = pdev->bus->self;
1879 while (parent != tmp) {
1880 ret = domain_context_mapping_one(domain, iommu,
1881 parent->bus->number,
1882 parent->devfn, translation);
1883 if (ret)
1884 return ret;
1885 parent = parent->bus->self;
1887 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1888 return domain_context_mapping_one(domain, iommu,
1889 tmp->subordinate->number, 0,
1890 translation);
1891 else /* this is a legacy PCI bridge */
1892 return domain_context_mapping_one(domain, iommu,
1893 tmp->bus->number,
1894 tmp->devfn,
1895 translation);
1898 static int domain_context_mapped(struct device *dev)
1900 int ret;
1901 struct pci_dev *pdev, *tmp, *parent;
1902 struct intel_iommu *iommu;
1903 u8 bus, devfn;
1905 iommu = device_to_iommu(dev, &bus, &devfn);
1906 if (!iommu)
1907 return -ENODEV;
1909 ret = device_context_mapped(iommu, bus, devfn);
1910 if (!ret || !dev_is_pci(dev))
1911 return ret;
1913 /* dependent device mapping */
1914 pdev = to_pci_dev(dev);
1915 tmp = pci_find_upstream_pcie_bridge(pdev);
1916 if (!tmp)
1917 return ret;
1918 /* Secondary interface's bus number and devfn 0 */
1919 parent = pdev->bus->self;
1920 while (parent != tmp) {
1921 ret = device_context_mapped(iommu, parent->bus->number,
1922 parent->devfn);
1923 if (!ret)
1924 return ret;
1925 parent = parent->bus->self;
1927 if (pci_is_pcie(tmp))
1928 return device_context_mapped(iommu, tmp->subordinate->number,
1930 else
1931 return device_context_mapped(iommu, tmp->bus->number,
1932 tmp->devfn);
1935 /* Returns a number of VTD pages, but aligned to MM page size */
1936 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1937 size_t size)
1939 host_addr &= ~PAGE_MASK;
1940 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1943 /* Return largest possible superpage level for a given mapping */
1944 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1945 unsigned long iov_pfn,
1946 unsigned long phy_pfn,
1947 unsigned long pages)
1949 int support, level = 1;
1950 unsigned long pfnmerge;
1952 support = domain->iommu_superpage;
1954 /* To use a large page, the virtual *and* physical addresses
1955 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1956 of them will mean we have to use smaller pages. So just
1957 merge them and check both at once. */
1958 pfnmerge = iov_pfn | phy_pfn;
1960 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1961 pages >>= VTD_STRIDE_SHIFT;
1962 if (!pages)
1963 break;
1964 pfnmerge >>= VTD_STRIDE_SHIFT;
1965 level++;
1966 support--;
1968 return level;
1971 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1972 struct scatterlist *sg, unsigned long phys_pfn,
1973 unsigned long nr_pages, int prot)
1975 struct dma_pte *first_pte = NULL, *pte = NULL;
1976 phys_addr_t uninitialized_var(pteval);
1977 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1978 unsigned long sg_res = 0;
1979 unsigned int largepage_lvl = 0;
1980 unsigned long lvl_pages = 0;
1982 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1984 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1985 return -EINVAL;
1987 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1989 if (!sg) {
1990 sg_res = nr_pages;
1991 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1994 while (nr_pages > 0) {
1995 uint64_t tmp;
1997 if (!sg_res) {
1998 sg_res = aligned_nrpages(sg->offset, sg->length);
1999 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2000 sg->dma_length = sg->length;
2001 pteval = page_to_phys(sg_page(sg)) | prot;
2002 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2005 if (!pte) {
2006 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2008 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2009 if (!pte)
2010 return -ENOMEM;
2011 /* It is large page*/
2012 if (largepage_lvl > 1) {
2013 pteval |= DMA_PTE_LARGE_PAGE;
2014 /* Ensure that old small page tables are removed to make room
2015 for superpage, if they exist. */
2016 dma_pte_clear_range(domain, iov_pfn,
2017 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
2018 dma_pte_free_pagetable(domain, iov_pfn,
2019 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
2020 } else {
2021 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2025 /* We don't need lock here, nobody else
2026 * touches the iova range
2028 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2029 if (tmp) {
2030 static int dumps = 5;
2031 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2032 iov_pfn, tmp, (unsigned long long)pteval);
2033 if (dumps) {
2034 dumps--;
2035 debug_dma_dump_mappings(NULL);
2037 WARN_ON(1);
2040 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2042 BUG_ON(nr_pages < lvl_pages);
2043 BUG_ON(sg_res < lvl_pages);
2045 nr_pages -= lvl_pages;
2046 iov_pfn += lvl_pages;
2047 phys_pfn += lvl_pages;
2048 pteval += lvl_pages * VTD_PAGE_SIZE;
2049 sg_res -= lvl_pages;
2051 /* If the next PTE would be the first in a new page, then we
2052 need to flush the cache on the entries we've just written.
2053 And then we'll need to recalculate 'pte', so clear it and
2054 let it get set again in the if (!pte) block above.
2056 If we're done (!nr_pages) we need to flush the cache too.
2058 Also if we've been setting superpages, we may need to
2059 recalculate 'pte' and switch back to smaller pages for the
2060 end of the mapping, if the trailing size is not enough to
2061 use another superpage (i.e. sg_res < lvl_pages). */
2062 pte++;
2063 if (!nr_pages || first_pte_in_page(pte) ||
2064 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2065 domain_flush_cache(domain, first_pte,
2066 (void *)pte - (void *)first_pte);
2067 pte = NULL;
2070 if (!sg_res && nr_pages)
2071 sg = sg_next(sg);
2073 return 0;
2076 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2077 struct scatterlist *sg, unsigned long nr_pages,
2078 int prot)
2080 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2083 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2084 unsigned long phys_pfn, unsigned long nr_pages,
2085 int prot)
2087 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2090 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2092 if (!iommu)
2093 return;
2095 clear_context_table(iommu, bus, devfn);
2096 iommu->flush.flush_context(iommu, 0, 0, 0,
2097 DMA_CCMD_GLOBAL_INVL);
2098 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2101 static inline void unlink_domain_info(struct device_domain_info *info)
2103 assert_spin_locked(&device_domain_lock);
2104 list_del(&info->link);
2105 list_del(&info->global);
2106 if (info->dev)
2107 info->dev->archdata.iommu = NULL;
2110 static void domain_remove_dev_info(struct dmar_domain *domain)
2112 struct device_domain_info *info;
2113 unsigned long flags, flags2;
2115 spin_lock_irqsave(&device_domain_lock, flags);
2116 while (!list_empty(&domain->devices)) {
2117 info = list_entry(domain->devices.next,
2118 struct device_domain_info, link);
2119 unlink_domain_info(info);
2120 spin_unlock_irqrestore(&device_domain_lock, flags);
2122 iommu_disable_dev_iotlb(info);
2123 iommu_detach_dev(info->iommu, info->bus, info->devfn);
2125 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
2126 iommu_detach_dependent_devices(info->iommu, info->dev);
2127 /* clear this iommu in iommu_bmp, update iommu count
2128 * and capabilities
2130 spin_lock_irqsave(&domain->iommu_lock, flags2);
2131 if (test_and_clear_bit(info->iommu->seq_id,
2132 domain->iommu_bmp)) {
2133 domain->iommu_count--;
2134 domain_update_iommu_cap(domain);
2136 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2139 free_devinfo_mem(info);
2140 spin_lock_irqsave(&device_domain_lock, flags);
2142 spin_unlock_irqrestore(&device_domain_lock, flags);
2146 * find_domain
2147 * Note: we use struct device->archdata.iommu stores the info
2149 static struct dmar_domain *find_domain(struct device *dev)
2151 struct device_domain_info *info;
2153 /* No lock here, assumes no domain exit in normal case */
2154 info = dev->archdata.iommu;
2155 if (info)
2156 return info->domain;
2157 return NULL;
2160 static inline struct device_domain_info *
2161 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2163 struct device_domain_info *info;
2165 list_for_each_entry(info, &device_domain_list, global)
2166 if (info->iommu->segment == segment && info->bus == bus &&
2167 info->devfn == devfn)
2168 return info;
2170 return NULL;
2173 static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2174 int bus, int devfn,
2175 struct device *dev,
2176 struct dmar_domain *domain)
2178 struct dmar_domain *found = NULL;
2179 struct device_domain_info *info;
2180 unsigned long flags;
2182 info = alloc_devinfo_mem();
2183 if (!info)
2184 return NULL;
2186 info->bus = bus;
2187 info->devfn = devfn;
2188 info->dev = dev;
2189 info->domain = domain;
2190 info->iommu = iommu;
2191 if (!dev)
2192 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2194 spin_lock_irqsave(&device_domain_lock, flags);
2195 if (dev)
2196 found = find_domain(dev);
2197 else {
2198 struct device_domain_info *info2;
2199 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2200 if (info2)
2201 found = info2->domain;
2203 if (found) {
2204 spin_unlock_irqrestore(&device_domain_lock, flags);
2205 free_devinfo_mem(info);
2206 /* Caller must free the original domain */
2207 return found;
2210 list_add(&info->link, &domain->devices);
2211 list_add(&info->global, &device_domain_list);
2212 if (dev)
2213 dev->archdata.iommu = info;
2214 spin_unlock_irqrestore(&device_domain_lock, flags);
2216 return domain;
2219 /* domain is initialized */
2220 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2222 struct dmar_domain *domain, *free = NULL;
2223 struct intel_iommu *iommu = NULL;
2224 struct device_domain_info *info;
2225 struct pci_dev *dev_tmp = NULL;
2226 unsigned long flags;
2227 u8 bus, devfn, bridge_bus, bridge_devfn;
2229 domain = find_domain(dev);
2230 if (domain)
2231 return domain;
2233 if (dev_is_pci(dev)) {
2234 struct pci_dev *pdev = to_pci_dev(dev);
2235 u16 segment;
2237 segment = pci_domain_nr(pdev->bus);
2238 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
2239 if (dev_tmp) {
2240 if (pci_is_pcie(dev_tmp)) {
2241 bridge_bus = dev_tmp->subordinate->number;
2242 bridge_devfn = 0;
2243 } else {
2244 bridge_bus = dev_tmp->bus->number;
2245 bridge_devfn = dev_tmp->devfn;
2247 spin_lock_irqsave(&device_domain_lock, flags);
2248 info = dmar_search_domain_by_dev_info(segment,
2249 bridge_bus,
2250 bridge_devfn);
2251 if (info) {
2252 iommu = info->iommu;
2253 domain = info->domain;
2255 spin_unlock_irqrestore(&device_domain_lock, flags);
2256 /* pcie-pci bridge already has a domain, uses it */
2257 if (info)
2258 goto found_domain;
2262 iommu = device_to_iommu(dev, &bus, &devfn);
2263 if (!iommu)
2264 goto error;
2266 /* Allocate and initialize new domain for the device */
2267 domain = alloc_domain(false);
2268 if (!domain)
2269 goto error;
2270 if (iommu_attach_domain(domain, iommu)) {
2271 free_domain_mem(domain);
2272 domain = NULL;
2273 goto error;
2275 free = domain;
2276 if (domain_init(domain, gaw))
2277 goto error;
2279 /* register pcie-to-pci device */
2280 if (dev_tmp) {
2281 domain = dmar_insert_dev_info(iommu, bridge_bus, bridge_devfn,
2282 NULL, domain);
2283 if (!domain)
2284 goto error;
2287 found_domain:
2288 domain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2289 error:
2290 if (free != domain)
2291 domain_exit(free);
2293 return domain;
2296 static int iommu_identity_mapping;
2297 #define IDENTMAP_ALL 1
2298 #define IDENTMAP_GFX 2
2299 #define IDENTMAP_AZALIA 4
2301 static int iommu_domain_identity_map(struct dmar_domain *domain,
2302 unsigned long long start,
2303 unsigned long long end)
2305 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2306 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2308 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2309 dma_to_mm_pfn(last_vpfn))) {
2310 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2311 return -ENOMEM;
2314 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2315 start, end, domain->id);
2317 * RMRR range might have overlap with physical memory range,
2318 * clear it first
2320 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2322 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2323 last_vpfn - first_vpfn + 1,
2324 DMA_PTE_READ|DMA_PTE_WRITE);
2327 static int iommu_prepare_identity_map(struct device *dev,
2328 unsigned long long start,
2329 unsigned long long end)
2331 struct dmar_domain *domain;
2332 int ret;
2334 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2335 if (!domain)
2336 return -ENOMEM;
2338 /* For _hardware_ passthrough, don't bother. But for software
2339 passthrough, we do it anyway -- it may indicate a memory
2340 range which is reserved in E820, so which didn't get set
2341 up to start with in si_domain */
2342 if (domain == si_domain && hw_pass_through) {
2343 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2344 dev_name(dev), start, end);
2345 return 0;
2348 printk(KERN_INFO
2349 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2350 dev_name(dev), start, end);
2352 if (end < start) {
2353 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2354 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2355 dmi_get_system_info(DMI_BIOS_VENDOR),
2356 dmi_get_system_info(DMI_BIOS_VERSION),
2357 dmi_get_system_info(DMI_PRODUCT_VERSION));
2358 ret = -EIO;
2359 goto error;
2362 if (end >> agaw_to_width(domain->agaw)) {
2363 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2364 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2365 agaw_to_width(domain->agaw),
2366 dmi_get_system_info(DMI_BIOS_VENDOR),
2367 dmi_get_system_info(DMI_BIOS_VERSION),
2368 dmi_get_system_info(DMI_PRODUCT_VERSION));
2369 ret = -EIO;
2370 goto error;
2373 ret = iommu_domain_identity_map(domain, start, end);
2374 if (ret)
2375 goto error;
2377 /* context entry init */
2378 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2379 if (ret)
2380 goto error;
2382 return 0;
2384 error:
2385 domain_exit(domain);
2386 return ret;
2389 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2390 struct device *dev)
2392 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2393 return 0;
2394 return iommu_prepare_identity_map(dev, rmrr->base_address,
2395 rmrr->end_address);
2398 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2399 static inline void iommu_prepare_isa(void)
2401 struct pci_dev *pdev;
2402 int ret;
2404 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2405 if (!pdev)
2406 return;
2408 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2409 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2411 if (ret)
2412 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2413 "floppy might not work\n");
2416 #else
2417 static inline void iommu_prepare_isa(void)
2419 return;
2421 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2423 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2425 static int __init si_domain_init(int hw)
2427 struct dmar_drhd_unit *drhd;
2428 struct intel_iommu *iommu;
2429 int nid, ret = 0;
2431 si_domain = alloc_domain(false);
2432 if (!si_domain)
2433 return -EFAULT;
2435 si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2437 for_each_active_iommu(iommu, drhd) {
2438 ret = iommu_attach_domain(si_domain, iommu);
2439 if (ret) {
2440 domain_exit(si_domain);
2441 return -EFAULT;
2445 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2446 domain_exit(si_domain);
2447 return -EFAULT;
2450 pr_debug("IOMMU: identity mapping domain is domain %d\n",
2451 si_domain->id);
2453 if (hw)
2454 return 0;
2456 for_each_online_node(nid) {
2457 unsigned long start_pfn, end_pfn;
2458 int i;
2460 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2461 ret = iommu_domain_identity_map(si_domain,
2462 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2463 if (ret)
2464 return ret;
2468 return 0;
2471 static int identity_mapping(struct device *dev)
2473 struct device_domain_info *info;
2475 if (likely(!iommu_identity_mapping))
2476 return 0;
2478 info = dev->archdata.iommu;
2479 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2480 return (info->domain == si_domain);
2482 return 0;
2485 static int domain_add_dev_info(struct dmar_domain *domain,
2486 struct device *dev, int translation)
2488 struct dmar_domain *ndomain;
2489 struct intel_iommu *iommu;
2490 u8 bus, devfn;
2491 int ret;
2493 iommu = device_to_iommu(dev, &bus, &devfn);
2494 if (!iommu)
2495 return -ENODEV;
2497 ndomain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2498 if (ndomain != domain)
2499 return -EBUSY;
2501 ret = domain_context_mapping(domain, dev, translation);
2502 if (ret) {
2503 domain_remove_one_dev_info(domain, dev);
2504 return ret;
2507 return 0;
2510 static bool device_has_rmrr(struct device *dev)
2512 struct dmar_rmrr_unit *rmrr;
2513 struct device *tmp;
2514 int i;
2516 rcu_read_lock();
2517 for_each_rmrr_units(rmrr) {
2519 * Return TRUE if this RMRR contains the device that
2520 * is passed in.
2522 for_each_active_dev_scope(rmrr->devices,
2523 rmrr->devices_cnt, i, tmp)
2524 if (tmp == dev) {
2525 rcu_read_unlock();
2526 return true;
2529 rcu_read_unlock();
2530 return false;
2534 * There are a couple cases where we need to restrict the functionality of
2535 * devices associated with RMRRs. The first is when evaluating a device for
2536 * identity mapping because problems exist when devices are moved in and out
2537 * of domains and their respective RMRR information is lost. This means that
2538 * a device with associated RMRRs will never be in a "passthrough" domain.
2539 * The second is use of the device through the IOMMU API. This interface
2540 * expects to have full control of the IOVA space for the device. We cannot
2541 * satisfy both the requirement that RMRR access is maintained and have an
2542 * unencumbered IOVA space. We also have no ability to quiesce the device's
2543 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2544 * We therefore prevent devices associated with an RMRR from participating in
2545 * the IOMMU API, which eliminates them from device assignment.
2547 * In both cases we assume that PCI USB devices with RMRRs have them largely
2548 * for historical reasons and that the RMRR space is not actively used post
2549 * boot. This exclusion may change if vendors begin to abuse it.
2551 * The same exception is made for graphics devices, with the requirement that
2552 * any use of the RMRR regions will be torn down before assigning the device
2553 * to a guest.
2555 static bool device_is_rmrr_locked(struct device *dev)
2557 if (!device_has_rmrr(dev))
2558 return false;
2560 if (dev_is_pci(dev)) {
2561 struct pci_dev *pdev = to_pci_dev(dev);
2563 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2564 return false;
2567 return true;
2570 static int iommu_should_identity_map(struct device *dev, int startup)
2573 if (dev_is_pci(dev)) {
2574 struct pci_dev *pdev = to_pci_dev(dev);
2576 if (device_is_rmrr_locked(dev))
2577 return 0;
2579 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2580 return 1;
2582 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2583 return 1;
2585 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2586 return 0;
2589 * We want to start off with all devices in the 1:1 domain, and
2590 * take them out later if we find they can't access all of memory.
2592 * However, we can't do this for PCI devices behind bridges,
2593 * because all PCI devices behind the same bridge will end up
2594 * with the same source-id on their transactions.
2596 * Practically speaking, we can't change things around for these
2597 * devices at run-time, because we can't be sure there'll be no
2598 * DMA transactions in flight for any of their siblings.
2600 * So PCI devices (unless they're on the root bus) as well as
2601 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2602 * the 1:1 domain, just in _case_ one of their siblings turns out
2603 * not to be able to map all of memory.
2605 if (!pci_is_pcie(pdev)) {
2606 if (!pci_is_root_bus(pdev->bus))
2607 return 0;
2608 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2609 return 0;
2610 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2611 return 0;
2612 } else {
2613 if (device_has_rmrr(dev))
2614 return 0;
2618 * At boot time, we don't yet know if devices will be 64-bit capable.
2619 * Assume that they will — if they turn out not to be, then we can
2620 * take them out of the 1:1 domain later.
2622 if (!startup) {
2624 * If the device's dma_mask is less than the system's memory
2625 * size then this is not a candidate for identity mapping.
2627 u64 dma_mask = *dev->dma_mask;
2629 if (dev->coherent_dma_mask &&
2630 dev->coherent_dma_mask < dma_mask)
2631 dma_mask = dev->coherent_dma_mask;
2633 return dma_mask >= dma_get_required_mask(dev);
2636 return 1;
2639 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2641 int ret;
2643 if (!iommu_should_identity_map(dev, 1))
2644 return 0;
2646 ret = domain_add_dev_info(si_domain, dev,
2647 hw ? CONTEXT_TT_PASS_THROUGH :
2648 CONTEXT_TT_MULTI_LEVEL);
2649 if (!ret)
2650 pr_info("IOMMU: %s identity mapping for device %s\n",
2651 hw ? "hardware" : "software", dev_name(dev));
2652 else if (ret == -ENODEV)
2653 /* device not associated with an iommu */
2654 ret = 0;
2656 return ret;
2660 static int __init iommu_prepare_static_identity_mapping(int hw)
2662 struct pci_dev *pdev = NULL;
2663 struct dmar_drhd_unit *drhd;
2664 struct intel_iommu *iommu;
2665 struct device *dev;
2666 int i;
2667 int ret = 0;
2669 ret = si_domain_init(hw);
2670 if (ret)
2671 return -EFAULT;
2673 for_each_pci_dev(pdev) {
2674 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2675 if (ret)
2676 return ret;
2679 for_each_active_iommu(iommu, drhd)
2680 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2681 struct acpi_device_physical_node *pn;
2682 struct acpi_device *adev;
2684 if (dev->bus != &acpi_bus_type)
2685 continue;
2687 adev= to_acpi_device(dev);
2688 mutex_lock(&adev->physical_node_lock);
2689 list_for_each_entry(pn, &adev->physical_node_list, node) {
2690 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2691 if (ret)
2692 break;
2694 mutex_unlock(&adev->physical_node_lock);
2695 if (ret)
2696 return ret;
2699 return 0;
2702 static int __init init_dmars(void)
2704 struct dmar_drhd_unit *drhd;
2705 struct dmar_rmrr_unit *rmrr;
2706 struct device *dev;
2707 struct intel_iommu *iommu;
2708 int i, ret;
2711 * for each drhd
2712 * allocate root
2713 * initialize and program root entry to not present
2714 * endfor
2716 for_each_drhd_unit(drhd) {
2718 * lock not needed as this is only incremented in the single
2719 * threaded kernel __init code path all other access are read
2720 * only
2722 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2723 g_num_of_iommus++;
2724 continue;
2726 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2727 IOMMU_UNITS_SUPPORTED);
2730 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2731 GFP_KERNEL);
2732 if (!g_iommus) {
2733 printk(KERN_ERR "Allocating global iommu array failed\n");
2734 ret = -ENOMEM;
2735 goto error;
2738 deferred_flush = kzalloc(g_num_of_iommus *
2739 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2740 if (!deferred_flush) {
2741 ret = -ENOMEM;
2742 goto free_g_iommus;
2745 for_each_active_iommu(iommu, drhd) {
2746 g_iommus[iommu->seq_id] = iommu;
2748 ret = iommu_init_domains(iommu);
2749 if (ret)
2750 goto free_iommu;
2753 * TBD:
2754 * we could share the same root & context tables
2755 * among all IOMMU's. Need to Split it later.
2757 ret = iommu_alloc_root_entry(iommu);
2758 if (ret) {
2759 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2760 goto free_iommu;
2762 if (!ecap_pass_through(iommu->ecap))
2763 hw_pass_through = 0;
2767 * Start from the sane iommu hardware state.
2769 for_each_active_iommu(iommu, drhd) {
2771 * If the queued invalidation is already initialized by us
2772 * (for example, while enabling interrupt-remapping) then
2773 * we got the things already rolling from a sane state.
2775 if (iommu->qi)
2776 continue;
2779 * Clear any previous faults.
2781 dmar_fault(-1, iommu);
2783 * Disable queued invalidation if supported and already enabled
2784 * before OS handover.
2786 dmar_disable_qi(iommu);
2789 for_each_active_iommu(iommu, drhd) {
2790 if (dmar_enable_qi(iommu)) {
2792 * Queued Invalidate not enabled, use Register Based
2793 * Invalidate
2795 iommu->flush.flush_context = __iommu_flush_context;
2796 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2797 printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2798 "invalidation\n",
2799 iommu->seq_id,
2800 (unsigned long long)drhd->reg_base_addr);
2801 } else {
2802 iommu->flush.flush_context = qi_flush_context;
2803 iommu->flush.flush_iotlb = qi_flush_iotlb;
2804 printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2805 "invalidation\n",
2806 iommu->seq_id,
2807 (unsigned long long)drhd->reg_base_addr);
2811 if (iommu_pass_through)
2812 iommu_identity_mapping |= IDENTMAP_ALL;
2814 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2815 iommu_identity_mapping |= IDENTMAP_GFX;
2816 #endif
2818 check_tylersburg_isoch();
2821 * If pass through is not set or not enabled, setup context entries for
2822 * identity mappings for rmrr, gfx, and isa and may fall back to static
2823 * identity mapping if iommu_identity_mapping is set.
2825 if (iommu_identity_mapping) {
2826 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2827 if (ret) {
2828 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2829 goto free_iommu;
2833 * For each rmrr
2834 * for each dev attached to rmrr
2835 * do
2836 * locate drhd for dev, alloc domain for dev
2837 * allocate free domain
2838 * allocate page table entries for rmrr
2839 * if context not allocated for bus
2840 * allocate and init context
2841 * set present in root table for this bus
2842 * init context with domain, translation etc
2843 * endfor
2844 * endfor
2846 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2847 for_each_rmrr_units(rmrr) {
2848 /* some BIOS lists non-exist devices in DMAR table. */
2849 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2850 i, dev) {
2851 ret = iommu_prepare_rmrr_dev(rmrr, dev);
2852 if (ret)
2853 printk(KERN_ERR
2854 "IOMMU: mapping reserved region failed\n");
2858 iommu_prepare_isa();
2861 * for each drhd
2862 * enable fault log
2863 * global invalidate context cache
2864 * global invalidate iotlb
2865 * enable translation
2867 for_each_iommu(iommu, drhd) {
2868 if (drhd->ignored) {
2870 * we always have to disable PMRs or DMA may fail on
2871 * this device
2873 if (force_on)
2874 iommu_disable_protect_mem_regions(iommu);
2875 continue;
2878 iommu_flush_write_buffer(iommu);
2880 ret = dmar_set_interrupt(iommu);
2881 if (ret)
2882 goto free_iommu;
2884 iommu_set_root_entry(iommu);
2886 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2887 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2889 ret = iommu_enable_translation(iommu);
2890 if (ret)
2891 goto free_iommu;
2893 iommu_disable_protect_mem_regions(iommu);
2896 return 0;
2898 free_iommu:
2899 for_each_active_iommu(iommu, drhd)
2900 free_dmar_iommu(iommu);
2901 kfree(deferred_flush);
2902 free_g_iommus:
2903 kfree(g_iommus);
2904 error:
2905 return ret;
2908 /* This takes a number of _MM_ pages, not VTD pages */
2909 static struct iova *intel_alloc_iova(struct device *dev,
2910 struct dmar_domain *domain,
2911 unsigned long nrpages, uint64_t dma_mask)
2913 struct iova *iova = NULL;
2915 /* Restrict dma_mask to the width that the iommu can handle */
2916 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2918 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2920 * First try to allocate an io virtual address in
2921 * DMA_BIT_MASK(32) and if that fails then try allocating
2922 * from higher range
2924 iova = alloc_iova(&domain->iovad, nrpages,
2925 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2926 if (iova)
2927 return iova;
2929 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2930 if (unlikely(!iova)) {
2931 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2932 nrpages, dev_name(dev));
2933 return NULL;
2936 return iova;
2939 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
2941 struct dmar_domain *domain;
2942 int ret;
2944 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2945 if (!domain) {
2946 printk(KERN_ERR "Allocating domain for %s failed",
2947 dev_name(dev));
2948 return NULL;
2951 /* make sure context mapping is ok */
2952 if (unlikely(!domain_context_mapped(dev))) {
2953 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2954 if (ret) {
2955 printk(KERN_ERR "Domain context map for %s failed",
2956 dev_name(dev));
2957 return NULL;
2961 return domain;
2964 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
2966 struct device_domain_info *info;
2968 /* No lock here, assumes no domain exit in normal case */
2969 info = dev->archdata.iommu;
2970 if (likely(info))
2971 return info->domain;
2973 return __get_valid_domain_for_dev(dev);
2976 /* Check if the dev needs to go through non-identity map and unmap process.*/
2977 static int iommu_no_mapping(struct device *dev)
2979 int found;
2981 if (iommu_dummy(dev))
2982 return 1;
2984 if (!iommu_identity_mapping)
2985 return 0;
2987 found = identity_mapping(dev);
2988 if (found) {
2989 if (iommu_should_identity_map(dev, 0))
2990 return 1;
2991 else {
2993 * 32 bit DMA is removed from si_domain and fall back
2994 * to non-identity mapping.
2996 domain_remove_one_dev_info(si_domain, dev);
2997 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2998 dev_name(dev));
2999 return 0;
3001 } else {
3003 * In case of a detached 64 bit DMA device from vm, the device
3004 * is put into si_domain for identity mapping.
3006 if (iommu_should_identity_map(dev, 0)) {
3007 int ret;
3008 ret = domain_add_dev_info(si_domain, dev,
3009 hw_pass_through ?
3010 CONTEXT_TT_PASS_THROUGH :
3011 CONTEXT_TT_MULTI_LEVEL);
3012 if (!ret) {
3013 printk(KERN_INFO "64bit %s uses identity mapping\n",
3014 dev_name(dev));
3015 return 1;
3020 return 0;
3023 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3024 size_t size, int dir, u64 dma_mask)
3026 struct dmar_domain *domain;
3027 phys_addr_t start_paddr;
3028 struct iova *iova;
3029 int prot = 0;
3030 int ret;
3031 struct intel_iommu *iommu;
3032 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3034 BUG_ON(dir == DMA_NONE);
3036 if (iommu_no_mapping(dev))
3037 return paddr;
3039 domain = get_valid_domain_for_dev(dev);
3040 if (!domain)
3041 return 0;
3043 iommu = domain_get_iommu(domain);
3044 size = aligned_nrpages(paddr, size);
3046 iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3047 if (!iova)
3048 goto error;
3051 * Check if DMAR supports zero-length reads on write only
3052 * mappings..
3054 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3055 !cap_zlr(iommu->cap))
3056 prot |= DMA_PTE_READ;
3057 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3058 prot |= DMA_PTE_WRITE;
3060 * paddr - (paddr + size) might be partial page, we should map the whole
3061 * page. Note: if two part of one page are separately mapped, we
3062 * might have two guest_addr mapping to the same host paddr, but this
3063 * is not a big problem
3065 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3066 mm_to_dma_pfn(paddr_pfn), size, prot);
3067 if (ret)
3068 goto error;
3070 /* it's a non-present to present mapping. Only flush if caching mode */
3071 if (cap_caching_mode(iommu->cap))
3072 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
3073 else
3074 iommu_flush_write_buffer(iommu);
3076 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3077 start_paddr += paddr & ~PAGE_MASK;
3078 return start_paddr;
3080 error:
3081 if (iova)
3082 __free_iova(&domain->iovad, iova);
3083 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
3084 dev_name(dev), size, (unsigned long long)paddr, dir);
3085 return 0;
3088 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3089 unsigned long offset, size_t size,
3090 enum dma_data_direction dir,
3091 struct dma_attrs *attrs)
3093 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3094 dir, *dev->dma_mask);
3097 static void flush_unmaps(void)
3099 int i, j;
3101 timer_on = 0;
3103 /* just flush them all */
3104 for (i = 0; i < g_num_of_iommus; i++) {
3105 struct intel_iommu *iommu = g_iommus[i];
3106 if (!iommu)
3107 continue;
3109 if (!deferred_flush[i].next)
3110 continue;
3112 /* In caching mode, global flushes turn emulation expensive */
3113 if (!cap_caching_mode(iommu->cap))
3114 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3115 DMA_TLB_GLOBAL_FLUSH);
3116 for (j = 0; j < deferred_flush[i].next; j++) {
3117 unsigned long mask;
3118 struct iova *iova = deferred_flush[i].iova[j];
3119 struct dmar_domain *domain = deferred_flush[i].domain[j];
3121 /* On real hardware multiple invalidations are expensive */
3122 if (cap_caching_mode(iommu->cap))
3123 iommu_flush_iotlb_psi(iommu, domain->id,
3124 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1,
3125 !deferred_flush[i].freelist[j], 0);
3126 else {
3127 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
3128 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3129 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3131 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3132 if (deferred_flush[i].freelist[j])
3133 dma_free_pagelist(deferred_flush[i].freelist[j]);
3135 deferred_flush[i].next = 0;
3138 list_size = 0;
3141 static void flush_unmaps_timeout(unsigned long data)
3143 unsigned long flags;
3145 spin_lock_irqsave(&async_umap_flush_lock, flags);
3146 flush_unmaps();
3147 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3150 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3152 unsigned long flags;
3153 int next, iommu_id;
3154 struct intel_iommu *iommu;
3156 spin_lock_irqsave(&async_umap_flush_lock, flags);
3157 if (list_size == HIGH_WATER_MARK)
3158 flush_unmaps();
3160 iommu = domain_get_iommu(dom);
3161 iommu_id = iommu->seq_id;
3163 next = deferred_flush[iommu_id].next;
3164 deferred_flush[iommu_id].domain[next] = dom;
3165 deferred_flush[iommu_id].iova[next] = iova;
3166 deferred_flush[iommu_id].freelist[next] = freelist;
3167 deferred_flush[iommu_id].next++;
3169 if (!timer_on) {
3170 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3171 timer_on = 1;
3173 list_size++;
3174 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3177 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3178 size_t size, enum dma_data_direction dir,
3179 struct dma_attrs *attrs)
3181 struct dmar_domain *domain;
3182 unsigned long start_pfn, last_pfn;
3183 struct iova *iova;
3184 struct intel_iommu *iommu;
3185 struct page *freelist;
3187 if (iommu_no_mapping(dev))
3188 return;
3190 domain = find_domain(dev);
3191 BUG_ON(!domain);
3193 iommu = domain_get_iommu(domain);
3195 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3196 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3197 (unsigned long long)dev_addr))
3198 return;
3200 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3201 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3203 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3204 dev_name(dev), start_pfn, last_pfn);
3206 freelist = domain_unmap(domain, start_pfn, last_pfn);
3208 if (intel_iommu_strict) {
3209 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3210 last_pfn - start_pfn + 1, !freelist, 0);
3211 /* free iova */
3212 __free_iova(&domain->iovad, iova);
3213 dma_free_pagelist(freelist);
3214 } else {
3215 add_unmap(domain, iova, freelist);
3217 * queue up the release of the unmap to save the 1/6th of the
3218 * cpu used up by the iotlb flush operation...
3223 static void *intel_alloc_coherent(struct device *dev, size_t size,
3224 dma_addr_t *dma_handle, gfp_t flags,
3225 struct dma_attrs *attrs)
3227 struct page *page = NULL;
3228 int order;
3230 size = PAGE_ALIGN(size);
3231 order = get_order(size);
3233 if (!iommu_no_mapping(dev))
3234 flags &= ~(GFP_DMA | GFP_DMA32);
3235 else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3236 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3237 flags |= GFP_DMA;
3238 else
3239 flags |= GFP_DMA32;
3242 if (flags & __GFP_WAIT) {
3243 unsigned int count = size >> PAGE_SHIFT;
3245 page = dma_alloc_from_contiguous(dev, count, order);
3246 if (page && iommu_no_mapping(dev) &&
3247 page_to_phys(page) + size > dev->coherent_dma_mask) {
3248 dma_release_from_contiguous(dev, page, count);
3249 page = NULL;
3253 if (!page)
3254 page = alloc_pages(flags, order);
3255 if (!page)
3256 return NULL;
3257 memset(page_address(page), 0, size);
3259 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3260 DMA_BIDIRECTIONAL,
3261 dev->coherent_dma_mask);
3262 if (*dma_handle)
3263 return page_address(page);
3264 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3265 __free_pages(page, order);
3267 return NULL;
3270 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3271 dma_addr_t dma_handle, struct dma_attrs *attrs)
3273 int order;
3274 struct page *page = virt_to_page(vaddr);
3276 size = PAGE_ALIGN(size);
3277 order = get_order(size);
3279 intel_unmap_page(dev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3280 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3281 __free_pages(page, order);
3284 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3285 int nelems, enum dma_data_direction dir,
3286 struct dma_attrs *attrs)
3288 struct dmar_domain *domain;
3289 unsigned long start_pfn, last_pfn;
3290 struct iova *iova;
3291 struct intel_iommu *iommu;
3292 struct page *freelist;
3294 if (iommu_no_mapping(dev))
3295 return;
3297 domain = find_domain(dev);
3298 BUG_ON(!domain);
3300 iommu = domain_get_iommu(domain);
3302 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3303 if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3304 (unsigned long long)sglist[0].dma_address))
3305 return;
3307 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3308 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3310 freelist = domain_unmap(domain, start_pfn, last_pfn);
3312 if (intel_iommu_strict) {
3313 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3314 last_pfn - start_pfn + 1, !freelist, 0);
3315 /* free iova */
3316 __free_iova(&domain->iovad, iova);
3317 dma_free_pagelist(freelist);
3318 } else {
3319 add_unmap(domain, iova, freelist);
3321 * queue up the release of the unmap to save the 1/6th of the
3322 * cpu used up by the iotlb flush operation...
3327 static int intel_nontranslate_map_sg(struct device *hddev,
3328 struct scatterlist *sglist, int nelems, int dir)
3330 int i;
3331 struct scatterlist *sg;
3333 for_each_sg(sglist, sg, nelems, i) {
3334 BUG_ON(!sg_page(sg));
3335 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3336 sg->dma_length = sg->length;
3338 return nelems;
3341 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3342 enum dma_data_direction dir, struct dma_attrs *attrs)
3344 int i;
3345 struct dmar_domain *domain;
3346 size_t size = 0;
3347 int prot = 0;
3348 struct iova *iova = NULL;
3349 int ret;
3350 struct scatterlist *sg;
3351 unsigned long start_vpfn;
3352 struct intel_iommu *iommu;
3354 BUG_ON(dir == DMA_NONE);
3355 if (iommu_no_mapping(dev))
3356 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3358 domain = get_valid_domain_for_dev(dev);
3359 if (!domain)
3360 return 0;
3362 iommu = domain_get_iommu(domain);
3364 for_each_sg(sglist, sg, nelems, i)
3365 size += aligned_nrpages(sg->offset, sg->length);
3367 iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3368 *dev->dma_mask);
3369 if (!iova) {
3370 sglist->dma_length = 0;
3371 return 0;
3375 * Check if DMAR supports zero-length reads on write only
3376 * mappings..
3378 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3379 !cap_zlr(iommu->cap))
3380 prot |= DMA_PTE_READ;
3381 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3382 prot |= DMA_PTE_WRITE;
3384 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3386 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3387 if (unlikely(ret)) {
3388 /* clear the page */
3389 dma_pte_clear_range(domain, start_vpfn,
3390 start_vpfn + size - 1);
3391 /* free page tables */
3392 dma_pte_free_pagetable(domain, start_vpfn,
3393 start_vpfn + size - 1);
3394 /* free iova */
3395 __free_iova(&domain->iovad, iova);
3396 return 0;
3399 /* it's a non-present to present mapping. Only flush if caching mode */
3400 if (cap_caching_mode(iommu->cap))
3401 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3402 else
3403 iommu_flush_write_buffer(iommu);
3405 return nelems;
3408 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3410 return !dma_addr;
3413 struct dma_map_ops intel_dma_ops = {
3414 .alloc = intel_alloc_coherent,
3415 .free = intel_free_coherent,
3416 .map_sg = intel_map_sg,
3417 .unmap_sg = intel_unmap_sg,
3418 .map_page = intel_map_page,
3419 .unmap_page = intel_unmap_page,
3420 .mapping_error = intel_mapping_error,
3423 static inline int iommu_domain_cache_init(void)
3425 int ret = 0;
3427 iommu_domain_cache = kmem_cache_create("iommu_domain",
3428 sizeof(struct dmar_domain),
3430 SLAB_HWCACHE_ALIGN,
3432 NULL);
3433 if (!iommu_domain_cache) {
3434 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3435 ret = -ENOMEM;
3438 return ret;
3441 static inline int iommu_devinfo_cache_init(void)
3443 int ret = 0;
3445 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3446 sizeof(struct device_domain_info),
3448 SLAB_HWCACHE_ALIGN,
3449 NULL);
3450 if (!iommu_devinfo_cache) {
3451 printk(KERN_ERR "Couldn't create devinfo cache\n");
3452 ret = -ENOMEM;
3455 return ret;
3458 static inline int iommu_iova_cache_init(void)
3460 int ret = 0;
3462 iommu_iova_cache = kmem_cache_create("iommu_iova",
3463 sizeof(struct iova),
3465 SLAB_HWCACHE_ALIGN,
3466 NULL);
3467 if (!iommu_iova_cache) {
3468 printk(KERN_ERR "Couldn't create iova cache\n");
3469 ret = -ENOMEM;
3472 return ret;
3475 static int __init iommu_init_mempool(void)
3477 int ret;
3478 ret = iommu_iova_cache_init();
3479 if (ret)
3480 return ret;
3482 ret = iommu_domain_cache_init();
3483 if (ret)
3484 goto domain_error;
3486 ret = iommu_devinfo_cache_init();
3487 if (!ret)
3488 return ret;
3490 kmem_cache_destroy(iommu_domain_cache);
3491 domain_error:
3492 kmem_cache_destroy(iommu_iova_cache);
3494 return -ENOMEM;
3497 static void __init iommu_exit_mempool(void)
3499 kmem_cache_destroy(iommu_devinfo_cache);
3500 kmem_cache_destroy(iommu_domain_cache);
3501 kmem_cache_destroy(iommu_iova_cache);
3505 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3507 struct dmar_drhd_unit *drhd;
3508 u32 vtbar;
3509 int rc;
3511 /* We know that this device on this chipset has its own IOMMU.
3512 * If we find it under a different IOMMU, then the BIOS is lying
3513 * to us. Hope that the IOMMU for this device is actually
3514 * disabled, and it needs no translation...
3516 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3517 if (rc) {
3518 /* "can't" happen */
3519 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3520 return;
3522 vtbar &= 0xffff0000;
3524 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3525 drhd = dmar_find_matched_drhd_unit(pdev);
3526 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3527 TAINT_FIRMWARE_WORKAROUND,
3528 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3529 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3531 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3533 static void __init init_no_remapping_devices(void)
3535 struct dmar_drhd_unit *drhd;
3536 struct device *dev;
3537 int i;
3539 for_each_drhd_unit(drhd) {
3540 if (!drhd->include_all) {
3541 for_each_active_dev_scope(drhd->devices,
3542 drhd->devices_cnt, i, dev)
3543 break;
3544 /* ignore DMAR unit if no devices exist */
3545 if (i == drhd->devices_cnt)
3546 drhd->ignored = 1;
3550 for_each_active_drhd_unit(drhd) {
3551 if (drhd->include_all)
3552 continue;
3554 for_each_active_dev_scope(drhd->devices,
3555 drhd->devices_cnt, i, dev)
3556 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3557 break;
3558 if (i < drhd->devices_cnt)
3559 continue;
3561 /* This IOMMU has *only* gfx devices. Either bypass it or
3562 set the gfx_mapped flag, as appropriate */
3563 if (dmar_map_gfx) {
3564 intel_iommu_gfx_mapped = 1;
3565 } else {
3566 drhd->ignored = 1;
3567 for_each_active_dev_scope(drhd->devices,
3568 drhd->devices_cnt, i, dev)
3569 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3574 #ifdef CONFIG_SUSPEND
3575 static int init_iommu_hw(void)
3577 struct dmar_drhd_unit *drhd;
3578 struct intel_iommu *iommu = NULL;
3580 for_each_active_iommu(iommu, drhd)
3581 if (iommu->qi)
3582 dmar_reenable_qi(iommu);
3584 for_each_iommu(iommu, drhd) {
3585 if (drhd->ignored) {
3587 * we always have to disable PMRs or DMA may fail on
3588 * this device
3590 if (force_on)
3591 iommu_disable_protect_mem_regions(iommu);
3592 continue;
3595 iommu_flush_write_buffer(iommu);
3597 iommu_set_root_entry(iommu);
3599 iommu->flush.flush_context(iommu, 0, 0, 0,
3600 DMA_CCMD_GLOBAL_INVL);
3601 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3602 DMA_TLB_GLOBAL_FLUSH);
3603 if (iommu_enable_translation(iommu))
3604 return 1;
3605 iommu_disable_protect_mem_regions(iommu);
3608 return 0;
3611 static void iommu_flush_all(void)
3613 struct dmar_drhd_unit *drhd;
3614 struct intel_iommu *iommu;
3616 for_each_active_iommu(iommu, drhd) {
3617 iommu->flush.flush_context(iommu, 0, 0, 0,
3618 DMA_CCMD_GLOBAL_INVL);
3619 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3620 DMA_TLB_GLOBAL_FLUSH);
3624 static int iommu_suspend(void)
3626 struct dmar_drhd_unit *drhd;
3627 struct intel_iommu *iommu = NULL;
3628 unsigned long flag;
3630 for_each_active_iommu(iommu, drhd) {
3631 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3632 GFP_ATOMIC);
3633 if (!iommu->iommu_state)
3634 goto nomem;
3637 iommu_flush_all();
3639 for_each_active_iommu(iommu, drhd) {
3640 iommu_disable_translation(iommu);
3642 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3644 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3645 readl(iommu->reg + DMAR_FECTL_REG);
3646 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3647 readl(iommu->reg + DMAR_FEDATA_REG);
3648 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3649 readl(iommu->reg + DMAR_FEADDR_REG);
3650 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3651 readl(iommu->reg + DMAR_FEUADDR_REG);
3653 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3655 return 0;
3657 nomem:
3658 for_each_active_iommu(iommu, drhd)
3659 kfree(iommu->iommu_state);
3661 return -ENOMEM;
3664 static void iommu_resume(void)
3666 struct dmar_drhd_unit *drhd;
3667 struct intel_iommu *iommu = NULL;
3668 unsigned long flag;
3670 if (init_iommu_hw()) {
3671 if (force_on)
3672 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3673 else
3674 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3675 return;
3678 for_each_active_iommu(iommu, drhd) {
3680 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3682 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3683 iommu->reg + DMAR_FECTL_REG);
3684 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3685 iommu->reg + DMAR_FEDATA_REG);
3686 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3687 iommu->reg + DMAR_FEADDR_REG);
3688 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3689 iommu->reg + DMAR_FEUADDR_REG);
3691 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3694 for_each_active_iommu(iommu, drhd)
3695 kfree(iommu->iommu_state);
3698 static struct syscore_ops iommu_syscore_ops = {
3699 .resume = iommu_resume,
3700 .suspend = iommu_suspend,
3703 static void __init init_iommu_pm_ops(void)
3705 register_syscore_ops(&iommu_syscore_ops);
3708 #else
3709 static inline void init_iommu_pm_ops(void) {}
3710 #endif /* CONFIG_PM */
3713 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3715 struct acpi_dmar_reserved_memory *rmrr;
3716 struct dmar_rmrr_unit *rmrru;
3718 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3719 if (!rmrru)
3720 return -ENOMEM;
3722 rmrru->hdr = header;
3723 rmrr = (struct acpi_dmar_reserved_memory *)header;
3724 rmrru->base_address = rmrr->base_address;
3725 rmrru->end_address = rmrr->end_address;
3726 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3727 ((void *)rmrr) + rmrr->header.length,
3728 &rmrru->devices_cnt);
3729 if (rmrru->devices_cnt && rmrru->devices == NULL) {
3730 kfree(rmrru);
3731 return -ENOMEM;
3734 list_add(&rmrru->list, &dmar_rmrr_units);
3736 return 0;
3739 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3741 struct acpi_dmar_atsr *atsr;
3742 struct dmar_atsr_unit *atsru;
3744 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3745 atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3746 if (!atsru)
3747 return -ENOMEM;
3749 atsru->hdr = hdr;
3750 atsru->include_all = atsr->flags & 0x1;
3751 if (!atsru->include_all) {
3752 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3753 (void *)atsr + atsr->header.length,
3754 &atsru->devices_cnt);
3755 if (atsru->devices_cnt && atsru->devices == NULL) {
3756 kfree(atsru);
3757 return -ENOMEM;
3761 list_add_rcu(&atsru->list, &dmar_atsr_units);
3763 return 0;
3766 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3768 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3769 kfree(atsru);
3772 static void intel_iommu_free_dmars(void)
3774 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3775 struct dmar_atsr_unit *atsru, *atsr_n;
3777 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3778 list_del(&rmrru->list);
3779 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3780 kfree(rmrru);
3783 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3784 list_del(&atsru->list);
3785 intel_iommu_free_atsr(atsru);
3789 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3791 int i, ret = 1;
3792 struct pci_bus *bus;
3793 struct pci_dev *bridge = NULL;
3794 struct device *tmp;
3795 struct acpi_dmar_atsr *atsr;
3796 struct dmar_atsr_unit *atsru;
3798 dev = pci_physfn(dev);
3799 for (bus = dev->bus; bus; bus = bus->parent) {
3800 bridge = bus->self;
3801 /* If it's an integrated device, allow ATS */
3802 if (!bridge)
3803 return 1;
3804 /* Connected via non-PCIe: no ATS */
3805 if (!pci_is_pcie(bridge) ||
3806 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3807 return 0;
3808 /* If we found the root port, look it up in the ATSR */
3809 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3810 break;
3813 rcu_read_lock();
3814 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3815 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3816 if (atsr->segment != pci_domain_nr(dev->bus))
3817 continue;
3819 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3820 if (tmp == &bridge->dev)
3821 goto out;
3823 if (atsru->include_all)
3824 goto out;
3826 ret = 0;
3827 out:
3828 rcu_read_unlock();
3830 return ret;
3833 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3835 int ret = 0;
3836 struct dmar_rmrr_unit *rmrru;
3837 struct dmar_atsr_unit *atsru;
3838 struct acpi_dmar_atsr *atsr;
3839 struct acpi_dmar_reserved_memory *rmrr;
3841 if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
3842 return 0;
3844 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3845 rmrr = container_of(rmrru->hdr,
3846 struct acpi_dmar_reserved_memory, header);
3847 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3848 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3849 ((void *)rmrr) + rmrr->header.length,
3850 rmrr->segment, rmrru->devices,
3851 rmrru->devices_cnt);
3852 if(ret < 0)
3853 return ret;
3854 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3855 dmar_remove_dev_scope(info, rmrr->segment,
3856 rmrru->devices, rmrru->devices_cnt);
3860 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3861 if (atsru->include_all)
3862 continue;
3864 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3865 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3866 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3867 (void *)atsr + atsr->header.length,
3868 atsr->segment, atsru->devices,
3869 atsru->devices_cnt);
3870 if (ret > 0)
3871 break;
3872 else if(ret < 0)
3873 return ret;
3874 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3875 if (dmar_remove_dev_scope(info, atsr->segment,
3876 atsru->devices, atsru->devices_cnt))
3877 break;
3881 return 0;
3885 * Here we only respond to action of unbound device from driver.
3887 * Added device is not attached to its DMAR domain here yet. That will happen
3888 * when mapping the device to iova.
3890 static int device_notifier(struct notifier_block *nb,
3891 unsigned long action, void *data)
3893 struct device *dev = data;
3894 struct dmar_domain *domain;
3896 if (iommu_dummy(dev))
3897 return 0;
3899 if (action != BUS_NOTIFY_UNBOUND_DRIVER &&
3900 action != BUS_NOTIFY_DEL_DEVICE)
3901 return 0;
3904 * If the device is still attached to a device driver we can't
3905 * tear down the domain yet as DMA mappings may still be in use.
3906 * Wait for the BUS_NOTIFY_UNBOUND_DRIVER event to do that.
3908 if (action == BUS_NOTIFY_DEL_DEVICE && dev->driver != NULL)
3909 return 0;
3911 domain = find_domain(dev);
3912 if (!domain)
3913 return 0;
3915 down_read(&dmar_global_lock);
3916 domain_remove_one_dev_info(domain, dev);
3917 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3918 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3919 list_empty(&domain->devices))
3920 domain_exit(domain);
3921 up_read(&dmar_global_lock);
3923 return 0;
3926 static struct notifier_block device_nb = {
3927 .notifier_call = device_notifier,
3930 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3931 unsigned long val, void *v)
3933 struct memory_notify *mhp = v;
3934 unsigned long long start, end;
3935 unsigned long start_vpfn, last_vpfn;
3937 switch (val) {
3938 case MEM_GOING_ONLINE:
3939 start = mhp->start_pfn << PAGE_SHIFT;
3940 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
3941 if (iommu_domain_identity_map(si_domain, start, end)) {
3942 pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
3943 start, end);
3944 return NOTIFY_BAD;
3946 break;
3948 case MEM_OFFLINE:
3949 case MEM_CANCEL_ONLINE:
3950 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3951 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
3952 while (start_vpfn <= last_vpfn) {
3953 struct iova *iova;
3954 struct dmar_drhd_unit *drhd;
3955 struct intel_iommu *iommu;
3956 struct page *freelist;
3958 iova = find_iova(&si_domain->iovad, start_vpfn);
3959 if (iova == NULL) {
3960 pr_debug("dmar: failed get IOVA for PFN %lx\n",
3961 start_vpfn);
3962 break;
3965 iova = split_and_remove_iova(&si_domain->iovad, iova,
3966 start_vpfn, last_vpfn);
3967 if (iova == NULL) {
3968 pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
3969 start_vpfn, last_vpfn);
3970 return NOTIFY_BAD;
3973 freelist = domain_unmap(si_domain, iova->pfn_lo,
3974 iova->pfn_hi);
3976 rcu_read_lock();
3977 for_each_active_iommu(iommu, drhd)
3978 iommu_flush_iotlb_psi(iommu, si_domain->id,
3979 iova->pfn_lo,
3980 iova->pfn_hi - iova->pfn_lo + 1,
3981 !freelist, 0);
3982 rcu_read_unlock();
3983 dma_free_pagelist(freelist);
3985 start_vpfn = iova->pfn_hi + 1;
3986 free_iova_mem(iova);
3988 break;
3991 return NOTIFY_OK;
3994 static struct notifier_block intel_iommu_memory_nb = {
3995 .notifier_call = intel_iommu_memory_notifier,
3996 .priority = 0
3999 int __init intel_iommu_init(void)
4001 int ret = -ENODEV;
4002 struct dmar_drhd_unit *drhd;
4003 struct intel_iommu *iommu;
4005 /* VT-d is required for a TXT/tboot launch, so enforce that */
4006 force_on = tboot_force_iommu();
4008 if (iommu_init_mempool()) {
4009 if (force_on)
4010 panic("tboot: Failed to initialize iommu memory\n");
4011 return -ENOMEM;
4014 down_write(&dmar_global_lock);
4015 if (dmar_table_init()) {
4016 if (force_on)
4017 panic("tboot: Failed to initialize DMAR table\n");
4018 goto out_free_dmar;
4022 * Disable translation if already enabled prior to OS handover.
4024 for_each_active_iommu(iommu, drhd)
4025 if (iommu->gcmd & DMA_GCMD_TE)
4026 iommu_disable_translation(iommu);
4028 if (dmar_dev_scope_init() < 0) {
4029 if (force_on)
4030 panic("tboot: Failed to initialize DMAR device scope\n");
4031 goto out_free_dmar;
4034 if (no_iommu || dmar_disabled)
4035 goto out_free_dmar;
4037 if (list_empty(&dmar_rmrr_units))
4038 printk(KERN_INFO "DMAR: No RMRR found\n");
4040 if (list_empty(&dmar_atsr_units))
4041 printk(KERN_INFO "DMAR: No ATSR found\n");
4043 if (dmar_init_reserved_ranges()) {
4044 if (force_on)
4045 panic("tboot: Failed to reserve iommu ranges\n");
4046 goto out_free_reserved_range;
4049 init_no_remapping_devices();
4051 ret = init_dmars();
4052 if (ret) {
4053 if (force_on)
4054 panic("tboot: Failed to initialize DMARs\n");
4055 printk(KERN_ERR "IOMMU: dmar init failed\n");
4056 goto out_free_reserved_range;
4058 up_write(&dmar_global_lock);
4059 printk(KERN_INFO
4060 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
4062 init_timer(&unmap_timer);
4063 #ifdef CONFIG_SWIOTLB
4064 swiotlb = 0;
4065 #endif
4066 dma_ops = &intel_dma_ops;
4068 init_iommu_pm_ops();
4070 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4071 bus_register_notifier(&pci_bus_type, &device_nb);
4072 if (si_domain && !hw_pass_through)
4073 register_memory_notifier(&intel_iommu_memory_nb);
4075 intel_iommu_enabled = 1;
4077 return 0;
4079 out_free_reserved_range:
4080 put_iova_domain(&reserved_iova_list);
4081 out_free_dmar:
4082 intel_iommu_free_dmars();
4083 up_write(&dmar_global_lock);
4084 iommu_exit_mempool();
4085 return ret;
4088 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
4089 struct device *dev)
4091 struct pci_dev *tmp, *parent, *pdev;
4093 if (!iommu || !dev || !dev_is_pci(dev))
4094 return;
4096 pdev = to_pci_dev(dev);
4098 /* dependent device detach */
4099 tmp = pci_find_upstream_pcie_bridge(pdev);
4100 /* Secondary interface's bus number and devfn 0 */
4101 if (tmp) {
4102 parent = pdev->bus->self;
4103 while (parent != tmp) {
4104 iommu_detach_dev(iommu, parent->bus->number,
4105 parent->devfn);
4106 parent = parent->bus->self;
4108 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
4109 iommu_detach_dev(iommu,
4110 tmp->subordinate->number, 0);
4111 else /* this is a legacy PCI bridge */
4112 iommu_detach_dev(iommu, tmp->bus->number,
4113 tmp->devfn);
4117 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4118 struct device *dev)
4120 struct device_domain_info *info, *tmp;
4121 struct intel_iommu *iommu;
4122 unsigned long flags;
4123 int found = 0;
4124 u8 bus, devfn;
4126 iommu = device_to_iommu(dev, &bus, &devfn);
4127 if (!iommu)
4128 return;
4130 spin_lock_irqsave(&device_domain_lock, flags);
4131 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4132 if (info->iommu == iommu && info->bus == bus &&
4133 info->devfn == devfn) {
4134 unlink_domain_info(info);
4135 spin_unlock_irqrestore(&device_domain_lock, flags);
4137 iommu_disable_dev_iotlb(info);
4138 iommu_detach_dev(iommu, info->bus, info->devfn);
4139 iommu_detach_dependent_devices(iommu, dev);
4140 free_devinfo_mem(info);
4142 spin_lock_irqsave(&device_domain_lock, flags);
4144 if (found)
4145 break;
4146 else
4147 continue;
4150 /* if there is no other devices under the same iommu
4151 * owned by this domain, clear this iommu in iommu_bmp
4152 * update iommu count and coherency
4154 if (info->iommu == iommu)
4155 found = 1;
4158 spin_unlock_irqrestore(&device_domain_lock, flags);
4160 if (found == 0) {
4161 unsigned long tmp_flags;
4162 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
4163 clear_bit(iommu->seq_id, domain->iommu_bmp);
4164 domain->iommu_count--;
4165 domain_update_iommu_cap(domain);
4166 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
4168 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
4169 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
4170 spin_lock_irqsave(&iommu->lock, tmp_flags);
4171 clear_bit(domain->id, iommu->domain_ids);
4172 iommu->domains[domain->id] = NULL;
4173 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
4178 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4180 int adjust_width;
4182 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
4183 domain_reserve_special_ranges(domain);
4185 /* calculate AGAW */
4186 domain->gaw = guest_width;
4187 adjust_width = guestwidth_to_adjustwidth(guest_width);
4188 domain->agaw = width_to_agaw(adjust_width);
4190 domain->iommu_coherency = 0;
4191 domain->iommu_snooping = 0;
4192 domain->iommu_superpage = 0;
4193 domain->max_addr = 0;
4194 domain->nid = -1;
4196 /* always allocate the top pgd */
4197 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4198 if (!domain->pgd)
4199 return -ENOMEM;
4200 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4201 return 0;
4204 static int intel_iommu_domain_init(struct iommu_domain *domain)
4206 struct dmar_domain *dmar_domain;
4208 dmar_domain = alloc_domain(true);
4209 if (!dmar_domain) {
4210 printk(KERN_ERR
4211 "intel_iommu_domain_init: dmar_domain == NULL\n");
4212 return -ENOMEM;
4214 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4215 printk(KERN_ERR
4216 "intel_iommu_domain_init() failed\n");
4217 domain_exit(dmar_domain);
4218 return -ENOMEM;
4220 domain_update_iommu_cap(dmar_domain);
4221 domain->priv = dmar_domain;
4223 domain->geometry.aperture_start = 0;
4224 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4225 domain->geometry.force_aperture = true;
4227 return 0;
4230 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
4232 struct dmar_domain *dmar_domain = domain->priv;
4234 domain->priv = NULL;
4235 domain_exit(dmar_domain);
4238 static int intel_iommu_attach_device(struct iommu_domain *domain,
4239 struct device *dev)
4241 struct dmar_domain *dmar_domain = domain->priv;
4242 struct intel_iommu *iommu;
4243 int addr_width;
4244 u8 bus, devfn;
4246 if (device_is_rmrr_locked(dev)) {
4247 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4248 return -EPERM;
4251 /* normally dev is not mapped */
4252 if (unlikely(domain_context_mapped(dev))) {
4253 struct dmar_domain *old_domain;
4255 old_domain = find_domain(dev);
4256 if (old_domain) {
4257 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4258 dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4259 domain_remove_one_dev_info(old_domain, dev);
4260 else
4261 domain_remove_dev_info(old_domain);
4265 iommu = device_to_iommu(dev, &bus, &devfn);
4266 if (!iommu)
4267 return -ENODEV;
4269 /* check if this iommu agaw is sufficient for max mapped address */
4270 addr_width = agaw_to_width(iommu->agaw);
4271 if (addr_width > cap_mgaw(iommu->cap))
4272 addr_width = cap_mgaw(iommu->cap);
4274 if (dmar_domain->max_addr > (1LL << addr_width)) {
4275 printk(KERN_ERR "%s: iommu width (%d) is not "
4276 "sufficient for the mapped address (%llx)\n",
4277 __func__, addr_width, dmar_domain->max_addr);
4278 return -EFAULT;
4280 dmar_domain->gaw = addr_width;
4283 * Knock out extra levels of page tables if necessary
4285 while (iommu->agaw < dmar_domain->agaw) {
4286 struct dma_pte *pte;
4288 pte = dmar_domain->pgd;
4289 if (dma_pte_present(pte)) {
4290 dmar_domain->pgd = (struct dma_pte *)
4291 phys_to_virt(dma_pte_addr(pte));
4292 free_pgtable_page(pte);
4294 dmar_domain->agaw--;
4297 return domain_add_dev_info(dmar_domain, dev, CONTEXT_TT_MULTI_LEVEL);
4300 static void intel_iommu_detach_device(struct iommu_domain *domain,
4301 struct device *dev)
4303 struct dmar_domain *dmar_domain = domain->priv;
4305 domain_remove_one_dev_info(dmar_domain, dev);
4308 static int intel_iommu_map(struct iommu_domain *domain,
4309 unsigned long iova, phys_addr_t hpa,
4310 size_t size, int iommu_prot)
4312 struct dmar_domain *dmar_domain = domain->priv;
4313 u64 max_addr;
4314 int prot = 0;
4315 int ret;
4317 if (iommu_prot & IOMMU_READ)
4318 prot |= DMA_PTE_READ;
4319 if (iommu_prot & IOMMU_WRITE)
4320 prot |= DMA_PTE_WRITE;
4321 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4322 prot |= DMA_PTE_SNP;
4324 max_addr = iova + size;
4325 if (dmar_domain->max_addr < max_addr) {
4326 u64 end;
4328 /* check if minimum agaw is sufficient for mapped address */
4329 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4330 if (end < max_addr) {
4331 printk(KERN_ERR "%s: iommu width (%d) is not "
4332 "sufficient for the mapped address (%llx)\n",
4333 __func__, dmar_domain->gaw, max_addr);
4334 return -EFAULT;
4336 dmar_domain->max_addr = max_addr;
4338 /* Round up size to next multiple of PAGE_SIZE, if it and
4339 the low bits of hpa would take us onto the next page */
4340 size = aligned_nrpages(hpa, size);
4341 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4342 hpa >> VTD_PAGE_SHIFT, size, prot);
4343 return ret;
4346 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4347 unsigned long iova, size_t size)
4349 struct dmar_domain *dmar_domain = domain->priv;
4350 struct page *freelist = NULL;
4351 struct intel_iommu *iommu;
4352 unsigned long start_pfn, last_pfn;
4353 unsigned int npages;
4354 int iommu_id, num, ndomains, level = 0;
4356 /* Cope with horrid API which requires us to unmap more than the
4357 size argument if it happens to be a large-page mapping. */
4358 if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4359 BUG();
4361 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4362 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4364 start_pfn = iova >> VTD_PAGE_SHIFT;
4365 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4367 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4369 npages = last_pfn - start_pfn + 1;
4371 for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4372 iommu = g_iommus[iommu_id];
4375 * find bit position of dmar_domain
4377 ndomains = cap_ndoms(iommu->cap);
4378 for_each_set_bit(num, iommu->domain_ids, ndomains) {
4379 if (iommu->domains[num] == dmar_domain)
4380 iommu_flush_iotlb_psi(iommu, num, start_pfn,
4381 npages, !freelist, 0);
4386 dma_free_pagelist(freelist);
4388 if (dmar_domain->max_addr == iova + size)
4389 dmar_domain->max_addr = iova;
4391 return size;
4394 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4395 dma_addr_t iova)
4397 struct dmar_domain *dmar_domain = domain->priv;
4398 struct dma_pte *pte;
4399 int level = 0;
4400 u64 phys = 0;
4402 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4403 if (pte)
4404 phys = dma_pte_addr(pte);
4406 return phys;
4409 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4410 unsigned long cap)
4412 struct dmar_domain *dmar_domain = domain->priv;
4414 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4415 return dmar_domain->iommu_snooping;
4416 if (cap == IOMMU_CAP_INTR_REMAP)
4417 return irq_remapping_enabled;
4419 return 0;
4422 #define REQ_ACS_FLAGS (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4424 static int intel_iommu_add_device(struct device *dev)
4426 struct pci_dev *pdev = to_pci_dev(dev);
4427 struct pci_dev *bridge, *dma_pdev = NULL;
4428 struct iommu_group *group;
4429 int ret;
4430 u8 bus, devfn;
4432 if (!device_to_iommu(dev, &bus, &devfn))
4433 return -ENODEV;
4435 bridge = pci_find_upstream_pcie_bridge(pdev);
4436 if (bridge) {
4437 if (pci_is_pcie(bridge))
4438 dma_pdev = pci_get_domain_bus_and_slot(
4439 pci_domain_nr(pdev->bus),
4440 bridge->subordinate->number, 0);
4441 if (!dma_pdev)
4442 dma_pdev = pci_dev_get(bridge);
4443 } else
4444 dma_pdev = pci_dev_get(pdev);
4446 /* Account for quirked devices */
4447 swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4450 * If it's a multifunction device that does not support our
4451 * required ACS flags, add to the same group as lowest numbered
4452 * function that also does not suport the required ACS flags.
4454 if (dma_pdev->multifunction &&
4455 !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS)) {
4456 u8 i, slot = PCI_SLOT(dma_pdev->devfn);
4458 for (i = 0; i < 8; i++) {
4459 struct pci_dev *tmp;
4461 tmp = pci_get_slot(dma_pdev->bus, PCI_DEVFN(slot, i));
4462 if (!tmp)
4463 continue;
4465 if (!pci_acs_enabled(tmp, REQ_ACS_FLAGS)) {
4466 swap_pci_ref(&dma_pdev, tmp);
4467 break;
4469 pci_dev_put(tmp);
4474 * Devices on the root bus go through the iommu. If that's not us,
4475 * find the next upstream device and test ACS up to the root bus.
4476 * Finding the next device may require skipping virtual buses.
4478 while (!pci_is_root_bus(dma_pdev->bus)) {
4479 struct pci_bus *bus = dma_pdev->bus;
4481 while (!bus->self) {
4482 if (!pci_is_root_bus(bus))
4483 bus = bus->parent;
4484 else
4485 goto root_bus;
4488 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4489 break;
4491 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4494 root_bus:
4495 group = iommu_group_get(&dma_pdev->dev);
4496 pci_dev_put(dma_pdev);
4497 if (!group) {
4498 group = iommu_group_alloc();
4499 if (IS_ERR(group))
4500 return PTR_ERR(group);
4503 ret = iommu_group_add_device(group, dev);
4505 iommu_group_put(group);
4506 return ret;
4509 static void intel_iommu_remove_device(struct device *dev)
4511 iommu_group_remove_device(dev);
4514 static struct iommu_ops intel_iommu_ops = {
4515 .domain_init = intel_iommu_domain_init,
4516 .domain_destroy = intel_iommu_domain_destroy,
4517 .attach_dev = intel_iommu_attach_device,
4518 .detach_dev = intel_iommu_detach_device,
4519 .map = intel_iommu_map,
4520 .unmap = intel_iommu_unmap,
4521 .iova_to_phys = intel_iommu_iova_to_phys,
4522 .domain_has_cap = intel_iommu_domain_has_cap,
4523 .add_device = intel_iommu_add_device,
4524 .remove_device = intel_iommu_remove_device,
4525 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4528 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4530 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4531 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4532 dmar_map_gfx = 0;
4535 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4536 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4537 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4538 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4539 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4540 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4541 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4543 static void quirk_iommu_rwbf(struct pci_dev *dev)
4546 * Mobile 4 Series Chipset neglects to set RWBF capability,
4547 * but needs it. Same seems to hold for the desktop versions.
4549 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4550 rwbf_quirk = 1;
4553 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4554 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4555 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4556 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4557 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4558 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4559 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4561 #define GGC 0x52
4562 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4563 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4564 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4565 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4566 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4567 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4568 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4569 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4571 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4573 unsigned short ggc;
4575 if (pci_read_config_word(dev, GGC, &ggc))
4576 return;
4578 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4579 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4580 dmar_map_gfx = 0;
4581 } else if (dmar_map_gfx) {
4582 /* we have to ensure the gfx device is idle before we flush */
4583 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4584 intel_iommu_strict = 1;
4587 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4588 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4589 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4590 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4592 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4593 ISOCH DMAR unit for the Azalia sound device, but not give it any
4594 TLB entries, which causes it to deadlock. Check for that. We do
4595 this in a function called from init_dmars(), instead of in a PCI
4596 quirk, because we don't want to print the obnoxious "BIOS broken"
4597 message if VT-d is actually disabled.
4599 static void __init check_tylersburg_isoch(void)
4601 struct pci_dev *pdev;
4602 uint32_t vtisochctrl;
4604 /* If there's no Azalia in the system anyway, forget it. */
4605 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4606 if (!pdev)
4607 return;
4608 pci_dev_put(pdev);
4610 /* System Management Registers. Might be hidden, in which case
4611 we can't do the sanity check. But that's OK, because the
4612 known-broken BIOSes _don't_ actually hide it, so far. */
4613 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4614 if (!pdev)
4615 return;
4617 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4618 pci_dev_put(pdev);
4619 return;
4622 pci_dev_put(pdev);
4624 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4625 if (vtisochctrl & 1)
4626 return;
4628 /* Drop all bits other than the number of TLB entries */
4629 vtisochctrl &= 0x1c;
4631 /* If we have the recommended number of TLB entries (16), fine. */
4632 if (vtisochctrl == 0x10)
4633 return;
4635 /* Zero TLB entries? You get to ride the short bus to school. */
4636 if (!vtisochctrl) {
4637 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4638 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4639 dmi_get_system_info(DMI_BIOS_VENDOR),
4640 dmi_get_system_info(DMI_BIOS_VERSION),
4641 dmi_get_system_info(DMI_PRODUCT_VERSION));
4642 iommu_identity_mapping |= IDENTMAP_AZALIA;
4643 return;
4646 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4647 vtisochctrl);