Linux 3.18.139
[linux/fpc-iii.git] / drivers / iommu / intel-iommu.c
blobb1966269f26a45d7257c5a42111eb91776b859c5
1 /*
2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
20 #include <linux/init.h>
21 #include <linux/bitmap.h>
22 #include <linux/debugfs.h>
23 #include <linux/export.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/spinlock.h>
28 #include <linux/pci.h>
29 #include <linux/dmar.h>
30 #include <linux/dma-mapping.h>
31 #include <linux/mempool.h>
32 #include <linux/memory.h>
33 #include <linux/timer.h>
34 #include <linux/iova.h>
35 #include <linux/iommu.h>
36 #include <linux/intel-iommu.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <linux/dma-contiguous.h>
43 #include <asm/irq_remapping.h>
44 #include <asm/cacheflush.h>
45 #include <asm/iommu.h>
47 #include "irq_remapping.h"
49 #define ROOT_SIZE VTD_PAGE_SIZE
50 #define CONTEXT_SIZE VTD_PAGE_SIZE
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
54 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
55 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
57 #define IOAPIC_RANGE_START (0xfee00000)
58 #define IOAPIC_RANGE_END (0xfeefffff)
59 #define IOVA_START_ADDR (0x1000)
61 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
63 #define MAX_AGAW_WIDTH 64
64 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
66 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
67 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
69 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
70 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
71 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
72 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
73 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
75 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
76 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
77 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
79 /* page table handling */
80 #define LEVEL_STRIDE (9)
81 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
84 * This bitmap is used to advertise the page sizes our hardware support
85 * to the IOMMU core, which will then use this information to split
86 * physically contiguous memory regions it is mapping into page sizes
87 * that we support.
89 * Traditionally the IOMMU core just handed us the mappings directly,
90 * after making sure the size is an order of a 4KiB page and that the
91 * mapping has natural alignment.
93 * To retain this behavior, we currently advertise that we support
94 * all page sizes that are an order of 4KiB.
96 * If at some point we'd like to utilize the IOMMU core's new behavior,
97 * we could change this to advertise the real page sizes we support.
99 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
101 static inline int agaw_to_level(int agaw)
103 return agaw + 2;
106 static inline int agaw_to_width(int agaw)
108 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
111 static inline int width_to_agaw(int width)
113 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
116 static inline unsigned int level_to_offset_bits(int level)
118 return (level - 1) * LEVEL_STRIDE;
121 static inline int pfn_level_offset(unsigned long pfn, int level)
123 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
126 static inline unsigned long level_mask(int level)
128 return -1UL << level_to_offset_bits(level);
131 static inline unsigned long level_size(int level)
133 return 1UL << level_to_offset_bits(level);
136 static inline unsigned long align_to_level(unsigned long pfn, int level)
138 return (pfn + level_size(level) - 1) & level_mask(level);
141 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
143 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
146 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
147 are never going to work. */
148 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
150 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
153 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
155 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
157 static inline unsigned long page_to_dma_pfn(struct page *pg)
159 return mm_to_dma_pfn(page_to_pfn(pg));
161 static inline unsigned long virt_to_dma_pfn(void *p)
163 return page_to_dma_pfn(virt_to_page(p));
166 /* global iommu list, set NULL for ignored DMAR units */
167 static struct intel_iommu **g_iommus;
169 static void __init check_tylersburg_isoch(void);
170 static int rwbf_quirk;
173 * set to 1 to panic kernel if can't successfully enable VT-d
174 * (used when kernel is launched w/ TXT)
176 static int force_on = 0;
179 * 0: Present
180 * 1-11: Reserved
181 * 12-63: Context Ptr (12 - (haw-1))
182 * 64-127: Reserved
184 struct root_entry {
185 u64 val;
186 u64 rsvd1;
188 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
189 static inline bool root_present(struct root_entry *root)
191 return (root->val & 1);
193 static inline void set_root_present(struct root_entry *root)
195 root->val |= 1;
197 static inline void set_root_value(struct root_entry *root, unsigned long value)
199 root->val |= value & VTD_PAGE_MASK;
202 static inline struct context_entry *
203 get_context_addr_from_root(struct root_entry *root)
205 return (struct context_entry *)
206 (root_present(root)?phys_to_virt(
207 root->val & VTD_PAGE_MASK) :
208 NULL);
212 * low 64 bits:
213 * 0: present
214 * 1: fault processing disable
215 * 2-3: translation type
216 * 12-63: address space root
217 * high 64 bits:
218 * 0-2: address width
219 * 3-6: aval
220 * 8-23: domain id
222 struct context_entry {
223 u64 lo;
224 u64 hi;
227 static inline bool context_present(struct context_entry *context)
229 return (context->lo & 1);
231 static inline void context_set_present(struct context_entry *context)
233 context->lo |= 1;
236 static inline void context_set_fault_enable(struct context_entry *context)
238 context->lo &= (((u64)-1) << 2) | 1;
241 static inline void context_set_translation_type(struct context_entry *context,
242 unsigned long value)
244 context->lo &= (((u64)-1) << 4) | 3;
245 context->lo |= (value & 3) << 2;
248 static inline void context_set_address_root(struct context_entry *context,
249 unsigned long value)
251 context->lo |= value & VTD_PAGE_MASK;
254 static inline void context_set_address_width(struct context_entry *context,
255 unsigned long value)
257 context->hi |= value & 7;
260 static inline void context_set_domain_id(struct context_entry *context,
261 unsigned long value)
263 context->hi |= (value & ((1 << 16) - 1)) << 8;
266 static inline void context_clear_entry(struct context_entry *context)
268 context->lo = 0;
269 context->hi = 0;
273 * 0: readable
274 * 1: writable
275 * 2-6: reserved
276 * 7: super page
277 * 8-10: available
278 * 11: snoop behavior
279 * 12-63: Host physcial address
281 struct dma_pte {
282 u64 val;
285 static inline void dma_clear_pte(struct dma_pte *pte)
287 pte->val = 0;
290 static inline u64 dma_pte_addr(struct dma_pte *pte)
292 #ifdef CONFIG_64BIT
293 return pte->val & VTD_PAGE_MASK;
294 #else
295 /* Must have a full atomic 64-bit read */
296 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
297 #endif
300 static inline bool dma_pte_present(struct dma_pte *pte)
302 return (pte->val & 3) != 0;
305 static inline bool dma_pte_superpage(struct dma_pte *pte)
307 return (pte->val & DMA_PTE_LARGE_PAGE);
310 static inline int first_pte_in_page(struct dma_pte *pte)
312 return !((unsigned long)pte & ~VTD_PAGE_MASK);
316 * This domain is a statically identity mapping domain.
317 * 1. This domain creats a static 1:1 mapping to all usable memory.
318 * 2. It maps to each iommu if successful.
319 * 3. Each iommu mapps to this domain if successful.
321 static struct dmar_domain *si_domain;
322 static int hw_pass_through = 1;
324 /* domain represents a virtual machine, more than one devices
325 * across iommus may be owned in one domain, e.g. kvm guest.
327 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
329 /* si_domain contains mulitple devices */
330 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
332 /* define the limit of IOMMUs supported in each domain */
333 #ifdef CONFIG_X86
334 # define IOMMU_UNITS_SUPPORTED MAX_IO_APICS
335 #else
336 # define IOMMU_UNITS_SUPPORTED 64
337 #endif
339 struct dmar_domain {
340 int id; /* domain id */
341 int nid; /* node id */
342 DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
343 /* bitmap of iommus this domain uses*/
345 struct list_head devices; /* all devices' list */
346 struct iova_domain iovad; /* iova's that belong to this domain */
348 struct dma_pte *pgd; /* virtual address */
349 int gaw; /* max guest address width */
351 /* adjusted guest address width, 0 is level 2 30-bit */
352 int agaw;
354 int flags; /* flags to find out type of domain */
356 int iommu_coherency;/* indicate coherency of iommu access */
357 int iommu_snooping; /* indicate snooping control feature*/
358 int iommu_count; /* reference count of iommu */
359 int iommu_superpage;/* Level of superpages supported:
360 0 == 4KiB (no superpages), 1 == 2MiB,
361 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
362 spinlock_t iommu_lock; /* protect iommu set in domain */
363 u64 max_addr; /* maximum mapped address */
366 /* PCI domain-device relationship */
367 struct device_domain_info {
368 struct list_head link; /* link to domain siblings */
369 struct list_head global; /* link to global list */
370 u8 bus; /* PCI bus number */
371 u8 devfn; /* PCI devfn number */
372 struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
373 struct intel_iommu *iommu; /* IOMMU used by this device */
374 struct dmar_domain *domain; /* pointer to domain */
377 struct dmar_rmrr_unit {
378 struct list_head list; /* list of rmrr units */
379 struct acpi_dmar_header *hdr; /* ACPI header */
380 u64 base_address; /* reserved base address*/
381 u64 end_address; /* reserved end address */
382 struct dmar_dev_scope *devices; /* target devices */
383 int devices_cnt; /* target device count */
386 struct dmar_atsr_unit {
387 struct list_head list; /* list of ATSR units */
388 struct acpi_dmar_header *hdr; /* ACPI header */
389 struct dmar_dev_scope *devices; /* target devices */
390 int devices_cnt; /* target device count */
391 u8 include_all:1; /* include all ports */
394 static LIST_HEAD(dmar_atsr_units);
395 static LIST_HEAD(dmar_rmrr_units);
397 #define for_each_rmrr_units(rmrr) \
398 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
400 static void flush_unmaps_timeout(unsigned long data);
402 static DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
404 #define HIGH_WATER_MARK 250
405 struct deferred_flush_tables {
406 int next;
407 struct iova *iova[HIGH_WATER_MARK];
408 struct dmar_domain *domain[HIGH_WATER_MARK];
409 struct page *freelist[HIGH_WATER_MARK];
412 static struct deferred_flush_tables *deferred_flush;
414 /* bitmap for indexing intel_iommus */
415 static int g_num_of_iommus;
417 static DEFINE_SPINLOCK(async_umap_flush_lock);
418 static LIST_HEAD(unmaps_to_do);
420 static int timer_on;
421 static long list_size;
423 static void domain_exit(struct dmar_domain *domain);
424 static void domain_remove_dev_info(struct dmar_domain *domain);
425 static void domain_remove_one_dev_info(struct dmar_domain *domain,
426 struct device *dev);
427 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
428 struct device *dev);
429 static int domain_detach_iommu(struct dmar_domain *domain,
430 struct intel_iommu *iommu);
432 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
433 int dmar_disabled = 0;
434 #else
435 int dmar_disabled = 1;
436 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
438 int intel_iommu_enabled = 0;
439 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
441 static int dmar_map_gfx = 1;
442 static int dmar_forcedac;
443 static int intel_iommu_strict;
444 static int intel_iommu_superpage = 1;
446 int intel_iommu_gfx_mapped;
447 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
449 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
450 static DEFINE_SPINLOCK(device_domain_lock);
451 static LIST_HEAD(device_domain_list);
453 static const struct iommu_ops intel_iommu_ops;
455 static int __init intel_iommu_setup(char *str)
457 if (!str)
458 return -EINVAL;
459 while (*str) {
460 if (!strncmp(str, "on", 2)) {
461 dmar_disabled = 0;
462 printk(KERN_INFO "Intel-IOMMU: enabled\n");
463 } else if (!strncmp(str, "off", 3)) {
464 dmar_disabled = 1;
465 printk(KERN_INFO "Intel-IOMMU: disabled\n");
466 } else if (!strncmp(str, "igfx_off", 8)) {
467 dmar_map_gfx = 0;
468 printk(KERN_INFO
469 "Intel-IOMMU: disable GFX device mapping\n");
470 } else if (!strncmp(str, "forcedac", 8)) {
471 printk(KERN_INFO
472 "Intel-IOMMU: Forcing DAC for PCI devices\n");
473 dmar_forcedac = 1;
474 } else if (!strncmp(str, "strict", 6)) {
475 printk(KERN_INFO
476 "Intel-IOMMU: disable batched IOTLB flush\n");
477 intel_iommu_strict = 1;
478 } else if (!strncmp(str, "sp_off", 6)) {
479 printk(KERN_INFO
480 "Intel-IOMMU: disable supported super page\n");
481 intel_iommu_superpage = 0;
484 str += strcspn(str, ",");
485 while (*str == ',')
486 str++;
488 return 0;
490 __setup("intel_iommu=", intel_iommu_setup);
492 static struct kmem_cache *iommu_domain_cache;
493 static struct kmem_cache *iommu_devinfo_cache;
494 static struct kmem_cache *iommu_iova_cache;
496 static inline void *alloc_pgtable_page(int node)
498 struct page *page;
499 void *vaddr = NULL;
501 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
502 if (page)
503 vaddr = page_address(page);
504 return vaddr;
507 static inline void free_pgtable_page(void *vaddr)
509 free_page((unsigned long)vaddr);
512 static inline void *alloc_domain_mem(void)
514 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
517 static void free_domain_mem(void *vaddr)
519 kmem_cache_free(iommu_domain_cache, vaddr);
522 static inline void * alloc_devinfo_mem(void)
524 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
527 static inline void free_devinfo_mem(void *vaddr)
529 kmem_cache_free(iommu_devinfo_cache, vaddr);
532 struct iova *alloc_iova_mem(void)
534 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
537 void free_iova_mem(struct iova *iova)
539 kmem_cache_free(iommu_iova_cache, iova);
542 static inline int domain_type_is_vm(struct dmar_domain *domain)
544 return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
547 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
549 return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
550 DOMAIN_FLAG_STATIC_IDENTITY);
553 static inline int domain_pfn_supported(struct dmar_domain *domain,
554 unsigned long pfn)
556 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
558 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
563 unsigned long sagaw;
564 int agaw = -1;
566 sagaw = cap_sagaw(iommu->cap);
567 for (agaw = width_to_agaw(max_gaw);
568 agaw >= 0; agaw--) {
569 if (test_bit(agaw, &sagaw))
570 break;
573 return agaw;
577 * Calculate max SAGAW for each iommu.
579 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
581 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
585 * calculate agaw for each iommu.
586 * "SAGAW" may be different across iommus, use a default agaw, and
587 * get a supported less agaw for iommus that don't support the default agaw.
589 int iommu_calculate_agaw(struct intel_iommu *iommu)
591 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
594 /* This functionin only returns single iommu in a domain */
595 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
597 int iommu_id;
599 /* si_domain and vm domain should not get here. */
600 BUG_ON(domain_type_is_vm_or_si(domain));
601 iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
602 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
603 return NULL;
605 return g_iommus[iommu_id];
608 static void domain_update_iommu_coherency(struct dmar_domain *domain)
610 struct dmar_drhd_unit *drhd;
611 struct intel_iommu *iommu;
612 int i, found = 0;
614 domain->iommu_coherency = 1;
616 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
617 found = 1;
618 if (!ecap_coherent(g_iommus[i]->ecap)) {
619 domain->iommu_coherency = 0;
620 break;
623 if (found)
624 return;
626 /* No hardware attached; use lowest common denominator */
627 rcu_read_lock();
628 for_each_active_iommu(iommu, drhd) {
629 if (!ecap_coherent(iommu->ecap)) {
630 domain->iommu_coherency = 0;
631 break;
634 rcu_read_unlock();
637 static int domain_update_iommu_snooping(struct intel_iommu *skip)
639 struct dmar_drhd_unit *drhd;
640 struct intel_iommu *iommu;
641 int ret = 1;
643 rcu_read_lock();
644 for_each_active_iommu(iommu, drhd) {
645 if (iommu != skip) {
646 if (!ecap_sc_support(iommu->ecap)) {
647 ret = 0;
648 break;
652 rcu_read_unlock();
654 return ret;
657 static int domain_update_iommu_superpage(struct intel_iommu *skip)
659 struct dmar_drhd_unit *drhd;
660 struct intel_iommu *iommu;
661 int mask = 0xf;
663 if (!intel_iommu_superpage) {
664 return 0;
667 /* set iommu_superpage to the smallest common denominator */
668 rcu_read_lock();
669 for_each_active_iommu(iommu, drhd) {
670 if (iommu != skip) {
671 mask &= cap_super_page_val(iommu->cap);
672 if (!mask)
673 break;
676 rcu_read_unlock();
678 return fls(mask);
681 /* Some capabilities may be different across iommus */
682 static void domain_update_iommu_cap(struct dmar_domain *domain)
684 domain_update_iommu_coherency(domain);
685 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
686 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
689 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
691 struct dmar_drhd_unit *drhd = NULL;
692 struct intel_iommu *iommu;
693 struct device *tmp;
694 struct pci_dev *ptmp, *pdev = NULL;
695 u16 segment = 0;
696 int i;
698 if (dev_is_pci(dev)) {
699 struct pci_dev *pf_pdev;
701 pdev = to_pci_dev(dev);
702 /* VFs aren't listed in scope tables; we need to look up
703 * the PF instead to find the IOMMU. */
704 pf_pdev = pci_physfn(pdev);
705 dev = &pf_pdev->dev;
706 segment = pci_domain_nr(pdev->bus);
707 } else if (ACPI_COMPANION(dev))
708 dev = &ACPI_COMPANION(dev)->dev;
710 rcu_read_lock();
711 for_each_active_iommu(iommu, drhd) {
712 if (pdev && segment != drhd->segment)
713 continue;
715 for_each_active_dev_scope(drhd->devices,
716 drhd->devices_cnt, i, tmp) {
717 if (tmp == dev) {
718 /* For a VF use its original BDF# not that of the PF
719 * which we used for the IOMMU lookup. Strictly speaking
720 * we could do this for all PCI devices; we only need to
721 * get the BDF# from the scope table for ACPI matches. */
722 if (pdev && pdev->is_virtfn)
723 goto got_pdev;
725 *bus = drhd->devices[i].bus;
726 *devfn = drhd->devices[i].devfn;
727 goto out;
730 if (!pdev || !dev_is_pci(tmp))
731 continue;
733 ptmp = to_pci_dev(tmp);
734 if (ptmp->subordinate &&
735 ptmp->subordinate->number <= pdev->bus->number &&
736 ptmp->subordinate->busn_res.end >= pdev->bus->number)
737 goto got_pdev;
740 if (pdev && drhd->include_all) {
741 got_pdev:
742 *bus = pdev->bus->number;
743 *devfn = pdev->devfn;
744 goto out;
747 iommu = NULL;
748 out:
749 rcu_read_unlock();
751 return iommu;
754 static void domain_flush_cache(struct dmar_domain *domain,
755 void *addr, int size)
757 if (!domain->iommu_coherency)
758 clflush_cache_range(addr, size);
761 /* Gets context entry for a given bus and devfn */
762 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
763 u8 bus, u8 devfn)
765 struct root_entry *root;
766 struct context_entry *context;
767 unsigned long phy_addr;
768 unsigned long flags;
770 spin_lock_irqsave(&iommu->lock, flags);
771 root = &iommu->root_entry[bus];
772 context = get_context_addr_from_root(root);
773 if (!context) {
774 context = (struct context_entry *)
775 alloc_pgtable_page(iommu->node);
776 if (!context) {
777 spin_unlock_irqrestore(&iommu->lock, flags);
778 return NULL;
780 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
781 phy_addr = virt_to_phys((void *)context);
782 set_root_value(root, phy_addr);
783 set_root_present(root);
784 __iommu_flush_cache(iommu, root, sizeof(*root));
786 spin_unlock_irqrestore(&iommu->lock, flags);
787 return &context[devfn];
790 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
792 struct root_entry *root;
793 struct context_entry *context;
794 int ret;
795 unsigned long flags;
797 spin_lock_irqsave(&iommu->lock, flags);
798 root = &iommu->root_entry[bus];
799 context = get_context_addr_from_root(root);
800 if (!context) {
801 ret = 0;
802 goto out;
804 ret = context_present(&context[devfn]);
805 out:
806 spin_unlock_irqrestore(&iommu->lock, flags);
807 return ret;
810 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
812 struct root_entry *root;
813 struct context_entry *context;
814 unsigned long flags;
816 spin_lock_irqsave(&iommu->lock, flags);
817 root = &iommu->root_entry[bus];
818 context = get_context_addr_from_root(root);
819 if (context) {
820 context_clear_entry(&context[devfn]);
821 __iommu_flush_cache(iommu, &context[devfn], \
822 sizeof(*context));
824 spin_unlock_irqrestore(&iommu->lock, flags);
827 static void free_context_table(struct intel_iommu *iommu)
829 struct root_entry *root;
830 int i;
831 unsigned long flags;
832 struct context_entry *context;
834 spin_lock_irqsave(&iommu->lock, flags);
835 if (!iommu->root_entry) {
836 goto out;
838 for (i = 0; i < ROOT_ENTRY_NR; i++) {
839 root = &iommu->root_entry[i];
840 context = get_context_addr_from_root(root);
841 if (context)
842 free_pgtable_page(context);
844 free_pgtable_page(iommu->root_entry);
845 iommu->root_entry = NULL;
846 out:
847 spin_unlock_irqrestore(&iommu->lock, flags);
850 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
851 unsigned long pfn, int *target_level)
853 struct dma_pte *parent, *pte = NULL;
854 int level = agaw_to_level(domain->agaw);
855 int offset;
857 BUG_ON(!domain->pgd);
859 if (!domain_pfn_supported(domain, pfn))
860 /* Address beyond IOMMU's addressing capabilities. */
861 return NULL;
863 parent = domain->pgd;
865 while (1) {
866 void *tmp_page;
868 offset = pfn_level_offset(pfn, level);
869 pte = &parent[offset];
870 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
871 break;
872 if (level == *target_level)
873 break;
875 if (!dma_pte_present(pte)) {
876 uint64_t pteval;
878 tmp_page = alloc_pgtable_page(domain->nid);
880 if (!tmp_page)
881 return NULL;
883 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
884 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
885 if (cmpxchg64(&pte->val, 0ULL, pteval))
886 /* Someone else set it while we were thinking; use theirs. */
887 free_pgtable_page(tmp_page);
888 else
889 domain_flush_cache(domain, pte, sizeof(*pte));
891 if (level == 1)
892 break;
894 parent = phys_to_virt(dma_pte_addr(pte));
895 level--;
898 if (!*target_level)
899 *target_level = level;
901 return pte;
905 /* return address's pte at specific level */
906 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
907 unsigned long pfn,
908 int level, int *large_page)
910 struct dma_pte *parent, *pte = NULL;
911 int total = agaw_to_level(domain->agaw);
912 int offset;
914 parent = domain->pgd;
915 while (level <= total) {
916 offset = pfn_level_offset(pfn, total);
917 pte = &parent[offset];
918 if (level == total)
919 return pte;
921 if (!dma_pte_present(pte)) {
922 *large_page = total;
923 break;
926 if (dma_pte_superpage(pte)) {
927 *large_page = total;
928 return pte;
931 parent = phys_to_virt(dma_pte_addr(pte));
932 total--;
934 return NULL;
937 /* clear last level pte, a tlb flush should be followed */
938 static void dma_pte_clear_range(struct dmar_domain *domain,
939 unsigned long start_pfn,
940 unsigned long last_pfn)
942 unsigned int large_page = 1;
943 struct dma_pte *first_pte, *pte;
945 BUG_ON(!domain_pfn_supported(domain, start_pfn));
946 BUG_ON(!domain_pfn_supported(domain, last_pfn));
947 BUG_ON(start_pfn > last_pfn);
949 /* we don't need lock here; nobody else touches the iova range */
950 do {
951 large_page = 1;
952 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
953 if (!pte) {
954 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
955 continue;
957 do {
958 dma_clear_pte(pte);
959 start_pfn += lvl_to_nr_pages(large_page);
960 pte++;
961 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
963 domain_flush_cache(domain, first_pte,
964 (void *)pte - (void *)first_pte);
966 } while (start_pfn && start_pfn <= last_pfn);
969 static void dma_pte_free_level(struct dmar_domain *domain, int level,
970 struct dma_pte *pte, unsigned long pfn,
971 unsigned long start_pfn, unsigned long last_pfn)
973 pfn = max(start_pfn, pfn);
974 pte = &pte[pfn_level_offset(pfn, level)];
976 do {
977 unsigned long level_pfn;
978 struct dma_pte *level_pte;
980 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
981 goto next;
983 level_pfn = pfn & level_mask(level);
984 level_pte = phys_to_virt(dma_pte_addr(pte));
986 if (level > 2)
987 dma_pte_free_level(domain, level - 1, level_pte,
988 level_pfn, start_pfn, last_pfn);
990 /* If range covers entire pagetable, free it */
991 if (!(start_pfn > level_pfn ||
992 last_pfn < level_pfn + level_size(level) - 1)) {
993 dma_clear_pte(pte);
994 domain_flush_cache(domain, pte, sizeof(*pte));
995 free_pgtable_page(level_pte);
997 next:
998 pfn += level_size(level);
999 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1002 /* free page table pages. last level pte should already be cleared */
1003 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1004 unsigned long start_pfn,
1005 unsigned long last_pfn)
1007 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1008 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1009 BUG_ON(start_pfn > last_pfn);
1011 dma_pte_clear_range(domain, start_pfn, last_pfn);
1013 /* We don't need lock here; nobody else touches the iova range */
1014 dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1015 domain->pgd, 0, start_pfn, last_pfn);
1017 /* free pgd */
1018 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1019 free_pgtable_page(domain->pgd);
1020 domain->pgd = NULL;
1024 /* When a page at a given level is being unlinked from its parent, we don't
1025 need to *modify* it at all. All we need to do is make a list of all the
1026 pages which can be freed just as soon as we've flushed the IOTLB and we
1027 know the hardware page-walk will no longer touch them.
1028 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1029 be freed. */
1030 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1031 int level, struct dma_pte *pte,
1032 struct page *freelist)
1034 struct page *pg;
1036 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1037 pg->freelist = freelist;
1038 freelist = pg;
1040 if (level == 1)
1041 return freelist;
1043 pte = page_address(pg);
1044 do {
1045 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1046 freelist = dma_pte_list_pagetables(domain, level - 1,
1047 pte, freelist);
1048 pte++;
1049 } while (!first_pte_in_page(pte));
1051 return freelist;
1054 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1055 struct dma_pte *pte, unsigned long pfn,
1056 unsigned long start_pfn,
1057 unsigned long last_pfn,
1058 struct page *freelist)
1060 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1062 pfn = max(start_pfn, pfn);
1063 pte = &pte[pfn_level_offset(pfn, level)];
1065 do {
1066 unsigned long level_pfn;
1068 if (!dma_pte_present(pte))
1069 goto next;
1071 level_pfn = pfn & level_mask(level);
1073 /* If range covers entire pagetable, free it */
1074 if (start_pfn <= level_pfn &&
1075 last_pfn >= level_pfn + level_size(level) - 1) {
1076 /* These suborbinate page tables are going away entirely. Don't
1077 bother to clear them; we're just going to *free* them. */
1078 if (level > 1 && !dma_pte_superpage(pte))
1079 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1081 dma_clear_pte(pte);
1082 if (!first_pte)
1083 first_pte = pte;
1084 last_pte = pte;
1085 } else if (level > 1) {
1086 /* Recurse down into a level that isn't *entirely* obsolete */
1087 freelist = dma_pte_clear_level(domain, level - 1,
1088 phys_to_virt(dma_pte_addr(pte)),
1089 level_pfn, start_pfn, last_pfn,
1090 freelist);
1092 next:
1093 pfn += level_size(level);
1094 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1096 if (first_pte)
1097 domain_flush_cache(domain, first_pte,
1098 (void *)++last_pte - (void *)first_pte);
1100 return freelist;
1103 /* We can't just free the pages because the IOMMU may still be walking
1104 the page tables, and may have cached the intermediate levels. The
1105 pages can only be freed after the IOTLB flush has been done. */
1106 struct page *domain_unmap(struct dmar_domain *domain,
1107 unsigned long start_pfn,
1108 unsigned long last_pfn)
1110 struct page *freelist = NULL;
1112 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1113 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1114 BUG_ON(start_pfn > last_pfn);
1116 /* we don't need lock here; nobody else touches the iova range */
1117 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1118 domain->pgd, 0, start_pfn, last_pfn, NULL);
1120 /* free pgd */
1121 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1122 struct page *pgd_page = virt_to_page(domain->pgd);
1123 pgd_page->freelist = freelist;
1124 freelist = pgd_page;
1126 domain->pgd = NULL;
1129 return freelist;
1132 void dma_free_pagelist(struct page *freelist)
1134 struct page *pg;
1136 while ((pg = freelist)) {
1137 freelist = pg->freelist;
1138 free_pgtable_page(page_address(pg));
1142 /* iommu handling */
1143 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1145 struct root_entry *root;
1146 unsigned long flags;
1148 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1149 if (!root)
1150 return -ENOMEM;
1152 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1154 spin_lock_irqsave(&iommu->lock, flags);
1155 iommu->root_entry = root;
1156 spin_unlock_irqrestore(&iommu->lock, flags);
1158 return 0;
1161 static void iommu_set_root_entry(struct intel_iommu *iommu)
1163 void *addr;
1164 u32 sts;
1165 unsigned long flag;
1167 addr = iommu->root_entry;
1169 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1170 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
1172 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1174 /* Make sure hardware complete it */
1175 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1176 readl, (sts & DMA_GSTS_RTPS), sts);
1178 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1181 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1183 u32 val;
1184 unsigned long flag;
1186 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1187 return;
1189 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1190 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1192 /* Make sure hardware complete it */
1193 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1194 readl, (!(val & DMA_GSTS_WBFS)), val);
1196 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1199 /* return value determine if we need a write buffer flush */
1200 static void __iommu_flush_context(struct intel_iommu *iommu,
1201 u16 did, u16 source_id, u8 function_mask,
1202 u64 type)
1204 u64 val = 0;
1205 unsigned long flag;
1207 switch (type) {
1208 case DMA_CCMD_GLOBAL_INVL:
1209 val = DMA_CCMD_GLOBAL_INVL;
1210 break;
1211 case DMA_CCMD_DOMAIN_INVL:
1212 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1213 break;
1214 case DMA_CCMD_DEVICE_INVL:
1215 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1216 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1217 break;
1218 default:
1219 BUG();
1221 val |= DMA_CCMD_ICC;
1223 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1224 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1226 /* Make sure hardware complete it */
1227 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1228 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1230 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1233 /* return value determine if we need a write buffer flush */
1234 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1235 u64 addr, unsigned int size_order, u64 type)
1237 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1238 u64 val = 0, val_iva = 0;
1239 unsigned long flag;
1241 switch (type) {
1242 case DMA_TLB_GLOBAL_FLUSH:
1243 /* global flush doesn't need set IVA_REG */
1244 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1245 break;
1246 case DMA_TLB_DSI_FLUSH:
1247 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1248 break;
1249 case DMA_TLB_PSI_FLUSH:
1250 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1251 /* IH bit is passed in as part of address */
1252 val_iva = size_order | addr;
1253 break;
1254 default:
1255 BUG();
1257 /* Note: set drain read/write */
1258 #if 0
1260 * This is probably to be super secure.. Looks like we can
1261 * ignore it without any impact.
1263 if (cap_read_drain(iommu->cap))
1264 val |= DMA_TLB_READ_DRAIN;
1265 #endif
1266 if (cap_write_drain(iommu->cap))
1267 val |= DMA_TLB_WRITE_DRAIN;
1269 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1270 /* Note: Only uses first TLB reg currently */
1271 if (val_iva)
1272 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1273 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1275 /* Make sure hardware complete it */
1276 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1277 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1279 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1281 /* check IOTLB invalidation granularity */
1282 if (DMA_TLB_IAIG(val) == 0)
1283 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1284 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1285 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1286 (unsigned long long)DMA_TLB_IIRG(type),
1287 (unsigned long long)DMA_TLB_IAIG(val));
1290 static struct device_domain_info *
1291 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1292 u8 bus, u8 devfn)
1294 int found = 0;
1295 unsigned long flags;
1296 struct device_domain_info *info;
1297 struct pci_dev *pdev;
1299 if (!ecap_dev_iotlb_support(iommu->ecap))
1300 return NULL;
1302 if (!iommu->qi)
1303 return NULL;
1305 spin_lock_irqsave(&device_domain_lock, flags);
1306 list_for_each_entry(info, &domain->devices, link)
1307 if (info->iommu == iommu && info->bus == bus &&
1308 info->devfn == devfn) {
1309 found = 1;
1310 break;
1312 spin_unlock_irqrestore(&device_domain_lock, flags);
1314 if (!found || !info->dev || !dev_is_pci(info->dev))
1315 return NULL;
1317 pdev = to_pci_dev(info->dev);
1319 if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1320 return NULL;
1322 if (!dmar_find_matched_atsr_unit(pdev))
1323 return NULL;
1325 return info;
1328 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1330 if (!info || !dev_is_pci(info->dev))
1331 return;
1333 pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1336 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1338 if (!info->dev || !dev_is_pci(info->dev) ||
1339 !pci_ats_enabled(to_pci_dev(info->dev)))
1340 return;
1342 pci_disable_ats(to_pci_dev(info->dev));
1345 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1346 u64 addr, unsigned mask)
1348 u16 sid, qdep;
1349 unsigned long flags;
1350 struct device_domain_info *info;
1352 spin_lock_irqsave(&device_domain_lock, flags);
1353 list_for_each_entry(info, &domain->devices, link) {
1354 struct pci_dev *pdev;
1355 if (!info->dev || !dev_is_pci(info->dev))
1356 continue;
1358 pdev = to_pci_dev(info->dev);
1359 if (!pci_ats_enabled(pdev))
1360 continue;
1362 sid = info->bus << 8 | info->devfn;
1363 qdep = pci_ats_queue_depth(pdev);
1364 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1366 spin_unlock_irqrestore(&device_domain_lock, flags);
1369 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1370 unsigned long pfn, unsigned int pages, int ih, int map)
1372 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1373 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1375 BUG_ON(pages == 0);
1377 if (ih)
1378 ih = 1 << 6;
1380 * Fallback to domain selective flush if no PSI support or the size is
1381 * too big.
1382 * PSI requires page size to be 2 ^ x, and the base address is naturally
1383 * aligned to the size
1385 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1386 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1387 DMA_TLB_DSI_FLUSH);
1388 else
1389 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1390 DMA_TLB_PSI_FLUSH);
1393 * In caching mode, changes of pages from non-present to present require
1394 * flush. However, device IOTLB doesn't need to be flushed in this case.
1396 if (!cap_caching_mode(iommu->cap) || !map)
1397 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1400 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1402 u32 pmen;
1403 unsigned long flags;
1405 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1406 return;
1408 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1409 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1410 pmen &= ~DMA_PMEN_EPM;
1411 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1413 /* wait for the protected region status bit to clear */
1414 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1415 readl, !(pmen & DMA_PMEN_PRS), pmen);
1417 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1420 static void iommu_enable_translation(struct intel_iommu *iommu)
1422 u32 sts;
1423 unsigned long flags;
1425 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1426 iommu->gcmd |= DMA_GCMD_TE;
1427 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1429 /* Make sure hardware complete it */
1430 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1431 readl, (sts & DMA_GSTS_TES), sts);
1433 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1436 static void iommu_disable_translation(struct intel_iommu *iommu)
1438 u32 sts;
1439 unsigned long flag;
1441 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1442 iommu->gcmd &= ~DMA_GCMD_TE;
1443 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1445 /* Make sure hardware complete it */
1446 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1447 readl, (!(sts & DMA_GSTS_TES)), sts);
1449 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1453 static int iommu_init_domains(struct intel_iommu *iommu)
1455 unsigned long ndomains;
1456 unsigned long nlongs;
1458 ndomains = cap_ndoms(iommu->cap);
1459 pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1460 iommu->seq_id, ndomains);
1461 nlongs = BITS_TO_LONGS(ndomains);
1463 spin_lock_init(&iommu->lock);
1465 /* TBD: there might be 64K domains,
1466 * consider other allocation for future chip
1468 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1469 if (!iommu->domain_ids) {
1470 pr_err("IOMMU%d: allocating domain id array failed\n",
1471 iommu->seq_id);
1472 return -ENOMEM;
1474 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1475 GFP_KERNEL);
1476 if (!iommu->domains) {
1477 pr_err("IOMMU%d: allocating domain array failed\n",
1478 iommu->seq_id);
1479 kfree(iommu->domain_ids);
1480 iommu->domain_ids = NULL;
1481 return -ENOMEM;
1485 * if Caching mode is set, then invalid translations are tagged
1486 * with domainid 0. Hence we need to pre-allocate it.
1488 if (cap_caching_mode(iommu->cap))
1489 set_bit(0, iommu->domain_ids);
1490 return 0;
1493 static void free_dmar_iommu(struct intel_iommu *iommu)
1495 struct dmar_domain *domain;
1496 int i;
1498 if ((iommu->domains) && (iommu->domain_ids)) {
1499 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1501 * Domain id 0 is reserved for invalid translation
1502 * if hardware supports caching mode.
1504 if (cap_caching_mode(iommu->cap) && i == 0)
1505 continue;
1507 domain = iommu->domains[i];
1508 clear_bit(i, iommu->domain_ids);
1509 if (domain_detach_iommu(domain, iommu) == 0 &&
1510 !domain_type_is_vm(domain))
1511 domain_exit(domain);
1515 if (iommu->gcmd & DMA_GCMD_TE)
1516 iommu_disable_translation(iommu);
1518 kfree(iommu->domains);
1519 kfree(iommu->domain_ids);
1520 iommu->domains = NULL;
1521 iommu->domain_ids = NULL;
1523 g_iommus[iommu->seq_id] = NULL;
1525 /* free context mapping */
1526 free_context_table(iommu);
1529 static struct dmar_domain *alloc_domain(int flags)
1531 /* domain id for virtual machine, it won't be set in context */
1532 static atomic_t vm_domid = ATOMIC_INIT(0);
1533 struct dmar_domain *domain;
1535 domain = alloc_domain_mem();
1536 if (!domain)
1537 return NULL;
1539 memset(domain, 0, sizeof(*domain));
1540 domain->nid = -1;
1541 domain->flags = flags;
1542 spin_lock_init(&domain->iommu_lock);
1543 INIT_LIST_HEAD(&domain->devices);
1544 if (flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1545 domain->id = atomic_inc_return(&vm_domid);
1547 return domain;
1550 static int __iommu_attach_domain(struct dmar_domain *domain,
1551 struct intel_iommu *iommu)
1553 int num;
1554 unsigned long ndomains;
1556 ndomains = cap_ndoms(iommu->cap);
1557 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1558 if (num < ndomains) {
1559 set_bit(num, iommu->domain_ids);
1560 iommu->domains[num] = domain;
1561 } else {
1562 num = -ENOSPC;
1565 return num;
1568 static int iommu_attach_domain(struct dmar_domain *domain,
1569 struct intel_iommu *iommu)
1571 int num;
1572 unsigned long flags;
1574 spin_lock_irqsave(&iommu->lock, flags);
1575 num = __iommu_attach_domain(domain, iommu);
1576 spin_unlock_irqrestore(&iommu->lock, flags);
1577 if (num < 0)
1578 pr_err("IOMMU: no free domain ids\n");
1580 return num;
1583 static int iommu_attach_vm_domain(struct dmar_domain *domain,
1584 struct intel_iommu *iommu)
1586 int num;
1587 unsigned long ndomains;
1589 ndomains = cap_ndoms(iommu->cap);
1590 for_each_set_bit(num, iommu->domain_ids, ndomains)
1591 if (iommu->domains[num] == domain)
1592 return num;
1594 return __iommu_attach_domain(domain, iommu);
1597 static void iommu_detach_domain(struct dmar_domain *domain,
1598 struct intel_iommu *iommu)
1600 unsigned long flags;
1601 int num, ndomains;
1603 spin_lock_irqsave(&iommu->lock, flags);
1604 if (domain_type_is_vm_or_si(domain)) {
1605 ndomains = cap_ndoms(iommu->cap);
1606 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1607 if (iommu->domains[num] == domain) {
1608 clear_bit(num, iommu->domain_ids);
1609 iommu->domains[num] = NULL;
1610 break;
1613 } else {
1614 clear_bit(domain->id, iommu->domain_ids);
1615 iommu->domains[domain->id] = NULL;
1617 spin_unlock_irqrestore(&iommu->lock, flags);
1620 static void domain_attach_iommu(struct dmar_domain *domain,
1621 struct intel_iommu *iommu)
1623 unsigned long flags;
1625 spin_lock_irqsave(&domain->iommu_lock, flags);
1626 if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1627 domain->iommu_count++;
1628 if (domain->iommu_count == 1)
1629 domain->nid = iommu->node;
1630 domain_update_iommu_cap(domain);
1632 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1635 static int domain_detach_iommu(struct dmar_domain *domain,
1636 struct intel_iommu *iommu)
1638 unsigned long flags;
1639 int count = INT_MAX;
1641 spin_lock_irqsave(&domain->iommu_lock, flags);
1642 if (test_and_clear_bit(iommu->seq_id, domain->iommu_bmp)) {
1643 count = --domain->iommu_count;
1644 domain_update_iommu_cap(domain);
1646 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1648 return count;
1651 static struct iova_domain reserved_iova_list;
1652 static struct lock_class_key reserved_rbtree_key;
1654 static int dmar_init_reserved_ranges(void)
1656 struct pci_dev *pdev = NULL;
1657 struct iova *iova;
1658 int i;
1660 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1662 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1663 &reserved_rbtree_key);
1665 /* IOAPIC ranges shouldn't be accessed by DMA */
1666 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1667 IOVA_PFN(IOAPIC_RANGE_END));
1668 if (!iova) {
1669 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1670 return -ENODEV;
1673 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1674 for_each_pci_dev(pdev) {
1675 struct resource *r;
1677 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1678 r = &pdev->resource[i];
1679 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1680 continue;
1681 iova = reserve_iova(&reserved_iova_list,
1682 IOVA_PFN(r->start),
1683 IOVA_PFN(r->end));
1684 if (!iova) {
1685 printk(KERN_ERR "Reserve iova failed\n");
1686 return -ENODEV;
1690 return 0;
1693 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1695 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1698 static inline int guestwidth_to_adjustwidth(int gaw)
1700 int agaw;
1701 int r = (gaw - 12) % 9;
1703 if (r == 0)
1704 agaw = gaw;
1705 else
1706 agaw = gaw + 9 - r;
1707 if (agaw > 64)
1708 agaw = 64;
1709 return agaw;
1712 static int domain_init(struct dmar_domain *domain, int guest_width)
1714 struct intel_iommu *iommu;
1715 int adjust_width, agaw;
1716 unsigned long sagaw;
1718 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1719 domain_reserve_special_ranges(domain);
1721 /* calculate AGAW */
1722 iommu = domain_get_iommu(domain);
1723 if (guest_width > cap_mgaw(iommu->cap))
1724 guest_width = cap_mgaw(iommu->cap);
1725 domain->gaw = guest_width;
1726 adjust_width = guestwidth_to_adjustwidth(guest_width);
1727 agaw = width_to_agaw(adjust_width);
1728 sagaw = cap_sagaw(iommu->cap);
1729 if (!test_bit(agaw, &sagaw)) {
1730 /* hardware doesn't support it, choose a bigger one */
1731 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1732 agaw = find_next_bit(&sagaw, 5, agaw);
1733 if (agaw >= 5)
1734 return -ENODEV;
1736 domain->agaw = agaw;
1738 if (ecap_coherent(iommu->ecap))
1739 domain->iommu_coherency = 1;
1740 else
1741 domain->iommu_coherency = 0;
1743 if (ecap_sc_support(iommu->ecap))
1744 domain->iommu_snooping = 1;
1745 else
1746 domain->iommu_snooping = 0;
1748 if (intel_iommu_superpage)
1749 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1750 else
1751 domain->iommu_superpage = 0;
1753 domain->nid = iommu->node;
1755 /* always allocate the top pgd */
1756 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1757 if (!domain->pgd)
1758 return -ENOMEM;
1759 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1760 return 0;
1763 static void domain_exit(struct dmar_domain *domain)
1765 struct dmar_drhd_unit *drhd;
1766 struct intel_iommu *iommu;
1767 struct page *freelist = NULL;
1769 /* Domain 0 is reserved, so dont process it */
1770 if (!domain)
1771 return;
1773 /* Flush any lazy unmaps that may reference this domain */
1774 if (!intel_iommu_strict)
1775 flush_unmaps_timeout(0);
1777 /* remove associated devices */
1778 domain_remove_dev_info(domain);
1780 /* destroy iovas */
1781 put_iova_domain(&domain->iovad);
1783 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1785 /* clear attached or cached domains */
1786 rcu_read_lock();
1787 for_each_active_iommu(iommu, drhd)
1788 if (domain_type_is_vm(domain) ||
1789 test_bit(iommu->seq_id, domain->iommu_bmp))
1790 iommu_detach_domain(domain, iommu);
1791 rcu_read_unlock();
1793 dma_free_pagelist(freelist);
1795 free_domain_mem(domain);
1798 static int domain_context_mapping_one(struct dmar_domain *domain,
1799 struct intel_iommu *iommu,
1800 u8 bus, u8 devfn, int translation)
1802 struct context_entry *context;
1803 unsigned long flags;
1804 struct dma_pte *pgd;
1805 int id;
1806 int agaw;
1807 struct device_domain_info *info = NULL;
1809 pr_debug("Set context mapping for %02x:%02x.%d\n",
1810 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1812 BUG_ON(!domain->pgd);
1813 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1814 translation != CONTEXT_TT_MULTI_LEVEL);
1816 context = device_to_context_entry(iommu, bus, devfn);
1817 if (!context)
1818 return -ENOMEM;
1819 spin_lock_irqsave(&iommu->lock, flags);
1820 if (context_present(context)) {
1821 spin_unlock_irqrestore(&iommu->lock, flags);
1822 return 0;
1825 id = domain->id;
1826 pgd = domain->pgd;
1828 if (domain_type_is_vm_or_si(domain)) {
1829 if (domain_type_is_vm(domain)) {
1830 id = iommu_attach_vm_domain(domain, iommu);
1831 if (id < 0) {
1832 spin_unlock_irqrestore(&iommu->lock, flags);
1833 pr_err("IOMMU: no free domain ids\n");
1834 return -EFAULT;
1838 /* Skip top levels of page tables for
1839 * iommu which has less agaw than default.
1840 * Unnecessary for PT mode.
1842 if (translation != CONTEXT_TT_PASS_THROUGH) {
1843 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1844 pgd = phys_to_virt(dma_pte_addr(pgd));
1845 if (!dma_pte_present(pgd)) {
1846 spin_unlock_irqrestore(&iommu->lock, flags);
1847 return -ENOMEM;
1853 context_set_domain_id(context, id);
1855 if (translation != CONTEXT_TT_PASS_THROUGH) {
1856 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1857 translation = info ? CONTEXT_TT_DEV_IOTLB :
1858 CONTEXT_TT_MULTI_LEVEL;
1861 * In pass through mode, AW must be programmed to indicate the largest
1862 * AGAW value supported by hardware. And ASR is ignored by hardware.
1864 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1865 context_set_address_width(context, iommu->msagaw);
1866 else {
1867 context_set_address_root(context, virt_to_phys(pgd));
1868 context_set_address_width(context, iommu->agaw);
1871 context_set_translation_type(context, translation);
1872 context_set_fault_enable(context);
1873 context_set_present(context);
1874 domain_flush_cache(domain, context, sizeof(*context));
1877 * It's a non-present to present mapping. If hardware doesn't cache
1878 * non-present entry we only need to flush the write-buffer. If the
1879 * _does_ cache non-present entries, then it does so in the special
1880 * domain #0, which we have to flush:
1882 if (cap_caching_mode(iommu->cap)) {
1883 iommu->flush.flush_context(iommu, 0,
1884 (((u16)bus) << 8) | devfn,
1885 DMA_CCMD_MASK_NOBIT,
1886 DMA_CCMD_DEVICE_INVL);
1887 iommu->flush.flush_iotlb(iommu, id, 0, 0, DMA_TLB_DSI_FLUSH);
1888 } else {
1889 iommu_flush_write_buffer(iommu);
1891 iommu_enable_dev_iotlb(info);
1892 spin_unlock_irqrestore(&iommu->lock, flags);
1894 domain_attach_iommu(domain, iommu);
1896 return 0;
1899 struct domain_context_mapping_data {
1900 struct dmar_domain *domain;
1901 struct intel_iommu *iommu;
1902 int translation;
1905 static int domain_context_mapping_cb(struct pci_dev *pdev,
1906 u16 alias, void *opaque)
1908 struct domain_context_mapping_data *data = opaque;
1910 return domain_context_mapping_one(data->domain, data->iommu,
1911 PCI_BUS_NUM(alias), alias & 0xff,
1912 data->translation);
1915 static int
1916 domain_context_mapping(struct dmar_domain *domain, struct device *dev,
1917 int translation)
1919 struct intel_iommu *iommu;
1920 u8 bus, devfn;
1921 struct domain_context_mapping_data data;
1923 iommu = device_to_iommu(dev, &bus, &devfn);
1924 if (!iommu)
1925 return -ENODEV;
1927 if (!dev_is_pci(dev))
1928 return domain_context_mapping_one(domain, iommu, bus, devfn,
1929 translation);
1931 data.domain = domain;
1932 data.iommu = iommu;
1933 data.translation = translation;
1935 return pci_for_each_dma_alias(to_pci_dev(dev),
1936 &domain_context_mapping_cb, &data);
1939 static int domain_context_mapped_cb(struct pci_dev *pdev,
1940 u16 alias, void *opaque)
1942 struct intel_iommu *iommu = opaque;
1944 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
1947 static int domain_context_mapped(struct device *dev)
1949 struct intel_iommu *iommu;
1950 u8 bus, devfn;
1952 iommu = device_to_iommu(dev, &bus, &devfn);
1953 if (!iommu)
1954 return -ENODEV;
1956 if (!dev_is_pci(dev))
1957 return device_context_mapped(iommu, bus, devfn);
1959 return !pci_for_each_dma_alias(to_pci_dev(dev),
1960 domain_context_mapped_cb, iommu);
1963 /* Returns a number of VTD pages, but aligned to MM page size */
1964 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1965 size_t size)
1967 host_addr &= ~PAGE_MASK;
1968 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1971 /* Return largest possible superpage level for a given mapping */
1972 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1973 unsigned long iov_pfn,
1974 unsigned long phy_pfn,
1975 unsigned long pages)
1977 int support, level = 1;
1978 unsigned long pfnmerge;
1980 support = domain->iommu_superpage;
1982 /* To use a large page, the virtual *and* physical addresses
1983 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1984 of them will mean we have to use smaller pages. So just
1985 merge them and check both at once. */
1986 pfnmerge = iov_pfn | phy_pfn;
1988 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1989 pages >>= VTD_STRIDE_SHIFT;
1990 if (!pages)
1991 break;
1992 pfnmerge >>= VTD_STRIDE_SHIFT;
1993 level++;
1994 support--;
1996 return level;
1999 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2000 struct scatterlist *sg, unsigned long phys_pfn,
2001 unsigned long nr_pages, int prot)
2003 struct dma_pte *first_pte = NULL, *pte = NULL;
2004 phys_addr_t uninitialized_var(pteval);
2005 unsigned long sg_res = 0;
2006 unsigned int largepage_lvl = 0;
2007 unsigned long lvl_pages = 0;
2009 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2011 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2012 return -EINVAL;
2014 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2016 if (!sg) {
2017 sg_res = nr_pages;
2018 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2021 while (nr_pages > 0) {
2022 uint64_t tmp;
2024 if (!sg_res) {
2025 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2027 sg_res = aligned_nrpages(sg->offset, sg->length);
2028 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2029 sg->dma_length = sg->length;
2030 pteval = (sg_phys(sg) - pgoff) | prot;
2031 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2034 if (!pte) {
2035 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2037 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2038 if (!pte)
2039 return -ENOMEM;
2040 /* It is large page*/
2041 if (largepage_lvl > 1) {
2042 pteval |= DMA_PTE_LARGE_PAGE;
2043 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2045 * Ensure that old small page tables are
2046 * removed to make room for superpage,
2047 * if they exist.
2049 dma_pte_free_pagetable(domain, iov_pfn,
2050 iov_pfn + lvl_pages - 1);
2051 } else {
2052 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2056 /* We don't need lock here, nobody else
2057 * touches the iova range
2059 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2060 if (tmp) {
2061 static int dumps = 5;
2062 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2063 iov_pfn, tmp, (unsigned long long)pteval);
2064 if (dumps) {
2065 dumps--;
2066 debug_dma_dump_mappings(NULL);
2068 WARN_ON(1);
2071 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2073 BUG_ON(nr_pages < lvl_pages);
2074 BUG_ON(sg_res < lvl_pages);
2076 nr_pages -= lvl_pages;
2077 iov_pfn += lvl_pages;
2078 phys_pfn += lvl_pages;
2079 pteval += lvl_pages * VTD_PAGE_SIZE;
2080 sg_res -= lvl_pages;
2082 /* If the next PTE would be the first in a new page, then we
2083 need to flush the cache on the entries we've just written.
2084 And then we'll need to recalculate 'pte', so clear it and
2085 let it get set again in the if (!pte) block above.
2087 If we're done (!nr_pages) we need to flush the cache too.
2089 Also if we've been setting superpages, we may need to
2090 recalculate 'pte' and switch back to smaller pages for the
2091 end of the mapping, if the trailing size is not enough to
2092 use another superpage (i.e. sg_res < lvl_pages). */
2093 pte++;
2094 if (!nr_pages || first_pte_in_page(pte) ||
2095 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2096 domain_flush_cache(domain, first_pte,
2097 (void *)pte - (void *)first_pte);
2098 pte = NULL;
2101 if (!sg_res && nr_pages)
2102 sg = sg_next(sg);
2104 return 0;
2107 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2108 struct scatterlist *sg, unsigned long nr_pages,
2109 int prot)
2111 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2114 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2115 unsigned long phys_pfn, unsigned long nr_pages,
2116 int prot)
2118 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2121 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2123 if (!iommu)
2124 return;
2126 clear_context_table(iommu, bus, devfn);
2127 iommu->flush.flush_context(iommu, 0, 0, 0,
2128 DMA_CCMD_GLOBAL_INVL);
2129 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2132 static inline void unlink_domain_info(struct device_domain_info *info)
2134 assert_spin_locked(&device_domain_lock);
2135 list_del(&info->link);
2136 list_del(&info->global);
2137 if (info->dev)
2138 info->dev->archdata.iommu = NULL;
2141 static void domain_remove_dev_info(struct dmar_domain *domain)
2143 struct device_domain_info *info, *tmp;
2144 unsigned long flags;
2146 spin_lock_irqsave(&device_domain_lock, flags);
2147 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
2148 unlink_domain_info(info);
2149 spin_unlock_irqrestore(&device_domain_lock, flags);
2151 iommu_disable_dev_iotlb(info);
2152 iommu_detach_dev(info->iommu, info->bus, info->devfn);
2154 if (domain_type_is_vm(domain)) {
2155 iommu_detach_dependent_devices(info->iommu, info->dev);
2156 domain_detach_iommu(domain, info->iommu);
2159 free_devinfo_mem(info);
2160 spin_lock_irqsave(&device_domain_lock, flags);
2162 spin_unlock_irqrestore(&device_domain_lock, flags);
2166 * find_domain
2167 * Note: we use struct device->archdata.iommu stores the info
2169 static struct dmar_domain *find_domain(struct device *dev)
2171 struct device_domain_info *info;
2173 /* No lock here, assumes no domain exit in normal case */
2174 info = dev->archdata.iommu;
2175 if (info)
2176 return info->domain;
2177 return NULL;
2180 static inline struct device_domain_info *
2181 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2183 struct device_domain_info *info;
2185 list_for_each_entry(info, &device_domain_list, global)
2186 if (info->iommu->segment == segment && info->bus == bus &&
2187 info->devfn == devfn)
2188 return info;
2190 return NULL;
2193 static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2194 int bus, int devfn,
2195 struct device *dev,
2196 struct dmar_domain *domain)
2198 struct dmar_domain *found = NULL;
2199 struct device_domain_info *info;
2200 unsigned long flags;
2202 info = alloc_devinfo_mem();
2203 if (!info)
2204 return NULL;
2206 info->bus = bus;
2207 info->devfn = devfn;
2208 info->dev = dev;
2209 info->domain = domain;
2210 info->iommu = iommu;
2212 spin_lock_irqsave(&device_domain_lock, flags);
2213 if (dev)
2214 found = find_domain(dev);
2215 else {
2216 struct device_domain_info *info2;
2217 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2218 if (info2)
2219 found = info2->domain;
2221 if (found) {
2222 spin_unlock_irqrestore(&device_domain_lock, flags);
2223 free_devinfo_mem(info);
2224 /* Caller must free the original domain */
2225 return found;
2228 list_add(&info->link, &domain->devices);
2229 list_add(&info->global, &device_domain_list);
2230 if (dev)
2231 dev->archdata.iommu = info;
2232 spin_unlock_irqrestore(&device_domain_lock, flags);
2234 return domain;
2237 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2239 *(u16 *)opaque = alias;
2240 return 0;
2243 /* domain is initialized */
2244 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2246 struct dmar_domain *domain, *tmp;
2247 struct intel_iommu *iommu;
2248 struct device_domain_info *info;
2249 u16 dma_alias;
2250 unsigned long flags;
2251 u8 bus, devfn;
2253 domain = find_domain(dev);
2254 if (domain)
2255 return domain;
2257 iommu = device_to_iommu(dev, &bus, &devfn);
2258 if (!iommu)
2259 return NULL;
2261 if (dev_is_pci(dev)) {
2262 struct pci_dev *pdev = to_pci_dev(dev);
2264 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2266 spin_lock_irqsave(&device_domain_lock, flags);
2267 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2268 PCI_BUS_NUM(dma_alias),
2269 dma_alias & 0xff);
2270 if (info) {
2271 iommu = info->iommu;
2272 domain = info->domain;
2274 spin_unlock_irqrestore(&device_domain_lock, flags);
2276 /* DMA alias already has a domain, uses it */
2277 if (info)
2278 goto found_domain;
2281 /* Allocate and initialize new domain for the device */
2282 domain = alloc_domain(0);
2283 if (!domain)
2284 return NULL;
2285 domain->id = iommu_attach_domain(domain, iommu);
2286 if (domain->id < 0) {
2287 free_domain_mem(domain);
2288 return NULL;
2290 domain_attach_iommu(domain, iommu);
2291 if (domain_init(domain, gaw)) {
2292 domain_exit(domain);
2293 return NULL;
2296 /* register PCI DMA alias device */
2297 if (dev_is_pci(dev)) {
2298 tmp = dmar_insert_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2299 dma_alias & 0xff, NULL, domain);
2301 if (!tmp || tmp != domain) {
2302 domain_exit(domain);
2303 domain = tmp;
2306 if (!domain)
2307 return NULL;
2310 found_domain:
2311 tmp = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2313 if (!tmp || tmp != domain) {
2314 domain_exit(domain);
2315 domain = tmp;
2318 return domain;
2321 static int iommu_identity_mapping;
2322 #define IDENTMAP_ALL 1
2323 #define IDENTMAP_GFX 2
2324 #define IDENTMAP_AZALIA 4
2326 static int iommu_domain_identity_map(struct dmar_domain *domain,
2327 unsigned long long start,
2328 unsigned long long end)
2330 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2331 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2333 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2334 dma_to_mm_pfn(last_vpfn))) {
2335 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2336 return -ENOMEM;
2339 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2340 start, end, domain->id);
2342 * RMRR range might have overlap with physical memory range,
2343 * clear it first
2345 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2347 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2348 last_vpfn - first_vpfn + 1,
2349 DMA_PTE_READ|DMA_PTE_WRITE);
2352 static int iommu_prepare_identity_map(struct device *dev,
2353 unsigned long long start,
2354 unsigned long long end)
2356 struct dmar_domain *domain;
2357 int ret;
2359 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2360 if (!domain)
2361 return -ENOMEM;
2363 /* For _hardware_ passthrough, don't bother. But for software
2364 passthrough, we do it anyway -- it may indicate a memory
2365 range which is reserved in E820, so which didn't get set
2366 up to start with in si_domain */
2367 if (domain == si_domain && hw_pass_through) {
2368 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2369 dev_name(dev), start, end);
2370 return 0;
2373 printk(KERN_INFO
2374 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2375 dev_name(dev), start, end);
2377 if (end < start) {
2378 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2379 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2380 dmi_get_system_info(DMI_BIOS_VENDOR),
2381 dmi_get_system_info(DMI_BIOS_VERSION),
2382 dmi_get_system_info(DMI_PRODUCT_VERSION));
2383 ret = -EIO;
2384 goto error;
2387 if (end >> agaw_to_width(domain->agaw)) {
2388 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2389 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2390 agaw_to_width(domain->agaw),
2391 dmi_get_system_info(DMI_BIOS_VENDOR),
2392 dmi_get_system_info(DMI_BIOS_VERSION),
2393 dmi_get_system_info(DMI_PRODUCT_VERSION));
2394 ret = -EIO;
2395 goto error;
2398 ret = iommu_domain_identity_map(domain, start, end);
2399 if (ret)
2400 goto error;
2402 /* context entry init */
2403 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2404 if (ret)
2405 goto error;
2407 return 0;
2409 error:
2410 domain_exit(domain);
2411 return ret;
2414 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2415 struct device *dev)
2417 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2418 return 0;
2419 return iommu_prepare_identity_map(dev, rmrr->base_address,
2420 rmrr->end_address);
2423 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2424 static inline void iommu_prepare_isa(void)
2426 struct pci_dev *pdev;
2427 int ret;
2429 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2430 if (!pdev)
2431 return;
2433 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2434 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2436 if (ret)
2437 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2438 "floppy might not work\n");
2440 pci_dev_put(pdev);
2442 #else
2443 static inline void iommu_prepare_isa(void)
2445 return;
2447 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2449 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2451 static int __init si_domain_init(int hw)
2453 struct dmar_drhd_unit *drhd;
2454 struct intel_iommu *iommu;
2455 int nid, ret = 0;
2456 bool first = true;
2458 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2459 if (!si_domain)
2460 return -EFAULT;
2462 for_each_active_iommu(iommu, drhd) {
2463 ret = iommu_attach_domain(si_domain, iommu);
2464 if (ret < 0) {
2465 domain_exit(si_domain);
2466 return -EFAULT;
2467 } else if (first) {
2468 si_domain->id = ret;
2469 first = false;
2470 } else if (si_domain->id != ret) {
2471 domain_exit(si_domain);
2472 return -EFAULT;
2474 domain_attach_iommu(si_domain, iommu);
2477 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2478 domain_exit(si_domain);
2479 return -EFAULT;
2482 pr_debug("IOMMU: identity mapping domain is domain %d\n",
2483 si_domain->id);
2485 if (hw)
2486 return 0;
2488 for_each_online_node(nid) {
2489 unsigned long start_pfn, end_pfn;
2490 int i;
2492 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2493 ret = iommu_domain_identity_map(si_domain,
2494 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2495 if (ret)
2496 return ret;
2500 return 0;
2503 static int identity_mapping(struct device *dev)
2505 struct device_domain_info *info;
2507 if (likely(!iommu_identity_mapping))
2508 return 0;
2510 info = dev->archdata.iommu;
2511 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2512 return (info->domain == si_domain);
2514 return 0;
2517 static int domain_add_dev_info(struct dmar_domain *domain,
2518 struct device *dev, int translation)
2520 struct dmar_domain *ndomain;
2521 struct intel_iommu *iommu;
2522 u8 bus, devfn;
2523 int ret;
2525 iommu = device_to_iommu(dev, &bus, &devfn);
2526 if (!iommu)
2527 return -ENODEV;
2529 ndomain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2530 if (ndomain != domain)
2531 return -EBUSY;
2533 ret = domain_context_mapping(domain, dev, translation);
2534 if (ret) {
2535 domain_remove_one_dev_info(domain, dev);
2536 return ret;
2539 return 0;
2542 static bool device_has_rmrr(struct device *dev)
2544 struct dmar_rmrr_unit *rmrr;
2545 struct device *tmp;
2546 int i;
2548 rcu_read_lock();
2549 for_each_rmrr_units(rmrr) {
2551 * Return TRUE if this RMRR contains the device that
2552 * is passed in.
2554 for_each_active_dev_scope(rmrr->devices,
2555 rmrr->devices_cnt, i, tmp)
2556 if (tmp == dev) {
2557 rcu_read_unlock();
2558 return true;
2561 rcu_read_unlock();
2562 return false;
2566 * There are a couple cases where we need to restrict the functionality of
2567 * devices associated with RMRRs. The first is when evaluating a device for
2568 * identity mapping because problems exist when devices are moved in and out
2569 * of domains and their respective RMRR information is lost. This means that
2570 * a device with associated RMRRs will never be in a "passthrough" domain.
2571 * The second is use of the device through the IOMMU API. This interface
2572 * expects to have full control of the IOVA space for the device. We cannot
2573 * satisfy both the requirement that RMRR access is maintained and have an
2574 * unencumbered IOVA space. We also have no ability to quiesce the device's
2575 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2576 * We therefore prevent devices associated with an RMRR from participating in
2577 * the IOMMU API, which eliminates them from device assignment.
2579 * In both cases we assume that PCI USB devices with RMRRs have them largely
2580 * for historical reasons and that the RMRR space is not actively used post
2581 * boot. This exclusion may change if vendors begin to abuse it.
2583 * The same exception is made for graphics devices, with the requirement that
2584 * any use of the RMRR regions will be torn down before assigning the device
2585 * to a guest.
2587 static bool device_is_rmrr_locked(struct device *dev)
2589 if (!device_has_rmrr(dev))
2590 return false;
2592 if (dev_is_pci(dev)) {
2593 struct pci_dev *pdev = to_pci_dev(dev);
2595 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2596 return false;
2599 return true;
2602 static int iommu_should_identity_map(struct device *dev, int startup)
2605 if (dev_is_pci(dev)) {
2606 struct pci_dev *pdev = to_pci_dev(dev);
2608 if (device_is_rmrr_locked(dev))
2609 return 0;
2611 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2612 return 1;
2614 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2615 return 1;
2617 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2618 return 0;
2621 * We want to start off with all devices in the 1:1 domain, and
2622 * take them out later if we find they can't access all of memory.
2624 * However, we can't do this for PCI devices behind bridges,
2625 * because all PCI devices behind the same bridge will end up
2626 * with the same source-id on their transactions.
2628 * Practically speaking, we can't change things around for these
2629 * devices at run-time, because we can't be sure there'll be no
2630 * DMA transactions in flight for any of their siblings.
2632 * So PCI devices (unless they're on the root bus) as well as
2633 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2634 * the 1:1 domain, just in _case_ one of their siblings turns out
2635 * not to be able to map all of memory.
2637 if (!pci_is_pcie(pdev)) {
2638 if (!pci_is_root_bus(pdev->bus))
2639 return 0;
2640 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2641 return 0;
2642 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2643 return 0;
2644 } else {
2645 if (device_has_rmrr(dev))
2646 return 0;
2650 * At boot time, we don't yet know if devices will be 64-bit capable.
2651 * Assume that they will — if they turn out not to be, then we can
2652 * take them out of the 1:1 domain later.
2654 if (!startup) {
2656 * If the device's dma_mask is less than the system's memory
2657 * size then this is not a candidate for identity mapping.
2659 u64 dma_mask = *dev->dma_mask;
2661 if (dev->coherent_dma_mask &&
2662 dev->coherent_dma_mask < dma_mask)
2663 dma_mask = dev->coherent_dma_mask;
2665 return dma_mask >= dma_get_required_mask(dev);
2668 return 1;
2671 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2673 int ret;
2675 if (!iommu_should_identity_map(dev, 1))
2676 return 0;
2678 ret = domain_add_dev_info(si_domain, dev,
2679 hw ? CONTEXT_TT_PASS_THROUGH :
2680 CONTEXT_TT_MULTI_LEVEL);
2681 if (!ret)
2682 pr_info("IOMMU: %s identity mapping for device %s\n",
2683 hw ? "hardware" : "software", dev_name(dev));
2684 else if (ret == -ENODEV)
2685 /* device not associated with an iommu */
2686 ret = 0;
2688 return ret;
2692 static int __init iommu_prepare_static_identity_mapping(int hw)
2694 struct pci_dev *pdev = NULL;
2695 struct dmar_drhd_unit *drhd;
2696 struct intel_iommu *iommu;
2697 struct device *dev;
2698 int i;
2699 int ret = 0;
2701 ret = si_domain_init(hw);
2702 if (ret)
2703 return -EFAULT;
2705 for_each_pci_dev(pdev) {
2706 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2707 if (ret)
2708 return ret;
2711 for_each_active_iommu(iommu, drhd)
2712 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2713 struct acpi_device_physical_node *pn;
2714 struct acpi_device *adev;
2716 if (dev->bus != &acpi_bus_type)
2717 continue;
2719 adev= to_acpi_device(dev);
2720 mutex_lock(&adev->physical_node_lock);
2721 list_for_each_entry(pn, &adev->physical_node_list, node) {
2722 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2723 if (ret)
2724 break;
2726 mutex_unlock(&adev->physical_node_lock);
2727 if (ret)
2728 return ret;
2731 return 0;
2734 static int __init init_dmars(void)
2736 struct dmar_drhd_unit *drhd;
2737 struct dmar_rmrr_unit *rmrr;
2738 struct device *dev;
2739 struct intel_iommu *iommu;
2740 int i, ret;
2743 * for each drhd
2744 * allocate root
2745 * initialize and program root entry to not present
2746 * endfor
2748 for_each_drhd_unit(drhd) {
2750 * lock not needed as this is only incremented in the single
2751 * threaded kernel __init code path all other access are read
2752 * only
2754 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2755 g_num_of_iommus++;
2756 continue;
2758 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2759 IOMMU_UNITS_SUPPORTED);
2762 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2763 GFP_KERNEL);
2764 if (!g_iommus) {
2765 printk(KERN_ERR "Allocating global iommu array failed\n");
2766 ret = -ENOMEM;
2767 goto error;
2770 deferred_flush = kzalloc(g_num_of_iommus *
2771 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2772 if (!deferred_flush) {
2773 ret = -ENOMEM;
2774 goto free_g_iommus;
2777 for_each_active_iommu(iommu, drhd) {
2778 g_iommus[iommu->seq_id] = iommu;
2780 ret = iommu_init_domains(iommu);
2781 if (ret)
2782 goto free_iommu;
2785 * TBD:
2786 * we could share the same root & context tables
2787 * among all IOMMU's. Need to Split it later.
2789 ret = iommu_alloc_root_entry(iommu);
2790 if (ret) {
2791 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2792 goto free_iommu;
2794 if (!ecap_pass_through(iommu->ecap))
2795 hw_pass_through = 0;
2799 * Start from the sane iommu hardware state.
2801 for_each_active_iommu(iommu, drhd) {
2803 * If the queued invalidation is already initialized by us
2804 * (for example, while enabling interrupt-remapping) then
2805 * we got the things already rolling from a sane state.
2807 if (iommu->qi)
2808 continue;
2811 * Clear any previous faults.
2813 dmar_fault(-1, iommu);
2815 * Disable queued invalidation if supported and already enabled
2816 * before OS handover.
2818 dmar_disable_qi(iommu);
2821 for_each_active_iommu(iommu, drhd) {
2822 if (dmar_enable_qi(iommu)) {
2824 * Queued Invalidate not enabled, use Register Based
2825 * Invalidate
2827 iommu->flush.flush_context = __iommu_flush_context;
2828 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2829 printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2830 "invalidation\n",
2831 iommu->seq_id,
2832 (unsigned long long)drhd->reg_base_addr);
2833 } else {
2834 iommu->flush.flush_context = qi_flush_context;
2835 iommu->flush.flush_iotlb = qi_flush_iotlb;
2836 printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2837 "invalidation\n",
2838 iommu->seq_id,
2839 (unsigned long long)drhd->reg_base_addr);
2843 if (iommu_pass_through)
2844 iommu_identity_mapping |= IDENTMAP_ALL;
2846 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2847 iommu_identity_mapping |= IDENTMAP_GFX;
2848 #endif
2850 check_tylersburg_isoch();
2853 * If pass through is not set or not enabled, setup context entries for
2854 * identity mappings for rmrr, gfx, and isa and may fall back to static
2855 * identity mapping if iommu_identity_mapping is set.
2857 if (iommu_identity_mapping) {
2858 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2859 if (ret) {
2860 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2861 goto free_iommu;
2865 * For each rmrr
2866 * for each dev attached to rmrr
2867 * do
2868 * locate drhd for dev, alloc domain for dev
2869 * allocate free domain
2870 * allocate page table entries for rmrr
2871 * if context not allocated for bus
2872 * allocate and init context
2873 * set present in root table for this bus
2874 * init context with domain, translation etc
2875 * endfor
2876 * endfor
2878 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2879 for_each_rmrr_units(rmrr) {
2880 /* some BIOS lists non-exist devices in DMAR table. */
2881 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2882 i, dev) {
2883 ret = iommu_prepare_rmrr_dev(rmrr, dev);
2884 if (ret)
2885 printk(KERN_ERR
2886 "IOMMU: mapping reserved region failed\n");
2890 iommu_prepare_isa();
2893 * for each drhd
2894 * enable fault log
2895 * global invalidate context cache
2896 * global invalidate iotlb
2897 * enable translation
2899 for_each_iommu(iommu, drhd) {
2900 if (drhd->ignored) {
2902 * we always have to disable PMRs or DMA may fail on
2903 * this device
2905 if (force_on)
2906 iommu_disable_protect_mem_regions(iommu);
2907 continue;
2910 iommu_flush_write_buffer(iommu);
2912 ret = dmar_set_interrupt(iommu);
2913 if (ret)
2914 goto free_iommu;
2916 iommu_set_root_entry(iommu);
2918 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2919 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2920 iommu_enable_translation(iommu);
2921 iommu_disable_protect_mem_regions(iommu);
2924 return 0;
2926 free_iommu:
2927 for_each_active_iommu(iommu, drhd)
2928 free_dmar_iommu(iommu);
2929 kfree(deferred_flush);
2930 free_g_iommus:
2931 kfree(g_iommus);
2932 error:
2933 return ret;
2936 /* This takes a number of _MM_ pages, not VTD pages */
2937 static struct iova *intel_alloc_iova(struct device *dev,
2938 struct dmar_domain *domain,
2939 unsigned long nrpages, uint64_t dma_mask)
2941 struct iova *iova = NULL;
2943 /* Restrict dma_mask to the width that the iommu can handle */
2944 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2946 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2948 * First try to allocate an io virtual address in
2949 * DMA_BIT_MASK(32) and if that fails then try allocating
2950 * from higher range
2952 iova = alloc_iova(&domain->iovad, nrpages,
2953 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2954 if (iova)
2955 return iova;
2957 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2958 if (unlikely(!iova)) {
2959 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2960 nrpages, dev_name(dev));
2961 return NULL;
2964 return iova;
2967 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
2969 struct dmar_domain *domain;
2970 int ret;
2972 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2973 if (!domain) {
2974 printk(KERN_ERR "Allocating domain for %s failed",
2975 dev_name(dev));
2976 return NULL;
2979 /* make sure context mapping is ok */
2980 if (unlikely(!domain_context_mapped(dev))) {
2981 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2982 if (ret) {
2983 printk(KERN_ERR "Domain context map for %s failed",
2984 dev_name(dev));
2985 return NULL;
2989 return domain;
2992 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
2994 struct device_domain_info *info;
2996 /* No lock here, assumes no domain exit in normal case */
2997 info = dev->archdata.iommu;
2998 if (likely(info))
2999 return info->domain;
3001 return __get_valid_domain_for_dev(dev);
3004 static int iommu_dummy(struct device *dev)
3006 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
3009 /* Check if the dev needs to go through non-identity map and unmap process.*/
3010 static int iommu_no_mapping(struct device *dev)
3012 int found;
3014 if (iommu_dummy(dev))
3015 return 1;
3017 if (!iommu_identity_mapping)
3018 return 0;
3020 found = identity_mapping(dev);
3021 if (found) {
3022 if (iommu_should_identity_map(dev, 0))
3023 return 1;
3024 else {
3026 * 32 bit DMA is removed from si_domain and fall back
3027 * to non-identity mapping.
3029 domain_remove_one_dev_info(si_domain, dev);
3030 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
3031 dev_name(dev));
3032 return 0;
3034 } else {
3036 * In case of a detached 64 bit DMA device from vm, the device
3037 * is put into si_domain for identity mapping.
3039 if (iommu_should_identity_map(dev, 0)) {
3040 int ret;
3041 ret = domain_add_dev_info(si_domain, dev,
3042 hw_pass_through ?
3043 CONTEXT_TT_PASS_THROUGH :
3044 CONTEXT_TT_MULTI_LEVEL);
3045 if (!ret) {
3046 printk(KERN_INFO "64bit %s uses identity mapping\n",
3047 dev_name(dev));
3048 return 1;
3053 return 0;
3056 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3057 size_t size, int dir, u64 dma_mask)
3059 struct dmar_domain *domain;
3060 phys_addr_t start_paddr;
3061 struct iova *iova;
3062 int prot = 0;
3063 int ret;
3064 struct intel_iommu *iommu;
3065 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3067 BUG_ON(dir == DMA_NONE);
3069 if (iommu_no_mapping(dev))
3070 return paddr;
3072 domain = get_valid_domain_for_dev(dev);
3073 if (!domain)
3074 return 0;
3076 iommu = domain_get_iommu(domain);
3077 size = aligned_nrpages(paddr, size);
3079 iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3080 if (!iova)
3081 goto error;
3084 * Check if DMAR supports zero-length reads on write only
3085 * mappings..
3087 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3088 !cap_zlr(iommu->cap))
3089 prot |= DMA_PTE_READ;
3090 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3091 prot |= DMA_PTE_WRITE;
3093 * paddr - (paddr + size) might be partial page, we should map the whole
3094 * page. Note: if two part of one page are separately mapped, we
3095 * might have two guest_addr mapping to the same host paddr, but this
3096 * is not a big problem
3098 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3099 mm_to_dma_pfn(paddr_pfn), size, prot);
3100 if (ret)
3101 goto error;
3103 /* it's a non-present to present mapping. Only flush if caching mode */
3104 if (cap_caching_mode(iommu->cap))
3105 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
3106 else
3107 iommu_flush_write_buffer(iommu);
3109 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3110 start_paddr += paddr & ~PAGE_MASK;
3111 return start_paddr;
3113 error:
3114 if (iova)
3115 __free_iova(&domain->iovad, iova);
3116 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
3117 dev_name(dev), size, (unsigned long long)paddr, dir);
3118 return 0;
3121 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3122 unsigned long offset, size_t size,
3123 enum dma_data_direction dir,
3124 struct dma_attrs *attrs)
3126 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3127 dir, *dev->dma_mask);
3130 static void flush_unmaps(void)
3132 int i, j;
3134 timer_on = 0;
3136 /* just flush them all */
3137 for (i = 0; i < g_num_of_iommus; i++) {
3138 struct intel_iommu *iommu = g_iommus[i];
3139 if (!iommu)
3140 continue;
3142 if (!deferred_flush[i].next)
3143 continue;
3145 /* In caching mode, global flushes turn emulation expensive */
3146 if (!cap_caching_mode(iommu->cap))
3147 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3148 DMA_TLB_GLOBAL_FLUSH);
3149 for (j = 0; j < deferred_flush[i].next; j++) {
3150 unsigned long mask;
3151 struct iova *iova = deferred_flush[i].iova[j];
3152 struct dmar_domain *domain = deferred_flush[i].domain[j];
3154 /* On real hardware multiple invalidations are expensive */
3155 if (cap_caching_mode(iommu->cap))
3156 iommu_flush_iotlb_psi(iommu, domain->id,
3157 iova->pfn_lo, iova_size(iova),
3158 !deferred_flush[i].freelist[j], 0);
3159 else {
3160 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3161 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3162 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3164 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3165 if (deferred_flush[i].freelist[j])
3166 dma_free_pagelist(deferred_flush[i].freelist[j]);
3168 deferred_flush[i].next = 0;
3171 list_size = 0;
3174 static void flush_unmaps_timeout(unsigned long data)
3176 unsigned long flags;
3178 spin_lock_irqsave(&async_umap_flush_lock, flags);
3179 flush_unmaps();
3180 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3183 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3185 unsigned long flags;
3186 int next, iommu_id;
3187 struct intel_iommu *iommu;
3189 spin_lock_irqsave(&async_umap_flush_lock, flags);
3190 if (list_size == HIGH_WATER_MARK)
3191 flush_unmaps();
3193 iommu = domain_get_iommu(dom);
3194 iommu_id = iommu->seq_id;
3196 next = deferred_flush[iommu_id].next;
3197 deferred_flush[iommu_id].domain[next] = dom;
3198 deferred_flush[iommu_id].iova[next] = iova;
3199 deferred_flush[iommu_id].freelist[next] = freelist;
3200 deferred_flush[iommu_id].next++;
3202 if (!timer_on) {
3203 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3204 timer_on = 1;
3206 list_size++;
3207 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3210 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3212 struct dmar_domain *domain;
3213 unsigned long start_pfn, last_pfn;
3214 struct iova *iova;
3215 struct intel_iommu *iommu;
3216 struct page *freelist;
3218 if (iommu_no_mapping(dev))
3219 return;
3221 domain = find_domain(dev);
3222 BUG_ON(!domain);
3224 iommu = domain_get_iommu(domain);
3226 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3227 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3228 (unsigned long long)dev_addr))
3229 return;
3231 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3232 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3234 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3235 dev_name(dev), start_pfn, last_pfn);
3237 freelist = domain_unmap(domain, start_pfn, last_pfn);
3239 if (intel_iommu_strict) {
3240 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3241 last_pfn - start_pfn + 1, !freelist, 0);
3242 /* free iova */
3243 __free_iova(&domain->iovad, iova);
3244 dma_free_pagelist(freelist);
3245 } else {
3246 add_unmap(domain, iova, freelist);
3248 * queue up the release of the unmap to save the 1/6th of the
3249 * cpu used up by the iotlb flush operation...
3254 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3255 size_t size, enum dma_data_direction dir,
3256 struct dma_attrs *attrs)
3258 intel_unmap(dev, dev_addr);
3261 static void *intel_alloc_coherent(struct device *dev, size_t size,
3262 dma_addr_t *dma_handle, gfp_t flags,
3263 struct dma_attrs *attrs)
3265 struct page *page = NULL;
3266 int order;
3268 size = PAGE_ALIGN(size);
3269 order = get_order(size);
3271 if (!iommu_no_mapping(dev))
3272 flags &= ~(GFP_DMA | GFP_DMA32);
3273 else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3274 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3275 flags |= GFP_DMA;
3276 else
3277 flags |= GFP_DMA32;
3280 if (flags & __GFP_WAIT) {
3281 unsigned int count = size >> PAGE_SHIFT;
3283 page = dma_alloc_from_contiguous(dev, count, order);
3284 if (page && iommu_no_mapping(dev) &&
3285 page_to_phys(page) + size > dev->coherent_dma_mask) {
3286 dma_release_from_contiguous(dev, page, count);
3287 page = NULL;
3291 if (!page)
3292 page = alloc_pages(flags, order);
3293 if (!page)
3294 return NULL;
3295 memset(page_address(page), 0, size);
3297 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3298 DMA_BIDIRECTIONAL,
3299 dev->coherent_dma_mask);
3300 if (*dma_handle)
3301 return page_address(page);
3302 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3303 __free_pages(page, order);
3305 return NULL;
3308 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3309 dma_addr_t dma_handle, struct dma_attrs *attrs)
3311 int order;
3312 struct page *page = virt_to_page(vaddr);
3314 size = PAGE_ALIGN(size);
3315 order = get_order(size);
3317 intel_unmap(dev, dma_handle);
3318 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3319 __free_pages(page, order);
3322 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3323 int nelems, enum dma_data_direction dir,
3324 struct dma_attrs *attrs)
3326 intel_unmap(dev, sglist[0].dma_address);
3329 static int intel_nontranslate_map_sg(struct device *hddev,
3330 struct scatterlist *sglist, int nelems, int dir)
3332 int i;
3333 struct scatterlist *sg;
3335 for_each_sg(sglist, sg, nelems, i) {
3336 BUG_ON(!sg_page(sg));
3337 sg->dma_address = sg_phys(sg);
3338 sg->dma_length = sg->length;
3340 return nelems;
3343 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3344 enum dma_data_direction dir, struct dma_attrs *attrs)
3346 int i;
3347 struct dmar_domain *domain;
3348 size_t size = 0;
3349 int prot = 0;
3350 struct iova *iova = NULL;
3351 int ret;
3352 struct scatterlist *sg;
3353 unsigned long start_vpfn;
3354 struct intel_iommu *iommu;
3356 BUG_ON(dir == DMA_NONE);
3357 if (iommu_no_mapping(dev))
3358 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3360 domain = get_valid_domain_for_dev(dev);
3361 if (!domain)
3362 return 0;
3364 iommu = domain_get_iommu(domain);
3366 for_each_sg(sglist, sg, nelems, i)
3367 size += aligned_nrpages(sg->offset, sg->length);
3369 iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3370 *dev->dma_mask);
3371 if (!iova) {
3372 sglist->dma_length = 0;
3373 return 0;
3377 * Check if DMAR supports zero-length reads on write only
3378 * mappings..
3380 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3381 !cap_zlr(iommu->cap))
3382 prot |= DMA_PTE_READ;
3383 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3384 prot |= DMA_PTE_WRITE;
3386 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3388 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3389 if (unlikely(ret)) {
3390 dma_pte_free_pagetable(domain, start_vpfn,
3391 start_vpfn + size - 1);
3392 __free_iova(&domain->iovad, iova);
3393 return 0;
3396 /* it's a non-present to present mapping. Only flush if caching mode */
3397 if (cap_caching_mode(iommu->cap))
3398 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3399 else
3400 iommu_flush_write_buffer(iommu);
3402 return nelems;
3405 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3407 return !dma_addr;
3410 struct dma_map_ops intel_dma_ops = {
3411 .alloc = intel_alloc_coherent,
3412 .free = intel_free_coherent,
3413 .map_sg = intel_map_sg,
3414 .unmap_sg = intel_unmap_sg,
3415 .map_page = intel_map_page,
3416 .unmap_page = intel_unmap_page,
3417 .mapping_error = intel_mapping_error,
3420 static inline int iommu_domain_cache_init(void)
3422 int ret = 0;
3424 iommu_domain_cache = kmem_cache_create("iommu_domain",
3425 sizeof(struct dmar_domain),
3427 SLAB_HWCACHE_ALIGN,
3429 NULL);
3430 if (!iommu_domain_cache) {
3431 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3432 ret = -ENOMEM;
3435 return ret;
3438 static inline int iommu_devinfo_cache_init(void)
3440 int ret = 0;
3442 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3443 sizeof(struct device_domain_info),
3445 SLAB_HWCACHE_ALIGN,
3446 NULL);
3447 if (!iommu_devinfo_cache) {
3448 printk(KERN_ERR "Couldn't create devinfo cache\n");
3449 ret = -ENOMEM;
3452 return ret;
3455 static inline int iommu_iova_cache_init(void)
3457 int ret = 0;
3459 iommu_iova_cache = kmem_cache_create("iommu_iova",
3460 sizeof(struct iova),
3462 SLAB_HWCACHE_ALIGN,
3463 NULL);
3464 if (!iommu_iova_cache) {
3465 printk(KERN_ERR "Couldn't create iova cache\n");
3466 ret = -ENOMEM;
3469 return ret;
3472 static int __init iommu_init_mempool(void)
3474 int ret;
3475 ret = iommu_iova_cache_init();
3476 if (ret)
3477 return ret;
3479 ret = iommu_domain_cache_init();
3480 if (ret)
3481 goto domain_error;
3483 ret = iommu_devinfo_cache_init();
3484 if (!ret)
3485 return ret;
3487 kmem_cache_destroy(iommu_domain_cache);
3488 domain_error:
3489 kmem_cache_destroy(iommu_iova_cache);
3491 return -ENOMEM;
3494 static void __init iommu_exit_mempool(void)
3496 kmem_cache_destroy(iommu_devinfo_cache);
3497 kmem_cache_destroy(iommu_domain_cache);
3498 kmem_cache_destroy(iommu_iova_cache);
3502 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3504 struct dmar_drhd_unit *drhd;
3505 u32 vtbar;
3506 int rc;
3508 /* We know that this device on this chipset has its own IOMMU.
3509 * If we find it under a different IOMMU, then the BIOS is lying
3510 * to us. Hope that the IOMMU for this device is actually
3511 * disabled, and it needs no translation...
3513 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3514 if (rc) {
3515 /* "can't" happen */
3516 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3517 return;
3519 vtbar &= 0xffff0000;
3521 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3522 drhd = dmar_find_matched_drhd_unit(pdev);
3523 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3524 TAINT_FIRMWARE_WORKAROUND,
3525 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3526 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3528 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3530 static void __init init_no_remapping_devices(void)
3532 struct dmar_drhd_unit *drhd;
3533 struct device *dev;
3534 int i;
3536 for_each_drhd_unit(drhd) {
3537 if (!drhd->include_all) {
3538 for_each_active_dev_scope(drhd->devices,
3539 drhd->devices_cnt, i, dev)
3540 break;
3541 /* ignore DMAR unit if no devices exist */
3542 if (i == drhd->devices_cnt)
3543 drhd->ignored = 1;
3547 for_each_active_drhd_unit(drhd) {
3548 if (drhd->include_all)
3549 continue;
3551 for_each_active_dev_scope(drhd->devices,
3552 drhd->devices_cnt, i, dev)
3553 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3554 break;
3555 if (i < drhd->devices_cnt)
3556 continue;
3558 /* This IOMMU has *only* gfx devices. Either bypass it or
3559 set the gfx_mapped flag, as appropriate */
3560 if (dmar_map_gfx) {
3561 intel_iommu_gfx_mapped = 1;
3562 } else {
3563 drhd->ignored = 1;
3564 for_each_active_dev_scope(drhd->devices,
3565 drhd->devices_cnt, i, dev)
3566 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3571 #ifdef CONFIG_SUSPEND
3572 static int init_iommu_hw(void)
3574 struct dmar_drhd_unit *drhd;
3575 struct intel_iommu *iommu = NULL;
3577 for_each_active_iommu(iommu, drhd)
3578 if (iommu->qi)
3579 dmar_reenable_qi(iommu);
3581 for_each_iommu(iommu, drhd) {
3582 if (drhd->ignored) {
3584 * we always have to disable PMRs or DMA may fail on
3585 * this device
3587 if (force_on)
3588 iommu_disable_protect_mem_regions(iommu);
3589 continue;
3592 iommu_flush_write_buffer(iommu);
3594 iommu_set_root_entry(iommu);
3596 iommu->flush.flush_context(iommu, 0, 0, 0,
3597 DMA_CCMD_GLOBAL_INVL);
3598 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3599 iommu_enable_translation(iommu);
3600 iommu_disable_protect_mem_regions(iommu);
3603 return 0;
3606 static void iommu_flush_all(void)
3608 struct dmar_drhd_unit *drhd;
3609 struct intel_iommu *iommu;
3611 for_each_active_iommu(iommu, drhd) {
3612 iommu->flush.flush_context(iommu, 0, 0, 0,
3613 DMA_CCMD_GLOBAL_INVL);
3614 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3615 DMA_TLB_GLOBAL_FLUSH);
3619 static int iommu_suspend(void)
3621 struct dmar_drhd_unit *drhd;
3622 struct intel_iommu *iommu = NULL;
3623 unsigned long flag;
3625 for_each_active_iommu(iommu, drhd) {
3626 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3627 GFP_ATOMIC);
3628 if (!iommu->iommu_state)
3629 goto nomem;
3632 iommu_flush_all();
3634 for_each_active_iommu(iommu, drhd) {
3635 iommu_disable_translation(iommu);
3637 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3639 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3640 readl(iommu->reg + DMAR_FECTL_REG);
3641 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3642 readl(iommu->reg + DMAR_FEDATA_REG);
3643 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3644 readl(iommu->reg + DMAR_FEADDR_REG);
3645 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3646 readl(iommu->reg + DMAR_FEUADDR_REG);
3648 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3650 return 0;
3652 nomem:
3653 for_each_active_iommu(iommu, drhd)
3654 kfree(iommu->iommu_state);
3656 return -ENOMEM;
3659 static void iommu_resume(void)
3661 struct dmar_drhd_unit *drhd;
3662 struct intel_iommu *iommu = NULL;
3663 unsigned long flag;
3665 if (init_iommu_hw()) {
3666 if (force_on)
3667 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3668 else
3669 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3670 return;
3673 for_each_active_iommu(iommu, drhd) {
3675 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3677 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3678 iommu->reg + DMAR_FECTL_REG);
3679 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3680 iommu->reg + DMAR_FEDATA_REG);
3681 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3682 iommu->reg + DMAR_FEADDR_REG);
3683 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3684 iommu->reg + DMAR_FEUADDR_REG);
3686 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3689 for_each_active_iommu(iommu, drhd)
3690 kfree(iommu->iommu_state);
3693 static struct syscore_ops iommu_syscore_ops = {
3694 .resume = iommu_resume,
3695 .suspend = iommu_suspend,
3698 static void __init init_iommu_pm_ops(void)
3700 register_syscore_ops(&iommu_syscore_ops);
3703 #else
3704 static inline void init_iommu_pm_ops(void) {}
3705 #endif /* CONFIG_PM */
3708 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3710 struct acpi_dmar_reserved_memory *rmrr;
3711 struct dmar_rmrr_unit *rmrru;
3713 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3714 if (!rmrru)
3715 return -ENOMEM;
3717 rmrru->hdr = header;
3718 rmrr = (struct acpi_dmar_reserved_memory *)header;
3719 rmrru->base_address = rmrr->base_address;
3720 rmrru->end_address = rmrr->end_address;
3721 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3722 ((void *)rmrr) + rmrr->header.length,
3723 &rmrru->devices_cnt);
3724 if (rmrru->devices_cnt && rmrru->devices == NULL) {
3725 kfree(rmrru);
3726 return -ENOMEM;
3729 list_add(&rmrru->list, &dmar_rmrr_units);
3731 return 0;
3734 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3736 struct acpi_dmar_atsr *atsr;
3737 struct dmar_atsr_unit *atsru;
3739 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3740 atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3741 if (!atsru)
3742 return -ENOMEM;
3744 atsru->hdr = hdr;
3745 atsru->include_all = atsr->flags & 0x1;
3746 if (!atsru->include_all) {
3747 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3748 (void *)atsr + atsr->header.length,
3749 &atsru->devices_cnt);
3750 if (atsru->devices_cnt && atsru->devices == NULL) {
3751 kfree(atsru);
3752 return -ENOMEM;
3756 list_add_rcu(&atsru->list, &dmar_atsr_units);
3758 return 0;
3761 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3763 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3764 kfree(atsru);
3767 static void intel_iommu_free_dmars(void)
3769 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3770 struct dmar_atsr_unit *atsru, *atsr_n;
3772 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3773 list_del(&rmrru->list);
3774 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3775 kfree(rmrru);
3778 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3779 list_del(&atsru->list);
3780 intel_iommu_free_atsr(atsru);
3784 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3786 int i, ret = 1;
3787 struct pci_bus *bus;
3788 struct pci_dev *bridge = NULL;
3789 struct device *tmp;
3790 struct acpi_dmar_atsr *atsr;
3791 struct dmar_atsr_unit *atsru;
3793 dev = pci_physfn(dev);
3794 for (bus = dev->bus; bus; bus = bus->parent) {
3795 bridge = bus->self;
3796 if (!bridge || !pci_is_pcie(bridge) ||
3797 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3798 return 0;
3799 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3800 break;
3802 if (!bridge)
3803 return 0;
3805 rcu_read_lock();
3806 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3807 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3808 if (atsr->segment != pci_domain_nr(dev->bus))
3809 continue;
3811 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3812 if (tmp == &bridge->dev)
3813 goto out;
3815 if (atsru->include_all)
3816 goto out;
3818 ret = 0;
3819 out:
3820 rcu_read_unlock();
3822 return ret;
3825 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3827 int ret = 0;
3828 struct dmar_rmrr_unit *rmrru;
3829 struct dmar_atsr_unit *atsru;
3830 struct acpi_dmar_atsr *atsr;
3831 struct acpi_dmar_reserved_memory *rmrr;
3833 if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
3834 return 0;
3836 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3837 rmrr = container_of(rmrru->hdr,
3838 struct acpi_dmar_reserved_memory, header);
3839 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3840 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3841 ((void *)rmrr) + rmrr->header.length,
3842 rmrr->segment, rmrru->devices,
3843 rmrru->devices_cnt);
3844 if(ret < 0)
3845 return ret;
3846 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3847 dmar_remove_dev_scope(info, rmrr->segment,
3848 rmrru->devices, rmrru->devices_cnt);
3852 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3853 if (atsru->include_all)
3854 continue;
3856 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3857 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3858 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3859 (void *)atsr + atsr->header.length,
3860 atsr->segment, atsru->devices,
3861 atsru->devices_cnt);
3862 if (ret > 0)
3863 break;
3864 else if(ret < 0)
3865 return ret;
3866 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3867 if (dmar_remove_dev_scope(info, atsr->segment,
3868 atsru->devices, atsru->devices_cnt))
3869 break;
3873 return 0;
3877 * Here we only respond to action of unbound device from driver.
3879 * Added device is not attached to its DMAR domain here yet. That will happen
3880 * when mapping the device to iova.
3882 static int device_notifier(struct notifier_block *nb,
3883 unsigned long action, void *data)
3885 struct device *dev = data;
3886 struct dmar_domain *domain;
3888 if (iommu_dummy(dev))
3889 return 0;
3891 if (action != BUS_NOTIFY_REMOVED_DEVICE)
3892 return 0;
3895 * If the device is still attached to a device driver we can't
3896 * tear down the domain yet as DMA mappings may still be in use.
3897 * Wait for the BUS_NOTIFY_UNBOUND_DRIVER event to do that.
3899 if (action == BUS_NOTIFY_DEL_DEVICE && dev->driver != NULL)
3900 return 0;
3902 domain = find_domain(dev);
3903 if (!domain)
3904 return 0;
3906 down_read(&dmar_global_lock);
3907 domain_remove_one_dev_info(domain, dev);
3908 if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
3909 domain_exit(domain);
3910 up_read(&dmar_global_lock);
3912 return 0;
3915 static struct notifier_block device_nb = {
3916 .notifier_call = device_notifier,
3919 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3920 unsigned long val, void *v)
3922 struct memory_notify *mhp = v;
3923 unsigned long long start, end;
3924 unsigned long start_vpfn, last_vpfn;
3926 switch (val) {
3927 case MEM_GOING_ONLINE:
3928 start = mhp->start_pfn << PAGE_SHIFT;
3929 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
3930 if (iommu_domain_identity_map(si_domain, start, end)) {
3931 pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
3932 start, end);
3933 return NOTIFY_BAD;
3935 break;
3937 case MEM_OFFLINE:
3938 case MEM_CANCEL_ONLINE:
3939 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3940 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
3941 while (start_vpfn <= last_vpfn) {
3942 struct iova *iova;
3943 struct dmar_drhd_unit *drhd;
3944 struct intel_iommu *iommu;
3945 struct page *freelist;
3947 iova = find_iova(&si_domain->iovad, start_vpfn);
3948 if (iova == NULL) {
3949 pr_debug("dmar: failed get IOVA for PFN %lx\n",
3950 start_vpfn);
3951 break;
3954 iova = split_and_remove_iova(&si_domain->iovad, iova,
3955 start_vpfn, last_vpfn);
3956 if (iova == NULL) {
3957 pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
3958 start_vpfn, last_vpfn);
3959 return NOTIFY_BAD;
3962 freelist = domain_unmap(si_domain, iova->pfn_lo,
3963 iova->pfn_hi);
3965 rcu_read_lock();
3966 for_each_active_iommu(iommu, drhd)
3967 iommu_flush_iotlb_psi(iommu, si_domain->id,
3968 iova->pfn_lo, iova_size(iova),
3969 !freelist, 0);
3970 rcu_read_unlock();
3971 dma_free_pagelist(freelist);
3973 start_vpfn = iova->pfn_hi + 1;
3974 free_iova_mem(iova);
3976 break;
3979 return NOTIFY_OK;
3982 static struct notifier_block intel_iommu_memory_nb = {
3983 .notifier_call = intel_iommu_memory_notifier,
3984 .priority = 0
3988 static ssize_t intel_iommu_show_version(struct device *dev,
3989 struct device_attribute *attr,
3990 char *buf)
3992 struct intel_iommu *iommu = dev_get_drvdata(dev);
3993 u32 ver = readl(iommu->reg + DMAR_VER_REG);
3994 return sprintf(buf, "%d:%d\n",
3995 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3997 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
3999 static ssize_t intel_iommu_show_address(struct device *dev,
4000 struct device_attribute *attr,
4001 char *buf)
4003 struct intel_iommu *iommu = dev_get_drvdata(dev);
4004 return sprintf(buf, "%llx\n", iommu->reg_phys);
4006 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4008 static ssize_t intel_iommu_show_cap(struct device *dev,
4009 struct device_attribute *attr,
4010 char *buf)
4012 struct intel_iommu *iommu = dev_get_drvdata(dev);
4013 return sprintf(buf, "%llx\n", iommu->cap);
4015 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4017 static ssize_t intel_iommu_show_ecap(struct device *dev,
4018 struct device_attribute *attr,
4019 char *buf)
4021 struct intel_iommu *iommu = dev_get_drvdata(dev);
4022 return sprintf(buf, "%llx\n", iommu->ecap);
4024 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4026 static struct attribute *intel_iommu_attrs[] = {
4027 &dev_attr_version.attr,
4028 &dev_attr_address.attr,
4029 &dev_attr_cap.attr,
4030 &dev_attr_ecap.attr,
4031 NULL,
4034 static struct attribute_group intel_iommu_group = {
4035 .name = "intel-iommu",
4036 .attrs = intel_iommu_attrs,
4039 const struct attribute_group *intel_iommu_groups[] = {
4040 &intel_iommu_group,
4041 NULL,
4044 int __init intel_iommu_init(void)
4046 int ret = -ENODEV;
4047 struct dmar_drhd_unit *drhd;
4048 struct intel_iommu *iommu;
4050 /* VT-d is required for a TXT/tboot launch, so enforce that */
4051 force_on = tboot_force_iommu();
4053 if (iommu_init_mempool()) {
4054 if (force_on)
4055 panic("tboot: Failed to initialize iommu memory\n");
4056 return -ENOMEM;
4059 down_write(&dmar_global_lock);
4060 if (dmar_table_init()) {
4061 if (force_on)
4062 panic("tboot: Failed to initialize DMAR table\n");
4063 goto out_free_dmar;
4067 * Disable translation if already enabled prior to OS handover.
4069 for_each_active_iommu(iommu, drhd)
4070 if (iommu->gcmd & DMA_GCMD_TE)
4071 iommu_disable_translation(iommu);
4073 if (dmar_dev_scope_init() < 0) {
4074 if (force_on)
4075 panic("tboot: Failed to initialize DMAR device scope\n");
4076 goto out_free_dmar;
4079 if (no_iommu || dmar_disabled)
4080 goto out_free_dmar;
4082 if (list_empty(&dmar_rmrr_units))
4083 printk(KERN_INFO "DMAR: No RMRR found\n");
4085 if (list_empty(&dmar_atsr_units))
4086 printk(KERN_INFO "DMAR: No ATSR found\n");
4088 if (dmar_init_reserved_ranges()) {
4089 if (force_on)
4090 panic("tboot: Failed to reserve iommu ranges\n");
4091 goto out_free_reserved_range;
4094 init_no_remapping_devices();
4096 ret = init_dmars();
4097 if (ret) {
4098 if (force_on)
4099 panic("tboot: Failed to initialize DMARs\n");
4100 printk(KERN_ERR "IOMMU: dmar init failed\n");
4101 goto out_free_reserved_range;
4103 up_write(&dmar_global_lock);
4104 printk(KERN_INFO
4105 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
4107 init_timer(&unmap_timer);
4108 #ifdef CONFIG_SWIOTLB
4109 swiotlb = 0;
4110 #endif
4111 dma_ops = &intel_dma_ops;
4113 init_iommu_pm_ops();
4115 for_each_active_iommu(iommu, drhd)
4116 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4117 intel_iommu_groups,
4118 iommu->name);
4120 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4121 bus_register_notifier(&pci_bus_type, &device_nb);
4122 if (si_domain && !hw_pass_through)
4123 register_memory_notifier(&intel_iommu_memory_nb);
4125 intel_iommu_enabled = 1;
4127 return 0;
4129 out_free_reserved_range:
4130 put_iova_domain(&reserved_iova_list);
4131 out_free_dmar:
4132 intel_iommu_free_dmars();
4133 up_write(&dmar_global_lock);
4134 iommu_exit_mempool();
4135 return ret;
4138 static int iommu_detach_dev_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4140 struct intel_iommu *iommu = opaque;
4142 iommu_detach_dev(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4143 return 0;
4147 * NB - intel-iommu lacks any sort of reference counting for the users of
4148 * dependent devices. If multiple endpoints have intersecting dependent
4149 * devices, unbinding the driver from any one of them will possibly leave
4150 * the others unable to operate.
4152 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
4153 struct device *dev)
4155 if (!iommu || !dev || !dev_is_pci(dev))
4156 return;
4158 pci_for_each_dma_alias(to_pci_dev(dev), &iommu_detach_dev_cb, iommu);
4161 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4162 struct device *dev)
4164 struct device_domain_info *info, *tmp;
4165 struct intel_iommu *iommu;
4166 unsigned long flags;
4167 int found = 0;
4168 u8 bus, devfn;
4170 iommu = device_to_iommu(dev, &bus, &devfn);
4171 if (!iommu)
4172 return;
4174 spin_lock_irqsave(&device_domain_lock, flags);
4175 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4176 if (info->iommu == iommu && info->bus == bus &&
4177 info->devfn == devfn) {
4178 unlink_domain_info(info);
4179 spin_unlock_irqrestore(&device_domain_lock, flags);
4181 iommu_disable_dev_iotlb(info);
4182 iommu_detach_dev(iommu, info->bus, info->devfn);
4183 iommu_detach_dependent_devices(iommu, dev);
4184 free_devinfo_mem(info);
4186 spin_lock_irqsave(&device_domain_lock, flags);
4188 if (found)
4189 break;
4190 else
4191 continue;
4194 /* if there is no other devices under the same iommu
4195 * owned by this domain, clear this iommu in iommu_bmp
4196 * update iommu count and coherency
4198 if (info->iommu == iommu)
4199 found = 1;
4202 spin_unlock_irqrestore(&device_domain_lock, flags);
4204 if (found == 0) {
4205 domain_detach_iommu(domain, iommu);
4206 if (!domain_type_is_vm_or_si(domain))
4207 iommu_detach_domain(domain, iommu);
4211 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4213 int adjust_width;
4215 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
4216 domain_reserve_special_ranges(domain);
4218 /* calculate AGAW */
4219 domain->gaw = guest_width;
4220 adjust_width = guestwidth_to_adjustwidth(guest_width);
4221 domain->agaw = width_to_agaw(adjust_width);
4223 domain->iommu_coherency = 0;
4224 domain->iommu_snooping = 0;
4225 domain->iommu_superpage = 0;
4226 domain->max_addr = 0;
4228 /* always allocate the top pgd */
4229 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4230 if (!domain->pgd)
4231 return -ENOMEM;
4232 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4233 return 0;
4236 static int intel_iommu_domain_init(struct iommu_domain *domain)
4238 struct dmar_domain *dmar_domain;
4240 dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4241 if (!dmar_domain) {
4242 printk(KERN_ERR
4243 "intel_iommu_domain_init: dmar_domain == NULL\n");
4244 return -ENOMEM;
4246 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4247 printk(KERN_ERR
4248 "intel_iommu_domain_init() failed\n");
4249 domain_exit(dmar_domain);
4250 return -ENOMEM;
4252 domain_update_iommu_cap(dmar_domain);
4253 domain->priv = dmar_domain;
4255 domain->geometry.aperture_start = 0;
4256 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4257 domain->geometry.force_aperture = true;
4259 return 0;
4262 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
4264 struct dmar_domain *dmar_domain = domain->priv;
4266 domain->priv = NULL;
4267 domain_exit(dmar_domain);
4270 static int intel_iommu_attach_device(struct iommu_domain *domain,
4271 struct device *dev)
4273 struct dmar_domain *dmar_domain = domain->priv;
4274 struct intel_iommu *iommu;
4275 int addr_width;
4276 u8 bus, devfn;
4278 if (device_is_rmrr_locked(dev)) {
4279 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4280 return -EPERM;
4283 /* normally dev is not mapped */
4284 if (unlikely(domain_context_mapped(dev))) {
4285 struct dmar_domain *old_domain;
4287 old_domain = find_domain(dev);
4288 if (old_domain) {
4289 if (domain_type_is_vm_or_si(dmar_domain))
4290 domain_remove_one_dev_info(old_domain, dev);
4291 else
4292 domain_remove_dev_info(old_domain);
4294 if (!domain_type_is_vm_or_si(old_domain) &&
4295 list_empty(&old_domain->devices))
4296 domain_exit(old_domain);
4300 iommu = device_to_iommu(dev, &bus, &devfn);
4301 if (!iommu)
4302 return -ENODEV;
4304 /* check if this iommu agaw is sufficient for max mapped address */
4305 addr_width = agaw_to_width(iommu->agaw);
4306 if (addr_width > cap_mgaw(iommu->cap))
4307 addr_width = cap_mgaw(iommu->cap);
4309 if (dmar_domain->max_addr > (1LL << addr_width)) {
4310 printk(KERN_ERR "%s: iommu width (%d) is not "
4311 "sufficient for the mapped address (%llx)\n",
4312 __func__, addr_width, dmar_domain->max_addr);
4313 return -EFAULT;
4315 dmar_domain->gaw = addr_width;
4318 * Knock out extra levels of page tables if necessary
4320 while (iommu->agaw < dmar_domain->agaw) {
4321 struct dma_pte *pte;
4323 pte = dmar_domain->pgd;
4324 if (dma_pte_present(pte)) {
4325 dmar_domain->pgd = (struct dma_pte *)
4326 phys_to_virt(dma_pte_addr(pte));
4327 free_pgtable_page(pte);
4329 dmar_domain->agaw--;
4332 return domain_add_dev_info(dmar_domain, dev, CONTEXT_TT_MULTI_LEVEL);
4335 static void intel_iommu_detach_device(struct iommu_domain *domain,
4336 struct device *dev)
4338 struct dmar_domain *dmar_domain = domain->priv;
4340 domain_remove_one_dev_info(dmar_domain, dev);
4343 static int intel_iommu_map(struct iommu_domain *domain,
4344 unsigned long iova, phys_addr_t hpa,
4345 size_t size, int iommu_prot)
4347 struct dmar_domain *dmar_domain = domain->priv;
4348 u64 max_addr;
4349 int prot = 0;
4350 int ret;
4352 if (iommu_prot & IOMMU_READ)
4353 prot |= DMA_PTE_READ;
4354 if (iommu_prot & IOMMU_WRITE)
4355 prot |= DMA_PTE_WRITE;
4356 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4357 prot |= DMA_PTE_SNP;
4359 max_addr = iova + size;
4360 if (dmar_domain->max_addr < max_addr) {
4361 u64 end;
4363 /* check if minimum agaw is sufficient for mapped address */
4364 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4365 if (end < max_addr) {
4366 printk(KERN_ERR "%s: iommu width (%d) is not "
4367 "sufficient for the mapped address (%llx)\n",
4368 __func__, dmar_domain->gaw, max_addr);
4369 return -EFAULT;
4371 dmar_domain->max_addr = max_addr;
4373 /* Round up size to next multiple of PAGE_SIZE, if it and
4374 the low bits of hpa would take us onto the next page */
4375 size = aligned_nrpages(hpa, size);
4376 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4377 hpa >> VTD_PAGE_SHIFT, size, prot);
4378 return ret;
4381 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4382 unsigned long iova, size_t size)
4384 struct dmar_domain *dmar_domain = domain->priv;
4385 struct page *freelist = NULL;
4386 struct intel_iommu *iommu;
4387 unsigned long start_pfn, last_pfn;
4388 unsigned int npages;
4389 int iommu_id, num, ndomains, level = 0;
4391 /* Cope with horrid API which requires us to unmap more than the
4392 size argument if it happens to be a large-page mapping. */
4393 if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4394 BUG();
4396 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4397 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4399 start_pfn = iova >> VTD_PAGE_SHIFT;
4400 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4402 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4404 npages = last_pfn - start_pfn + 1;
4406 for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4407 iommu = g_iommus[iommu_id];
4410 * find bit position of dmar_domain
4412 ndomains = cap_ndoms(iommu->cap);
4413 for_each_set_bit(num, iommu->domain_ids, ndomains) {
4414 if (iommu->domains[num] == dmar_domain)
4415 iommu_flush_iotlb_psi(iommu, num, start_pfn,
4416 npages, !freelist, 0);
4421 dma_free_pagelist(freelist);
4423 if (dmar_domain->max_addr == iova + size)
4424 dmar_domain->max_addr = iova;
4426 return size;
4429 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4430 dma_addr_t iova)
4432 struct dmar_domain *dmar_domain = domain->priv;
4433 struct dma_pte *pte;
4434 int level = 0;
4435 u64 phys = 0;
4437 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4438 if (pte)
4439 phys = dma_pte_addr(pte);
4441 return phys;
4444 static bool intel_iommu_capable(enum iommu_cap cap)
4446 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4447 return domain_update_iommu_snooping(NULL) == 1;
4448 if (cap == IOMMU_CAP_INTR_REMAP)
4449 return irq_remapping_enabled == 1;
4451 return false;
4454 static int intel_iommu_add_device(struct device *dev)
4456 struct intel_iommu *iommu;
4457 struct iommu_group *group;
4458 u8 bus, devfn;
4460 iommu = device_to_iommu(dev, &bus, &devfn);
4461 if (!iommu)
4462 return -ENODEV;
4464 iommu_device_link(iommu->iommu_dev, dev);
4466 group = iommu_group_get_for_dev(dev);
4468 if (IS_ERR(group))
4469 return PTR_ERR(group);
4471 iommu_group_put(group);
4472 return 0;
4475 static void intel_iommu_remove_device(struct device *dev)
4477 struct intel_iommu *iommu;
4478 u8 bus, devfn;
4480 iommu = device_to_iommu(dev, &bus, &devfn);
4481 if (!iommu)
4482 return;
4484 iommu_group_remove_device(dev);
4486 iommu_device_unlink(iommu->iommu_dev, dev);
4489 static const struct iommu_ops intel_iommu_ops = {
4490 .capable = intel_iommu_capable,
4491 .domain_init = intel_iommu_domain_init,
4492 .domain_destroy = intel_iommu_domain_destroy,
4493 .attach_dev = intel_iommu_attach_device,
4494 .detach_dev = intel_iommu_detach_device,
4495 .map = intel_iommu_map,
4496 .unmap = intel_iommu_unmap,
4497 .iova_to_phys = intel_iommu_iova_to_phys,
4498 .add_device = intel_iommu_add_device,
4499 .remove_device = intel_iommu_remove_device,
4500 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4503 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4505 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4506 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4507 dmar_map_gfx = 0;
4510 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4511 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4512 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4513 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4514 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4515 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4516 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4518 static void quirk_iommu_rwbf(struct pci_dev *dev)
4521 * Mobile 4 Series Chipset neglects to set RWBF capability,
4522 * but needs it. Same seems to hold for the desktop versions.
4524 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4525 rwbf_quirk = 1;
4528 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4529 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4530 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4531 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4532 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4533 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4534 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4536 #define GGC 0x52
4537 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4538 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4539 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4540 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4541 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4542 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4543 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4544 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4546 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4548 unsigned short ggc;
4550 if (pci_read_config_word(dev, GGC, &ggc))
4551 return;
4553 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4554 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4555 dmar_map_gfx = 0;
4556 } else if (dmar_map_gfx) {
4557 /* we have to ensure the gfx device is idle before we flush */
4558 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4559 intel_iommu_strict = 1;
4562 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4563 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4564 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4565 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4567 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4568 ISOCH DMAR unit for the Azalia sound device, but not give it any
4569 TLB entries, which causes it to deadlock. Check for that. We do
4570 this in a function called from init_dmars(), instead of in a PCI
4571 quirk, because we don't want to print the obnoxious "BIOS broken"
4572 message if VT-d is actually disabled.
4574 static void __init check_tylersburg_isoch(void)
4576 struct pci_dev *pdev;
4577 uint32_t vtisochctrl;
4579 /* If there's no Azalia in the system anyway, forget it. */
4580 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4581 if (!pdev)
4582 return;
4583 pci_dev_put(pdev);
4585 /* System Management Registers. Might be hidden, in which case
4586 we can't do the sanity check. But that's OK, because the
4587 known-broken BIOSes _don't_ actually hide it, so far. */
4588 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4589 if (!pdev)
4590 return;
4592 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4593 pci_dev_put(pdev);
4594 return;
4597 pci_dev_put(pdev);
4599 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4600 if (vtisochctrl & 1)
4601 return;
4603 /* Drop all bits other than the number of TLB entries */
4604 vtisochctrl &= 0x1c;
4606 /* If we have the recommended number of TLB entries (16), fine. */
4607 if (vtisochctrl == 0x10)
4608 return;
4610 /* Zero TLB entries? You get to ride the short bus to school. */
4611 if (!vtisochctrl) {
4612 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4613 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4614 dmi_get_system_info(DMI_BIOS_VENDOR),
4615 dmi_get_system_info(DMI_BIOS_VERSION),
4616 dmi_get_system_info(DMI_PRODUCT_VERSION));
4617 iommu_identity_mapping |= IDENTMAP_AZALIA;
4618 return;
4621 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4622 vtisochctrl);