Linux 2.6.33-rc6
[cris-mirror.git] / drivers / pci / intel-iommu.c
blob417312528ddff29ab4ead4fd5bc7d89e9a718121
1 /*
2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/sysdev.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
44 #include "pci.h"
46 #define ROOT_SIZE VTD_PAGE_SIZE
47 #define CONTEXT_SIZE VTD_PAGE_SIZE
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
53 #define IOAPIC_RANGE_START (0xfee00000)
54 #define IOAPIC_RANGE_END (0xfeefffff)
55 #define IOVA_START_ADDR (0x1000)
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
59 #define MAX_AGAW_WIDTH 64
61 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
67 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
70 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
75 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
76 are never going to work. */
77 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
79 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
82 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
84 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
86 static inline unsigned long page_to_dma_pfn(struct page *pg)
88 return mm_to_dma_pfn(page_to_pfn(pg));
90 static inline unsigned long virt_to_dma_pfn(void *p)
92 return page_to_dma_pfn(virt_to_page(p));
95 /* global iommu list, set NULL for ignored DMAR units */
96 static struct intel_iommu **g_iommus;
98 static void __init check_tylersburg_isoch(void);
99 static int rwbf_quirk;
102 * 0: Present
103 * 1-11: Reserved
104 * 12-63: Context Ptr (12 - (haw-1))
105 * 64-127: Reserved
107 struct root_entry {
108 u64 val;
109 u64 rsvd1;
111 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
112 static inline bool root_present(struct root_entry *root)
114 return (root->val & 1);
116 static inline void set_root_present(struct root_entry *root)
118 root->val |= 1;
120 static inline void set_root_value(struct root_entry *root, unsigned long value)
122 root->val |= value & VTD_PAGE_MASK;
125 static inline struct context_entry *
126 get_context_addr_from_root(struct root_entry *root)
128 return (struct context_entry *)
129 (root_present(root)?phys_to_virt(
130 root->val & VTD_PAGE_MASK) :
131 NULL);
135 * low 64 bits:
136 * 0: present
137 * 1: fault processing disable
138 * 2-3: translation type
139 * 12-63: address space root
140 * high 64 bits:
141 * 0-2: address width
142 * 3-6: aval
143 * 8-23: domain id
145 struct context_entry {
146 u64 lo;
147 u64 hi;
150 static inline bool context_present(struct context_entry *context)
152 return (context->lo & 1);
154 static inline void context_set_present(struct context_entry *context)
156 context->lo |= 1;
159 static inline void context_set_fault_enable(struct context_entry *context)
161 context->lo &= (((u64)-1) << 2) | 1;
164 static inline void context_set_translation_type(struct context_entry *context,
165 unsigned long value)
167 context->lo &= (((u64)-1) << 4) | 3;
168 context->lo |= (value & 3) << 2;
171 static inline void context_set_address_root(struct context_entry *context,
172 unsigned long value)
174 context->lo |= value & VTD_PAGE_MASK;
177 static inline void context_set_address_width(struct context_entry *context,
178 unsigned long value)
180 context->hi |= value & 7;
183 static inline void context_set_domain_id(struct context_entry *context,
184 unsigned long value)
186 context->hi |= (value & ((1 << 16) - 1)) << 8;
189 static inline void context_clear_entry(struct context_entry *context)
191 context->lo = 0;
192 context->hi = 0;
196 * 0: readable
197 * 1: writable
198 * 2-6: reserved
199 * 7: super page
200 * 8-10: available
201 * 11: snoop behavior
202 * 12-63: Host physcial address
204 struct dma_pte {
205 u64 val;
208 static inline void dma_clear_pte(struct dma_pte *pte)
210 pte->val = 0;
213 static inline void dma_set_pte_readable(struct dma_pte *pte)
215 pte->val |= DMA_PTE_READ;
218 static inline void dma_set_pte_writable(struct dma_pte *pte)
220 pte->val |= DMA_PTE_WRITE;
223 static inline void dma_set_pte_snp(struct dma_pte *pte)
225 pte->val |= DMA_PTE_SNP;
228 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
230 pte->val = (pte->val & ~3) | (prot & 3);
233 static inline u64 dma_pte_addr(struct dma_pte *pte)
235 #ifdef CONFIG_64BIT
236 return pte->val & VTD_PAGE_MASK;
237 #else
238 /* Must have a full atomic 64-bit read */
239 return __cmpxchg64(pte, 0ULL, 0ULL) & VTD_PAGE_MASK;
240 #endif
243 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
245 pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
248 static inline bool dma_pte_present(struct dma_pte *pte)
250 return (pte->val & 3) != 0;
253 static inline int first_pte_in_page(struct dma_pte *pte)
255 return !((unsigned long)pte & ~VTD_PAGE_MASK);
259 * This domain is a statically identity mapping domain.
260 * 1. This domain creats a static 1:1 mapping to all usable memory.
261 * 2. It maps to each iommu if successful.
262 * 3. Each iommu mapps to this domain if successful.
264 static struct dmar_domain *si_domain;
265 static int hw_pass_through = 1;
267 /* devices under the same p2p bridge are owned in one domain */
268 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
270 /* domain represents a virtual machine, more than one devices
271 * across iommus may be owned in one domain, e.g. kvm guest.
273 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
275 /* si_domain contains mulitple devices */
276 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
278 struct dmar_domain {
279 int id; /* domain id */
280 int nid; /* node id */
281 unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/
283 struct list_head devices; /* all devices' list */
284 struct iova_domain iovad; /* iova's that belong to this domain */
286 struct dma_pte *pgd; /* virtual address */
287 int gaw; /* max guest address width */
289 /* adjusted guest address width, 0 is level 2 30-bit */
290 int agaw;
292 int flags; /* flags to find out type of domain */
294 int iommu_coherency;/* indicate coherency of iommu access */
295 int iommu_snooping; /* indicate snooping control feature*/
296 int iommu_count; /* reference count of iommu */
297 spinlock_t iommu_lock; /* protect iommu set in domain */
298 u64 max_addr; /* maximum mapped address */
301 /* PCI domain-device relationship */
302 struct device_domain_info {
303 struct list_head link; /* link to domain siblings */
304 struct list_head global; /* link to global list */
305 int segment; /* PCI domain */
306 u8 bus; /* PCI bus number */
307 u8 devfn; /* PCI devfn number */
308 struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
309 struct intel_iommu *iommu; /* IOMMU used by this device */
310 struct dmar_domain *domain; /* pointer to domain */
313 static void flush_unmaps_timeout(unsigned long data);
315 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
317 #define HIGH_WATER_MARK 250
318 struct deferred_flush_tables {
319 int next;
320 struct iova *iova[HIGH_WATER_MARK];
321 struct dmar_domain *domain[HIGH_WATER_MARK];
324 static struct deferred_flush_tables *deferred_flush;
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
329 static DEFINE_SPINLOCK(async_umap_flush_lock);
330 static LIST_HEAD(unmaps_to_do);
332 static int timer_on;
333 static long list_size;
335 static void domain_remove_dev_info(struct dmar_domain *domain);
337 #ifdef CONFIG_DMAR_DEFAULT_ON
338 int dmar_disabled = 0;
339 #else
340 int dmar_disabled = 1;
341 #endif /*CONFIG_DMAR_DEFAULT_ON*/
343 static int __initdata dmar_map_gfx = 1;
344 static int dmar_forcedac;
345 static int intel_iommu_strict;
347 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
348 static DEFINE_SPINLOCK(device_domain_lock);
349 static LIST_HEAD(device_domain_list);
351 static struct iommu_ops intel_iommu_ops;
353 static int __init intel_iommu_setup(char *str)
355 if (!str)
356 return -EINVAL;
357 while (*str) {
358 if (!strncmp(str, "on", 2)) {
359 dmar_disabled = 0;
360 printk(KERN_INFO "Intel-IOMMU: enabled\n");
361 } else if (!strncmp(str, "off", 3)) {
362 dmar_disabled = 1;
363 printk(KERN_INFO "Intel-IOMMU: disabled\n");
364 } else if (!strncmp(str, "igfx_off", 8)) {
365 dmar_map_gfx = 0;
366 printk(KERN_INFO
367 "Intel-IOMMU: disable GFX device mapping\n");
368 } else if (!strncmp(str, "forcedac", 8)) {
369 printk(KERN_INFO
370 "Intel-IOMMU: Forcing DAC for PCI devices\n");
371 dmar_forcedac = 1;
372 } else if (!strncmp(str, "strict", 6)) {
373 printk(KERN_INFO
374 "Intel-IOMMU: disable batched IOTLB flush\n");
375 intel_iommu_strict = 1;
378 str += strcspn(str, ",");
379 while (*str == ',')
380 str++;
382 return 0;
384 __setup("intel_iommu=", intel_iommu_setup);
386 static struct kmem_cache *iommu_domain_cache;
387 static struct kmem_cache *iommu_devinfo_cache;
388 static struct kmem_cache *iommu_iova_cache;
390 static inline void *alloc_pgtable_page(int node)
392 struct page *page;
393 void *vaddr = NULL;
395 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
396 if (page)
397 vaddr = page_address(page);
398 return vaddr;
401 static inline void free_pgtable_page(void *vaddr)
403 free_page((unsigned long)vaddr);
406 static inline void *alloc_domain_mem(void)
408 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
411 static void free_domain_mem(void *vaddr)
413 kmem_cache_free(iommu_domain_cache, vaddr);
416 static inline void * alloc_devinfo_mem(void)
418 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
421 static inline void free_devinfo_mem(void *vaddr)
423 kmem_cache_free(iommu_devinfo_cache, vaddr);
426 struct iova *alloc_iova_mem(void)
428 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
431 void free_iova_mem(struct iova *iova)
433 kmem_cache_free(iommu_iova_cache, iova);
437 static inline int width_to_agaw(int width);
439 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
441 unsigned long sagaw;
442 int agaw = -1;
444 sagaw = cap_sagaw(iommu->cap);
445 for (agaw = width_to_agaw(max_gaw);
446 agaw >= 0; agaw--) {
447 if (test_bit(agaw, &sagaw))
448 break;
451 return agaw;
455 * Calculate max SAGAW for each iommu.
457 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
459 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
463 * calculate agaw for each iommu.
464 * "SAGAW" may be different across iommus, use a default agaw, and
465 * get a supported less agaw for iommus that don't support the default agaw.
467 int iommu_calculate_agaw(struct intel_iommu *iommu)
469 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
472 /* This functionin only returns single iommu in a domain */
473 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
475 int iommu_id;
477 /* si_domain and vm domain should not get here. */
478 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
479 BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
481 iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
482 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
483 return NULL;
485 return g_iommus[iommu_id];
488 static void domain_update_iommu_coherency(struct dmar_domain *domain)
490 int i;
492 domain->iommu_coherency = 1;
494 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
495 for (; i < g_num_of_iommus; ) {
496 if (!ecap_coherent(g_iommus[i]->ecap)) {
497 domain->iommu_coherency = 0;
498 break;
500 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
504 static void domain_update_iommu_snooping(struct dmar_domain *domain)
506 int i;
508 domain->iommu_snooping = 1;
510 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
511 for (; i < g_num_of_iommus; ) {
512 if (!ecap_sc_support(g_iommus[i]->ecap)) {
513 domain->iommu_snooping = 0;
514 break;
516 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
520 /* Some capabilities may be different across iommus */
521 static void domain_update_iommu_cap(struct dmar_domain *domain)
523 domain_update_iommu_coherency(domain);
524 domain_update_iommu_snooping(domain);
527 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
529 struct dmar_drhd_unit *drhd = NULL;
530 int i;
532 for_each_drhd_unit(drhd) {
533 if (drhd->ignored)
534 continue;
535 if (segment != drhd->segment)
536 continue;
538 for (i = 0; i < drhd->devices_cnt; i++) {
539 if (drhd->devices[i] &&
540 drhd->devices[i]->bus->number == bus &&
541 drhd->devices[i]->devfn == devfn)
542 return drhd->iommu;
543 if (drhd->devices[i] &&
544 drhd->devices[i]->subordinate &&
545 drhd->devices[i]->subordinate->number <= bus &&
546 drhd->devices[i]->subordinate->subordinate >= bus)
547 return drhd->iommu;
550 if (drhd->include_all)
551 return drhd->iommu;
554 return NULL;
557 static void domain_flush_cache(struct dmar_domain *domain,
558 void *addr, int size)
560 if (!domain->iommu_coherency)
561 clflush_cache_range(addr, size);
564 /* Gets context entry for a given bus and devfn */
565 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
566 u8 bus, u8 devfn)
568 struct root_entry *root;
569 struct context_entry *context;
570 unsigned long phy_addr;
571 unsigned long flags;
573 spin_lock_irqsave(&iommu->lock, flags);
574 root = &iommu->root_entry[bus];
575 context = get_context_addr_from_root(root);
576 if (!context) {
577 context = (struct context_entry *)
578 alloc_pgtable_page(iommu->node);
579 if (!context) {
580 spin_unlock_irqrestore(&iommu->lock, flags);
581 return NULL;
583 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
584 phy_addr = virt_to_phys((void *)context);
585 set_root_value(root, phy_addr);
586 set_root_present(root);
587 __iommu_flush_cache(iommu, root, sizeof(*root));
589 spin_unlock_irqrestore(&iommu->lock, flags);
590 return &context[devfn];
593 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
595 struct root_entry *root;
596 struct context_entry *context;
597 int ret;
598 unsigned long flags;
600 spin_lock_irqsave(&iommu->lock, flags);
601 root = &iommu->root_entry[bus];
602 context = get_context_addr_from_root(root);
603 if (!context) {
604 ret = 0;
605 goto out;
607 ret = context_present(&context[devfn]);
608 out:
609 spin_unlock_irqrestore(&iommu->lock, flags);
610 return ret;
613 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
615 struct root_entry *root;
616 struct context_entry *context;
617 unsigned long flags;
619 spin_lock_irqsave(&iommu->lock, flags);
620 root = &iommu->root_entry[bus];
621 context = get_context_addr_from_root(root);
622 if (context) {
623 context_clear_entry(&context[devfn]);
624 __iommu_flush_cache(iommu, &context[devfn], \
625 sizeof(*context));
627 spin_unlock_irqrestore(&iommu->lock, flags);
630 static void free_context_table(struct intel_iommu *iommu)
632 struct root_entry *root;
633 int i;
634 unsigned long flags;
635 struct context_entry *context;
637 spin_lock_irqsave(&iommu->lock, flags);
638 if (!iommu->root_entry) {
639 goto out;
641 for (i = 0; i < ROOT_ENTRY_NR; i++) {
642 root = &iommu->root_entry[i];
643 context = get_context_addr_from_root(root);
644 if (context)
645 free_pgtable_page(context);
647 free_pgtable_page(iommu->root_entry);
648 iommu->root_entry = NULL;
649 out:
650 spin_unlock_irqrestore(&iommu->lock, flags);
653 /* page table handling */
654 #define LEVEL_STRIDE (9)
655 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
657 static inline int agaw_to_level(int agaw)
659 return agaw + 2;
662 static inline int agaw_to_width(int agaw)
664 return 30 + agaw * LEVEL_STRIDE;
668 static inline int width_to_agaw(int width)
670 return (width - 30) / LEVEL_STRIDE;
673 static inline unsigned int level_to_offset_bits(int level)
675 return (level - 1) * LEVEL_STRIDE;
678 static inline int pfn_level_offset(unsigned long pfn, int level)
680 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
683 static inline unsigned long level_mask(int level)
685 return -1UL << level_to_offset_bits(level);
688 static inline unsigned long level_size(int level)
690 return 1UL << level_to_offset_bits(level);
693 static inline unsigned long align_to_level(unsigned long pfn, int level)
695 return (pfn + level_size(level) - 1) & level_mask(level);
698 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
699 unsigned long pfn)
701 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
702 struct dma_pte *parent, *pte = NULL;
703 int level = agaw_to_level(domain->agaw);
704 int offset;
706 BUG_ON(!domain->pgd);
707 BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
708 parent = domain->pgd;
710 while (level > 0) {
711 void *tmp_page;
713 offset = pfn_level_offset(pfn, level);
714 pte = &parent[offset];
715 if (level == 1)
716 break;
718 if (!dma_pte_present(pte)) {
719 uint64_t pteval;
721 tmp_page = alloc_pgtable_page(domain->nid);
723 if (!tmp_page)
724 return NULL;
726 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
727 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
728 if (cmpxchg64(&pte->val, 0ULL, pteval)) {
729 /* Someone else set it while we were thinking; use theirs. */
730 free_pgtable_page(tmp_page);
731 } else {
732 dma_pte_addr(pte);
733 domain_flush_cache(domain, pte, sizeof(*pte));
736 parent = phys_to_virt(dma_pte_addr(pte));
737 level--;
740 return pte;
743 /* return address's pte at specific level */
744 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
745 unsigned long pfn,
746 int level)
748 struct dma_pte *parent, *pte = NULL;
749 int total = agaw_to_level(domain->agaw);
750 int offset;
752 parent = domain->pgd;
753 while (level <= total) {
754 offset = pfn_level_offset(pfn, total);
755 pte = &parent[offset];
756 if (level == total)
757 return pte;
759 if (!dma_pte_present(pte))
760 break;
761 parent = phys_to_virt(dma_pte_addr(pte));
762 total--;
764 return NULL;
767 /* clear last level pte, a tlb flush should be followed */
768 static void dma_pte_clear_range(struct dmar_domain *domain,
769 unsigned long start_pfn,
770 unsigned long last_pfn)
772 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
773 struct dma_pte *first_pte, *pte;
775 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
776 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
777 BUG_ON(start_pfn > last_pfn);
779 /* we don't need lock here; nobody else touches the iova range */
780 do {
781 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
782 if (!pte) {
783 start_pfn = align_to_level(start_pfn + 1, 2);
784 continue;
786 do {
787 dma_clear_pte(pte);
788 start_pfn++;
789 pte++;
790 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
792 domain_flush_cache(domain, first_pte,
793 (void *)pte - (void *)first_pte);
795 } while (start_pfn && start_pfn <= last_pfn);
798 /* free page table pages. last level pte should already be cleared */
799 static void dma_pte_free_pagetable(struct dmar_domain *domain,
800 unsigned long start_pfn,
801 unsigned long last_pfn)
803 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
804 struct dma_pte *first_pte, *pte;
805 int total = agaw_to_level(domain->agaw);
806 int level;
807 unsigned long tmp;
809 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
810 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
811 BUG_ON(start_pfn > last_pfn);
813 /* We don't need lock here; nobody else touches the iova range */
814 level = 2;
815 while (level <= total) {
816 tmp = align_to_level(start_pfn, level);
818 /* If we can't even clear one PTE at this level, we're done */
819 if (tmp + level_size(level) - 1 > last_pfn)
820 return;
822 do {
823 first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
824 if (!pte) {
825 tmp = align_to_level(tmp + 1, level + 1);
826 continue;
828 do {
829 if (dma_pte_present(pte)) {
830 free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
831 dma_clear_pte(pte);
833 pte++;
834 tmp += level_size(level);
835 } while (!first_pte_in_page(pte) &&
836 tmp + level_size(level) - 1 <= last_pfn);
838 domain_flush_cache(domain, first_pte,
839 (void *)pte - (void *)first_pte);
841 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
842 level++;
844 /* free pgd */
845 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
846 free_pgtable_page(domain->pgd);
847 domain->pgd = NULL;
851 /* iommu handling */
852 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
854 struct root_entry *root;
855 unsigned long flags;
857 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
858 if (!root)
859 return -ENOMEM;
861 __iommu_flush_cache(iommu, root, ROOT_SIZE);
863 spin_lock_irqsave(&iommu->lock, flags);
864 iommu->root_entry = root;
865 spin_unlock_irqrestore(&iommu->lock, flags);
867 return 0;
870 static void iommu_set_root_entry(struct intel_iommu *iommu)
872 void *addr;
873 u32 sts;
874 unsigned long flag;
876 addr = iommu->root_entry;
878 spin_lock_irqsave(&iommu->register_lock, flag);
879 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
881 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
883 /* Make sure hardware complete it */
884 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
885 readl, (sts & DMA_GSTS_RTPS), sts);
887 spin_unlock_irqrestore(&iommu->register_lock, flag);
890 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
892 u32 val;
893 unsigned long flag;
895 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
896 return;
898 spin_lock_irqsave(&iommu->register_lock, flag);
899 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
901 /* Make sure hardware complete it */
902 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
903 readl, (!(val & DMA_GSTS_WBFS)), val);
905 spin_unlock_irqrestore(&iommu->register_lock, flag);
908 /* return value determine if we need a write buffer flush */
909 static void __iommu_flush_context(struct intel_iommu *iommu,
910 u16 did, u16 source_id, u8 function_mask,
911 u64 type)
913 u64 val = 0;
914 unsigned long flag;
916 switch (type) {
917 case DMA_CCMD_GLOBAL_INVL:
918 val = DMA_CCMD_GLOBAL_INVL;
919 break;
920 case DMA_CCMD_DOMAIN_INVL:
921 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
922 break;
923 case DMA_CCMD_DEVICE_INVL:
924 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
925 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
926 break;
927 default:
928 BUG();
930 val |= DMA_CCMD_ICC;
932 spin_lock_irqsave(&iommu->register_lock, flag);
933 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
935 /* Make sure hardware complete it */
936 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
937 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
939 spin_unlock_irqrestore(&iommu->register_lock, flag);
942 /* return value determine if we need a write buffer flush */
943 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
944 u64 addr, unsigned int size_order, u64 type)
946 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
947 u64 val = 0, val_iva = 0;
948 unsigned long flag;
950 switch (type) {
951 case DMA_TLB_GLOBAL_FLUSH:
952 /* global flush doesn't need set IVA_REG */
953 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
954 break;
955 case DMA_TLB_DSI_FLUSH:
956 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
957 break;
958 case DMA_TLB_PSI_FLUSH:
959 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
960 /* Note: always flush non-leaf currently */
961 val_iva = size_order | addr;
962 break;
963 default:
964 BUG();
966 /* Note: set drain read/write */
967 #if 0
969 * This is probably to be super secure.. Looks like we can
970 * ignore it without any impact.
972 if (cap_read_drain(iommu->cap))
973 val |= DMA_TLB_READ_DRAIN;
974 #endif
975 if (cap_write_drain(iommu->cap))
976 val |= DMA_TLB_WRITE_DRAIN;
978 spin_lock_irqsave(&iommu->register_lock, flag);
979 /* Note: Only uses first TLB reg currently */
980 if (val_iva)
981 dmar_writeq(iommu->reg + tlb_offset, val_iva);
982 dmar_writeq(iommu->reg + tlb_offset + 8, val);
984 /* Make sure hardware complete it */
985 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
986 dmar_readq, (!(val & DMA_TLB_IVT)), val);
988 spin_unlock_irqrestore(&iommu->register_lock, flag);
990 /* check IOTLB invalidation granularity */
991 if (DMA_TLB_IAIG(val) == 0)
992 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
993 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
994 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
995 (unsigned long long)DMA_TLB_IIRG(type),
996 (unsigned long long)DMA_TLB_IAIG(val));
999 static struct device_domain_info *iommu_support_dev_iotlb(
1000 struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1002 int found = 0;
1003 unsigned long flags;
1004 struct device_domain_info *info;
1005 struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1007 if (!ecap_dev_iotlb_support(iommu->ecap))
1008 return NULL;
1010 if (!iommu->qi)
1011 return NULL;
1013 spin_lock_irqsave(&device_domain_lock, flags);
1014 list_for_each_entry(info, &domain->devices, link)
1015 if (info->bus == bus && info->devfn == devfn) {
1016 found = 1;
1017 break;
1019 spin_unlock_irqrestore(&device_domain_lock, flags);
1021 if (!found || !info->dev)
1022 return NULL;
1024 if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1025 return NULL;
1027 if (!dmar_find_matched_atsr_unit(info->dev))
1028 return NULL;
1030 info->iommu = iommu;
1032 return info;
1035 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1037 if (!info)
1038 return;
1040 pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1043 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1045 if (!info->dev || !pci_ats_enabled(info->dev))
1046 return;
1048 pci_disable_ats(info->dev);
1051 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1052 u64 addr, unsigned mask)
1054 u16 sid, qdep;
1055 unsigned long flags;
1056 struct device_domain_info *info;
1058 spin_lock_irqsave(&device_domain_lock, flags);
1059 list_for_each_entry(info, &domain->devices, link) {
1060 if (!info->dev || !pci_ats_enabled(info->dev))
1061 continue;
1063 sid = info->bus << 8 | info->devfn;
1064 qdep = pci_ats_queue_depth(info->dev);
1065 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1067 spin_unlock_irqrestore(&device_domain_lock, flags);
1070 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1071 unsigned long pfn, unsigned int pages)
1073 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1074 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1076 BUG_ON(pages == 0);
1079 * Fallback to domain selective flush if no PSI support or the size is
1080 * too big.
1081 * PSI requires page size to be 2 ^ x, and the base address is naturally
1082 * aligned to the size
1084 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1085 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1086 DMA_TLB_DSI_FLUSH);
1087 else
1088 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1089 DMA_TLB_PSI_FLUSH);
1092 * In caching mode, domain ID 0 is reserved for non-present to present
1093 * mapping flush. Device IOTLB doesn't need to be flushed in this case.
1095 if (!cap_caching_mode(iommu->cap) || did)
1096 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1099 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1101 u32 pmen;
1102 unsigned long flags;
1104 spin_lock_irqsave(&iommu->register_lock, flags);
1105 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1106 pmen &= ~DMA_PMEN_EPM;
1107 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1109 /* wait for the protected region status bit to clear */
1110 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1111 readl, !(pmen & DMA_PMEN_PRS), pmen);
1113 spin_unlock_irqrestore(&iommu->register_lock, flags);
1116 static int iommu_enable_translation(struct intel_iommu *iommu)
1118 u32 sts;
1119 unsigned long flags;
1121 spin_lock_irqsave(&iommu->register_lock, flags);
1122 iommu->gcmd |= DMA_GCMD_TE;
1123 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1125 /* Make sure hardware complete it */
1126 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1127 readl, (sts & DMA_GSTS_TES), sts);
1129 spin_unlock_irqrestore(&iommu->register_lock, flags);
1130 return 0;
1133 static int iommu_disable_translation(struct intel_iommu *iommu)
1135 u32 sts;
1136 unsigned long flag;
1138 spin_lock_irqsave(&iommu->register_lock, flag);
1139 iommu->gcmd &= ~DMA_GCMD_TE;
1140 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1142 /* Make sure hardware complete it */
1143 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1144 readl, (!(sts & DMA_GSTS_TES)), sts);
1146 spin_unlock_irqrestore(&iommu->register_lock, flag);
1147 return 0;
1151 static int iommu_init_domains(struct intel_iommu *iommu)
1153 unsigned long ndomains;
1154 unsigned long nlongs;
1156 ndomains = cap_ndoms(iommu->cap);
1157 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1158 nlongs = BITS_TO_LONGS(ndomains);
1160 spin_lock_init(&iommu->lock);
1162 /* TBD: there might be 64K domains,
1163 * consider other allocation for future chip
1165 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1166 if (!iommu->domain_ids) {
1167 printk(KERN_ERR "Allocating domain id array failed\n");
1168 return -ENOMEM;
1170 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1171 GFP_KERNEL);
1172 if (!iommu->domains) {
1173 printk(KERN_ERR "Allocating domain array failed\n");
1174 return -ENOMEM;
1178 * if Caching mode is set, then invalid translations are tagged
1179 * with domainid 0. Hence we need to pre-allocate it.
1181 if (cap_caching_mode(iommu->cap))
1182 set_bit(0, iommu->domain_ids);
1183 return 0;
1187 static void domain_exit(struct dmar_domain *domain);
1188 static void vm_domain_exit(struct dmar_domain *domain);
1190 void free_dmar_iommu(struct intel_iommu *iommu)
1192 struct dmar_domain *domain;
1193 int i;
1194 unsigned long flags;
1196 if ((iommu->domains) && (iommu->domain_ids)) {
1197 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1198 for (; i < cap_ndoms(iommu->cap); ) {
1199 domain = iommu->domains[i];
1200 clear_bit(i, iommu->domain_ids);
1202 spin_lock_irqsave(&domain->iommu_lock, flags);
1203 if (--domain->iommu_count == 0) {
1204 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1205 vm_domain_exit(domain);
1206 else
1207 domain_exit(domain);
1209 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1211 i = find_next_bit(iommu->domain_ids,
1212 cap_ndoms(iommu->cap), i+1);
1216 if (iommu->gcmd & DMA_GCMD_TE)
1217 iommu_disable_translation(iommu);
1219 if (iommu->irq) {
1220 set_irq_data(iommu->irq, NULL);
1221 /* This will mask the irq */
1222 free_irq(iommu->irq, iommu);
1223 destroy_irq(iommu->irq);
1226 kfree(iommu->domains);
1227 kfree(iommu->domain_ids);
1229 g_iommus[iommu->seq_id] = NULL;
1231 /* if all iommus are freed, free g_iommus */
1232 for (i = 0; i < g_num_of_iommus; i++) {
1233 if (g_iommus[i])
1234 break;
1237 if (i == g_num_of_iommus)
1238 kfree(g_iommus);
1240 /* free context mapping */
1241 free_context_table(iommu);
1244 static struct dmar_domain *alloc_domain(void)
1246 struct dmar_domain *domain;
1248 domain = alloc_domain_mem();
1249 if (!domain)
1250 return NULL;
1252 domain->nid = -1;
1253 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1254 domain->flags = 0;
1256 return domain;
1259 static int iommu_attach_domain(struct dmar_domain *domain,
1260 struct intel_iommu *iommu)
1262 int num;
1263 unsigned long ndomains;
1264 unsigned long flags;
1266 ndomains = cap_ndoms(iommu->cap);
1268 spin_lock_irqsave(&iommu->lock, flags);
1270 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1271 if (num >= ndomains) {
1272 spin_unlock_irqrestore(&iommu->lock, flags);
1273 printk(KERN_ERR "IOMMU: no free domain ids\n");
1274 return -ENOMEM;
1277 domain->id = num;
1278 set_bit(num, iommu->domain_ids);
1279 set_bit(iommu->seq_id, &domain->iommu_bmp);
1280 iommu->domains[num] = domain;
1281 spin_unlock_irqrestore(&iommu->lock, flags);
1283 return 0;
1286 static void iommu_detach_domain(struct dmar_domain *domain,
1287 struct intel_iommu *iommu)
1289 unsigned long flags;
1290 int num, ndomains;
1291 int found = 0;
1293 spin_lock_irqsave(&iommu->lock, flags);
1294 ndomains = cap_ndoms(iommu->cap);
1295 num = find_first_bit(iommu->domain_ids, ndomains);
1296 for (; num < ndomains; ) {
1297 if (iommu->domains[num] == domain) {
1298 found = 1;
1299 break;
1301 num = find_next_bit(iommu->domain_ids,
1302 cap_ndoms(iommu->cap), num+1);
1305 if (found) {
1306 clear_bit(num, iommu->domain_ids);
1307 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1308 iommu->domains[num] = NULL;
1310 spin_unlock_irqrestore(&iommu->lock, flags);
1313 static struct iova_domain reserved_iova_list;
1314 static struct lock_class_key reserved_rbtree_key;
1316 static void dmar_init_reserved_ranges(void)
1318 struct pci_dev *pdev = NULL;
1319 struct iova *iova;
1320 int i;
1322 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1324 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1325 &reserved_rbtree_key);
1327 /* IOAPIC ranges shouldn't be accessed by DMA */
1328 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1329 IOVA_PFN(IOAPIC_RANGE_END));
1330 if (!iova)
1331 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1333 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1334 for_each_pci_dev(pdev) {
1335 struct resource *r;
1337 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1338 r = &pdev->resource[i];
1339 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1340 continue;
1341 iova = reserve_iova(&reserved_iova_list,
1342 IOVA_PFN(r->start),
1343 IOVA_PFN(r->end));
1344 if (!iova)
1345 printk(KERN_ERR "Reserve iova failed\n");
1351 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1353 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1356 static inline int guestwidth_to_adjustwidth(int gaw)
1358 int agaw;
1359 int r = (gaw - 12) % 9;
1361 if (r == 0)
1362 agaw = gaw;
1363 else
1364 agaw = gaw + 9 - r;
1365 if (agaw > 64)
1366 agaw = 64;
1367 return agaw;
1370 static int domain_init(struct dmar_domain *domain, int guest_width)
1372 struct intel_iommu *iommu;
1373 int adjust_width, agaw;
1374 unsigned long sagaw;
1376 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1377 spin_lock_init(&domain->iommu_lock);
1379 domain_reserve_special_ranges(domain);
1381 /* calculate AGAW */
1382 iommu = domain_get_iommu(domain);
1383 if (guest_width > cap_mgaw(iommu->cap))
1384 guest_width = cap_mgaw(iommu->cap);
1385 domain->gaw = guest_width;
1386 adjust_width = guestwidth_to_adjustwidth(guest_width);
1387 agaw = width_to_agaw(adjust_width);
1388 sagaw = cap_sagaw(iommu->cap);
1389 if (!test_bit(agaw, &sagaw)) {
1390 /* hardware doesn't support it, choose a bigger one */
1391 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1392 agaw = find_next_bit(&sagaw, 5, agaw);
1393 if (agaw >= 5)
1394 return -ENODEV;
1396 domain->agaw = agaw;
1397 INIT_LIST_HEAD(&domain->devices);
1399 if (ecap_coherent(iommu->ecap))
1400 domain->iommu_coherency = 1;
1401 else
1402 domain->iommu_coherency = 0;
1404 if (ecap_sc_support(iommu->ecap))
1405 domain->iommu_snooping = 1;
1406 else
1407 domain->iommu_snooping = 0;
1409 domain->iommu_count = 1;
1410 domain->nid = iommu->node;
1412 /* always allocate the top pgd */
1413 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1414 if (!domain->pgd)
1415 return -ENOMEM;
1416 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1417 return 0;
1420 static void domain_exit(struct dmar_domain *domain)
1422 struct dmar_drhd_unit *drhd;
1423 struct intel_iommu *iommu;
1425 /* Domain 0 is reserved, so dont process it */
1426 if (!domain)
1427 return;
1429 domain_remove_dev_info(domain);
1430 /* destroy iovas */
1431 put_iova_domain(&domain->iovad);
1433 /* clear ptes */
1434 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1436 /* free page tables */
1437 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1439 for_each_active_iommu(iommu, drhd)
1440 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1441 iommu_detach_domain(domain, iommu);
1443 free_domain_mem(domain);
1446 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1447 u8 bus, u8 devfn, int translation)
1449 struct context_entry *context;
1450 unsigned long flags;
1451 struct intel_iommu *iommu;
1452 struct dma_pte *pgd;
1453 unsigned long num;
1454 unsigned long ndomains;
1455 int id;
1456 int agaw;
1457 struct device_domain_info *info = NULL;
1459 pr_debug("Set context mapping for %02x:%02x.%d\n",
1460 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1462 BUG_ON(!domain->pgd);
1463 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1464 translation != CONTEXT_TT_MULTI_LEVEL);
1466 iommu = device_to_iommu(segment, bus, devfn);
1467 if (!iommu)
1468 return -ENODEV;
1470 context = device_to_context_entry(iommu, bus, devfn);
1471 if (!context)
1472 return -ENOMEM;
1473 spin_lock_irqsave(&iommu->lock, flags);
1474 if (context_present(context)) {
1475 spin_unlock_irqrestore(&iommu->lock, flags);
1476 return 0;
1479 id = domain->id;
1480 pgd = domain->pgd;
1482 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1483 domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1484 int found = 0;
1486 /* find an available domain id for this device in iommu */
1487 ndomains = cap_ndoms(iommu->cap);
1488 num = find_first_bit(iommu->domain_ids, ndomains);
1489 for (; num < ndomains; ) {
1490 if (iommu->domains[num] == domain) {
1491 id = num;
1492 found = 1;
1493 break;
1495 num = find_next_bit(iommu->domain_ids,
1496 cap_ndoms(iommu->cap), num+1);
1499 if (found == 0) {
1500 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1501 if (num >= ndomains) {
1502 spin_unlock_irqrestore(&iommu->lock, flags);
1503 printk(KERN_ERR "IOMMU: no free domain ids\n");
1504 return -EFAULT;
1507 set_bit(num, iommu->domain_ids);
1508 iommu->domains[num] = domain;
1509 id = num;
1512 /* Skip top levels of page tables for
1513 * iommu which has less agaw than default.
1514 * Unnecessary for PT mode.
1516 if (translation != CONTEXT_TT_PASS_THROUGH) {
1517 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1518 pgd = phys_to_virt(dma_pte_addr(pgd));
1519 if (!dma_pte_present(pgd)) {
1520 spin_unlock_irqrestore(&iommu->lock, flags);
1521 return -ENOMEM;
1527 context_set_domain_id(context, id);
1529 if (translation != CONTEXT_TT_PASS_THROUGH) {
1530 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1531 translation = info ? CONTEXT_TT_DEV_IOTLB :
1532 CONTEXT_TT_MULTI_LEVEL;
1535 * In pass through mode, AW must be programmed to indicate the largest
1536 * AGAW value supported by hardware. And ASR is ignored by hardware.
1538 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1539 context_set_address_width(context, iommu->msagaw);
1540 else {
1541 context_set_address_root(context, virt_to_phys(pgd));
1542 context_set_address_width(context, iommu->agaw);
1545 context_set_translation_type(context, translation);
1546 context_set_fault_enable(context);
1547 context_set_present(context);
1548 domain_flush_cache(domain, context, sizeof(*context));
1551 * It's a non-present to present mapping. If hardware doesn't cache
1552 * non-present entry we only need to flush the write-buffer. If the
1553 * _does_ cache non-present entries, then it does so in the special
1554 * domain #0, which we have to flush:
1556 if (cap_caching_mode(iommu->cap)) {
1557 iommu->flush.flush_context(iommu, 0,
1558 (((u16)bus) << 8) | devfn,
1559 DMA_CCMD_MASK_NOBIT,
1560 DMA_CCMD_DEVICE_INVL);
1561 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH);
1562 } else {
1563 iommu_flush_write_buffer(iommu);
1565 iommu_enable_dev_iotlb(info);
1566 spin_unlock_irqrestore(&iommu->lock, flags);
1568 spin_lock_irqsave(&domain->iommu_lock, flags);
1569 if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1570 domain->iommu_count++;
1571 if (domain->iommu_count == 1)
1572 domain->nid = iommu->node;
1573 domain_update_iommu_cap(domain);
1575 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1576 return 0;
1579 static int
1580 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1581 int translation)
1583 int ret;
1584 struct pci_dev *tmp, *parent;
1586 ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1587 pdev->bus->number, pdev->devfn,
1588 translation);
1589 if (ret)
1590 return ret;
1592 /* dependent device mapping */
1593 tmp = pci_find_upstream_pcie_bridge(pdev);
1594 if (!tmp)
1595 return 0;
1596 /* Secondary interface's bus number and devfn 0 */
1597 parent = pdev->bus->self;
1598 while (parent != tmp) {
1599 ret = domain_context_mapping_one(domain,
1600 pci_domain_nr(parent->bus),
1601 parent->bus->number,
1602 parent->devfn, translation);
1603 if (ret)
1604 return ret;
1605 parent = parent->bus->self;
1607 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1608 return domain_context_mapping_one(domain,
1609 pci_domain_nr(tmp->subordinate),
1610 tmp->subordinate->number, 0,
1611 translation);
1612 else /* this is a legacy PCI bridge */
1613 return domain_context_mapping_one(domain,
1614 pci_domain_nr(tmp->bus),
1615 tmp->bus->number,
1616 tmp->devfn,
1617 translation);
1620 static int domain_context_mapped(struct pci_dev *pdev)
1622 int ret;
1623 struct pci_dev *tmp, *parent;
1624 struct intel_iommu *iommu;
1626 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1627 pdev->devfn);
1628 if (!iommu)
1629 return -ENODEV;
1631 ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1632 if (!ret)
1633 return ret;
1634 /* dependent device mapping */
1635 tmp = pci_find_upstream_pcie_bridge(pdev);
1636 if (!tmp)
1637 return ret;
1638 /* Secondary interface's bus number and devfn 0 */
1639 parent = pdev->bus->self;
1640 while (parent != tmp) {
1641 ret = device_context_mapped(iommu, parent->bus->number,
1642 parent->devfn);
1643 if (!ret)
1644 return ret;
1645 parent = parent->bus->self;
1647 if (pci_is_pcie(tmp))
1648 return device_context_mapped(iommu, tmp->subordinate->number,
1650 else
1651 return device_context_mapped(iommu, tmp->bus->number,
1652 tmp->devfn);
1655 /* Returns a number of VTD pages, but aligned to MM page size */
1656 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1657 size_t size)
1659 host_addr &= ~PAGE_MASK;
1660 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1663 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1664 struct scatterlist *sg, unsigned long phys_pfn,
1665 unsigned long nr_pages, int prot)
1667 struct dma_pte *first_pte = NULL, *pte = NULL;
1668 phys_addr_t uninitialized_var(pteval);
1669 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1670 unsigned long sg_res;
1672 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1674 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1675 return -EINVAL;
1677 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1679 if (sg)
1680 sg_res = 0;
1681 else {
1682 sg_res = nr_pages + 1;
1683 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1686 while (nr_pages--) {
1687 uint64_t tmp;
1689 if (!sg_res) {
1690 sg_res = aligned_nrpages(sg->offset, sg->length);
1691 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1692 sg->dma_length = sg->length;
1693 pteval = page_to_phys(sg_page(sg)) | prot;
1695 if (!pte) {
1696 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1697 if (!pte)
1698 return -ENOMEM;
1700 /* We don't need lock here, nobody else
1701 * touches the iova range
1703 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1704 if (tmp) {
1705 static int dumps = 5;
1706 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1707 iov_pfn, tmp, (unsigned long long)pteval);
1708 if (dumps) {
1709 dumps--;
1710 debug_dma_dump_mappings(NULL);
1712 WARN_ON(1);
1714 pte++;
1715 if (!nr_pages || first_pte_in_page(pte)) {
1716 domain_flush_cache(domain, first_pte,
1717 (void *)pte - (void *)first_pte);
1718 pte = NULL;
1720 iov_pfn++;
1721 pteval += VTD_PAGE_SIZE;
1722 sg_res--;
1723 if (!sg_res)
1724 sg = sg_next(sg);
1726 return 0;
1729 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1730 struct scatterlist *sg, unsigned long nr_pages,
1731 int prot)
1733 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1736 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1737 unsigned long phys_pfn, unsigned long nr_pages,
1738 int prot)
1740 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1743 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1745 if (!iommu)
1746 return;
1748 clear_context_table(iommu, bus, devfn);
1749 iommu->flush.flush_context(iommu, 0, 0, 0,
1750 DMA_CCMD_GLOBAL_INVL);
1751 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1754 static void domain_remove_dev_info(struct dmar_domain *domain)
1756 struct device_domain_info *info;
1757 unsigned long flags;
1758 struct intel_iommu *iommu;
1760 spin_lock_irqsave(&device_domain_lock, flags);
1761 while (!list_empty(&domain->devices)) {
1762 info = list_entry(domain->devices.next,
1763 struct device_domain_info, link);
1764 list_del(&info->link);
1765 list_del(&info->global);
1766 if (info->dev)
1767 info->dev->dev.archdata.iommu = NULL;
1768 spin_unlock_irqrestore(&device_domain_lock, flags);
1770 iommu_disable_dev_iotlb(info);
1771 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1772 iommu_detach_dev(iommu, info->bus, info->devfn);
1773 free_devinfo_mem(info);
1775 spin_lock_irqsave(&device_domain_lock, flags);
1777 spin_unlock_irqrestore(&device_domain_lock, flags);
1781 * find_domain
1782 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1784 static struct dmar_domain *
1785 find_domain(struct pci_dev *pdev)
1787 struct device_domain_info *info;
1789 /* No lock here, assumes no domain exit in normal case */
1790 info = pdev->dev.archdata.iommu;
1791 if (info)
1792 return info->domain;
1793 return NULL;
1796 /* domain is initialized */
1797 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1799 struct dmar_domain *domain, *found = NULL;
1800 struct intel_iommu *iommu;
1801 struct dmar_drhd_unit *drhd;
1802 struct device_domain_info *info, *tmp;
1803 struct pci_dev *dev_tmp;
1804 unsigned long flags;
1805 int bus = 0, devfn = 0;
1806 int segment;
1807 int ret;
1809 domain = find_domain(pdev);
1810 if (domain)
1811 return domain;
1813 segment = pci_domain_nr(pdev->bus);
1815 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1816 if (dev_tmp) {
1817 if (pci_is_pcie(dev_tmp)) {
1818 bus = dev_tmp->subordinate->number;
1819 devfn = 0;
1820 } else {
1821 bus = dev_tmp->bus->number;
1822 devfn = dev_tmp->devfn;
1824 spin_lock_irqsave(&device_domain_lock, flags);
1825 list_for_each_entry(info, &device_domain_list, global) {
1826 if (info->segment == segment &&
1827 info->bus == bus && info->devfn == devfn) {
1828 found = info->domain;
1829 break;
1832 spin_unlock_irqrestore(&device_domain_lock, flags);
1833 /* pcie-pci bridge already has a domain, uses it */
1834 if (found) {
1835 domain = found;
1836 goto found_domain;
1840 domain = alloc_domain();
1841 if (!domain)
1842 goto error;
1844 /* Allocate new domain for the device */
1845 drhd = dmar_find_matched_drhd_unit(pdev);
1846 if (!drhd) {
1847 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1848 pci_name(pdev));
1849 return NULL;
1851 iommu = drhd->iommu;
1853 ret = iommu_attach_domain(domain, iommu);
1854 if (ret) {
1855 domain_exit(domain);
1856 goto error;
1859 if (domain_init(domain, gaw)) {
1860 domain_exit(domain);
1861 goto error;
1864 /* register pcie-to-pci device */
1865 if (dev_tmp) {
1866 info = alloc_devinfo_mem();
1867 if (!info) {
1868 domain_exit(domain);
1869 goto error;
1871 info->segment = segment;
1872 info->bus = bus;
1873 info->devfn = devfn;
1874 info->dev = NULL;
1875 info->domain = domain;
1876 /* This domain is shared by devices under p2p bridge */
1877 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1879 /* pcie-to-pci bridge already has a domain, uses it */
1880 found = NULL;
1881 spin_lock_irqsave(&device_domain_lock, flags);
1882 list_for_each_entry(tmp, &device_domain_list, global) {
1883 if (tmp->segment == segment &&
1884 tmp->bus == bus && tmp->devfn == devfn) {
1885 found = tmp->domain;
1886 break;
1889 if (found) {
1890 free_devinfo_mem(info);
1891 domain_exit(domain);
1892 domain = found;
1893 } else {
1894 list_add(&info->link, &domain->devices);
1895 list_add(&info->global, &device_domain_list);
1897 spin_unlock_irqrestore(&device_domain_lock, flags);
1900 found_domain:
1901 info = alloc_devinfo_mem();
1902 if (!info)
1903 goto error;
1904 info->segment = segment;
1905 info->bus = pdev->bus->number;
1906 info->devfn = pdev->devfn;
1907 info->dev = pdev;
1908 info->domain = domain;
1909 spin_lock_irqsave(&device_domain_lock, flags);
1910 /* somebody is fast */
1911 found = find_domain(pdev);
1912 if (found != NULL) {
1913 spin_unlock_irqrestore(&device_domain_lock, flags);
1914 if (found != domain) {
1915 domain_exit(domain);
1916 domain = found;
1918 free_devinfo_mem(info);
1919 return domain;
1921 list_add(&info->link, &domain->devices);
1922 list_add(&info->global, &device_domain_list);
1923 pdev->dev.archdata.iommu = info;
1924 spin_unlock_irqrestore(&device_domain_lock, flags);
1925 return domain;
1926 error:
1927 /* recheck it here, maybe others set it */
1928 return find_domain(pdev);
1931 static int iommu_identity_mapping;
1932 #define IDENTMAP_ALL 1
1933 #define IDENTMAP_GFX 2
1934 #define IDENTMAP_AZALIA 4
1936 static int iommu_domain_identity_map(struct dmar_domain *domain,
1937 unsigned long long start,
1938 unsigned long long end)
1940 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1941 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1943 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1944 dma_to_mm_pfn(last_vpfn))) {
1945 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1946 return -ENOMEM;
1949 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1950 start, end, domain->id);
1952 * RMRR range might have overlap with physical memory range,
1953 * clear it first
1955 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1957 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1958 last_vpfn - first_vpfn + 1,
1959 DMA_PTE_READ|DMA_PTE_WRITE);
1962 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1963 unsigned long long start,
1964 unsigned long long end)
1966 struct dmar_domain *domain;
1967 int ret;
1969 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1970 if (!domain)
1971 return -ENOMEM;
1973 /* For _hardware_ passthrough, don't bother. But for software
1974 passthrough, we do it anyway -- it may indicate a memory
1975 range which is reserved in E820, so which didn't get set
1976 up to start with in si_domain */
1977 if (domain == si_domain && hw_pass_through) {
1978 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1979 pci_name(pdev), start, end);
1980 return 0;
1983 printk(KERN_INFO
1984 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1985 pci_name(pdev), start, end);
1987 if (end < start) {
1988 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1989 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1990 dmi_get_system_info(DMI_BIOS_VENDOR),
1991 dmi_get_system_info(DMI_BIOS_VERSION),
1992 dmi_get_system_info(DMI_PRODUCT_VERSION));
1993 ret = -EIO;
1994 goto error;
1997 if (end >> agaw_to_width(domain->agaw)) {
1998 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1999 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2000 agaw_to_width(domain->agaw),
2001 dmi_get_system_info(DMI_BIOS_VENDOR),
2002 dmi_get_system_info(DMI_BIOS_VERSION),
2003 dmi_get_system_info(DMI_PRODUCT_VERSION));
2004 ret = -EIO;
2005 goto error;
2008 ret = iommu_domain_identity_map(domain, start, end);
2009 if (ret)
2010 goto error;
2012 /* context entry init */
2013 ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2014 if (ret)
2015 goto error;
2017 return 0;
2019 error:
2020 domain_exit(domain);
2021 return ret;
2024 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2025 struct pci_dev *pdev)
2027 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2028 return 0;
2029 return iommu_prepare_identity_map(pdev, rmrr->base_address,
2030 rmrr->end_address + 1);
2033 #ifdef CONFIG_DMAR_FLOPPY_WA
2034 static inline void iommu_prepare_isa(void)
2036 struct pci_dev *pdev;
2037 int ret;
2039 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2040 if (!pdev)
2041 return;
2043 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2044 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2046 if (ret)
2047 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2048 "floppy might not work\n");
2051 #else
2052 static inline void iommu_prepare_isa(void)
2054 return;
2056 #endif /* !CONFIG_DMAR_FLPY_WA */
2058 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2060 static int __init si_domain_work_fn(unsigned long start_pfn,
2061 unsigned long end_pfn, void *datax)
2063 int *ret = datax;
2065 *ret = iommu_domain_identity_map(si_domain,
2066 (uint64_t)start_pfn << PAGE_SHIFT,
2067 (uint64_t)end_pfn << PAGE_SHIFT);
2068 return *ret;
2072 static int __init si_domain_init(int hw)
2074 struct dmar_drhd_unit *drhd;
2075 struct intel_iommu *iommu;
2076 int nid, ret = 0;
2078 si_domain = alloc_domain();
2079 if (!si_domain)
2080 return -EFAULT;
2082 pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2084 for_each_active_iommu(iommu, drhd) {
2085 ret = iommu_attach_domain(si_domain, iommu);
2086 if (ret) {
2087 domain_exit(si_domain);
2088 return -EFAULT;
2092 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2093 domain_exit(si_domain);
2094 return -EFAULT;
2097 si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2099 if (hw)
2100 return 0;
2102 for_each_online_node(nid) {
2103 work_with_active_regions(nid, si_domain_work_fn, &ret);
2104 if (ret)
2105 return ret;
2108 return 0;
2111 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2112 struct pci_dev *pdev);
2113 static int identity_mapping(struct pci_dev *pdev)
2115 struct device_domain_info *info;
2117 if (likely(!iommu_identity_mapping))
2118 return 0;
2121 list_for_each_entry(info, &si_domain->devices, link)
2122 if (info->dev == pdev)
2123 return 1;
2124 return 0;
2127 static int domain_add_dev_info(struct dmar_domain *domain,
2128 struct pci_dev *pdev,
2129 int translation)
2131 struct device_domain_info *info;
2132 unsigned long flags;
2133 int ret;
2135 info = alloc_devinfo_mem();
2136 if (!info)
2137 return -ENOMEM;
2139 ret = domain_context_mapping(domain, pdev, translation);
2140 if (ret) {
2141 free_devinfo_mem(info);
2142 return ret;
2145 info->segment = pci_domain_nr(pdev->bus);
2146 info->bus = pdev->bus->number;
2147 info->devfn = pdev->devfn;
2148 info->dev = pdev;
2149 info->domain = domain;
2151 spin_lock_irqsave(&device_domain_lock, flags);
2152 list_add(&info->link, &domain->devices);
2153 list_add(&info->global, &device_domain_list);
2154 pdev->dev.archdata.iommu = info;
2155 spin_unlock_irqrestore(&device_domain_lock, flags);
2157 return 0;
2160 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2162 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2163 return 1;
2165 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2166 return 1;
2168 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2169 return 0;
2172 * We want to start off with all devices in the 1:1 domain, and
2173 * take them out later if we find they can't access all of memory.
2175 * However, we can't do this for PCI devices behind bridges,
2176 * because all PCI devices behind the same bridge will end up
2177 * with the same source-id on their transactions.
2179 * Practically speaking, we can't change things around for these
2180 * devices at run-time, because we can't be sure there'll be no
2181 * DMA transactions in flight for any of their siblings.
2183 * So PCI devices (unless they're on the root bus) as well as
2184 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2185 * the 1:1 domain, just in _case_ one of their siblings turns out
2186 * not to be able to map all of memory.
2188 if (!pci_is_pcie(pdev)) {
2189 if (!pci_is_root_bus(pdev->bus))
2190 return 0;
2191 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2192 return 0;
2193 } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2194 return 0;
2197 * At boot time, we don't yet know if devices will be 64-bit capable.
2198 * Assume that they will -- if they turn out not to be, then we can
2199 * take them out of the 1:1 domain later.
2201 if (!startup)
2202 return pdev->dma_mask > DMA_BIT_MASK(32);
2204 return 1;
2207 static int __init iommu_prepare_static_identity_mapping(int hw)
2209 struct pci_dev *pdev = NULL;
2210 int ret;
2212 ret = si_domain_init(hw);
2213 if (ret)
2214 return -EFAULT;
2216 for_each_pci_dev(pdev) {
2217 if (iommu_should_identity_map(pdev, 1)) {
2218 printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2219 hw ? "hardware" : "software", pci_name(pdev));
2221 ret = domain_add_dev_info(si_domain, pdev,
2222 hw ? CONTEXT_TT_PASS_THROUGH :
2223 CONTEXT_TT_MULTI_LEVEL);
2224 if (ret)
2225 return ret;
2229 return 0;
2232 int __init init_dmars(void)
2234 struct dmar_drhd_unit *drhd;
2235 struct dmar_rmrr_unit *rmrr;
2236 struct pci_dev *pdev;
2237 struct intel_iommu *iommu;
2238 int i, ret;
2241 * for each drhd
2242 * allocate root
2243 * initialize and program root entry to not present
2244 * endfor
2246 for_each_drhd_unit(drhd) {
2247 g_num_of_iommus++;
2249 * lock not needed as this is only incremented in the single
2250 * threaded kernel __init code path all other access are read
2251 * only
2255 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2256 GFP_KERNEL);
2257 if (!g_iommus) {
2258 printk(KERN_ERR "Allocating global iommu array failed\n");
2259 ret = -ENOMEM;
2260 goto error;
2263 deferred_flush = kzalloc(g_num_of_iommus *
2264 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2265 if (!deferred_flush) {
2266 ret = -ENOMEM;
2267 goto error;
2270 for_each_drhd_unit(drhd) {
2271 if (drhd->ignored)
2272 continue;
2274 iommu = drhd->iommu;
2275 g_iommus[iommu->seq_id] = iommu;
2277 ret = iommu_init_domains(iommu);
2278 if (ret)
2279 goto error;
2282 * TBD:
2283 * we could share the same root & context tables
2284 * amoung all IOMMU's. Need to Split it later.
2286 ret = iommu_alloc_root_entry(iommu);
2287 if (ret) {
2288 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2289 goto error;
2291 if (!ecap_pass_through(iommu->ecap))
2292 hw_pass_through = 0;
2296 * Start from the sane iommu hardware state.
2298 for_each_drhd_unit(drhd) {
2299 if (drhd->ignored)
2300 continue;
2302 iommu = drhd->iommu;
2305 * If the queued invalidation is already initialized by us
2306 * (for example, while enabling interrupt-remapping) then
2307 * we got the things already rolling from a sane state.
2309 if (iommu->qi)
2310 continue;
2313 * Clear any previous faults.
2315 dmar_fault(-1, iommu);
2317 * Disable queued invalidation if supported and already enabled
2318 * before OS handover.
2320 dmar_disable_qi(iommu);
2323 for_each_drhd_unit(drhd) {
2324 if (drhd->ignored)
2325 continue;
2327 iommu = drhd->iommu;
2329 if (dmar_enable_qi(iommu)) {
2331 * Queued Invalidate not enabled, use Register Based
2332 * Invalidate
2334 iommu->flush.flush_context = __iommu_flush_context;
2335 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2336 printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2337 "invalidation\n",
2338 (unsigned long long)drhd->reg_base_addr);
2339 } else {
2340 iommu->flush.flush_context = qi_flush_context;
2341 iommu->flush.flush_iotlb = qi_flush_iotlb;
2342 printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2343 "invalidation\n",
2344 (unsigned long long)drhd->reg_base_addr);
2348 if (iommu_pass_through)
2349 iommu_identity_mapping |= IDENTMAP_ALL;
2351 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2352 iommu_identity_mapping |= IDENTMAP_GFX;
2353 #endif
2355 check_tylersburg_isoch();
2358 * If pass through is not set or not enabled, setup context entries for
2359 * identity mappings for rmrr, gfx, and isa and may fall back to static
2360 * identity mapping if iommu_identity_mapping is set.
2362 if (iommu_identity_mapping) {
2363 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2364 if (ret) {
2365 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2366 goto error;
2370 * For each rmrr
2371 * for each dev attached to rmrr
2372 * do
2373 * locate drhd for dev, alloc domain for dev
2374 * allocate free domain
2375 * allocate page table entries for rmrr
2376 * if context not allocated for bus
2377 * allocate and init context
2378 * set present in root table for this bus
2379 * init context with domain, translation etc
2380 * endfor
2381 * endfor
2383 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2384 for_each_rmrr_units(rmrr) {
2385 for (i = 0; i < rmrr->devices_cnt; i++) {
2386 pdev = rmrr->devices[i];
2388 * some BIOS lists non-exist devices in DMAR
2389 * table.
2391 if (!pdev)
2392 continue;
2393 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2394 if (ret)
2395 printk(KERN_ERR
2396 "IOMMU: mapping reserved region failed\n");
2400 iommu_prepare_isa();
2403 * for each drhd
2404 * enable fault log
2405 * global invalidate context cache
2406 * global invalidate iotlb
2407 * enable translation
2409 for_each_drhd_unit(drhd) {
2410 if (drhd->ignored)
2411 continue;
2412 iommu = drhd->iommu;
2414 iommu_flush_write_buffer(iommu);
2416 ret = dmar_set_interrupt(iommu);
2417 if (ret)
2418 goto error;
2420 iommu_set_root_entry(iommu);
2422 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2423 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2425 ret = iommu_enable_translation(iommu);
2426 if (ret)
2427 goto error;
2429 iommu_disable_protect_mem_regions(iommu);
2432 return 0;
2433 error:
2434 for_each_drhd_unit(drhd) {
2435 if (drhd->ignored)
2436 continue;
2437 iommu = drhd->iommu;
2438 free_iommu(iommu);
2440 kfree(g_iommus);
2441 return ret;
2444 /* This takes a number of _MM_ pages, not VTD pages */
2445 static struct iova *intel_alloc_iova(struct device *dev,
2446 struct dmar_domain *domain,
2447 unsigned long nrpages, uint64_t dma_mask)
2449 struct pci_dev *pdev = to_pci_dev(dev);
2450 struct iova *iova = NULL;
2452 /* Restrict dma_mask to the width that the iommu can handle */
2453 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2455 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2457 * First try to allocate an io virtual address in
2458 * DMA_BIT_MASK(32) and if that fails then try allocating
2459 * from higher range
2461 iova = alloc_iova(&domain->iovad, nrpages,
2462 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2463 if (iova)
2464 return iova;
2466 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2467 if (unlikely(!iova)) {
2468 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2469 nrpages, pci_name(pdev));
2470 return NULL;
2473 return iova;
2476 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2478 struct dmar_domain *domain;
2479 int ret;
2481 domain = get_domain_for_dev(pdev,
2482 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2483 if (!domain) {
2484 printk(KERN_ERR
2485 "Allocating domain for %s failed", pci_name(pdev));
2486 return NULL;
2489 /* make sure context mapping is ok */
2490 if (unlikely(!domain_context_mapped(pdev))) {
2491 ret = domain_context_mapping(domain, pdev,
2492 CONTEXT_TT_MULTI_LEVEL);
2493 if (ret) {
2494 printk(KERN_ERR
2495 "Domain context map for %s failed",
2496 pci_name(pdev));
2497 return NULL;
2501 return domain;
2504 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2506 struct device_domain_info *info;
2508 /* No lock here, assumes no domain exit in normal case */
2509 info = dev->dev.archdata.iommu;
2510 if (likely(info))
2511 return info->domain;
2513 return __get_valid_domain_for_dev(dev);
2516 static int iommu_dummy(struct pci_dev *pdev)
2518 return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2521 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2522 static int iommu_no_mapping(struct device *dev)
2524 struct pci_dev *pdev;
2525 int found;
2527 if (unlikely(dev->bus != &pci_bus_type))
2528 return 1;
2530 pdev = to_pci_dev(dev);
2531 if (iommu_dummy(pdev))
2532 return 1;
2534 if (!iommu_identity_mapping)
2535 return 0;
2537 found = identity_mapping(pdev);
2538 if (found) {
2539 if (iommu_should_identity_map(pdev, 0))
2540 return 1;
2541 else {
2543 * 32 bit DMA is removed from si_domain and fall back
2544 * to non-identity mapping.
2546 domain_remove_one_dev_info(si_domain, pdev);
2547 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2548 pci_name(pdev));
2549 return 0;
2551 } else {
2553 * In case of a detached 64 bit DMA device from vm, the device
2554 * is put into si_domain for identity mapping.
2556 if (iommu_should_identity_map(pdev, 0)) {
2557 int ret;
2558 ret = domain_add_dev_info(si_domain, pdev,
2559 hw_pass_through ?
2560 CONTEXT_TT_PASS_THROUGH :
2561 CONTEXT_TT_MULTI_LEVEL);
2562 if (!ret) {
2563 printk(KERN_INFO "64bit %s uses identity mapping\n",
2564 pci_name(pdev));
2565 return 1;
2570 return 0;
2573 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2574 size_t size, int dir, u64 dma_mask)
2576 struct pci_dev *pdev = to_pci_dev(hwdev);
2577 struct dmar_domain *domain;
2578 phys_addr_t start_paddr;
2579 struct iova *iova;
2580 int prot = 0;
2581 int ret;
2582 struct intel_iommu *iommu;
2583 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2585 BUG_ON(dir == DMA_NONE);
2587 if (iommu_no_mapping(hwdev))
2588 return paddr;
2590 domain = get_valid_domain_for_dev(pdev);
2591 if (!domain)
2592 return 0;
2594 iommu = domain_get_iommu(domain);
2595 size = aligned_nrpages(paddr, size);
2597 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2598 pdev->dma_mask);
2599 if (!iova)
2600 goto error;
2603 * Check if DMAR supports zero-length reads on write only
2604 * mappings..
2606 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2607 !cap_zlr(iommu->cap))
2608 prot |= DMA_PTE_READ;
2609 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2610 prot |= DMA_PTE_WRITE;
2612 * paddr - (paddr + size) might be partial page, we should map the whole
2613 * page. Note: if two part of one page are separately mapped, we
2614 * might have two guest_addr mapping to the same host paddr, but this
2615 * is not a big problem
2617 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2618 mm_to_dma_pfn(paddr_pfn), size, prot);
2619 if (ret)
2620 goto error;
2622 /* it's a non-present to present mapping. Only flush if caching mode */
2623 if (cap_caching_mode(iommu->cap))
2624 iommu_flush_iotlb_psi(iommu, 0, mm_to_dma_pfn(iova->pfn_lo), size);
2625 else
2626 iommu_flush_write_buffer(iommu);
2628 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2629 start_paddr += paddr & ~PAGE_MASK;
2630 return start_paddr;
2632 error:
2633 if (iova)
2634 __free_iova(&domain->iovad, iova);
2635 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2636 pci_name(pdev), size, (unsigned long long)paddr, dir);
2637 return 0;
2640 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2641 unsigned long offset, size_t size,
2642 enum dma_data_direction dir,
2643 struct dma_attrs *attrs)
2645 return __intel_map_single(dev, page_to_phys(page) + offset, size,
2646 dir, to_pci_dev(dev)->dma_mask);
2649 static void flush_unmaps(void)
2651 int i, j;
2653 timer_on = 0;
2655 /* just flush them all */
2656 for (i = 0; i < g_num_of_iommus; i++) {
2657 struct intel_iommu *iommu = g_iommus[i];
2658 if (!iommu)
2659 continue;
2661 if (!deferred_flush[i].next)
2662 continue;
2664 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2665 DMA_TLB_GLOBAL_FLUSH);
2666 for (j = 0; j < deferred_flush[i].next; j++) {
2667 unsigned long mask;
2668 struct iova *iova = deferred_flush[i].iova[j];
2670 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2671 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2672 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2673 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2675 deferred_flush[i].next = 0;
2678 list_size = 0;
2681 static void flush_unmaps_timeout(unsigned long data)
2683 unsigned long flags;
2685 spin_lock_irqsave(&async_umap_flush_lock, flags);
2686 flush_unmaps();
2687 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2690 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2692 unsigned long flags;
2693 int next, iommu_id;
2694 struct intel_iommu *iommu;
2696 spin_lock_irqsave(&async_umap_flush_lock, flags);
2697 if (list_size == HIGH_WATER_MARK)
2698 flush_unmaps();
2700 iommu = domain_get_iommu(dom);
2701 iommu_id = iommu->seq_id;
2703 next = deferred_flush[iommu_id].next;
2704 deferred_flush[iommu_id].domain[next] = dom;
2705 deferred_flush[iommu_id].iova[next] = iova;
2706 deferred_flush[iommu_id].next++;
2708 if (!timer_on) {
2709 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2710 timer_on = 1;
2712 list_size++;
2713 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2716 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2717 size_t size, enum dma_data_direction dir,
2718 struct dma_attrs *attrs)
2720 struct pci_dev *pdev = to_pci_dev(dev);
2721 struct dmar_domain *domain;
2722 unsigned long start_pfn, last_pfn;
2723 struct iova *iova;
2724 struct intel_iommu *iommu;
2726 if (iommu_no_mapping(dev))
2727 return;
2729 domain = find_domain(pdev);
2730 BUG_ON(!domain);
2732 iommu = domain_get_iommu(domain);
2734 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2735 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2736 (unsigned long long)dev_addr))
2737 return;
2739 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2740 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2742 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2743 pci_name(pdev), start_pfn, last_pfn);
2745 /* clear the whole page */
2746 dma_pte_clear_range(domain, start_pfn, last_pfn);
2748 /* free page tables */
2749 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2751 if (intel_iommu_strict) {
2752 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2753 last_pfn - start_pfn + 1);
2754 /* free iova */
2755 __free_iova(&domain->iovad, iova);
2756 } else {
2757 add_unmap(domain, iova);
2759 * queue up the release of the unmap to save the 1/6th of the
2760 * cpu used up by the iotlb flush operation...
2765 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2766 dma_addr_t *dma_handle, gfp_t flags)
2768 void *vaddr;
2769 int order;
2771 size = PAGE_ALIGN(size);
2772 order = get_order(size);
2774 if (!iommu_no_mapping(hwdev))
2775 flags &= ~(GFP_DMA | GFP_DMA32);
2776 else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2777 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2778 flags |= GFP_DMA;
2779 else
2780 flags |= GFP_DMA32;
2783 vaddr = (void *)__get_free_pages(flags, order);
2784 if (!vaddr)
2785 return NULL;
2786 memset(vaddr, 0, size);
2788 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2789 DMA_BIDIRECTIONAL,
2790 hwdev->coherent_dma_mask);
2791 if (*dma_handle)
2792 return vaddr;
2793 free_pages((unsigned long)vaddr, order);
2794 return NULL;
2797 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2798 dma_addr_t dma_handle)
2800 int order;
2802 size = PAGE_ALIGN(size);
2803 order = get_order(size);
2805 intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2806 free_pages((unsigned long)vaddr, order);
2809 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2810 int nelems, enum dma_data_direction dir,
2811 struct dma_attrs *attrs)
2813 struct pci_dev *pdev = to_pci_dev(hwdev);
2814 struct dmar_domain *domain;
2815 unsigned long start_pfn, last_pfn;
2816 struct iova *iova;
2817 struct intel_iommu *iommu;
2819 if (iommu_no_mapping(hwdev))
2820 return;
2822 domain = find_domain(pdev);
2823 BUG_ON(!domain);
2825 iommu = domain_get_iommu(domain);
2827 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2828 if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2829 (unsigned long long)sglist[0].dma_address))
2830 return;
2832 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2833 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2835 /* clear the whole page */
2836 dma_pte_clear_range(domain, start_pfn, last_pfn);
2838 /* free page tables */
2839 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2841 if (intel_iommu_strict) {
2842 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2843 last_pfn - start_pfn + 1);
2844 /* free iova */
2845 __free_iova(&domain->iovad, iova);
2846 } else {
2847 add_unmap(domain, iova);
2849 * queue up the release of the unmap to save the 1/6th of the
2850 * cpu used up by the iotlb flush operation...
2855 static int intel_nontranslate_map_sg(struct device *hddev,
2856 struct scatterlist *sglist, int nelems, int dir)
2858 int i;
2859 struct scatterlist *sg;
2861 for_each_sg(sglist, sg, nelems, i) {
2862 BUG_ON(!sg_page(sg));
2863 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2864 sg->dma_length = sg->length;
2866 return nelems;
2869 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2870 enum dma_data_direction dir, struct dma_attrs *attrs)
2872 int i;
2873 struct pci_dev *pdev = to_pci_dev(hwdev);
2874 struct dmar_domain *domain;
2875 size_t size = 0;
2876 int prot = 0;
2877 size_t offset_pfn = 0;
2878 struct iova *iova = NULL;
2879 int ret;
2880 struct scatterlist *sg;
2881 unsigned long start_vpfn;
2882 struct intel_iommu *iommu;
2884 BUG_ON(dir == DMA_NONE);
2885 if (iommu_no_mapping(hwdev))
2886 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2888 domain = get_valid_domain_for_dev(pdev);
2889 if (!domain)
2890 return 0;
2892 iommu = domain_get_iommu(domain);
2894 for_each_sg(sglist, sg, nelems, i)
2895 size += aligned_nrpages(sg->offset, sg->length);
2897 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2898 pdev->dma_mask);
2899 if (!iova) {
2900 sglist->dma_length = 0;
2901 return 0;
2905 * Check if DMAR supports zero-length reads on write only
2906 * mappings..
2908 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2909 !cap_zlr(iommu->cap))
2910 prot |= DMA_PTE_READ;
2911 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2912 prot |= DMA_PTE_WRITE;
2914 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2916 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2917 if (unlikely(ret)) {
2918 /* clear the page */
2919 dma_pte_clear_range(domain, start_vpfn,
2920 start_vpfn + size - 1);
2921 /* free page tables */
2922 dma_pte_free_pagetable(domain, start_vpfn,
2923 start_vpfn + size - 1);
2924 /* free iova */
2925 __free_iova(&domain->iovad, iova);
2926 return 0;
2929 /* it's a non-present to present mapping. Only flush if caching mode */
2930 if (cap_caching_mode(iommu->cap))
2931 iommu_flush_iotlb_psi(iommu, 0, start_vpfn, offset_pfn);
2932 else
2933 iommu_flush_write_buffer(iommu);
2935 return nelems;
2938 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2940 return !dma_addr;
2943 struct dma_map_ops intel_dma_ops = {
2944 .alloc_coherent = intel_alloc_coherent,
2945 .free_coherent = intel_free_coherent,
2946 .map_sg = intel_map_sg,
2947 .unmap_sg = intel_unmap_sg,
2948 .map_page = intel_map_page,
2949 .unmap_page = intel_unmap_page,
2950 .mapping_error = intel_mapping_error,
2953 static inline int iommu_domain_cache_init(void)
2955 int ret = 0;
2957 iommu_domain_cache = kmem_cache_create("iommu_domain",
2958 sizeof(struct dmar_domain),
2960 SLAB_HWCACHE_ALIGN,
2962 NULL);
2963 if (!iommu_domain_cache) {
2964 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2965 ret = -ENOMEM;
2968 return ret;
2971 static inline int iommu_devinfo_cache_init(void)
2973 int ret = 0;
2975 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2976 sizeof(struct device_domain_info),
2978 SLAB_HWCACHE_ALIGN,
2979 NULL);
2980 if (!iommu_devinfo_cache) {
2981 printk(KERN_ERR "Couldn't create devinfo cache\n");
2982 ret = -ENOMEM;
2985 return ret;
2988 static inline int iommu_iova_cache_init(void)
2990 int ret = 0;
2992 iommu_iova_cache = kmem_cache_create("iommu_iova",
2993 sizeof(struct iova),
2995 SLAB_HWCACHE_ALIGN,
2996 NULL);
2997 if (!iommu_iova_cache) {
2998 printk(KERN_ERR "Couldn't create iova cache\n");
2999 ret = -ENOMEM;
3002 return ret;
3005 static int __init iommu_init_mempool(void)
3007 int ret;
3008 ret = iommu_iova_cache_init();
3009 if (ret)
3010 return ret;
3012 ret = iommu_domain_cache_init();
3013 if (ret)
3014 goto domain_error;
3016 ret = iommu_devinfo_cache_init();
3017 if (!ret)
3018 return ret;
3020 kmem_cache_destroy(iommu_domain_cache);
3021 domain_error:
3022 kmem_cache_destroy(iommu_iova_cache);
3024 return -ENOMEM;
3027 static void __init iommu_exit_mempool(void)
3029 kmem_cache_destroy(iommu_devinfo_cache);
3030 kmem_cache_destroy(iommu_domain_cache);
3031 kmem_cache_destroy(iommu_iova_cache);
3035 static void __init init_no_remapping_devices(void)
3037 struct dmar_drhd_unit *drhd;
3039 for_each_drhd_unit(drhd) {
3040 if (!drhd->include_all) {
3041 int i;
3042 for (i = 0; i < drhd->devices_cnt; i++)
3043 if (drhd->devices[i] != NULL)
3044 break;
3045 /* ignore DMAR unit if no pci devices exist */
3046 if (i == drhd->devices_cnt)
3047 drhd->ignored = 1;
3051 if (dmar_map_gfx)
3052 return;
3054 for_each_drhd_unit(drhd) {
3055 int i;
3056 if (drhd->ignored || drhd->include_all)
3057 continue;
3059 for (i = 0; i < drhd->devices_cnt; i++)
3060 if (drhd->devices[i] &&
3061 !IS_GFX_DEVICE(drhd->devices[i]))
3062 break;
3064 if (i < drhd->devices_cnt)
3065 continue;
3067 /* bypass IOMMU if it is just for gfx devices */
3068 drhd->ignored = 1;
3069 for (i = 0; i < drhd->devices_cnt; i++) {
3070 if (!drhd->devices[i])
3071 continue;
3072 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3077 #ifdef CONFIG_SUSPEND
3078 static int init_iommu_hw(void)
3080 struct dmar_drhd_unit *drhd;
3081 struct intel_iommu *iommu = NULL;
3083 for_each_active_iommu(iommu, drhd)
3084 if (iommu->qi)
3085 dmar_reenable_qi(iommu);
3087 for_each_active_iommu(iommu, drhd) {
3088 iommu_flush_write_buffer(iommu);
3090 iommu_set_root_entry(iommu);
3092 iommu->flush.flush_context(iommu, 0, 0, 0,
3093 DMA_CCMD_GLOBAL_INVL);
3094 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3095 DMA_TLB_GLOBAL_FLUSH);
3096 iommu_enable_translation(iommu);
3097 iommu_disable_protect_mem_regions(iommu);
3100 return 0;
3103 static void iommu_flush_all(void)
3105 struct dmar_drhd_unit *drhd;
3106 struct intel_iommu *iommu;
3108 for_each_active_iommu(iommu, drhd) {
3109 iommu->flush.flush_context(iommu, 0, 0, 0,
3110 DMA_CCMD_GLOBAL_INVL);
3111 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3112 DMA_TLB_GLOBAL_FLUSH);
3116 static int iommu_suspend(struct sys_device *dev, pm_message_t state)
3118 struct dmar_drhd_unit *drhd;
3119 struct intel_iommu *iommu = NULL;
3120 unsigned long flag;
3122 for_each_active_iommu(iommu, drhd) {
3123 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3124 GFP_ATOMIC);
3125 if (!iommu->iommu_state)
3126 goto nomem;
3129 iommu_flush_all();
3131 for_each_active_iommu(iommu, drhd) {
3132 iommu_disable_translation(iommu);
3134 spin_lock_irqsave(&iommu->register_lock, flag);
3136 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3137 readl(iommu->reg + DMAR_FECTL_REG);
3138 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3139 readl(iommu->reg + DMAR_FEDATA_REG);
3140 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3141 readl(iommu->reg + DMAR_FEADDR_REG);
3142 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3143 readl(iommu->reg + DMAR_FEUADDR_REG);
3145 spin_unlock_irqrestore(&iommu->register_lock, flag);
3147 return 0;
3149 nomem:
3150 for_each_active_iommu(iommu, drhd)
3151 kfree(iommu->iommu_state);
3153 return -ENOMEM;
3156 static int iommu_resume(struct sys_device *dev)
3158 struct dmar_drhd_unit *drhd;
3159 struct intel_iommu *iommu = NULL;
3160 unsigned long flag;
3162 if (init_iommu_hw()) {
3163 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3164 return -EIO;
3167 for_each_active_iommu(iommu, drhd) {
3169 spin_lock_irqsave(&iommu->register_lock, flag);
3171 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3172 iommu->reg + DMAR_FECTL_REG);
3173 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3174 iommu->reg + DMAR_FEDATA_REG);
3175 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3176 iommu->reg + DMAR_FEADDR_REG);
3177 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3178 iommu->reg + DMAR_FEUADDR_REG);
3180 spin_unlock_irqrestore(&iommu->register_lock, flag);
3183 for_each_active_iommu(iommu, drhd)
3184 kfree(iommu->iommu_state);
3186 return 0;
3189 static struct sysdev_class iommu_sysclass = {
3190 .name = "iommu",
3191 .resume = iommu_resume,
3192 .suspend = iommu_suspend,
3195 static struct sys_device device_iommu = {
3196 .cls = &iommu_sysclass,
3199 static int __init init_iommu_sysfs(void)
3201 int error;
3203 error = sysdev_class_register(&iommu_sysclass);
3204 if (error)
3205 return error;
3207 error = sysdev_register(&device_iommu);
3208 if (error)
3209 sysdev_class_unregister(&iommu_sysclass);
3211 return error;
3214 #else
3215 static int __init init_iommu_sysfs(void)
3217 return 0;
3219 #endif /* CONFIG_PM */
3222 * Here we only respond to action of unbound device from driver.
3224 * Added device is not attached to its DMAR domain here yet. That will happen
3225 * when mapping the device to iova.
3227 static int device_notifier(struct notifier_block *nb,
3228 unsigned long action, void *data)
3230 struct device *dev = data;
3231 struct pci_dev *pdev = to_pci_dev(dev);
3232 struct dmar_domain *domain;
3234 if (iommu_no_mapping(dev))
3235 return 0;
3237 domain = find_domain(pdev);
3238 if (!domain)
3239 return 0;
3241 if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through)
3242 domain_remove_one_dev_info(domain, pdev);
3244 return 0;
3247 static struct notifier_block device_nb = {
3248 .notifier_call = device_notifier,
3251 int __init intel_iommu_init(void)
3253 int ret = 0;
3254 int force_on = 0;
3256 /* VT-d is required for a TXT/tboot launch, so enforce that */
3257 force_on = tboot_force_iommu();
3259 if (dmar_table_init()) {
3260 if (force_on)
3261 panic("tboot: Failed to initialize DMAR table\n");
3262 return -ENODEV;
3265 if (dmar_dev_scope_init()) {
3266 if (force_on)
3267 panic("tboot: Failed to initialize DMAR device scope\n");
3268 return -ENODEV;
3272 * Check the need for DMA-remapping initialization now.
3273 * Above initialization will also be used by Interrupt-remapping.
3275 if (no_iommu || dmar_disabled)
3276 return -ENODEV;
3278 iommu_init_mempool();
3279 dmar_init_reserved_ranges();
3281 init_no_remapping_devices();
3283 ret = init_dmars();
3284 if (ret) {
3285 if (force_on)
3286 panic("tboot: Failed to initialize DMARs\n");
3287 printk(KERN_ERR "IOMMU: dmar init failed\n");
3288 put_iova_domain(&reserved_iova_list);
3289 iommu_exit_mempool();
3290 return ret;
3292 printk(KERN_INFO
3293 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3295 init_timer(&unmap_timer);
3296 #ifdef CONFIG_SWIOTLB
3297 swiotlb = 0;
3298 #endif
3299 dma_ops = &intel_dma_ops;
3301 init_iommu_sysfs();
3303 register_iommu(&intel_iommu_ops);
3305 bus_register_notifier(&pci_bus_type, &device_nb);
3307 return 0;
3310 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3311 struct pci_dev *pdev)
3313 struct pci_dev *tmp, *parent;
3315 if (!iommu || !pdev)
3316 return;
3318 /* dependent device detach */
3319 tmp = pci_find_upstream_pcie_bridge(pdev);
3320 /* Secondary interface's bus number and devfn 0 */
3321 if (tmp) {
3322 parent = pdev->bus->self;
3323 while (parent != tmp) {
3324 iommu_detach_dev(iommu, parent->bus->number,
3325 parent->devfn);
3326 parent = parent->bus->self;
3328 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3329 iommu_detach_dev(iommu,
3330 tmp->subordinate->number, 0);
3331 else /* this is a legacy PCI bridge */
3332 iommu_detach_dev(iommu, tmp->bus->number,
3333 tmp->devfn);
3337 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3338 struct pci_dev *pdev)
3340 struct device_domain_info *info;
3341 struct intel_iommu *iommu;
3342 unsigned long flags;
3343 int found = 0;
3344 struct list_head *entry, *tmp;
3346 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3347 pdev->devfn);
3348 if (!iommu)
3349 return;
3351 spin_lock_irqsave(&device_domain_lock, flags);
3352 list_for_each_safe(entry, tmp, &domain->devices) {
3353 info = list_entry(entry, struct device_domain_info, link);
3354 /* No need to compare PCI domain; it has to be the same */
3355 if (info->bus == pdev->bus->number &&
3356 info->devfn == pdev->devfn) {
3357 list_del(&info->link);
3358 list_del(&info->global);
3359 if (info->dev)
3360 info->dev->dev.archdata.iommu = NULL;
3361 spin_unlock_irqrestore(&device_domain_lock, flags);
3363 iommu_disable_dev_iotlb(info);
3364 iommu_detach_dev(iommu, info->bus, info->devfn);
3365 iommu_detach_dependent_devices(iommu, pdev);
3366 free_devinfo_mem(info);
3368 spin_lock_irqsave(&device_domain_lock, flags);
3370 if (found)
3371 break;
3372 else
3373 continue;
3376 /* if there is no other devices under the same iommu
3377 * owned by this domain, clear this iommu in iommu_bmp
3378 * update iommu count and coherency
3380 if (iommu == device_to_iommu(info->segment, info->bus,
3381 info->devfn))
3382 found = 1;
3385 if (found == 0) {
3386 unsigned long tmp_flags;
3387 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3388 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3389 domain->iommu_count--;
3390 domain_update_iommu_cap(domain);
3391 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3394 spin_unlock_irqrestore(&device_domain_lock, flags);
3397 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3399 struct device_domain_info *info;
3400 struct intel_iommu *iommu;
3401 unsigned long flags1, flags2;
3403 spin_lock_irqsave(&device_domain_lock, flags1);
3404 while (!list_empty(&domain->devices)) {
3405 info = list_entry(domain->devices.next,
3406 struct device_domain_info, link);
3407 list_del(&info->link);
3408 list_del(&info->global);
3409 if (info->dev)
3410 info->dev->dev.archdata.iommu = NULL;
3412 spin_unlock_irqrestore(&device_domain_lock, flags1);
3414 iommu_disable_dev_iotlb(info);
3415 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3416 iommu_detach_dev(iommu, info->bus, info->devfn);
3417 iommu_detach_dependent_devices(iommu, info->dev);
3419 /* clear this iommu in iommu_bmp, update iommu count
3420 * and capabilities
3422 spin_lock_irqsave(&domain->iommu_lock, flags2);
3423 if (test_and_clear_bit(iommu->seq_id,
3424 &domain->iommu_bmp)) {
3425 domain->iommu_count--;
3426 domain_update_iommu_cap(domain);
3428 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3430 free_devinfo_mem(info);
3431 spin_lock_irqsave(&device_domain_lock, flags1);
3433 spin_unlock_irqrestore(&device_domain_lock, flags1);
3436 /* domain id for virtual machine, it won't be set in context */
3437 static unsigned long vm_domid;
3439 static int vm_domain_min_agaw(struct dmar_domain *domain)
3441 int i;
3442 int min_agaw = domain->agaw;
3444 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
3445 for (; i < g_num_of_iommus; ) {
3446 if (min_agaw > g_iommus[i]->agaw)
3447 min_agaw = g_iommus[i]->agaw;
3449 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
3452 return min_agaw;
3455 static struct dmar_domain *iommu_alloc_vm_domain(void)
3457 struct dmar_domain *domain;
3459 domain = alloc_domain_mem();
3460 if (!domain)
3461 return NULL;
3463 domain->id = vm_domid++;
3464 domain->nid = -1;
3465 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3466 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3468 return domain;
3471 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3473 int adjust_width;
3475 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3476 spin_lock_init(&domain->iommu_lock);
3478 domain_reserve_special_ranges(domain);
3480 /* calculate AGAW */
3481 domain->gaw = guest_width;
3482 adjust_width = guestwidth_to_adjustwidth(guest_width);
3483 domain->agaw = width_to_agaw(adjust_width);
3485 INIT_LIST_HEAD(&domain->devices);
3487 domain->iommu_count = 0;
3488 domain->iommu_coherency = 0;
3489 domain->iommu_snooping = 0;
3490 domain->max_addr = 0;
3491 domain->nid = -1;
3493 /* always allocate the top pgd */
3494 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3495 if (!domain->pgd)
3496 return -ENOMEM;
3497 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3498 return 0;
3501 static void iommu_free_vm_domain(struct dmar_domain *domain)
3503 unsigned long flags;
3504 struct dmar_drhd_unit *drhd;
3505 struct intel_iommu *iommu;
3506 unsigned long i;
3507 unsigned long ndomains;
3509 for_each_drhd_unit(drhd) {
3510 if (drhd->ignored)
3511 continue;
3512 iommu = drhd->iommu;
3514 ndomains = cap_ndoms(iommu->cap);
3515 i = find_first_bit(iommu->domain_ids, ndomains);
3516 for (; i < ndomains; ) {
3517 if (iommu->domains[i] == domain) {
3518 spin_lock_irqsave(&iommu->lock, flags);
3519 clear_bit(i, iommu->domain_ids);
3520 iommu->domains[i] = NULL;
3521 spin_unlock_irqrestore(&iommu->lock, flags);
3522 break;
3524 i = find_next_bit(iommu->domain_ids, ndomains, i+1);
3529 static void vm_domain_exit(struct dmar_domain *domain)
3531 /* Domain 0 is reserved, so dont process it */
3532 if (!domain)
3533 return;
3535 vm_domain_remove_all_dev_info(domain);
3536 /* destroy iovas */
3537 put_iova_domain(&domain->iovad);
3539 /* clear ptes */
3540 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3542 /* free page tables */
3543 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3545 iommu_free_vm_domain(domain);
3546 free_domain_mem(domain);
3549 static int intel_iommu_domain_init(struct iommu_domain *domain)
3551 struct dmar_domain *dmar_domain;
3553 dmar_domain = iommu_alloc_vm_domain();
3554 if (!dmar_domain) {
3555 printk(KERN_ERR
3556 "intel_iommu_domain_init: dmar_domain == NULL\n");
3557 return -ENOMEM;
3559 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3560 printk(KERN_ERR
3561 "intel_iommu_domain_init() failed\n");
3562 vm_domain_exit(dmar_domain);
3563 return -ENOMEM;
3565 domain->priv = dmar_domain;
3567 return 0;
3570 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3572 struct dmar_domain *dmar_domain = domain->priv;
3574 domain->priv = NULL;
3575 vm_domain_exit(dmar_domain);
3578 static int intel_iommu_attach_device(struct iommu_domain *domain,
3579 struct device *dev)
3581 struct dmar_domain *dmar_domain = domain->priv;
3582 struct pci_dev *pdev = to_pci_dev(dev);
3583 struct intel_iommu *iommu;
3584 int addr_width;
3585 u64 end;
3587 /* normally pdev is not mapped */
3588 if (unlikely(domain_context_mapped(pdev))) {
3589 struct dmar_domain *old_domain;
3591 old_domain = find_domain(pdev);
3592 if (old_domain) {
3593 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3594 dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3595 domain_remove_one_dev_info(old_domain, pdev);
3596 else
3597 domain_remove_dev_info(old_domain);
3601 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3602 pdev->devfn);
3603 if (!iommu)
3604 return -ENODEV;
3606 /* check if this iommu agaw is sufficient for max mapped address */
3607 addr_width = agaw_to_width(iommu->agaw);
3608 end = DOMAIN_MAX_ADDR(addr_width);
3609 end = end & VTD_PAGE_MASK;
3610 if (end < dmar_domain->max_addr) {
3611 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3612 "sufficient for the mapped address (%llx)\n",
3613 __func__, iommu->agaw, dmar_domain->max_addr);
3614 return -EFAULT;
3617 return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3620 static void intel_iommu_detach_device(struct iommu_domain *domain,
3621 struct device *dev)
3623 struct dmar_domain *dmar_domain = domain->priv;
3624 struct pci_dev *pdev = to_pci_dev(dev);
3626 domain_remove_one_dev_info(dmar_domain, pdev);
3629 static int intel_iommu_map_range(struct iommu_domain *domain,
3630 unsigned long iova, phys_addr_t hpa,
3631 size_t size, int iommu_prot)
3633 struct dmar_domain *dmar_domain = domain->priv;
3634 u64 max_addr;
3635 int addr_width;
3636 int prot = 0;
3637 int ret;
3639 if (iommu_prot & IOMMU_READ)
3640 prot |= DMA_PTE_READ;
3641 if (iommu_prot & IOMMU_WRITE)
3642 prot |= DMA_PTE_WRITE;
3643 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3644 prot |= DMA_PTE_SNP;
3646 max_addr = iova + size;
3647 if (dmar_domain->max_addr < max_addr) {
3648 int min_agaw;
3649 u64 end;
3651 /* check if minimum agaw is sufficient for mapped address */
3652 min_agaw = vm_domain_min_agaw(dmar_domain);
3653 addr_width = agaw_to_width(min_agaw);
3654 end = DOMAIN_MAX_ADDR(addr_width);
3655 end = end & VTD_PAGE_MASK;
3656 if (end < max_addr) {
3657 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3658 "sufficient for the mapped address (%llx)\n",
3659 __func__, min_agaw, max_addr);
3660 return -EFAULT;
3662 dmar_domain->max_addr = max_addr;
3664 /* Round up size to next multiple of PAGE_SIZE, if it and
3665 the low bits of hpa would take us onto the next page */
3666 size = aligned_nrpages(hpa, size);
3667 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3668 hpa >> VTD_PAGE_SHIFT, size, prot);
3669 return ret;
3672 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3673 unsigned long iova, size_t size)
3675 struct dmar_domain *dmar_domain = domain->priv;
3677 if (!size)
3678 return;
3680 dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3681 (iova + size - 1) >> VTD_PAGE_SHIFT);
3683 if (dmar_domain->max_addr == iova + size)
3684 dmar_domain->max_addr = iova;
3687 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3688 unsigned long iova)
3690 struct dmar_domain *dmar_domain = domain->priv;
3691 struct dma_pte *pte;
3692 u64 phys = 0;
3694 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3695 if (pte)
3696 phys = dma_pte_addr(pte);
3698 return phys;
3701 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3702 unsigned long cap)
3704 struct dmar_domain *dmar_domain = domain->priv;
3706 if (cap == IOMMU_CAP_CACHE_COHERENCY)
3707 return dmar_domain->iommu_snooping;
3709 return 0;
3712 static struct iommu_ops intel_iommu_ops = {
3713 .domain_init = intel_iommu_domain_init,
3714 .domain_destroy = intel_iommu_domain_destroy,
3715 .attach_dev = intel_iommu_attach_device,
3716 .detach_dev = intel_iommu_detach_device,
3717 .map = intel_iommu_map_range,
3718 .unmap = intel_iommu_unmap_range,
3719 .iova_to_phys = intel_iommu_iova_to_phys,
3720 .domain_has_cap = intel_iommu_domain_has_cap,
3723 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3726 * Mobile 4 Series Chipset neglects to set RWBF capability,
3727 * but needs it:
3729 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3730 rwbf_quirk = 1;
3733 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3735 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3736 ISOCH DMAR unit for the Azalia sound device, but not give it any
3737 TLB entries, which causes it to deadlock. Check for that. We do
3738 this in a function called from init_dmars(), instead of in a PCI
3739 quirk, because we don't want to print the obnoxious "BIOS broken"
3740 message if VT-d is actually disabled.
3742 static void __init check_tylersburg_isoch(void)
3744 struct pci_dev *pdev;
3745 uint32_t vtisochctrl;
3747 /* If there's no Azalia in the system anyway, forget it. */
3748 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3749 if (!pdev)
3750 return;
3751 pci_dev_put(pdev);
3753 /* System Management Registers. Might be hidden, in which case
3754 we can't do the sanity check. But that's OK, because the
3755 known-broken BIOSes _don't_ actually hide it, so far. */
3756 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3757 if (!pdev)
3758 return;
3760 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3761 pci_dev_put(pdev);
3762 return;
3765 pci_dev_put(pdev);
3767 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3768 if (vtisochctrl & 1)
3769 return;
3771 /* Drop all bits other than the number of TLB entries */
3772 vtisochctrl &= 0x1c;
3774 /* If we have the recommended number of TLB entries (16), fine. */
3775 if (vtisochctrl == 0x10)
3776 return;
3778 /* Zero TLB entries? You get to ride the short bus to school. */
3779 if (!vtisochctrl) {
3780 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3781 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3782 dmi_get_system_info(DMI_BIOS_VENDOR),
3783 dmi_get_system_info(DMI_BIOS_VERSION),
3784 dmi_get_system_info(DMI_PRODUCT_VERSION));
3785 iommu_identity_mapping |= IDENTMAP_AZALIA;
3786 return;
3789 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3790 vtisochctrl);