Merge branch 'akpm'
[linux-2.6/next.git] / drivers / iommu / intel-iommu.c
blob1d85ed6615ab2c8e0ebf5f63811feb1c2be70d15
1 /*
2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/syscore_ops.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <linux/pci-ats.h>
43 #include <asm/cacheflush.h>
44 #include <asm/iommu.h>
46 #define ROOT_SIZE VTD_PAGE_SIZE
47 #define CONTEXT_SIZE VTD_PAGE_SIZE
49 #define IS_BRIDGE_HOST_DEVICE(pdev) \
50 ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
51 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
52 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
53 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55 #define IOAPIC_RANGE_START (0xfee00000)
56 #define IOAPIC_RANGE_END (0xfeefffff)
57 #define IOVA_START_ADDR (0x1000)
59 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61 #define MAX_AGAW_WIDTH 64
63 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
64 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
67 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
68 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
69 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
70 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
73 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
74 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
76 /* page table handling */
77 #define LEVEL_STRIDE (9)
78 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
80 static inline int agaw_to_level(int agaw)
82 return agaw + 2;
85 static inline int agaw_to_width(int agaw)
87 return 30 + agaw * LEVEL_STRIDE;
90 static inline int width_to_agaw(int width)
92 return (width - 30) / LEVEL_STRIDE;
95 static inline unsigned int level_to_offset_bits(int level)
97 return (level - 1) * LEVEL_STRIDE;
100 static inline int pfn_level_offset(unsigned long pfn, int level)
102 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
105 static inline unsigned long level_mask(int level)
107 return -1UL << level_to_offset_bits(level);
110 static inline unsigned long level_size(int level)
112 return 1UL << level_to_offset_bits(level);
115 static inline unsigned long align_to_level(unsigned long pfn, int level)
117 return (pfn + level_size(level) - 1) & level_mask(level);
120 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
122 return 1 << ((lvl - 1) * LEVEL_STRIDE);
125 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
126 are never going to work. */
127 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
129 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
132 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
134 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
136 static inline unsigned long page_to_dma_pfn(struct page *pg)
138 return mm_to_dma_pfn(page_to_pfn(pg));
140 static inline unsigned long virt_to_dma_pfn(void *p)
142 return page_to_dma_pfn(virt_to_page(p));
145 /* global iommu list, set NULL for ignored DMAR units */
146 static struct intel_iommu **g_iommus;
148 static void __init check_tylersburg_isoch(void);
149 static int rwbf_quirk;
152 * set to 1 to panic kernel if can't successfully enable VT-d
153 * (used when kernel is launched w/ TXT)
155 static int force_on = 0;
158 * 0: Present
159 * 1-11: Reserved
160 * 12-63: Context Ptr (12 - (haw-1))
161 * 64-127: Reserved
163 struct root_entry {
164 u64 val;
165 u64 rsvd1;
167 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
168 static inline bool root_present(struct root_entry *root)
170 return (root->val & 1);
172 static inline void set_root_present(struct root_entry *root)
174 root->val |= 1;
176 static inline void set_root_value(struct root_entry *root, unsigned long value)
178 root->val |= value & VTD_PAGE_MASK;
181 static inline struct context_entry *
182 get_context_addr_from_root(struct root_entry *root)
184 return (struct context_entry *)
185 (root_present(root)?phys_to_virt(
186 root->val & VTD_PAGE_MASK) :
187 NULL);
191 * low 64 bits:
192 * 0: present
193 * 1: fault processing disable
194 * 2-3: translation type
195 * 12-63: address space root
196 * high 64 bits:
197 * 0-2: address width
198 * 3-6: aval
199 * 8-23: domain id
201 struct context_entry {
202 u64 lo;
203 u64 hi;
206 static inline bool context_present(struct context_entry *context)
208 return (context->lo & 1);
210 static inline void context_set_present(struct context_entry *context)
212 context->lo |= 1;
215 static inline void context_set_fault_enable(struct context_entry *context)
217 context->lo &= (((u64)-1) << 2) | 1;
220 static inline void context_set_translation_type(struct context_entry *context,
221 unsigned long value)
223 context->lo &= (((u64)-1) << 4) | 3;
224 context->lo |= (value & 3) << 2;
227 static inline void context_set_address_root(struct context_entry *context,
228 unsigned long value)
230 context->lo |= value & VTD_PAGE_MASK;
233 static inline void context_set_address_width(struct context_entry *context,
234 unsigned long value)
236 context->hi |= value & 7;
239 static inline void context_set_domain_id(struct context_entry *context,
240 unsigned long value)
242 context->hi |= (value & ((1 << 16) - 1)) << 8;
245 static inline void context_clear_entry(struct context_entry *context)
247 context->lo = 0;
248 context->hi = 0;
252 * 0: readable
253 * 1: writable
254 * 2-6: reserved
255 * 7: super page
256 * 8-10: available
257 * 11: snoop behavior
258 * 12-63: Host physcial address
260 struct dma_pte {
261 u64 val;
264 static inline void dma_clear_pte(struct dma_pte *pte)
266 pte->val = 0;
269 static inline void dma_set_pte_readable(struct dma_pte *pte)
271 pte->val |= DMA_PTE_READ;
274 static inline void dma_set_pte_writable(struct dma_pte *pte)
276 pte->val |= DMA_PTE_WRITE;
279 static inline void dma_set_pte_snp(struct dma_pte *pte)
281 pte->val |= DMA_PTE_SNP;
284 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
286 pte->val = (pte->val & ~3) | (prot & 3);
289 static inline u64 dma_pte_addr(struct dma_pte *pte)
291 #ifdef CONFIG_64BIT
292 return pte->val & VTD_PAGE_MASK;
293 #else
294 /* Must have a full atomic 64-bit read */
295 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
296 #endif
299 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
301 pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
304 static inline bool dma_pte_present(struct dma_pte *pte)
306 return (pte->val & 3) != 0;
309 static inline int first_pte_in_page(struct dma_pte *pte)
311 return !((unsigned long)pte & ~VTD_PAGE_MASK);
315 * This domain is a statically identity mapping domain.
316 * 1. This domain creats a static 1:1 mapping to all usable memory.
317 * 2. It maps to each iommu if successful.
318 * 3. Each iommu mapps to this domain if successful.
320 static struct dmar_domain *si_domain;
321 static int hw_pass_through = 1;
323 /* devices under the same p2p bridge are owned in one domain */
324 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
326 /* domain represents a virtual machine, more than one devices
327 * across iommus may be owned in one domain, e.g. kvm guest.
329 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
331 /* si_domain contains mulitple devices */
332 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
334 struct dmar_domain {
335 int id; /* domain id */
336 int nid; /* node id */
337 unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/
339 struct list_head devices; /* all devices' list */
340 struct iova_domain iovad; /* iova's that belong to this domain */
342 struct dma_pte *pgd; /* virtual address */
343 int gaw; /* max guest address width */
345 /* adjusted guest address width, 0 is level 2 30-bit */
346 int agaw;
348 int flags; /* flags to find out type of domain */
350 int iommu_coherency;/* indicate coherency of iommu access */
351 int iommu_snooping; /* indicate snooping control feature*/
352 int iommu_count; /* reference count of iommu */
353 int iommu_superpage;/* Level of superpages supported:
354 0 == 4KiB (no superpages), 1 == 2MiB,
355 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
356 spinlock_t iommu_lock; /* protect iommu set in domain */
357 u64 max_addr; /* maximum mapped address */
360 /* PCI domain-device relationship */
361 struct device_domain_info {
362 struct list_head link; /* link to domain siblings */
363 struct list_head global; /* link to global list */
364 int segment; /* PCI domain */
365 u8 bus; /* PCI bus number */
366 u8 devfn; /* PCI devfn number */
367 struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
368 struct intel_iommu *iommu; /* IOMMU used by this device */
369 struct dmar_domain *domain; /* pointer to domain */
372 static void flush_unmaps_timeout(unsigned long data);
374 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
376 #define HIGH_WATER_MARK 250
377 struct deferred_flush_tables {
378 int next;
379 struct iova *iova[HIGH_WATER_MARK];
380 struct dmar_domain *domain[HIGH_WATER_MARK];
383 static struct deferred_flush_tables *deferred_flush;
385 /* bitmap for indexing intel_iommus */
386 static int g_num_of_iommus;
388 static DEFINE_SPINLOCK(async_umap_flush_lock);
389 static LIST_HEAD(unmaps_to_do);
391 static int timer_on;
392 static long list_size;
394 static void domain_remove_dev_info(struct dmar_domain *domain);
396 #ifdef CONFIG_DMAR_DEFAULT_ON
397 int dmar_disabled = 0;
398 #else
399 int dmar_disabled = 1;
400 #endif /*CONFIG_DMAR_DEFAULT_ON*/
402 static int dmar_map_gfx = 1;
403 static int dmar_forcedac;
404 static int intel_iommu_strict;
405 static int intel_iommu_superpage = 1;
407 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
408 static DEFINE_SPINLOCK(device_domain_lock);
409 static LIST_HEAD(device_domain_list);
411 static struct iommu_ops intel_iommu_ops;
413 static int __init intel_iommu_setup(char *str)
415 if (!str)
416 return -EINVAL;
417 while (*str) {
418 if (!strncmp(str, "on", 2)) {
419 dmar_disabled = 0;
420 printk(KERN_INFO "Intel-IOMMU: enabled\n");
421 } else if (!strncmp(str, "off", 3)) {
422 dmar_disabled = 1;
423 printk(KERN_INFO "Intel-IOMMU: disabled\n");
424 } else if (!strncmp(str, "igfx_off", 8)) {
425 dmar_map_gfx = 0;
426 printk(KERN_INFO
427 "Intel-IOMMU: disable GFX device mapping\n");
428 } else if (!strncmp(str, "forcedac", 8)) {
429 printk(KERN_INFO
430 "Intel-IOMMU: Forcing DAC for PCI devices\n");
431 dmar_forcedac = 1;
432 } else if (!strncmp(str, "strict", 6)) {
433 printk(KERN_INFO
434 "Intel-IOMMU: disable batched IOTLB flush\n");
435 intel_iommu_strict = 1;
436 } else if (!strncmp(str, "sp_off", 6)) {
437 printk(KERN_INFO
438 "Intel-IOMMU: disable supported super page\n");
439 intel_iommu_superpage = 0;
442 str += strcspn(str, ",");
443 while (*str == ',')
444 str++;
446 return 0;
448 __setup("intel_iommu=", intel_iommu_setup);
450 static struct kmem_cache *iommu_domain_cache;
451 static struct kmem_cache *iommu_devinfo_cache;
452 static struct kmem_cache *iommu_iova_cache;
454 static inline void *alloc_pgtable_page(int node)
456 struct page *page;
457 void *vaddr = NULL;
459 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
460 if (page)
461 vaddr = page_address(page);
462 return vaddr;
465 static inline void free_pgtable_page(void *vaddr)
467 free_page((unsigned long)vaddr);
470 static inline void *alloc_domain_mem(void)
472 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
475 static void free_domain_mem(void *vaddr)
477 kmem_cache_free(iommu_domain_cache, vaddr);
480 static inline void * alloc_devinfo_mem(void)
482 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
485 static inline void free_devinfo_mem(void *vaddr)
487 kmem_cache_free(iommu_devinfo_cache, vaddr);
490 struct iova *alloc_iova_mem(void)
492 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
495 void free_iova_mem(struct iova *iova)
497 kmem_cache_free(iommu_iova_cache, iova);
501 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
503 unsigned long sagaw;
504 int agaw = -1;
506 sagaw = cap_sagaw(iommu->cap);
507 for (agaw = width_to_agaw(max_gaw);
508 agaw >= 0; agaw--) {
509 if (test_bit(agaw, &sagaw))
510 break;
513 return agaw;
517 * Calculate max SAGAW for each iommu.
519 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
521 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
525 * calculate agaw for each iommu.
526 * "SAGAW" may be different across iommus, use a default agaw, and
527 * get a supported less agaw for iommus that don't support the default agaw.
529 int iommu_calculate_agaw(struct intel_iommu *iommu)
531 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
534 /* This functionin only returns single iommu in a domain */
535 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
537 int iommu_id;
539 /* si_domain and vm domain should not get here. */
540 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
541 BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
543 iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
544 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
545 return NULL;
547 return g_iommus[iommu_id];
550 static void domain_update_iommu_coherency(struct dmar_domain *domain)
552 int i;
554 domain->iommu_coherency = 1;
556 for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
557 if (!ecap_coherent(g_iommus[i]->ecap)) {
558 domain->iommu_coherency = 0;
559 break;
564 static void domain_update_iommu_snooping(struct dmar_domain *domain)
566 int i;
568 domain->iommu_snooping = 1;
570 for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
571 if (!ecap_sc_support(g_iommus[i]->ecap)) {
572 domain->iommu_snooping = 0;
573 break;
578 static void domain_update_iommu_superpage(struct dmar_domain *domain)
580 int i, mask = 0xf;
582 if (!intel_iommu_superpage) {
583 domain->iommu_superpage = 0;
584 return;
587 domain->iommu_superpage = 4; /* 1TiB */
589 for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
590 mask |= cap_super_page_val(g_iommus[i]->cap);
591 if (!mask) {
592 break;
595 domain->iommu_superpage = fls(mask);
598 /* Some capabilities may be different across iommus */
599 static void domain_update_iommu_cap(struct dmar_domain *domain)
601 domain_update_iommu_coherency(domain);
602 domain_update_iommu_snooping(domain);
603 domain_update_iommu_superpage(domain);
606 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
608 struct dmar_drhd_unit *drhd = NULL;
609 int i;
611 for_each_drhd_unit(drhd) {
612 if (drhd->ignored)
613 continue;
614 if (segment != drhd->segment)
615 continue;
617 for (i = 0; i < drhd->devices_cnt; i++) {
618 if (drhd->devices[i] &&
619 drhd->devices[i]->bus->number == bus &&
620 drhd->devices[i]->devfn == devfn)
621 return drhd->iommu;
622 if (drhd->devices[i] &&
623 drhd->devices[i]->subordinate &&
624 drhd->devices[i]->subordinate->number <= bus &&
625 drhd->devices[i]->subordinate->subordinate >= bus)
626 return drhd->iommu;
629 if (drhd->include_all)
630 return drhd->iommu;
633 return NULL;
636 #ifdef CONFIG_HOTPLUG
637 struct dev_dmaru {
638 struct list_head list;
639 void *dmaru;
640 int index;
641 int segment;
642 unsigned char bus;
643 unsigned int devfn;
646 static int
647 save_dev_dmaru(int segment, unsigned char bus, unsigned int devfn,
648 void *dmaru, int index, struct list_head *lh)
650 struct dev_dmaru *m;
652 m = kzalloc(sizeof(*m), GFP_KERNEL);
653 if (!m)
654 return -ENOMEM;
656 m->segment = segment;
657 m->bus = bus;
658 m->devfn = devfn;
659 m->dmaru = dmaru;
660 m->index = index;
662 list_add(&m->list, lh);
664 return 0;
667 static void
668 *get_dev_dmaru(int segment, unsigned char bus, unsigned int devfn,
669 int *index, struct list_head *lh)
671 struct dev_dmaru *m;
672 void *dmaru = NULL;
674 list_for_each_entry(m, lh, list) {
675 if (m->segment == segment &&
676 m->bus == bus && m->devfn == devfn) {
677 *index = m->index;
678 dmaru = m->dmaru;
679 list_del(&m->list);
680 kfree(m);
681 break;
685 return dmaru;
688 static LIST_HEAD(saved_dev_drhd_list);
690 static void remove_dev_from_drhd(struct pci_dev *dev)
692 struct dmar_drhd_unit *drhd = NULL;
693 int segment = pci_domain_nr(dev->bus);
694 int i;
696 for_each_drhd_unit(drhd) {
697 if (drhd->ignored)
698 continue;
699 if (segment != drhd->segment)
700 continue;
702 for (i = 0; i < drhd->devices_cnt; i++) {
703 if (drhd->devices[i] == dev) {
704 /* save it at first if it is in drhd */
705 save_dev_dmaru(segment, dev->bus->number,
706 dev->devfn, drhd, i,
707 &saved_dev_drhd_list);
708 /* always remove it */
709 drhd->devices[i] = NULL;
710 return;
716 static void restore_dev_to_drhd(struct pci_dev *dev)
718 struct dmar_drhd_unit *drhd = NULL;
719 int i;
721 /* find the stored drhd */
722 drhd = get_dev_dmaru(pci_domain_nr(dev->bus), dev->bus->number,
723 dev->devfn, &i, &saved_dev_drhd_list);
724 /* restore that into drhd */
725 if (drhd)
726 drhd->devices[i] = dev;
728 #else
729 static void remove_dev_from_drhd(struct pci_dev *dev)
733 static void restore_dev_to_drhd(struct pci_dev *dev)
736 #endif
738 #if defined(CONFIG_DMAR) && defined(CONFIG_HOTPLUG)
739 static LIST_HEAD(saved_dev_atsr_list);
741 static void remove_dev_from_atsr(struct pci_dev *dev)
743 struct dmar_atsr_unit *atsr = NULL;
744 int segment = pci_domain_nr(dev->bus);
745 int i;
747 for_each_atsr_unit(atsr) {
748 if (segment != atsr->segment)
749 continue;
751 for (i = 0; i < atsr->devices_cnt; i++) {
752 if (atsr->devices[i] == dev) {
753 /* save it at first if it is in drhd */
754 save_dev_dmaru(segment, dev->bus->number,
755 dev->devfn, atsr, i,
756 &saved_dev_atsr_list);
757 /* always remove it */
758 atsr->devices[i] = NULL;
759 return;
765 static void restore_dev_to_atsr(struct pci_dev *dev)
767 struct dmar_atsr_unit *atsr = NULL;
768 int i;
770 /* find the stored atsr */
771 atsr = get_dev_dmaru(pci_domain_nr(dev->bus), dev->bus->number,
772 dev->devfn, &i, &saved_dev_atsr_list);
773 /* restore that into atsr */
774 if (atsr)
775 atsr->devices[i] = dev;
777 #else
778 static void remove_dev_from_atsr(struct pci_dev *dev)
782 static void restore_dev_to_atsr(struct pci_dev *dev)
785 #endif
787 static void domain_flush_cache(struct dmar_domain *domain,
788 void *addr, int size)
790 if (!domain->iommu_coherency)
791 clflush_cache_range(addr, size);
794 /* Gets context entry for a given bus and devfn */
795 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
796 u8 bus, u8 devfn)
798 struct root_entry *root;
799 struct context_entry *context;
800 unsigned long phy_addr;
801 unsigned long flags;
803 spin_lock_irqsave(&iommu->lock, flags);
804 root = &iommu->root_entry[bus];
805 context = get_context_addr_from_root(root);
806 if (!context) {
807 context = (struct context_entry *)
808 alloc_pgtable_page(iommu->node);
809 if (!context) {
810 spin_unlock_irqrestore(&iommu->lock, flags);
811 return NULL;
813 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
814 phy_addr = virt_to_phys((void *)context);
815 set_root_value(root, phy_addr);
816 set_root_present(root);
817 __iommu_flush_cache(iommu, root, sizeof(*root));
819 spin_unlock_irqrestore(&iommu->lock, flags);
820 return &context[devfn];
823 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
825 struct root_entry *root;
826 struct context_entry *context;
827 int ret;
828 unsigned long flags;
830 spin_lock_irqsave(&iommu->lock, flags);
831 root = &iommu->root_entry[bus];
832 context = get_context_addr_from_root(root);
833 if (!context) {
834 ret = 0;
835 goto out;
837 ret = context_present(&context[devfn]);
838 out:
839 spin_unlock_irqrestore(&iommu->lock, flags);
840 return ret;
843 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
845 struct root_entry *root;
846 struct context_entry *context;
847 unsigned long flags;
849 spin_lock_irqsave(&iommu->lock, flags);
850 root = &iommu->root_entry[bus];
851 context = get_context_addr_from_root(root);
852 if (context) {
853 context_clear_entry(&context[devfn]);
854 __iommu_flush_cache(iommu, &context[devfn], \
855 sizeof(*context));
857 spin_unlock_irqrestore(&iommu->lock, flags);
860 static void free_context_table(struct intel_iommu *iommu)
862 struct root_entry *root;
863 int i;
864 unsigned long flags;
865 struct context_entry *context;
867 spin_lock_irqsave(&iommu->lock, flags);
868 if (!iommu->root_entry) {
869 goto out;
871 for (i = 0; i < ROOT_ENTRY_NR; i++) {
872 root = &iommu->root_entry[i];
873 context = get_context_addr_from_root(root);
874 if (context)
875 free_pgtable_page(context);
877 free_pgtable_page(iommu->root_entry);
878 iommu->root_entry = NULL;
879 out:
880 spin_unlock_irqrestore(&iommu->lock, flags);
883 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
884 unsigned long pfn, int large_level)
886 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
887 struct dma_pte *parent, *pte = NULL;
888 int level = agaw_to_level(domain->agaw);
889 int offset, target_level;
891 BUG_ON(!domain->pgd);
892 BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
893 parent = domain->pgd;
895 /* Search pte */
896 if (!large_level)
897 target_level = 1;
898 else
899 target_level = large_level;
901 while (level > 0) {
902 void *tmp_page;
904 offset = pfn_level_offset(pfn, level);
905 pte = &parent[offset];
906 if (!large_level && (pte->val & DMA_PTE_LARGE_PAGE))
907 break;
908 if (level == target_level)
909 break;
911 if (!dma_pte_present(pte)) {
912 uint64_t pteval;
914 tmp_page = alloc_pgtable_page(domain->nid);
916 if (!tmp_page)
917 return NULL;
919 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
920 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
921 if (cmpxchg64(&pte->val, 0ULL, pteval)) {
922 /* Someone else set it while we were thinking; use theirs. */
923 free_pgtable_page(tmp_page);
924 } else {
925 dma_pte_addr(pte);
926 domain_flush_cache(domain, pte, sizeof(*pte));
929 parent = phys_to_virt(dma_pte_addr(pte));
930 level--;
933 return pte;
937 /* return address's pte at specific level */
938 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
939 unsigned long pfn,
940 int level, int *large_page)
942 struct dma_pte *parent, *pte = NULL;
943 int total = agaw_to_level(domain->agaw);
944 int offset;
946 parent = domain->pgd;
947 while (level <= total) {
948 offset = pfn_level_offset(pfn, total);
949 pte = &parent[offset];
950 if (level == total)
951 return pte;
953 if (!dma_pte_present(pte)) {
954 *large_page = total;
955 break;
958 if (pte->val & DMA_PTE_LARGE_PAGE) {
959 *large_page = total;
960 return pte;
963 parent = phys_to_virt(dma_pte_addr(pte));
964 total--;
966 return NULL;
969 /* clear last level pte, a tlb flush should be followed */
970 static void dma_pte_clear_range(struct dmar_domain *domain,
971 unsigned long start_pfn,
972 unsigned long last_pfn)
974 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
975 unsigned int large_page = 1;
976 struct dma_pte *first_pte, *pte;
978 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
979 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
980 BUG_ON(start_pfn > last_pfn);
982 /* we don't need lock here; nobody else touches the iova range */
983 do {
984 large_page = 1;
985 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
986 if (!pte) {
987 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
988 continue;
990 do {
991 dma_clear_pte(pte);
992 start_pfn += lvl_to_nr_pages(large_page);
993 pte++;
994 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
996 domain_flush_cache(domain, first_pte,
997 (void *)pte - (void *)first_pte);
999 } while (start_pfn && start_pfn <= last_pfn);
1002 /* free page table pages. last level pte should already be cleared */
1003 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1004 unsigned long start_pfn,
1005 unsigned long last_pfn)
1007 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1008 struct dma_pte *first_pte, *pte;
1009 int total = agaw_to_level(domain->agaw);
1010 int level;
1011 unsigned long tmp;
1012 int large_page = 2;
1014 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
1015 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
1016 BUG_ON(start_pfn > last_pfn);
1018 /* We don't need lock here; nobody else touches the iova range */
1019 level = 2;
1020 while (level <= total) {
1021 tmp = align_to_level(start_pfn, level);
1023 /* If we can't even clear one PTE at this level, we're done */
1024 if (tmp + level_size(level) - 1 > last_pfn)
1025 return;
1027 do {
1028 large_page = level;
1029 first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
1030 if (large_page > level)
1031 level = large_page + 1;
1032 if (!pte) {
1033 tmp = align_to_level(tmp + 1, level + 1);
1034 continue;
1036 do {
1037 if (dma_pte_present(pte)) {
1038 free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
1039 dma_clear_pte(pte);
1041 pte++;
1042 tmp += level_size(level);
1043 } while (!first_pte_in_page(pte) &&
1044 tmp + level_size(level) - 1 <= last_pfn);
1046 domain_flush_cache(domain, first_pte,
1047 (void *)pte - (void *)first_pte);
1049 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
1050 level++;
1052 /* free pgd */
1053 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1054 free_pgtable_page(domain->pgd);
1055 domain->pgd = NULL;
1059 /* iommu handling */
1060 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1062 struct root_entry *root;
1063 unsigned long flags;
1065 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1066 if (!root)
1067 return -ENOMEM;
1069 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1071 spin_lock_irqsave(&iommu->lock, flags);
1072 iommu->root_entry = root;
1073 spin_unlock_irqrestore(&iommu->lock, flags);
1075 return 0;
1078 static void iommu_set_root_entry(struct intel_iommu *iommu)
1080 void *addr;
1081 u32 sts;
1082 unsigned long flag;
1084 addr = iommu->root_entry;
1086 spin_lock_irqsave(&iommu->register_lock, flag);
1087 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
1089 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1091 /* Make sure hardware complete it */
1092 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1093 readl, (sts & DMA_GSTS_RTPS), sts);
1095 spin_unlock_irqrestore(&iommu->register_lock, flag);
1098 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1100 u32 val;
1101 unsigned long flag;
1103 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1104 return;
1106 spin_lock_irqsave(&iommu->register_lock, flag);
1107 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1109 /* Make sure hardware complete it */
1110 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1111 readl, (!(val & DMA_GSTS_WBFS)), val);
1113 spin_unlock_irqrestore(&iommu->register_lock, flag);
1116 /* return value determine if we need a write buffer flush */
1117 static void __iommu_flush_context(struct intel_iommu *iommu,
1118 u16 did, u16 source_id, u8 function_mask,
1119 u64 type)
1121 u64 val = 0;
1122 unsigned long flag;
1124 switch (type) {
1125 case DMA_CCMD_GLOBAL_INVL:
1126 val = DMA_CCMD_GLOBAL_INVL;
1127 break;
1128 case DMA_CCMD_DOMAIN_INVL:
1129 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1130 break;
1131 case DMA_CCMD_DEVICE_INVL:
1132 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1133 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1134 break;
1135 default:
1136 BUG();
1138 val |= DMA_CCMD_ICC;
1140 spin_lock_irqsave(&iommu->register_lock, flag);
1141 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1143 /* Make sure hardware complete it */
1144 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1145 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1147 spin_unlock_irqrestore(&iommu->register_lock, flag);
1150 /* return value determine if we need a write buffer flush */
1151 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1152 u64 addr, unsigned int size_order, u64 type)
1154 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1155 u64 val = 0, val_iva = 0;
1156 unsigned long flag;
1158 switch (type) {
1159 case DMA_TLB_GLOBAL_FLUSH:
1160 /* global flush doesn't need set IVA_REG */
1161 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1162 break;
1163 case DMA_TLB_DSI_FLUSH:
1164 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1165 break;
1166 case DMA_TLB_PSI_FLUSH:
1167 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1168 /* Note: always flush non-leaf currently */
1169 val_iva = size_order | addr;
1170 break;
1171 default:
1172 BUG();
1174 /* Note: set drain read/write */
1175 #if 0
1177 * This is probably to be super secure.. Looks like we can
1178 * ignore it without any impact.
1180 if (cap_read_drain(iommu->cap))
1181 val |= DMA_TLB_READ_DRAIN;
1182 #endif
1183 if (cap_write_drain(iommu->cap))
1184 val |= DMA_TLB_WRITE_DRAIN;
1186 spin_lock_irqsave(&iommu->register_lock, flag);
1187 /* Note: Only uses first TLB reg currently */
1188 if (val_iva)
1189 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1190 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1192 /* Make sure hardware complete it */
1193 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1194 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1196 spin_unlock_irqrestore(&iommu->register_lock, flag);
1198 /* check IOTLB invalidation granularity */
1199 if (DMA_TLB_IAIG(val) == 0)
1200 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1201 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1202 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1203 (unsigned long long)DMA_TLB_IIRG(type),
1204 (unsigned long long)DMA_TLB_IAIG(val));
1207 static struct device_domain_info *iommu_support_dev_iotlb(
1208 struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1210 int found = 0;
1211 unsigned long flags;
1212 struct device_domain_info *info;
1213 struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1215 if (!ecap_dev_iotlb_support(iommu->ecap))
1216 return NULL;
1218 if (!iommu->qi)
1219 return NULL;
1221 spin_lock_irqsave(&device_domain_lock, flags);
1222 list_for_each_entry(info, &domain->devices, link)
1223 if (info->bus == bus && info->devfn == devfn) {
1224 found = 1;
1225 break;
1227 spin_unlock_irqrestore(&device_domain_lock, flags);
1229 if (!found || !info->dev)
1230 return NULL;
1232 if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1233 return NULL;
1235 if (!dmar_find_matched_atsr_unit(info->dev))
1236 return NULL;
1238 info->iommu = iommu;
1240 return info;
1243 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1245 if (!info)
1246 return;
1248 pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1251 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1253 if (!info->dev || !pci_ats_enabled(info->dev))
1254 return;
1256 pci_disable_ats(info->dev);
1259 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1260 u64 addr, unsigned mask)
1262 u16 sid, qdep;
1263 unsigned long flags;
1264 struct device_domain_info *info;
1266 spin_lock_irqsave(&device_domain_lock, flags);
1267 list_for_each_entry(info, &domain->devices, link) {
1268 if (!info->dev || !pci_ats_enabled(info->dev))
1269 continue;
1271 sid = info->bus << 8 | info->devfn;
1272 qdep = pci_ats_queue_depth(info->dev);
1273 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1275 spin_unlock_irqrestore(&device_domain_lock, flags);
1278 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1279 unsigned long pfn, unsigned int pages, int map)
1281 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1282 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1284 BUG_ON(pages == 0);
1287 * Fallback to domain selective flush if no PSI support or the size is
1288 * too big.
1289 * PSI requires page size to be 2 ^ x, and the base address is naturally
1290 * aligned to the size
1292 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1293 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1294 DMA_TLB_DSI_FLUSH);
1295 else
1296 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1297 DMA_TLB_PSI_FLUSH);
1300 * In caching mode, changes of pages from non-present to present require
1301 * flush. However, device IOTLB doesn't need to be flushed in this case.
1303 if (!cap_caching_mode(iommu->cap) || !map)
1304 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1307 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1309 u32 pmen;
1310 unsigned long flags;
1312 spin_lock_irqsave(&iommu->register_lock, flags);
1313 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1314 pmen &= ~DMA_PMEN_EPM;
1315 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1317 /* wait for the protected region status bit to clear */
1318 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1319 readl, !(pmen & DMA_PMEN_PRS), pmen);
1321 spin_unlock_irqrestore(&iommu->register_lock, flags);
1324 static int iommu_enable_translation(struct intel_iommu *iommu)
1326 u32 sts;
1327 unsigned long flags;
1329 spin_lock_irqsave(&iommu->register_lock, flags);
1330 iommu->gcmd |= DMA_GCMD_TE;
1331 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1333 /* Make sure hardware complete it */
1334 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1335 readl, (sts & DMA_GSTS_TES), sts);
1337 spin_unlock_irqrestore(&iommu->register_lock, flags);
1338 return 0;
1341 static int iommu_disable_translation(struct intel_iommu *iommu)
1343 u32 sts;
1344 unsigned long flag;
1346 spin_lock_irqsave(&iommu->register_lock, flag);
1347 iommu->gcmd &= ~DMA_GCMD_TE;
1348 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1350 /* Make sure hardware complete it */
1351 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1352 readl, (!(sts & DMA_GSTS_TES)), sts);
1354 spin_unlock_irqrestore(&iommu->register_lock, flag);
1355 return 0;
1359 static int iommu_init_domains(struct intel_iommu *iommu)
1361 unsigned long ndomains;
1362 unsigned long nlongs;
1364 ndomains = cap_ndoms(iommu->cap);
1365 pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1366 ndomains);
1367 nlongs = BITS_TO_LONGS(ndomains);
1369 spin_lock_init(&iommu->lock);
1371 /* TBD: there might be 64K domains,
1372 * consider other allocation for future chip
1374 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1375 if (!iommu->domain_ids) {
1376 printk(KERN_ERR "Allocating domain id array failed\n");
1377 return -ENOMEM;
1379 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1380 GFP_KERNEL);
1381 if (!iommu->domains) {
1382 printk(KERN_ERR "Allocating domain array failed\n");
1383 return -ENOMEM;
1387 * if Caching mode is set, then invalid translations are tagged
1388 * with domainid 0. Hence we need to pre-allocate it.
1390 if (cap_caching_mode(iommu->cap))
1391 set_bit(0, iommu->domain_ids);
1392 return 0;
1396 static void domain_exit(struct dmar_domain *domain);
1397 static void vm_domain_exit(struct dmar_domain *domain);
1399 void free_dmar_iommu(struct intel_iommu *iommu)
1401 struct dmar_domain *domain;
1402 int i;
1403 unsigned long flags;
1405 if ((iommu->domains) && (iommu->domain_ids)) {
1406 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1407 domain = iommu->domains[i];
1408 clear_bit(i, iommu->domain_ids);
1410 spin_lock_irqsave(&domain->iommu_lock, flags);
1411 if (--domain->iommu_count == 0) {
1412 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1413 vm_domain_exit(domain);
1414 else
1415 domain_exit(domain);
1417 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1421 if (iommu->gcmd & DMA_GCMD_TE)
1422 iommu_disable_translation(iommu);
1424 if (iommu->irq) {
1425 irq_set_handler_data(iommu->irq, NULL);
1426 /* This will mask the irq */
1427 free_irq(iommu->irq, iommu);
1428 destroy_irq(iommu->irq);
1431 kfree(iommu->domains);
1432 kfree(iommu->domain_ids);
1434 g_iommus[iommu->seq_id] = NULL;
1436 /* if all iommus are freed, free g_iommus */
1437 for (i = 0; i < g_num_of_iommus; i++) {
1438 if (g_iommus[i])
1439 break;
1442 if (i == g_num_of_iommus)
1443 kfree(g_iommus);
1445 /* free context mapping */
1446 free_context_table(iommu);
1449 static struct dmar_domain *alloc_domain(void)
1451 struct dmar_domain *domain;
1453 domain = alloc_domain_mem();
1454 if (!domain)
1455 return NULL;
1457 domain->nid = -1;
1458 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1459 domain->flags = 0;
1461 return domain;
1464 static int iommu_attach_domain(struct dmar_domain *domain,
1465 struct intel_iommu *iommu)
1467 int num;
1468 unsigned long ndomains;
1469 unsigned long flags;
1471 ndomains = cap_ndoms(iommu->cap);
1473 spin_lock_irqsave(&iommu->lock, flags);
1475 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1476 if (num >= ndomains) {
1477 spin_unlock_irqrestore(&iommu->lock, flags);
1478 printk(KERN_ERR "IOMMU: no free domain ids\n");
1479 return -ENOMEM;
1482 domain->id = num;
1483 set_bit(num, iommu->domain_ids);
1484 set_bit(iommu->seq_id, &domain->iommu_bmp);
1485 iommu->domains[num] = domain;
1486 spin_unlock_irqrestore(&iommu->lock, flags);
1488 return 0;
1491 static void iommu_detach_domain(struct dmar_domain *domain,
1492 struct intel_iommu *iommu)
1494 unsigned long flags;
1495 int num, ndomains;
1496 int found = 0;
1498 spin_lock_irqsave(&iommu->lock, flags);
1499 ndomains = cap_ndoms(iommu->cap);
1500 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1501 if (iommu->domains[num] == domain) {
1502 found = 1;
1503 break;
1507 if (found) {
1508 clear_bit(num, iommu->domain_ids);
1509 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1510 iommu->domains[num] = NULL;
1512 spin_unlock_irqrestore(&iommu->lock, flags);
1515 static struct iova_domain reserved_iova_list;
1516 static struct lock_class_key reserved_rbtree_key;
1518 static int dmar_init_reserved_ranges(void)
1520 struct pci_dev *pdev = NULL;
1521 struct iova *iova;
1522 int i;
1524 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1526 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1527 &reserved_rbtree_key);
1529 /* IOAPIC ranges shouldn't be accessed by DMA */
1530 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1531 IOVA_PFN(IOAPIC_RANGE_END));
1532 if (!iova) {
1533 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1534 return -ENODEV;
1537 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1538 for_each_pci_dev(pdev) {
1539 struct resource *r;
1541 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1542 r = &pdev->resource[i];
1543 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1544 continue;
1545 iova = reserve_iova(&reserved_iova_list,
1546 IOVA_PFN(r->start),
1547 IOVA_PFN(r->end));
1548 if (!iova) {
1549 printk(KERN_ERR "Reserve iova failed\n");
1550 return -ENODEV;
1554 return 0;
1557 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1559 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1562 static inline int guestwidth_to_adjustwidth(int gaw)
1564 int agaw;
1565 int r = (gaw - 12) % 9;
1567 if (r == 0)
1568 agaw = gaw;
1569 else
1570 agaw = gaw + 9 - r;
1571 if (agaw > 64)
1572 agaw = 64;
1573 return agaw;
1576 static int domain_init(struct dmar_domain *domain, int guest_width)
1578 struct intel_iommu *iommu;
1579 int adjust_width, agaw;
1580 unsigned long sagaw;
1582 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1583 spin_lock_init(&domain->iommu_lock);
1585 domain_reserve_special_ranges(domain);
1587 /* calculate AGAW */
1588 iommu = domain_get_iommu(domain);
1589 if (guest_width > cap_mgaw(iommu->cap))
1590 guest_width = cap_mgaw(iommu->cap);
1591 domain->gaw = guest_width;
1592 adjust_width = guestwidth_to_adjustwidth(guest_width);
1593 agaw = width_to_agaw(adjust_width);
1594 sagaw = cap_sagaw(iommu->cap);
1595 if (!test_bit(agaw, &sagaw)) {
1596 /* hardware doesn't support it, choose a bigger one */
1597 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1598 agaw = find_next_bit(&sagaw, 5, agaw);
1599 if (agaw >= 5)
1600 return -ENODEV;
1602 domain->agaw = agaw;
1603 INIT_LIST_HEAD(&domain->devices);
1605 if (ecap_coherent(iommu->ecap))
1606 domain->iommu_coherency = 1;
1607 else
1608 domain->iommu_coherency = 0;
1610 if (ecap_sc_support(iommu->ecap))
1611 domain->iommu_snooping = 1;
1612 else
1613 domain->iommu_snooping = 0;
1615 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1616 domain->iommu_count = 1;
1617 domain->nid = iommu->node;
1619 /* always allocate the top pgd */
1620 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1621 if (!domain->pgd)
1622 return -ENOMEM;
1623 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1624 return 0;
1627 static void domain_exit(struct dmar_domain *domain)
1629 struct dmar_drhd_unit *drhd;
1630 struct intel_iommu *iommu;
1632 /* Domain 0 is reserved, so dont process it */
1633 if (!domain)
1634 return;
1636 /* Flush any lazy unmaps that may reference this domain */
1637 if (!intel_iommu_strict)
1638 flush_unmaps_timeout(0);
1640 domain_remove_dev_info(domain);
1641 /* destroy iovas */
1642 put_iova_domain(&domain->iovad);
1644 /* clear ptes */
1645 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1647 /* free page tables */
1648 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1650 for_each_active_iommu(iommu, drhd)
1651 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1652 iommu_detach_domain(domain, iommu);
1654 free_domain_mem(domain);
1657 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1658 u8 bus, u8 devfn, int translation)
1660 struct context_entry *context;
1661 unsigned long flags;
1662 struct intel_iommu *iommu;
1663 struct dma_pte *pgd;
1664 unsigned long num;
1665 unsigned long ndomains;
1666 int id;
1667 int agaw;
1668 struct device_domain_info *info = NULL;
1670 pr_debug("Set context mapping for %02x:%02x.%d\n",
1671 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1673 BUG_ON(!domain->pgd);
1674 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1675 translation != CONTEXT_TT_MULTI_LEVEL);
1677 iommu = device_to_iommu(segment, bus, devfn);
1678 if (!iommu)
1679 return -ENODEV;
1681 context = device_to_context_entry(iommu, bus, devfn);
1682 if (!context)
1683 return -ENOMEM;
1684 spin_lock_irqsave(&iommu->lock, flags);
1685 if (context_present(context)) {
1686 spin_unlock_irqrestore(&iommu->lock, flags);
1687 return 0;
1690 id = domain->id;
1691 pgd = domain->pgd;
1693 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1694 domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1695 int found = 0;
1697 /* find an available domain id for this device in iommu */
1698 ndomains = cap_ndoms(iommu->cap);
1699 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1700 if (iommu->domains[num] == domain) {
1701 id = num;
1702 found = 1;
1703 break;
1707 if (found == 0) {
1708 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1709 if (num >= ndomains) {
1710 spin_unlock_irqrestore(&iommu->lock, flags);
1711 printk(KERN_ERR "IOMMU: no free domain ids\n");
1712 return -EFAULT;
1715 set_bit(num, iommu->domain_ids);
1716 iommu->domains[num] = domain;
1717 id = num;
1720 /* Skip top levels of page tables for
1721 * iommu which has less agaw than default.
1722 * Unnecessary for PT mode.
1724 if (translation != CONTEXT_TT_PASS_THROUGH) {
1725 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1726 pgd = phys_to_virt(dma_pte_addr(pgd));
1727 if (!dma_pte_present(pgd)) {
1728 spin_unlock_irqrestore(&iommu->lock, flags);
1729 return -ENOMEM;
1735 context_set_domain_id(context, id);
1737 if (translation != CONTEXT_TT_PASS_THROUGH) {
1738 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1739 translation = info ? CONTEXT_TT_DEV_IOTLB :
1740 CONTEXT_TT_MULTI_LEVEL;
1743 * In pass through mode, AW must be programmed to indicate the largest
1744 * AGAW value supported by hardware. And ASR is ignored by hardware.
1746 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1747 context_set_address_width(context, iommu->msagaw);
1748 else {
1749 context_set_address_root(context, virt_to_phys(pgd));
1750 context_set_address_width(context, iommu->agaw);
1753 context_set_translation_type(context, translation);
1754 context_set_fault_enable(context);
1755 context_set_present(context);
1756 domain_flush_cache(domain, context, sizeof(*context));
1759 * It's a non-present to present mapping. If hardware doesn't cache
1760 * non-present entry we only need to flush the write-buffer. If the
1761 * _does_ cache non-present entries, then it does so in the special
1762 * domain #0, which we have to flush:
1764 if (cap_caching_mode(iommu->cap)) {
1765 iommu->flush.flush_context(iommu, 0,
1766 (((u16)bus) << 8) | devfn,
1767 DMA_CCMD_MASK_NOBIT,
1768 DMA_CCMD_DEVICE_INVL);
1769 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1770 } else {
1771 iommu_flush_write_buffer(iommu);
1773 iommu_enable_dev_iotlb(info);
1774 spin_unlock_irqrestore(&iommu->lock, flags);
1776 spin_lock_irqsave(&domain->iommu_lock, flags);
1777 if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1778 domain->iommu_count++;
1779 if (domain->iommu_count == 1)
1780 domain->nid = iommu->node;
1781 domain_update_iommu_cap(domain);
1783 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1784 return 0;
1787 static int
1788 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1789 int translation)
1791 int ret;
1792 struct pci_dev *tmp, *parent;
1794 ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1795 pdev->bus->number, pdev->devfn,
1796 translation);
1797 if (ret)
1798 return ret;
1800 /* dependent device mapping */
1801 tmp = pci_find_upstream_pcie_bridge(pdev);
1802 if (!tmp)
1803 return 0;
1804 /* Secondary interface's bus number and devfn 0 */
1805 parent = pdev->bus->self;
1806 while (parent != tmp) {
1807 ret = domain_context_mapping_one(domain,
1808 pci_domain_nr(parent->bus),
1809 parent->bus->number,
1810 parent->devfn, translation);
1811 if (ret)
1812 return ret;
1813 parent = parent->bus->self;
1815 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1816 return domain_context_mapping_one(domain,
1817 pci_domain_nr(tmp->subordinate),
1818 tmp->subordinate->number, 0,
1819 translation);
1820 else /* this is a legacy PCI bridge */
1821 return domain_context_mapping_one(domain,
1822 pci_domain_nr(tmp->bus),
1823 tmp->bus->number,
1824 tmp->devfn,
1825 translation);
1828 static int domain_context_mapped(struct pci_dev *pdev)
1830 int ret;
1831 struct pci_dev *tmp, *parent;
1832 struct intel_iommu *iommu;
1834 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1835 pdev->devfn);
1836 if (!iommu)
1837 return -ENODEV;
1839 ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1840 if (!ret)
1841 return ret;
1842 /* dependent device mapping */
1843 tmp = pci_find_upstream_pcie_bridge(pdev);
1844 if (!tmp)
1845 return ret;
1846 /* Secondary interface's bus number and devfn 0 */
1847 parent = pdev->bus->self;
1848 while (parent != tmp) {
1849 ret = device_context_mapped(iommu, parent->bus->number,
1850 parent->devfn);
1851 if (!ret)
1852 return ret;
1853 parent = parent->bus->self;
1855 if (pci_is_pcie(tmp))
1856 return device_context_mapped(iommu, tmp->subordinate->number,
1858 else
1859 return device_context_mapped(iommu, tmp->bus->number,
1860 tmp->devfn);
1863 /* Returns a number of VTD pages, but aligned to MM page size */
1864 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1865 size_t size)
1867 host_addr &= ~PAGE_MASK;
1868 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1871 /* Return largest possible superpage level for a given mapping */
1872 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1873 unsigned long iov_pfn,
1874 unsigned long phy_pfn,
1875 unsigned long pages)
1877 int support, level = 1;
1878 unsigned long pfnmerge;
1880 support = domain->iommu_superpage;
1882 /* To use a large page, the virtual *and* physical addresses
1883 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1884 of them will mean we have to use smaller pages. So just
1885 merge them and check both at once. */
1886 pfnmerge = iov_pfn | phy_pfn;
1888 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1889 pages >>= VTD_STRIDE_SHIFT;
1890 if (!pages)
1891 break;
1892 pfnmerge >>= VTD_STRIDE_SHIFT;
1893 level++;
1894 support--;
1896 return level;
1899 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1900 struct scatterlist *sg, unsigned long phys_pfn,
1901 unsigned long nr_pages, int prot)
1903 struct dma_pte *first_pte = NULL, *pte = NULL;
1904 phys_addr_t uninitialized_var(pteval);
1905 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1906 unsigned long sg_res;
1907 unsigned int largepage_lvl = 0;
1908 unsigned long lvl_pages = 0;
1910 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1912 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1913 return -EINVAL;
1915 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1917 if (sg)
1918 sg_res = 0;
1919 else {
1920 sg_res = nr_pages + 1;
1921 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1924 while (nr_pages > 0) {
1925 uint64_t tmp;
1927 if (!sg_res) {
1928 sg_res = aligned_nrpages(sg->offset, sg->length);
1929 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1930 sg->dma_length = sg->length;
1931 pteval = page_to_phys(sg_page(sg)) | prot;
1932 phys_pfn = pteval >> VTD_PAGE_SHIFT;
1935 if (!pte) {
1936 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1938 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1939 if (!pte)
1940 return -ENOMEM;
1941 /* It is large page*/
1942 if (largepage_lvl > 1)
1943 pteval |= DMA_PTE_LARGE_PAGE;
1944 else
1945 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1948 /* We don't need lock here, nobody else
1949 * touches the iova range
1951 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1952 if (tmp) {
1953 static int dumps = 5;
1954 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1955 iov_pfn, tmp, (unsigned long long)pteval);
1956 if (dumps) {
1957 dumps--;
1958 debug_dma_dump_mappings(NULL);
1960 WARN_ON(1);
1963 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1965 BUG_ON(nr_pages < lvl_pages);
1966 BUG_ON(sg_res < lvl_pages);
1968 nr_pages -= lvl_pages;
1969 iov_pfn += lvl_pages;
1970 phys_pfn += lvl_pages;
1971 pteval += lvl_pages * VTD_PAGE_SIZE;
1972 sg_res -= lvl_pages;
1974 /* If the next PTE would be the first in a new page, then we
1975 need to flush the cache on the entries we've just written.
1976 And then we'll need to recalculate 'pte', so clear it and
1977 let it get set again in the if (!pte) block above.
1979 If we're done (!nr_pages) we need to flush the cache too.
1981 Also if we've been setting superpages, we may need to
1982 recalculate 'pte' and switch back to smaller pages for the
1983 end of the mapping, if the trailing size is not enough to
1984 use another superpage (i.e. sg_res < lvl_pages). */
1985 pte++;
1986 if (!nr_pages || first_pte_in_page(pte) ||
1987 (largepage_lvl > 1 && sg_res < lvl_pages)) {
1988 domain_flush_cache(domain, first_pte,
1989 (void *)pte - (void *)first_pte);
1990 pte = NULL;
1993 if (!sg_res && nr_pages)
1994 sg = sg_next(sg);
1996 return 0;
1999 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2000 struct scatterlist *sg, unsigned long nr_pages,
2001 int prot)
2003 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2006 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2007 unsigned long phys_pfn, unsigned long nr_pages,
2008 int prot)
2010 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2013 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2015 if (!iommu)
2016 return;
2018 clear_context_table(iommu, bus, devfn);
2019 iommu->flush.flush_context(iommu, 0, 0, 0,
2020 DMA_CCMD_GLOBAL_INVL);
2021 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2024 static void domain_remove_dev_info(struct dmar_domain *domain)
2026 struct device_domain_info *info;
2027 unsigned long flags;
2028 struct intel_iommu *iommu;
2030 spin_lock_irqsave(&device_domain_lock, flags);
2031 while (!list_empty(&domain->devices)) {
2032 info = list_entry(domain->devices.next,
2033 struct device_domain_info, link);
2034 list_del(&info->link);
2035 list_del(&info->global);
2036 if (info->dev)
2037 info->dev->dev.archdata.iommu = NULL;
2038 spin_unlock_irqrestore(&device_domain_lock, flags);
2040 iommu_disable_dev_iotlb(info);
2041 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
2042 iommu_detach_dev(iommu, info->bus, info->devfn);
2043 free_devinfo_mem(info);
2045 spin_lock_irqsave(&device_domain_lock, flags);
2047 spin_unlock_irqrestore(&device_domain_lock, flags);
2051 * find_domain
2052 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
2054 static struct dmar_domain *
2055 find_domain(struct pci_dev *pdev)
2057 struct device_domain_info *info;
2059 /* No lock here, assumes no domain exit in normal case */
2060 info = pdev->dev.archdata.iommu;
2061 if (info)
2062 return info->domain;
2063 return NULL;
2066 /* domain is initialized */
2067 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
2069 struct dmar_domain *domain, *found = NULL;
2070 struct intel_iommu *iommu;
2071 struct dmar_drhd_unit *drhd;
2072 struct device_domain_info *info, *tmp;
2073 struct pci_dev *dev_tmp;
2074 unsigned long flags;
2075 int bus = 0, devfn = 0;
2076 int segment;
2077 int ret;
2079 domain = find_domain(pdev);
2080 if (domain)
2081 return domain;
2083 segment = pci_domain_nr(pdev->bus);
2085 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
2086 if (dev_tmp) {
2087 if (pci_is_pcie(dev_tmp)) {
2088 bus = dev_tmp->subordinate->number;
2089 devfn = 0;
2090 } else {
2091 bus = dev_tmp->bus->number;
2092 devfn = dev_tmp->devfn;
2094 spin_lock_irqsave(&device_domain_lock, flags);
2095 list_for_each_entry(info, &device_domain_list, global) {
2096 if (info->segment == segment &&
2097 info->bus == bus && info->devfn == devfn) {
2098 found = info->domain;
2099 break;
2102 spin_unlock_irqrestore(&device_domain_lock, flags);
2103 /* pcie-pci bridge already has a domain, uses it */
2104 if (found) {
2105 domain = found;
2106 goto found_domain;
2110 domain = alloc_domain();
2111 if (!domain)
2112 goto error;
2114 /* Allocate new domain for the device */
2115 drhd = dmar_find_matched_drhd_unit(pdev);
2116 if (!drhd) {
2117 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2118 pci_name(pdev));
2119 return NULL;
2121 iommu = drhd->iommu;
2123 ret = iommu_attach_domain(domain, iommu);
2124 if (ret) {
2125 free_domain_mem(domain);
2126 goto error;
2129 if (domain_init(domain, gaw)) {
2130 domain_exit(domain);
2131 goto error;
2134 /* register pcie-to-pci device */
2135 if (dev_tmp) {
2136 info = alloc_devinfo_mem();
2137 if (!info) {
2138 domain_exit(domain);
2139 goto error;
2141 info->segment = segment;
2142 info->bus = bus;
2143 info->devfn = devfn;
2144 info->dev = NULL;
2145 info->domain = domain;
2146 /* This domain is shared by devices under p2p bridge */
2147 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2149 /* pcie-to-pci bridge already has a domain, uses it */
2150 found = NULL;
2151 spin_lock_irqsave(&device_domain_lock, flags);
2152 list_for_each_entry(tmp, &device_domain_list, global) {
2153 if (tmp->segment == segment &&
2154 tmp->bus == bus && tmp->devfn == devfn) {
2155 found = tmp->domain;
2156 break;
2159 if (found) {
2160 spin_unlock_irqrestore(&device_domain_lock, flags);
2161 free_devinfo_mem(info);
2162 domain_exit(domain);
2163 domain = found;
2164 } else {
2165 list_add(&info->link, &domain->devices);
2166 list_add(&info->global, &device_domain_list);
2167 spin_unlock_irqrestore(&device_domain_lock, flags);
2171 found_domain:
2172 info = alloc_devinfo_mem();
2173 if (!info)
2174 goto error;
2175 info->segment = segment;
2176 info->bus = pdev->bus->number;
2177 info->devfn = pdev->devfn;
2178 info->dev = pdev;
2179 info->domain = domain;
2180 spin_lock_irqsave(&device_domain_lock, flags);
2181 /* somebody is fast */
2182 found = find_domain(pdev);
2183 if (found != NULL) {
2184 spin_unlock_irqrestore(&device_domain_lock, flags);
2185 if (found != domain) {
2186 domain_exit(domain);
2187 domain = found;
2189 free_devinfo_mem(info);
2190 return domain;
2192 list_add(&info->link, &domain->devices);
2193 list_add(&info->global, &device_domain_list);
2194 pdev->dev.archdata.iommu = info;
2195 spin_unlock_irqrestore(&device_domain_lock, flags);
2196 return domain;
2197 error:
2198 /* recheck it here, maybe others set it */
2199 return find_domain(pdev);
2202 static int iommu_identity_mapping;
2203 #define IDENTMAP_ALL 1
2204 #define IDENTMAP_GFX 2
2205 #define IDENTMAP_AZALIA 4
2207 static int iommu_domain_identity_map(struct dmar_domain *domain,
2208 unsigned long long start,
2209 unsigned long long end)
2211 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2212 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2214 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2215 dma_to_mm_pfn(last_vpfn))) {
2216 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2217 return -ENOMEM;
2220 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2221 start, end, domain->id);
2223 * RMRR range might have overlap with physical memory range,
2224 * clear it first
2226 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2228 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2229 last_vpfn - first_vpfn + 1,
2230 DMA_PTE_READ|DMA_PTE_WRITE);
2233 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2234 unsigned long long start,
2235 unsigned long long end)
2237 struct dmar_domain *domain;
2238 int ret;
2240 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2241 if (!domain)
2242 return -ENOMEM;
2244 /* For _hardware_ passthrough, don't bother. But for software
2245 passthrough, we do it anyway -- it may indicate a memory
2246 range which is reserved in E820, so which didn't get set
2247 up to start with in si_domain */
2248 if (domain == si_domain && hw_pass_through) {
2249 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2250 pci_name(pdev), start, end);
2251 return 0;
2254 printk(KERN_INFO
2255 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2256 pci_name(pdev), start, end);
2258 if (end < start) {
2259 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2260 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2261 dmi_get_system_info(DMI_BIOS_VENDOR),
2262 dmi_get_system_info(DMI_BIOS_VERSION),
2263 dmi_get_system_info(DMI_PRODUCT_VERSION));
2264 ret = -EIO;
2265 goto error;
2268 if (end >> agaw_to_width(domain->agaw)) {
2269 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2270 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2271 agaw_to_width(domain->agaw),
2272 dmi_get_system_info(DMI_BIOS_VENDOR),
2273 dmi_get_system_info(DMI_BIOS_VERSION),
2274 dmi_get_system_info(DMI_PRODUCT_VERSION));
2275 ret = -EIO;
2276 goto error;
2279 ret = iommu_domain_identity_map(domain, start, end);
2280 if (ret)
2281 goto error;
2283 /* context entry init */
2284 ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2285 if (ret)
2286 goto error;
2288 return 0;
2290 error:
2291 domain_exit(domain);
2292 return ret;
2295 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2296 struct pci_dev *pdev)
2298 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2299 return 0;
2300 return iommu_prepare_identity_map(pdev, rmrr->base_address,
2301 rmrr->end_address);
2304 #ifdef CONFIG_DMAR_FLOPPY_WA
2305 static inline void iommu_prepare_isa(void)
2307 struct pci_dev *pdev;
2308 int ret;
2310 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2311 if (!pdev)
2312 return;
2314 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2315 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2317 if (ret)
2318 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2319 "floppy might not work\n");
2322 #else
2323 static inline void iommu_prepare_isa(void)
2325 return;
2327 #endif /* !CONFIG_DMAR_FLPY_WA */
2329 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2331 static int __init si_domain_work_fn(unsigned long start_pfn,
2332 unsigned long end_pfn, void *datax)
2334 int *ret = datax;
2336 *ret = iommu_domain_identity_map(si_domain,
2337 (uint64_t)start_pfn << PAGE_SHIFT,
2338 (uint64_t)end_pfn << PAGE_SHIFT);
2339 return *ret;
2343 static int __init si_domain_init(int hw)
2345 struct dmar_drhd_unit *drhd;
2346 struct intel_iommu *iommu;
2347 int nid, ret = 0;
2349 si_domain = alloc_domain();
2350 if (!si_domain)
2351 return -EFAULT;
2353 pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2355 for_each_active_iommu(iommu, drhd) {
2356 ret = iommu_attach_domain(si_domain, iommu);
2357 if (ret) {
2358 domain_exit(si_domain);
2359 return -EFAULT;
2363 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2364 domain_exit(si_domain);
2365 return -EFAULT;
2368 si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2370 if (hw)
2371 return 0;
2373 for_each_online_node(nid) {
2374 work_with_active_regions(nid, si_domain_work_fn, &ret);
2375 if (ret)
2376 return ret;
2379 return 0;
2382 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2383 struct pci_dev *pdev);
2384 static int identity_mapping(struct pci_dev *pdev)
2386 struct device_domain_info *info;
2388 if (likely(!iommu_identity_mapping))
2389 return 0;
2391 info = pdev->dev.archdata.iommu;
2392 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2393 return (info->domain == si_domain);
2395 return 0;
2398 static int domain_add_dev_info(struct dmar_domain *domain,
2399 struct pci_dev *pdev,
2400 int translation)
2402 struct device_domain_info *info;
2403 unsigned long flags;
2404 int ret;
2406 info = alloc_devinfo_mem();
2407 if (!info)
2408 return -ENOMEM;
2410 ret = domain_context_mapping(domain, pdev, translation);
2411 if (ret) {
2412 free_devinfo_mem(info);
2413 return ret;
2416 info->segment = pci_domain_nr(pdev->bus);
2417 info->bus = pdev->bus->number;
2418 info->devfn = pdev->devfn;
2419 info->dev = pdev;
2420 info->domain = domain;
2422 spin_lock_irqsave(&device_domain_lock, flags);
2423 list_add(&info->link, &domain->devices);
2424 list_add(&info->global, &device_domain_list);
2425 pdev->dev.archdata.iommu = info;
2426 spin_unlock_irqrestore(&device_domain_lock, flags);
2428 return 0;
2431 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2433 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2434 return 1;
2436 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2437 return 1;
2439 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2440 return 0;
2443 * We want to start off with all devices in the 1:1 domain, and
2444 * take them out later if we find they can't access all of memory.
2446 * However, we can't do this for PCI devices behind bridges,
2447 * because all PCI devices behind the same bridge will end up
2448 * with the same source-id on their transactions.
2450 * Practically speaking, we can't change things around for these
2451 * devices at run-time, because we can't be sure there'll be no
2452 * DMA transactions in flight for any of their siblings.
2454 * So PCI devices (unless they're on the root bus) as well as
2455 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2456 * the 1:1 domain, just in _case_ one of their siblings turns out
2457 * not to be able to map all of memory.
2459 if (!pci_is_pcie(pdev)) {
2460 if (!pci_is_root_bus(pdev->bus))
2461 return 0;
2462 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2463 return 0;
2464 } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2465 return 0;
2468 * At boot time, we don't yet know if devices will be 64-bit capable.
2469 * Assume that they will -- if they turn out not to be, then we can
2470 * take them out of the 1:1 domain later.
2472 if (!startup) {
2474 * If the device's dma_mask is less than the system's memory
2475 * size then this is not a candidate for identity mapping.
2477 u64 dma_mask = pdev->dma_mask;
2479 if (pdev->dev.coherent_dma_mask &&
2480 pdev->dev.coherent_dma_mask < dma_mask)
2481 dma_mask = pdev->dev.coherent_dma_mask;
2483 return dma_mask >= dma_get_required_mask(&pdev->dev);
2486 return 1;
2489 static int __init iommu_prepare_static_identity_mapping(int hw)
2491 struct pci_dev *pdev = NULL;
2492 int ret;
2494 ret = si_domain_init(hw);
2495 if (ret)
2496 return -EFAULT;
2498 for_each_pci_dev(pdev) {
2499 /* Skip Host/PCI Bridge devices */
2500 if (IS_BRIDGE_HOST_DEVICE(pdev))
2501 continue;
2502 if (iommu_should_identity_map(pdev, 1)) {
2503 printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2504 hw ? "hardware" : "software", pci_name(pdev));
2506 ret = domain_add_dev_info(si_domain, pdev,
2507 hw ? CONTEXT_TT_PASS_THROUGH :
2508 CONTEXT_TT_MULTI_LEVEL);
2509 if (ret)
2510 return ret;
2514 return 0;
2517 static int __init init_dmars(void)
2519 struct dmar_drhd_unit *drhd;
2520 struct dmar_rmrr_unit *rmrr;
2521 struct pci_dev *pdev;
2522 struct intel_iommu *iommu;
2523 int i, ret;
2526 * for each drhd
2527 * allocate root
2528 * initialize and program root entry to not present
2529 * endfor
2531 for_each_drhd_unit(drhd) {
2532 g_num_of_iommus++;
2534 * lock not needed as this is only incremented in the single
2535 * threaded kernel __init code path all other access are read
2536 * only
2540 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2541 GFP_KERNEL);
2542 if (!g_iommus) {
2543 printk(KERN_ERR "Allocating global iommu array failed\n");
2544 ret = -ENOMEM;
2545 goto error;
2548 deferred_flush = kzalloc(g_num_of_iommus *
2549 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2550 if (!deferred_flush) {
2551 ret = -ENOMEM;
2552 goto error;
2555 for_each_drhd_unit(drhd) {
2556 if (drhd->ignored)
2557 continue;
2559 iommu = drhd->iommu;
2560 g_iommus[iommu->seq_id] = iommu;
2562 ret = iommu_init_domains(iommu);
2563 if (ret)
2564 goto error;
2567 * TBD:
2568 * we could share the same root & context tables
2569 * among all IOMMU's. Need to Split it later.
2571 ret = iommu_alloc_root_entry(iommu);
2572 if (ret) {
2573 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2574 goto error;
2576 if (!ecap_pass_through(iommu->ecap))
2577 hw_pass_through = 0;
2581 * Start from the sane iommu hardware state.
2583 for_each_drhd_unit(drhd) {
2584 if (drhd->ignored)
2585 continue;
2587 iommu = drhd->iommu;
2590 * If the queued invalidation is already initialized by us
2591 * (for example, while enabling interrupt-remapping) then
2592 * we got the things already rolling from a sane state.
2594 if (iommu->qi)
2595 continue;
2598 * Clear any previous faults.
2600 dmar_fault(-1, iommu);
2602 * Disable queued invalidation if supported and already enabled
2603 * before OS handover.
2605 dmar_disable_qi(iommu);
2608 for_each_drhd_unit(drhd) {
2609 if (drhd->ignored)
2610 continue;
2612 iommu = drhd->iommu;
2614 if (dmar_enable_qi(iommu)) {
2616 * Queued Invalidate not enabled, use Register Based
2617 * Invalidate
2619 iommu->flush.flush_context = __iommu_flush_context;
2620 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2621 printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2622 "invalidation\n",
2623 iommu->seq_id,
2624 (unsigned long long)drhd->reg_base_addr);
2625 } else {
2626 iommu->flush.flush_context = qi_flush_context;
2627 iommu->flush.flush_iotlb = qi_flush_iotlb;
2628 printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2629 "invalidation\n",
2630 iommu->seq_id,
2631 (unsigned long long)drhd->reg_base_addr);
2635 if (iommu_pass_through)
2636 iommu_identity_mapping |= IDENTMAP_ALL;
2638 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2639 iommu_identity_mapping |= IDENTMAP_GFX;
2640 #endif
2642 check_tylersburg_isoch();
2645 * If pass through is not set or not enabled, setup context entries for
2646 * identity mappings for rmrr, gfx, and isa and may fall back to static
2647 * identity mapping if iommu_identity_mapping is set.
2649 if (iommu_identity_mapping) {
2650 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2651 if (ret) {
2652 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2653 goto error;
2657 * For each rmrr
2658 * for each dev attached to rmrr
2659 * do
2660 * locate drhd for dev, alloc domain for dev
2661 * allocate free domain
2662 * allocate page table entries for rmrr
2663 * if context not allocated for bus
2664 * allocate and init context
2665 * set present in root table for this bus
2666 * init context with domain, translation etc
2667 * endfor
2668 * endfor
2670 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2671 for_each_rmrr_units(rmrr) {
2672 for (i = 0; i < rmrr->devices_cnt; i++) {
2673 pdev = rmrr->devices[i];
2675 * some BIOS lists non-exist devices in DMAR
2676 * table.
2678 if (!pdev)
2679 continue;
2680 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2681 if (ret)
2682 printk(KERN_ERR
2683 "IOMMU: mapping reserved region failed\n");
2687 iommu_prepare_isa();
2690 * for each drhd
2691 * enable fault log
2692 * global invalidate context cache
2693 * global invalidate iotlb
2694 * enable translation
2696 for_each_drhd_unit(drhd) {
2697 if (drhd->ignored) {
2699 * we always have to disable PMRs or DMA may fail on
2700 * this device
2702 if (force_on)
2703 iommu_disable_protect_mem_regions(drhd->iommu);
2704 continue;
2706 iommu = drhd->iommu;
2708 iommu_flush_write_buffer(iommu);
2710 ret = dmar_set_interrupt(iommu);
2711 if (ret)
2712 goto error;
2714 iommu_set_root_entry(iommu);
2716 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2717 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2719 ret = iommu_enable_translation(iommu);
2720 if (ret)
2721 goto error;
2723 iommu_disable_protect_mem_regions(iommu);
2726 return 0;
2727 error:
2728 for_each_drhd_unit(drhd) {
2729 if (drhd->ignored)
2730 continue;
2731 iommu = drhd->iommu;
2732 free_iommu(iommu);
2734 kfree(g_iommus);
2735 return ret;
2738 /* This takes a number of _MM_ pages, not VTD pages */
2739 static struct iova *intel_alloc_iova(struct device *dev,
2740 struct dmar_domain *domain,
2741 unsigned long nrpages, uint64_t dma_mask)
2743 struct pci_dev *pdev = to_pci_dev(dev);
2744 struct iova *iova = NULL;
2746 /* Restrict dma_mask to the width that the iommu can handle */
2747 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2749 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2751 * First try to allocate an io virtual address in
2752 * DMA_BIT_MASK(32) and if that fails then try allocating
2753 * from higher range
2755 iova = alloc_iova(&domain->iovad, nrpages,
2756 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2757 if (iova)
2758 return iova;
2760 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2761 if (unlikely(!iova)) {
2762 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2763 nrpages, pci_name(pdev));
2764 return NULL;
2767 return iova;
2770 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2772 struct dmar_domain *domain;
2773 int ret;
2775 domain = get_domain_for_dev(pdev,
2776 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2777 if (!domain) {
2778 printk(KERN_ERR
2779 "Allocating domain for %s failed", pci_name(pdev));
2780 return NULL;
2783 /* make sure context mapping is ok */
2784 if (unlikely(!domain_context_mapped(pdev))) {
2785 ret = domain_context_mapping(domain, pdev,
2786 CONTEXT_TT_MULTI_LEVEL);
2787 if (ret) {
2788 printk(KERN_ERR
2789 "Domain context map for %s failed",
2790 pci_name(pdev));
2791 return NULL;
2795 return domain;
2798 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2800 struct device_domain_info *info;
2802 /* No lock here, assumes no domain exit in normal case */
2803 info = dev->dev.archdata.iommu;
2804 if (likely(info))
2805 return info->domain;
2807 return __get_valid_domain_for_dev(dev);
2810 static int iommu_dummy(struct pci_dev *pdev)
2812 return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2815 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2816 static int iommu_no_mapping(struct device *dev)
2818 struct pci_dev *pdev;
2819 int found;
2821 if (unlikely(dev->bus != &pci_bus_type))
2822 return 1;
2824 pdev = to_pci_dev(dev);
2825 if (iommu_dummy(pdev))
2826 return 1;
2828 if (!iommu_identity_mapping)
2829 return 0;
2831 found = identity_mapping(pdev);
2832 if (found) {
2833 if (iommu_should_identity_map(pdev, 0))
2834 return 1;
2835 else {
2837 * 32 bit DMA is removed from si_domain and fall back
2838 * to non-identity mapping.
2840 domain_remove_one_dev_info(si_domain, pdev);
2841 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2842 pci_name(pdev));
2843 return 0;
2845 } else {
2847 * In case of a detached 64 bit DMA device from vm, the device
2848 * is put into si_domain for identity mapping.
2850 if (iommu_should_identity_map(pdev, 0)) {
2851 int ret;
2852 ret = domain_add_dev_info(si_domain, pdev,
2853 hw_pass_through ?
2854 CONTEXT_TT_PASS_THROUGH :
2855 CONTEXT_TT_MULTI_LEVEL);
2856 if (!ret) {
2857 printk(KERN_INFO "64bit %s uses identity mapping\n",
2858 pci_name(pdev));
2859 return 1;
2864 return 0;
2867 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2868 size_t size, int dir, u64 dma_mask)
2870 struct pci_dev *pdev = to_pci_dev(hwdev);
2871 struct dmar_domain *domain;
2872 phys_addr_t start_paddr;
2873 struct iova *iova;
2874 int prot = 0;
2875 int ret;
2876 struct intel_iommu *iommu;
2877 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2879 BUG_ON(dir == DMA_NONE);
2881 if (iommu_no_mapping(hwdev))
2882 return paddr;
2884 domain = get_valid_domain_for_dev(pdev);
2885 if (!domain)
2886 return 0;
2888 iommu = domain_get_iommu(domain);
2889 size = aligned_nrpages(paddr, size);
2891 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2892 if (!iova)
2893 goto error;
2896 * Check if DMAR supports zero-length reads on write only
2897 * mappings..
2899 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2900 !cap_zlr(iommu->cap))
2901 prot |= DMA_PTE_READ;
2902 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2903 prot |= DMA_PTE_WRITE;
2905 * paddr - (paddr + size) might be partial page, we should map the whole
2906 * page. Note: if two part of one page are separately mapped, we
2907 * might have two guest_addr mapping to the same host paddr, but this
2908 * is not a big problem
2910 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2911 mm_to_dma_pfn(paddr_pfn), size, prot);
2912 if (ret)
2913 goto error;
2915 /* it's a non-present to present mapping. Only flush if caching mode */
2916 if (cap_caching_mode(iommu->cap))
2917 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2918 else
2919 iommu_flush_write_buffer(iommu);
2921 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2922 start_paddr += paddr & ~PAGE_MASK;
2923 return start_paddr;
2925 error:
2926 if (iova)
2927 __free_iova(&domain->iovad, iova);
2928 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2929 pci_name(pdev), size, (unsigned long long)paddr, dir);
2930 return 0;
2933 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2934 unsigned long offset, size_t size,
2935 enum dma_data_direction dir,
2936 struct dma_attrs *attrs)
2938 return __intel_map_single(dev, page_to_phys(page) + offset, size,
2939 dir, to_pci_dev(dev)->dma_mask);
2942 static void flush_unmaps(void)
2944 int i, j;
2946 timer_on = 0;
2948 /* just flush them all */
2949 for (i = 0; i < g_num_of_iommus; i++) {
2950 struct intel_iommu *iommu = g_iommus[i];
2951 if (!iommu)
2952 continue;
2954 if (!deferred_flush[i].next)
2955 continue;
2957 /* In caching mode, global flushes turn emulation expensive */
2958 if (!cap_caching_mode(iommu->cap))
2959 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2960 DMA_TLB_GLOBAL_FLUSH);
2961 for (j = 0; j < deferred_flush[i].next; j++) {
2962 unsigned long mask;
2963 struct iova *iova = deferred_flush[i].iova[j];
2964 struct dmar_domain *domain = deferred_flush[i].domain[j];
2966 /* On real hardware multiple invalidations are expensive */
2967 if (cap_caching_mode(iommu->cap))
2968 iommu_flush_iotlb_psi(iommu, domain->id,
2969 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2970 else {
2971 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2972 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2973 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2975 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2977 deferred_flush[i].next = 0;
2980 list_size = 0;
2983 static void flush_unmaps_timeout(unsigned long data)
2985 unsigned long flags;
2987 spin_lock_irqsave(&async_umap_flush_lock, flags);
2988 flush_unmaps();
2989 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2992 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2994 unsigned long flags;
2995 int next, iommu_id;
2996 struct intel_iommu *iommu;
2998 spin_lock_irqsave(&async_umap_flush_lock, flags);
2999 if (list_size == HIGH_WATER_MARK)
3000 flush_unmaps();
3002 iommu = domain_get_iommu(dom);
3003 iommu_id = iommu->seq_id;
3005 next = deferred_flush[iommu_id].next;
3006 deferred_flush[iommu_id].domain[next] = dom;
3007 deferred_flush[iommu_id].iova[next] = iova;
3008 deferred_flush[iommu_id].next++;
3010 if (!timer_on) {
3011 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3012 timer_on = 1;
3014 list_size++;
3015 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3018 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3019 size_t size, enum dma_data_direction dir,
3020 struct dma_attrs *attrs)
3022 struct pci_dev *pdev = to_pci_dev(dev);
3023 struct dmar_domain *domain;
3024 unsigned long start_pfn, last_pfn;
3025 struct iova *iova;
3026 struct intel_iommu *iommu;
3028 if (iommu_no_mapping(dev))
3029 return;
3031 domain = find_domain(pdev);
3032 BUG_ON(!domain);
3034 iommu = domain_get_iommu(domain);
3036 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3037 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3038 (unsigned long long)dev_addr))
3039 return;
3041 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3042 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3044 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3045 pci_name(pdev), start_pfn, last_pfn);
3047 /* clear the whole page */
3048 dma_pte_clear_range(domain, start_pfn, last_pfn);
3050 /* free page tables */
3051 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3053 if (intel_iommu_strict) {
3054 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3055 last_pfn - start_pfn + 1, 0);
3056 /* free iova */
3057 __free_iova(&domain->iovad, iova);
3058 } else {
3059 add_unmap(domain, iova);
3061 * queue up the release of the unmap to save the 1/6th of the
3062 * cpu used up by the iotlb flush operation...
3067 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
3068 dma_addr_t *dma_handle, gfp_t flags)
3070 void *vaddr;
3071 int order;
3073 size = PAGE_ALIGN(size);
3074 order = get_order(size);
3076 if (!iommu_no_mapping(hwdev))
3077 flags &= ~(GFP_DMA | GFP_DMA32);
3078 else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3079 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3080 flags |= GFP_DMA;
3081 else
3082 flags |= GFP_DMA32;
3085 vaddr = (void *)__get_free_pages(flags, order);
3086 if (!vaddr)
3087 return NULL;
3088 memset(vaddr, 0, size);
3090 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3091 DMA_BIDIRECTIONAL,
3092 hwdev->coherent_dma_mask);
3093 if (*dma_handle)
3094 return vaddr;
3095 free_pages((unsigned long)vaddr, order);
3096 return NULL;
3099 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3100 dma_addr_t dma_handle)
3102 int order;
3104 size = PAGE_ALIGN(size);
3105 order = get_order(size);
3107 intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3108 free_pages((unsigned long)vaddr, order);
3111 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3112 int nelems, enum dma_data_direction dir,
3113 struct dma_attrs *attrs)
3115 struct pci_dev *pdev = to_pci_dev(hwdev);
3116 struct dmar_domain *domain;
3117 unsigned long start_pfn, last_pfn;
3118 struct iova *iova;
3119 struct intel_iommu *iommu;
3121 if (iommu_no_mapping(hwdev))
3122 return;
3124 domain = find_domain(pdev);
3125 BUG_ON(!domain);
3127 iommu = domain_get_iommu(domain);
3129 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3130 if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3131 (unsigned long long)sglist[0].dma_address))
3132 return;
3134 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3135 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3137 /* clear the whole page */
3138 dma_pte_clear_range(domain, start_pfn, last_pfn);
3140 /* free page tables */
3141 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3143 if (intel_iommu_strict) {
3144 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3145 last_pfn - start_pfn + 1, 0);
3146 /* free iova */
3147 __free_iova(&domain->iovad, iova);
3148 } else {
3149 add_unmap(domain, iova);
3151 * queue up the release of the unmap to save the 1/6th of the
3152 * cpu used up by the iotlb flush operation...
3157 static int intel_nontranslate_map_sg(struct device *hddev,
3158 struct scatterlist *sglist, int nelems, int dir)
3160 int i;
3161 struct scatterlist *sg;
3163 for_each_sg(sglist, sg, nelems, i) {
3164 BUG_ON(!sg_page(sg));
3165 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3166 sg->dma_length = sg->length;
3168 return nelems;
3171 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3172 enum dma_data_direction dir, struct dma_attrs *attrs)
3174 int i;
3175 struct pci_dev *pdev = to_pci_dev(hwdev);
3176 struct dmar_domain *domain;
3177 size_t size = 0;
3178 int prot = 0;
3179 struct iova *iova = NULL;
3180 int ret;
3181 struct scatterlist *sg;
3182 unsigned long start_vpfn;
3183 struct intel_iommu *iommu;
3185 BUG_ON(dir == DMA_NONE);
3186 if (iommu_no_mapping(hwdev))
3187 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3189 domain = get_valid_domain_for_dev(pdev);
3190 if (!domain)
3191 return 0;
3193 iommu = domain_get_iommu(domain);
3195 for_each_sg(sglist, sg, nelems, i)
3196 size += aligned_nrpages(sg->offset, sg->length);
3198 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3199 pdev->dma_mask);
3200 if (!iova) {
3201 sglist->dma_length = 0;
3202 return 0;
3206 * Check if DMAR supports zero-length reads on write only
3207 * mappings..
3209 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3210 !cap_zlr(iommu->cap))
3211 prot |= DMA_PTE_READ;
3212 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3213 prot |= DMA_PTE_WRITE;
3215 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3217 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3218 if (unlikely(ret)) {
3219 /* clear the page */
3220 dma_pte_clear_range(domain, start_vpfn,
3221 start_vpfn + size - 1);
3222 /* free page tables */
3223 dma_pte_free_pagetable(domain, start_vpfn,
3224 start_vpfn + size - 1);
3225 /* free iova */
3226 __free_iova(&domain->iovad, iova);
3227 return 0;
3230 /* it's a non-present to present mapping. Only flush if caching mode */
3231 if (cap_caching_mode(iommu->cap))
3232 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3233 else
3234 iommu_flush_write_buffer(iommu);
3236 return nelems;
3239 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3241 return !dma_addr;
3244 struct dma_map_ops intel_dma_ops = {
3245 .alloc_coherent = intel_alloc_coherent,
3246 .free_coherent = intel_free_coherent,
3247 .map_sg = intel_map_sg,
3248 .unmap_sg = intel_unmap_sg,
3249 .map_page = intel_map_page,
3250 .unmap_page = intel_unmap_page,
3251 .mapping_error = intel_mapping_error,
3254 static inline int iommu_domain_cache_init(void)
3256 int ret = 0;
3258 iommu_domain_cache = kmem_cache_create("iommu_domain",
3259 sizeof(struct dmar_domain),
3261 SLAB_HWCACHE_ALIGN,
3263 NULL);
3264 if (!iommu_domain_cache) {
3265 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3266 ret = -ENOMEM;
3269 return ret;
3272 static inline int iommu_devinfo_cache_init(void)
3274 int ret = 0;
3276 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3277 sizeof(struct device_domain_info),
3279 SLAB_HWCACHE_ALIGN,
3280 NULL);
3281 if (!iommu_devinfo_cache) {
3282 printk(KERN_ERR "Couldn't create devinfo cache\n");
3283 ret = -ENOMEM;
3286 return ret;
3289 static inline int iommu_iova_cache_init(void)
3291 int ret = 0;
3293 iommu_iova_cache = kmem_cache_create("iommu_iova",
3294 sizeof(struct iova),
3296 SLAB_HWCACHE_ALIGN,
3297 NULL);
3298 if (!iommu_iova_cache) {
3299 printk(KERN_ERR "Couldn't create iova cache\n");
3300 ret = -ENOMEM;
3303 return ret;
3306 static int __init iommu_init_mempool(void)
3308 int ret;
3309 ret = iommu_iova_cache_init();
3310 if (ret)
3311 return ret;
3313 ret = iommu_domain_cache_init();
3314 if (ret)
3315 goto domain_error;
3317 ret = iommu_devinfo_cache_init();
3318 if (!ret)
3319 return ret;
3321 kmem_cache_destroy(iommu_domain_cache);
3322 domain_error:
3323 kmem_cache_destroy(iommu_iova_cache);
3325 return -ENOMEM;
3328 static void __init iommu_exit_mempool(void)
3330 kmem_cache_destroy(iommu_devinfo_cache);
3331 kmem_cache_destroy(iommu_domain_cache);
3332 kmem_cache_destroy(iommu_iova_cache);
3336 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3338 struct dmar_drhd_unit *drhd;
3339 u32 vtbar;
3340 int rc;
3342 /* We know that this device on this chipset has its own IOMMU.
3343 * If we find it under a different IOMMU, then the BIOS is lying
3344 * to us. Hope that the IOMMU for this device is actually
3345 * disabled, and it needs no translation...
3347 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3348 if (rc) {
3349 /* "can't" happen */
3350 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3351 return;
3353 vtbar &= 0xffff0000;
3355 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3356 drhd = dmar_find_matched_drhd_unit(pdev);
3357 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3358 TAINT_FIRMWARE_WORKAROUND,
3359 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3360 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3362 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3364 static void __init init_no_remapping_devices(void)
3366 struct dmar_drhd_unit *drhd;
3368 for_each_drhd_unit(drhd) {
3369 if (!drhd->include_all) {
3370 int i;
3371 for (i = 0; i < drhd->devices_cnt; i++)
3372 if (drhd->devices[i] != NULL)
3373 break;
3374 /* ignore DMAR unit if no pci devices exist */
3375 if (i == drhd->devices_cnt)
3376 drhd->ignored = 1;
3380 if (dmar_map_gfx)
3381 return;
3383 for_each_drhd_unit(drhd) {
3384 int i;
3385 if (drhd->ignored || drhd->include_all)
3386 continue;
3388 for (i = 0; i < drhd->devices_cnt; i++)
3389 if (drhd->devices[i] &&
3390 !IS_GFX_DEVICE(drhd->devices[i]))
3391 break;
3393 if (i < drhd->devices_cnt)
3394 continue;
3396 /* bypass IOMMU if it is just for gfx devices */
3397 drhd->ignored = 1;
3398 for (i = 0; i < drhd->devices_cnt; i++) {
3399 if (!drhd->devices[i])
3400 continue;
3401 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3406 #ifdef CONFIG_SUSPEND
3407 static int init_iommu_hw(void)
3409 struct dmar_drhd_unit *drhd;
3410 struct intel_iommu *iommu = NULL;
3412 for_each_active_iommu(iommu, drhd)
3413 if (iommu->qi)
3414 dmar_reenable_qi(iommu);
3416 for_each_iommu(iommu, drhd) {
3417 if (drhd->ignored) {
3419 * we always have to disable PMRs or DMA may fail on
3420 * this device
3422 if (force_on)
3423 iommu_disable_protect_mem_regions(iommu);
3424 continue;
3427 iommu_flush_write_buffer(iommu);
3429 iommu_set_root_entry(iommu);
3431 iommu->flush.flush_context(iommu, 0, 0, 0,
3432 DMA_CCMD_GLOBAL_INVL);
3433 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3434 DMA_TLB_GLOBAL_FLUSH);
3435 if (iommu_enable_translation(iommu))
3436 return 1;
3437 iommu_disable_protect_mem_regions(iommu);
3440 return 0;
3443 static void iommu_flush_all(void)
3445 struct dmar_drhd_unit *drhd;
3446 struct intel_iommu *iommu;
3448 for_each_active_iommu(iommu, drhd) {
3449 iommu->flush.flush_context(iommu, 0, 0, 0,
3450 DMA_CCMD_GLOBAL_INVL);
3451 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3452 DMA_TLB_GLOBAL_FLUSH);
3456 static int iommu_suspend(void)
3458 struct dmar_drhd_unit *drhd;
3459 struct intel_iommu *iommu = NULL;
3460 unsigned long flag;
3462 for_each_active_iommu(iommu, drhd) {
3463 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3464 GFP_ATOMIC);
3465 if (!iommu->iommu_state)
3466 goto nomem;
3469 iommu_flush_all();
3471 for_each_active_iommu(iommu, drhd) {
3472 iommu_disable_translation(iommu);
3474 spin_lock_irqsave(&iommu->register_lock, flag);
3476 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3477 readl(iommu->reg + DMAR_FECTL_REG);
3478 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3479 readl(iommu->reg + DMAR_FEDATA_REG);
3480 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3481 readl(iommu->reg + DMAR_FEADDR_REG);
3482 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3483 readl(iommu->reg + DMAR_FEUADDR_REG);
3485 spin_unlock_irqrestore(&iommu->register_lock, flag);
3487 return 0;
3489 nomem:
3490 for_each_active_iommu(iommu, drhd)
3491 kfree(iommu->iommu_state);
3493 return -ENOMEM;
3496 static void iommu_resume(void)
3498 struct dmar_drhd_unit *drhd;
3499 struct intel_iommu *iommu = NULL;
3500 unsigned long flag;
3502 if (init_iommu_hw()) {
3503 if (force_on)
3504 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3505 else
3506 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3507 return;
3510 for_each_active_iommu(iommu, drhd) {
3512 spin_lock_irqsave(&iommu->register_lock, flag);
3514 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3515 iommu->reg + DMAR_FECTL_REG);
3516 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3517 iommu->reg + DMAR_FEDATA_REG);
3518 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3519 iommu->reg + DMAR_FEADDR_REG);
3520 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3521 iommu->reg + DMAR_FEUADDR_REG);
3523 spin_unlock_irqrestore(&iommu->register_lock, flag);
3526 for_each_active_iommu(iommu, drhd)
3527 kfree(iommu->iommu_state);
3530 static struct syscore_ops iommu_syscore_ops = {
3531 .resume = iommu_resume,
3532 .suspend = iommu_suspend,
3535 static void __init init_iommu_pm_ops(void)
3537 register_syscore_ops(&iommu_syscore_ops);
3540 #else
3541 static inline void init_iommu_pm_ops(void) {}
3542 #endif /* CONFIG_PM */
3545 * Here we only respond to action of unbound device from driver.
3547 * Added device is not attached to its DMAR domain here yet. That will happen
3548 * when mapping the device to iova.
3550 static int device_notifier(struct notifier_block *nb,
3551 unsigned long action, void *data)
3553 struct device *dev = data;
3554 struct pci_dev *pdev = to_pci_dev(dev);
3555 struct dmar_domain *domain;
3557 if (unlikely(dev->bus != &pci_bus_type))
3558 return 0;
3560 switch (action) {
3561 case BUS_NOTIFY_UNBOUND_DRIVER:
3562 if (iommu_no_mapping(dev))
3563 goto out;
3565 if (iommu_pass_through)
3566 goto out;
3568 domain = find_domain(pdev);
3569 if (!domain)
3570 goto out;
3572 domain_remove_one_dev_info(domain, pdev);
3574 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3575 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3576 list_empty(&domain->devices))
3577 domain_exit(domain);
3578 out:
3579 remove_dev_from_drhd(pdev);
3580 remove_dev_from_atsr(pdev);
3582 break;
3583 case BUS_NOTIFY_ADD_DEVICE:
3584 restore_dev_to_drhd(pdev);
3585 restore_dev_to_atsr(pdev);
3586 break;
3589 return 0;
3592 static struct notifier_block device_nb = {
3593 .notifier_call = device_notifier,
3596 int __init intel_iommu_init(void)
3598 int ret = 0;
3600 /* VT-d is required for a TXT/tboot launch, so enforce that */
3601 force_on = tboot_force_iommu();
3603 if (dmar_table_init()) {
3604 if (force_on)
3605 panic("tboot: Failed to initialize DMAR table\n");
3606 return -ENODEV;
3609 if (dmar_dev_scope_init()) {
3610 if (force_on)
3611 panic("tboot: Failed to initialize DMAR device scope\n");
3612 return -ENODEV;
3616 * Check the need for DMA-remapping initialization now.
3617 * Above initialization will also be used by Interrupt-remapping.
3619 if (no_iommu || dmar_disabled)
3620 return -ENODEV;
3622 if (iommu_init_mempool()) {
3623 if (force_on)
3624 panic("tboot: Failed to initialize iommu memory\n");
3625 return -ENODEV;
3628 if (dmar_init_reserved_ranges()) {
3629 if (force_on)
3630 panic("tboot: Failed to reserve iommu ranges\n");
3631 return -ENODEV;
3634 init_no_remapping_devices();
3636 ret = init_dmars();
3637 if (ret) {
3638 if (force_on)
3639 panic("tboot: Failed to initialize DMARs\n");
3640 printk(KERN_ERR "IOMMU: dmar init failed\n");
3641 put_iova_domain(&reserved_iova_list);
3642 iommu_exit_mempool();
3643 return ret;
3645 printk(KERN_INFO
3646 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3648 init_timer(&unmap_timer);
3649 #ifdef CONFIG_SWIOTLB
3650 swiotlb = 0;
3651 #endif
3652 dma_ops = &intel_dma_ops;
3654 init_iommu_pm_ops();
3656 register_iommu(&intel_iommu_ops);
3658 bus_register_notifier(&pci_bus_type, &device_nb);
3660 return 0;
3663 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3664 struct pci_dev *pdev)
3666 struct pci_dev *tmp, *parent;
3668 if (!iommu || !pdev)
3669 return;
3671 /* dependent device detach */
3672 tmp = pci_find_upstream_pcie_bridge(pdev);
3673 /* Secondary interface's bus number and devfn 0 */
3674 if (tmp) {
3675 parent = pdev->bus->self;
3676 while (parent != tmp) {
3677 iommu_detach_dev(iommu, parent->bus->number,
3678 parent->devfn);
3679 parent = parent->bus->self;
3681 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3682 iommu_detach_dev(iommu,
3683 tmp->subordinate->number, 0);
3684 else /* this is a legacy PCI bridge */
3685 iommu_detach_dev(iommu, tmp->bus->number,
3686 tmp->devfn);
3690 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3691 struct pci_dev *pdev)
3693 struct device_domain_info *info;
3694 struct intel_iommu *iommu;
3695 unsigned long flags;
3696 int found = 0;
3697 struct list_head *entry, *tmp;
3699 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3700 pdev->devfn);
3701 if (!iommu)
3702 return;
3704 spin_lock_irqsave(&device_domain_lock, flags);
3705 list_for_each_safe(entry, tmp, &domain->devices) {
3706 info = list_entry(entry, struct device_domain_info, link);
3707 if (info->segment == pci_domain_nr(pdev->bus) &&
3708 info->bus == pdev->bus->number &&
3709 info->devfn == pdev->devfn) {
3710 list_del(&info->link);
3711 list_del(&info->global);
3712 if (info->dev)
3713 info->dev->dev.archdata.iommu = NULL;
3714 spin_unlock_irqrestore(&device_domain_lock, flags);
3716 iommu_disable_dev_iotlb(info);
3717 iommu_detach_dev(iommu, info->bus, info->devfn);
3718 iommu_detach_dependent_devices(iommu, pdev);
3719 free_devinfo_mem(info);
3721 spin_lock_irqsave(&device_domain_lock, flags);
3723 if (found)
3724 break;
3725 else
3726 continue;
3729 /* if there is no other devices under the same iommu
3730 * owned by this domain, clear this iommu in iommu_bmp
3731 * update iommu count and coherency
3733 if (iommu == device_to_iommu(info->segment, info->bus,
3734 info->devfn))
3735 found = 1;
3738 if (found == 0) {
3739 unsigned long tmp_flags;
3740 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3741 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3742 domain->iommu_count--;
3743 domain_update_iommu_cap(domain);
3744 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3746 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3747 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3748 spin_lock_irqsave(&iommu->lock, tmp_flags);
3749 clear_bit(domain->id, iommu->domain_ids);
3750 iommu->domains[domain->id] = NULL;
3751 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3755 spin_unlock_irqrestore(&device_domain_lock, flags);
3758 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3760 struct device_domain_info *info;
3761 struct intel_iommu *iommu;
3762 unsigned long flags1, flags2;
3764 spin_lock_irqsave(&device_domain_lock, flags1);
3765 while (!list_empty(&domain->devices)) {
3766 info = list_entry(domain->devices.next,
3767 struct device_domain_info, link);
3768 list_del(&info->link);
3769 list_del(&info->global);
3770 if (info->dev)
3771 info->dev->dev.archdata.iommu = NULL;
3773 spin_unlock_irqrestore(&device_domain_lock, flags1);
3775 iommu_disable_dev_iotlb(info);
3776 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3777 iommu_detach_dev(iommu, info->bus, info->devfn);
3778 iommu_detach_dependent_devices(iommu, info->dev);
3780 /* clear this iommu in iommu_bmp, update iommu count
3781 * and capabilities
3783 spin_lock_irqsave(&domain->iommu_lock, flags2);
3784 if (test_and_clear_bit(iommu->seq_id,
3785 &domain->iommu_bmp)) {
3786 domain->iommu_count--;
3787 domain_update_iommu_cap(domain);
3789 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3791 free_devinfo_mem(info);
3792 spin_lock_irqsave(&device_domain_lock, flags1);
3794 spin_unlock_irqrestore(&device_domain_lock, flags1);
3797 /* domain id for virtual machine, it won't be set in context */
3798 static unsigned long vm_domid;
3800 static struct dmar_domain *iommu_alloc_vm_domain(void)
3802 struct dmar_domain *domain;
3804 domain = alloc_domain_mem();
3805 if (!domain)
3806 return NULL;
3808 domain->id = vm_domid++;
3809 domain->nid = -1;
3810 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3811 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3813 return domain;
3816 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3818 int adjust_width;
3820 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3821 spin_lock_init(&domain->iommu_lock);
3823 domain_reserve_special_ranges(domain);
3825 /* calculate AGAW */
3826 domain->gaw = guest_width;
3827 adjust_width = guestwidth_to_adjustwidth(guest_width);
3828 domain->agaw = width_to_agaw(adjust_width);
3830 INIT_LIST_HEAD(&domain->devices);
3832 domain->iommu_count = 0;
3833 domain->iommu_coherency = 0;
3834 domain->iommu_snooping = 0;
3835 domain->iommu_superpage = 0;
3836 domain->max_addr = 0;
3837 domain->nid = -1;
3839 /* always allocate the top pgd */
3840 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3841 if (!domain->pgd)
3842 return -ENOMEM;
3843 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3844 return 0;
3847 static void iommu_free_vm_domain(struct dmar_domain *domain)
3849 unsigned long flags;
3850 struct dmar_drhd_unit *drhd;
3851 struct intel_iommu *iommu;
3852 unsigned long i;
3853 unsigned long ndomains;
3855 for_each_drhd_unit(drhd) {
3856 if (drhd->ignored)
3857 continue;
3858 iommu = drhd->iommu;
3860 ndomains = cap_ndoms(iommu->cap);
3861 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3862 if (iommu->domains[i] == domain) {
3863 spin_lock_irqsave(&iommu->lock, flags);
3864 clear_bit(i, iommu->domain_ids);
3865 iommu->domains[i] = NULL;
3866 spin_unlock_irqrestore(&iommu->lock, flags);
3867 break;
3873 static void vm_domain_exit(struct dmar_domain *domain)
3875 /* Domain 0 is reserved, so dont process it */
3876 if (!domain)
3877 return;
3879 vm_domain_remove_all_dev_info(domain);
3880 /* destroy iovas */
3881 put_iova_domain(&domain->iovad);
3883 /* clear ptes */
3884 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3886 /* free page tables */
3887 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3889 iommu_free_vm_domain(domain);
3890 free_domain_mem(domain);
3893 static int intel_iommu_domain_init(struct iommu_domain *domain)
3895 struct dmar_domain *dmar_domain;
3897 dmar_domain = iommu_alloc_vm_domain();
3898 if (!dmar_domain) {
3899 printk(KERN_ERR
3900 "intel_iommu_domain_init: dmar_domain == NULL\n");
3901 return -ENOMEM;
3903 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3904 printk(KERN_ERR
3905 "intel_iommu_domain_init() failed\n");
3906 vm_domain_exit(dmar_domain);
3907 return -ENOMEM;
3909 domain->priv = dmar_domain;
3911 return 0;
3914 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3916 struct dmar_domain *dmar_domain = domain->priv;
3918 domain->priv = NULL;
3919 vm_domain_exit(dmar_domain);
3922 static int intel_iommu_attach_device(struct iommu_domain *domain,
3923 struct device *dev)
3925 struct dmar_domain *dmar_domain = domain->priv;
3926 struct pci_dev *pdev = to_pci_dev(dev);
3927 struct intel_iommu *iommu;
3928 int addr_width;
3930 /* normally pdev is not mapped */
3931 if (unlikely(domain_context_mapped(pdev))) {
3932 struct dmar_domain *old_domain;
3934 old_domain = find_domain(pdev);
3935 if (old_domain) {
3936 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3937 dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3938 domain_remove_one_dev_info(old_domain, pdev);
3939 else
3940 domain_remove_dev_info(old_domain);
3944 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3945 pdev->devfn);
3946 if (!iommu)
3947 return -ENODEV;
3949 /* check if this iommu agaw is sufficient for max mapped address */
3950 addr_width = agaw_to_width(iommu->agaw);
3951 if (addr_width > cap_mgaw(iommu->cap))
3952 addr_width = cap_mgaw(iommu->cap);
3954 if (dmar_domain->max_addr > (1LL << addr_width)) {
3955 printk(KERN_ERR "%s: iommu width (%d) is not "
3956 "sufficient for the mapped address (%llx)\n",
3957 __func__, addr_width, dmar_domain->max_addr);
3958 return -EFAULT;
3960 dmar_domain->gaw = addr_width;
3963 * Knock out extra levels of page tables if necessary
3965 while (iommu->agaw < dmar_domain->agaw) {
3966 struct dma_pte *pte;
3968 pte = dmar_domain->pgd;
3969 if (dma_pte_present(pte)) {
3970 dmar_domain->pgd = (struct dma_pte *)
3971 phys_to_virt(dma_pte_addr(pte));
3972 free_pgtable_page(pte);
3974 dmar_domain->agaw--;
3977 return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3980 static void intel_iommu_detach_device(struct iommu_domain *domain,
3981 struct device *dev)
3983 struct dmar_domain *dmar_domain = domain->priv;
3984 struct pci_dev *pdev = to_pci_dev(dev);
3986 domain_remove_one_dev_info(dmar_domain, pdev);
3989 static int intel_iommu_map(struct iommu_domain *domain,
3990 unsigned long iova, phys_addr_t hpa,
3991 int gfp_order, int iommu_prot)
3993 struct dmar_domain *dmar_domain = domain->priv;
3994 u64 max_addr;
3995 int prot = 0;
3996 size_t size;
3997 int ret;
3999 if (iommu_prot & IOMMU_READ)
4000 prot |= DMA_PTE_READ;
4001 if (iommu_prot & IOMMU_WRITE)
4002 prot |= DMA_PTE_WRITE;
4003 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4004 prot |= DMA_PTE_SNP;
4006 size = PAGE_SIZE << gfp_order;
4007 max_addr = iova + size;
4008 if (dmar_domain->max_addr < max_addr) {
4009 u64 end;
4011 /* check if minimum agaw is sufficient for mapped address */
4012 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4013 if (end < max_addr) {
4014 printk(KERN_ERR "%s: iommu width (%d) is not "
4015 "sufficient for the mapped address (%llx)\n",
4016 __func__, dmar_domain->gaw, max_addr);
4017 return -EFAULT;
4019 dmar_domain->max_addr = max_addr;
4021 /* Round up size to next multiple of PAGE_SIZE, if it and
4022 the low bits of hpa would take us onto the next page */
4023 size = aligned_nrpages(hpa, size);
4024 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4025 hpa >> VTD_PAGE_SHIFT, size, prot);
4026 return ret;
4029 static int intel_iommu_unmap(struct iommu_domain *domain,
4030 unsigned long iova, int gfp_order)
4032 struct dmar_domain *dmar_domain = domain->priv;
4033 size_t size = PAGE_SIZE << gfp_order;
4035 dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4036 (iova + size - 1) >> VTD_PAGE_SHIFT);
4038 if (dmar_domain->max_addr == iova + size)
4039 dmar_domain->max_addr = iova;
4041 return gfp_order;
4044 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4045 unsigned long iova)
4047 struct dmar_domain *dmar_domain = domain->priv;
4048 struct dma_pte *pte;
4049 u64 phys = 0;
4051 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4052 if (pte)
4053 phys = dma_pte_addr(pte);
4055 return phys;
4058 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4059 unsigned long cap)
4061 struct dmar_domain *dmar_domain = domain->priv;
4063 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4064 return dmar_domain->iommu_snooping;
4065 if (cap == IOMMU_CAP_INTR_REMAP)
4066 return intr_remapping_enabled;
4068 return 0;
4071 static struct iommu_ops intel_iommu_ops = {
4072 .domain_init = intel_iommu_domain_init,
4073 .domain_destroy = intel_iommu_domain_destroy,
4074 .attach_dev = intel_iommu_attach_device,
4075 .detach_dev = intel_iommu_detach_device,
4076 .map = intel_iommu_map,
4077 .unmap = intel_iommu_unmap,
4078 .iova_to_phys = intel_iommu_iova_to_phys,
4079 .domain_has_cap = intel_iommu_domain_has_cap,
4082 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4085 * Mobile 4 Series Chipset neglects to set RWBF capability,
4086 * but needs it:
4088 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4089 rwbf_quirk = 1;
4091 /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4092 if (dev->revision == 0x07) {
4093 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4094 dmar_map_gfx = 0;
4098 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4100 #define GGC 0x52
4101 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4102 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4103 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4104 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4105 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4106 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4107 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4108 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4110 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4112 unsigned short ggc;
4114 if (pci_read_config_word(dev, GGC, &ggc))
4115 return;
4117 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4118 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4119 dmar_map_gfx = 0;
4122 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4123 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4124 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4125 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4127 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4128 ISOCH DMAR unit for the Azalia sound device, but not give it any
4129 TLB entries, which causes it to deadlock. Check for that. We do
4130 this in a function called from init_dmars(), instead of in a PCI
4131 quirk, because we don't want to print the obnoxious "BIOS broken"
4132 message if VT-d is actually disabled.
4134 static void __init check_tylersburg_isoch(void)
4136 struct pci_dev *pdev;
4137 uint32_t vtisochctrl;
4139 /* If there's no Azalia in the system anyway, forget it. */
4140 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4141 if (!pdev)
4142 return;
4143 pci_dev_put(pdev);
4145 /* System Management Registers. Might be hidden, in which case
4146 we can't do the sanity check. But that's OK, because the
4147 known-broken BIOSes _don't_ actually hide it, so far. */
4148 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4149 if (!pdev)
4150 return;
4152 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4153 pci_dev_put(pdev);
4154 return;
4157 pci_dev_put(pdev);
4159 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4160 if (vtisochctrl & 1)
4161 return;
4163 /* Drop all bits other than the number of TLB entries */
4164 vtisochctrl &= 0x1c;
4166 /* If we have the recommended number of TLB entries (16), fine. */
4167 if (vtisochctrl == 0x10)
4168 return;
4170 /* Zero TLB entries? You get to ride the short bus to school. */
4171 if (!vtisochctrl) {
4172 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4173 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4174 dmi_get_system_info(DMI_BIOS_VENDOR),
4175 dmi_get_system_info(DMI_BIOS_VERSION),
4176 dmi_get_system_info(DMI_PRODUCT_VERSION));
4177 iommu_identity_mapping |= IDENTMAP_AZALIA;
4178 return;
4181 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4182 vtisochctrl);