ASoC: es8316: Add jack-detect support
[linux/fpc-iii.git] / drivers / iommu / intel-iommu.c
blob41a4b8808802b8bcc30106c37b748eb3507943c0
1 /*
2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
18 * Joerg Roedel <jroedel@suse.de>
21 #define pr_fmt(fmt) "DMAR: " fmt
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/cpu.h>
37 #include <linux/timer.h>
38 #include <linux/io.h>
39 #include <linux/iova.h>
40 #include <linux/iommu.h>
41 #include <linux/intel-iommu.h>
42 #include <linux/syscore_ops.h>
43 #include <linux/tboot.h>
44 #include <linux/dmi.h>
45 #include <linux/pci-ats.h>
46 #include <linux/memblock.h>
47 #include <linux/dma-contiguous.h>
48 #include <linux/dma-direct.h>
49 #include <linux/crash_dump.h>
50 #include <asm/irq_remapping.h>
51 #include <asm/cacheflush.h>
52 #include <asm/iommu.h>
54 #include "irq_remapping.h"
55 #include "intel-pasid.h"
57 #define ROOT_SIZE VTD_PAGE_SIZE
58 #define CONTEXT_SIZE VTD_PAGE_SIZE
60 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
61 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
62 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
63 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
65 #define IOAPIC_RANGE_START (0xfee00000)
66 #define IOAPIC_RANGE_END (0xfeefffff)
67 #define IOVA_START_ADDR (0x1000)
69 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
71 #define MAX_AGAW_WIDTH 64
72 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
74 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
75 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
77 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
78 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
79 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
80 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
81 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
83 /* IO virtual address start page frame number */
84 #define IOVA_START_PFN (1)
86 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
88 /* page table handling */
89 #define LEVEL_STRIDE (9)
90 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
93 * This bitmap is used to advertise the page sizes our hardware support
94 * to the IOMMU core, which will then use this information to split
95 * physically contiguous memory regions it is mapping into page sizes
96 * that we support.
98 * Traditionally the IOMMU core just handed us the mappings directly,
99 * after making sure the size is an order of a 4KiB page and that the
100 * mapping has natural alignment.
102 * To retain this behavior, we currently advertise that we support
103 * all page sizes that are an order of 4KiB.
105 * If at some point we'd like to utilize the IOMMU core's new behavior,
106 * we could change this to advertise the real page sizes we support.
108 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
110 static inline int agaw_to_level(int agaw)
112 return agaw + 2;
115 static inline int agaw_to_width(int agaw)
117 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
120 static inline int width_to_agaw(int width)
122 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
125 static inline unsigned int level_to_offset_bits(int level)
127 return (level - 1) * LEVEL_STRIDE;
130 static inline int pfn_level_offset(unsigned long pfn, int level)
132 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
135 static inline unsigned long level_mask(int level)
137 return -1UL << level_to_offset_bits(level);
140 static inline unsigned long level_size(int level)
142 return 1UL << level_to_offset_bits(level);
145 static inline unsigned long align_to_level(unsigned long pfn, int level)
147 return (pfn + level_size(level) - 1) & level_mask(level);
150 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
152 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
155 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
156 are never going to work. */
157 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
159 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
164 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
166 static inline unsigned long page_to_dma_pfn(struct page *pg)
168 return mm_to_dma_pfn(page_to_pfn(pg));
170 static inline unsigned long virt_to_dma_pfn(void *p)
172 return page_to_dma_pfn(virt_to_page(p));
175 /* global iommu list, set NULL for ignored DMAR units */
176 static struct intel_iommu **g_iommus;
178 static void __init check_tylersburg_isoch(void);
179 static int rwbf_quirk;
182 * set to 1 to panic kernel if can't successfully enable VT-d
183 * (used when kernel is launched w/ TXT)
185 static int force_on = 0;
186 int intel_iommu_tboot_noforce;
188 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
191 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
192 * if marked present.
194 static phys_addr_t root_entry_lctp(struct root_entry *re)
196 if (!(re->lo & 1))
197 return 0;
199 return re->lo & VTD_PAGE_MASK;
203 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
204 * if marked present.
206 static phys_addr_t root_entry_uctp(struct root_entry *re)
208 if (!(re->hi & 1))
209 return 0;
211 return re->hi & VTD_PAGE_MASK;
214 static inline void context_clear_pasid_enable(struct context_entry *context)
216 context->lo &= ~(1ULL << 11);
219 static inline bool context_pasid_enabled(struct context_entry *context)
221 return !!(context->lo & (1ULL << 11));
224 static inline void context_set_copied(struct context_entry *context)
226 context->hi |= (1ull << 3);
229 static inline bool context_copied(struct context_entry *context)
231 return !!(context->hi & (1ULL << 3));
234 static inline bool __context_present(struct context_entry *context)
236 return (context->lo & 1);
239 bool context_present(struct context_entry *context)
241 return context_pasid_enabled(context) ?
242 __context_present(context) :
243 __context_present(context) && !context_copied(context);
246 static inline void context_set_present(struct context_entry *context)
248 context->lo |= 1;
251 static inline void context_set_fault_enable(struct context_entry *context)
253 context->lo &= (((u64)-1) << 2) | 1;
256 static inline void context_set_translation_type(struct context_entry *context,
257 unsigned long value)
259 context->lo &= (((u64)-1) << 4) | 3;
260 context->lo |= (value & 3) << 2;
263 static inline void context_set_address_root(struct context_entry *context,
264 unsigned long value)
266 context->lo &= ~VTD_PAGE_MASK;
267 context->lo |= value & VTD_PAGE_MASK;
270 static inline void context_set_address_width(struct context_entry *context,
271 unsigned long value)
273 context->hi |= value & 7;
276 static inline void context_set_domain_id(struct context_entry *context,
277 unsigned long value)
279 context->hi |= (value & ((1 << 16) - 1)) << 8;
282 static inline int context_domain_id(struct context_entry *c)
284 return((c->hi >> 8) & 0xffff);
287 static inline void context_clear_entry(struct context_entry *context)
289 context->lo = 0;
290 context->hi = 0;
294 * 0: readable
295 * 1: writable
296 * 2-6: reserved
297 * 7: super page
298 * 8-10: available
299 * 11: snoop behavior
300 * 12-63: Host physcial address
302 struct dma_pte {
303 u64 val;
306 static inline void dma_clear_pte(struct dma_pte *pte)
308 pte->val = 0;
311 static inline u64 dma_pte_addr(struct dma_pte *pte)
313 #ifdef CONFIG_64BIT
314 return pte->val & VTD_PAGE_MASK;
315 #else
316 /* Must have a full atomic 64-bit read */
317 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
318 #endif
321 static inline bool dma_pte_present(struct dma_pte *pte)
323 return (pte->val & 3) != 0;
326 static inline bool dma_pte_superpage(struct dma_pte *pte)
328 return (pte->val & DMA_PTE_LARGE_PAGE);
331 static inline int first_pte_in_page(struct dma_pte *pte)
333 return !((unsigned long)pte & ~VTD_PAGE_MASK);
337 * This domain is a statically identity mapping domain.
338 * 1. This domain creats a static 1:1 mapping to all usable memory.
339 * 2. It maps to each iommu if successful.
340 * 3. Each iommu mapps to this domain if successful.
342 static struct dmar_domain *si_domain;
343 static int hw_pass_through = 1;
346 * Domain represents a virtual machine, more than one devices
347 * across iommus may be owned in one domain, e.g. kvm guest.
349 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
351 /* si_domain contains mulitple devices */
352 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
354 #define for_each_domain_iommu(idx, domain) \
355 for (idx = 0; idx < g_num_of_iommus; idx++) \
356 if (domain->iommu_refcnt[idx])
358 struct dmar_rmrr_unit {
359 struct list_head list; /* list of rmrr units */
360 struct acpi_dmar_header *hdr; /* ACPI header */
361 u64 base_address; /* reserved base address*/
362 u64 end_address; /* reserved end address */
363 struct dmar_dev_scope *devices; /* target devices */
364 int devices_cnt; /* target device count */
365 struct iommu_resv_region *resv; /* reserved region handle */
368 struct dmar_atsr_unit {
369 struct list_head list; /* list of ATSR units */
370 struct acpi_dmar_header *hdr; /* ACPI header */
371 struct dmar_dev_scope *devices; /* target devices */
372 int devices_cnt; /* target device count */
373 u8 include_all:1; /* include all ports */
376 static LIST_HEAD(dmar_atsr_units);
377 static LIST_HEAD(dmar_rmrr_units);
379 #define for_each_rmrr_units(rmrr) \
380 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
382 /* bitmap for indexing intel_iommus */
383 static int g_num_of_iommus;
385 static void domain_exit(struct dmar_domain *domain);
386 static void domain_remove_dev_info(struct dmar_domain *domain);
387 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
388 struct device *dev);
389 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
390 static void domain_context_clear(struct intel_iommu *iommu,
391 struct device *dev);
392 static int domain_detach_iommu(struct dmar_domain *domain,
393 struct intel_iommu *iommu);
395 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
396 int dmar_disabled = 0;
397 #else
398 int dmar_disabled = 1;
399 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
401 int intel_iommu_enabled = 0;
402 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
404 static int dmar_map_gfx = 1;
405 static int dmar_forcedac;
406 static int intel_iommu_strict;
407 static int intel_iommu_superpage = 1;
408 static int intel_iommu_ecs = 1;
409 static int intel_iommu_pasid28;
410 static int iommu_identity_mapping;
412 #define IDENTMAP_ALL 1
413 #define IDENTMAP_GFX 2
414 #define IDENTMAP_AZALIA 4
416 /* Broadwell and Skylake have broken ECS support — normal so-called "second
417 * level" translation of DMA requests-without-PASID doesn't actually happen
418 * unless you also set the NESTE bit in an extended context-entry. Which of
419 * course means that SVM doesn't work because it's trying to do nested
420 * translation of the physical addresses it finds in the process page tables,
421 * through the IOVA->phys mapping found in the "second level" page tables.
423 * The VT-d specification was retroactively changed to change the definition
424 * of the capability bits and pretend that Broadwell/Skylake never happened...
425 * but unfortunately the wrong bit was changed. It's ECS which is broken, but
426 * for some reason it was the PASID capability bit which was redefined (from
427 * bit 28 on BDW/SKL to bit 40 in future).
429 * So our test for ECS needs to eschew those implementations which set the old
430 * PASID capabiity bit 28, since those are the ones on which ECS is broken.
431 * Unless we are working around the 'pasid28' limitations, that is, by putting
432 * the device into passthrough mode for normal DMA and thus masking the bug.
434 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
435 (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
436 /* PASID support is thus enabled if ECS is enabled and *either* of the old
437 * or new capability bits are set. */
438 #define pasid_enabled(iommu) (ecs_enabled(iommu) && \
439 (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
441 int intel_iommu_gfx_mapped;
442 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
444 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
445 static DEFINE_SPINLOCK(device_domain_lock);
446 static LIST_HEAD(device_domain_list);
449 * Iterate over elements in device_domain_list and call the specified
450 * callback @fn against each element. This helper should only be used
451 * in the context where the device_domain_lock has already been holden.
453 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
454 void *data), void *data)
456 int ret = 0;
457 struct device_domain_info *info;
459 assert_spin_locked(&device_domain_lock);
460 list_for_each_entry(info, &device_domain_list, global) {
461 ret = fn(info, data);
462 if (ret)
463 return ret;
466 return 0;
469 const struct iommu_ops intel_iommu_ops;
471 static bool translation_pre_enabled(struct intel_iommu *iommu)
473 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
476 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
478 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
481 static void init_translation_status(struct intel_iommu *iommu)
483 u32 gsts;
485 gsts = readl(iommu->reg + DMAR_GSTS_REG);
486 if (gsts & DMA_GSTS_TES)
487 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
490 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
491 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
493 return container_of(dom, struct dmar_domain, domain);
496 static int __init intel_iommu_setup(char *str)
498 if (!str)
499 return -EINVAL;
500 while (*str) {
501 if (!strncmp(str, "on", 2)) {
502 dmar_disabled = 0;
503 pr_info("IOMMU enabled\n");
504 } else if (!strncmp(str, "off", 3)) {
505 dmar_disabled = 1;
506 pr_info("IOMMU disabled\n");
507 } else if (!strncmp(str, "igfx_off", 8)) {
508 dmar_map_gfx = 0;
509 pr_info("Disable GFX device mapping\n");
510 } else if (!strncmp(str, "forcedac", 8)) {
511 pr_info("Forcing DAC for PCI devices\n");
512 dmar_forcedac = 1;
513 } else if (!strncmp(str, "strict", 6)) {
514 pr_info("Disable batched IOTLB flush\n");
515 intel_iommu_strict = 1;
516 } else if (!strncmp(str, "sp_off", 6)) {
517 pr_info("Disable supported super page\n");
518 intel_iommu_superpage = 0;
519 } else if (!strncmp(str, "ecs_off", 7)) {
520 printk(KERN_INFO
521 "Intel-IOMMU: disable extended context table support\n");
522 intel_iommu_ecs = 0;
523 } else if (!strncmp(str, "pasid28", 7)) {
524 printk(KERN_INFO
525 "Intel-IOMMU: enable pre-production PASID support\n");
526 intel_iommu_pasid28 = 1;
527 iommu_identity_mapping |= IDENTMAP_GFX;
528 } else if (!strncmp(str, "tboot_noforce", 13)) {
529 printk(KERN_INFO
530 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
531 intel_iommu_tboot_noforce = 1;
534 str += strcspn(str, ",");
535 while (*str == ',')
536 str++;
538 return 0;
540 __setup("intel_iommu=", intel_iommu_setup);
542 static struct kmem_cache *iommu_domain_cache;
543 static struct kmem_cache *iommu_devinfo_cache;
545 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
547 struct dmar_domain **domains;
548 int idx = did >> 8;
550 domains = iommu->domains[idx];
551 if (!domains)
552 return NULL;
554 return domains[did & 0xff];
557 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
558 struct dmar_domain *domain)
560 struct dmar_domain **domains;
561 int idx = did >> 8;
563 if (!iommu->domains[idx]) {
564 size_t size = 256 * sizeof(struct dmar_domain *);
565 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
568 domains = iommu->domains[idx];
569 if (WARN_ON(!domains))
570 return;
571 else
572 domains[did & 0xff] = domain;
575 void *alloc_pgtable_page(int node)
577 struct page *page;
578 void *vaddr = NULL;
580 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
581 if (page)
582 vaddr = page_address(page);
583 return vaddr;
586 void free_pgtable_page(void *vaddr)
588 free_page((unsigned long)vaddr);
591 static inline void *alloc_domain_mem(void)
593 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
596 static void free_domain_mem(void *vaddr)
598 kmem_cache_free(iommu_domain_cache, vaddr);
601 static inline void * alloc_devinfo_mem(void)
603 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
606 static inline void free_devinfo_mem(void *vaddr)
608 kmem_cache_free(iommu_devinfo_cache, vaddr);
611 static inline int domain_type_is_vm(struct dmar_domain *domain)
613 return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
616 static inline int domain_type_is_si(struct dmar_domain *domain)
618 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
621 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
623 return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
624 DOMAIN_FLAG_STATIC_IDENTITY);
627 static inline int domain_pfn_supported(struct dmar_domain *domain,
628 unsigned long pfn)
630 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
632 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
635 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
637 unsigned long sagaw;
638 int agaw = -1;
640 sagaw = cap_sagaw(iommu->cap);
641 for (agaw = width_to_agaw(max_gaw);
642 agaw >= 0; agaw--) {
643 if (test_bit(agaw, &sagaw))
644 break;
647 return agaw;
651 * Calculate max SAGAW for each iommu.
653 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
655 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
659 * calculate agaw for each iommu.
660 * "SAGAW" may be different across iommus, use a default agaw, and
661 * get a supported less agaw for iommus that don't support the default agaw.
663 int iommu_calculate_agaw(struct intel_iommu *iommu)
665 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
668 /* This functionin only returns single iommu in a domain */
669 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
671 int iommu_id;
673 /* si_domain and vm domain should not get here. */
674 BUG_ON(domain_type_is_vm_or_si(domain));
675 for_each_domain_iommu(iommu_id, domain)
676 break;
678 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
679 return NULL;
681 return g_iommus[iommu_id];
684 static void domain_update_iommu_coherency(struct dmar_domain *domain)
686 struct dmar_drhd_unit *drhd;
687 struct intel_iommu *iommu;
688 bool found = false;
689 int i;
691 domain->iommu_coherency = 1;
693 for_each_domain_iommu(i, domain) {
694 found = true;
695 if (!ecap_coherent(g_iommus[i]->ecap)) {
696 domain->iommu_coherency = 0;
697 break;
700 if (found)
701 return;
703 /* No hardware attached; use lowest common denominator */
704 rcu_read_lock();
705 for_each_active_iommu(iommu, drhd) {
706 if (!ecap_coherent(iommu->ecap)) {
707 domain->iommu_coherency = 0;
708 break;
711 rcu_read_unlock();
714 static int domain_update_iommu_snooping(struct intel_iommu *skip)
716 struct dmar_drhd_unit *drhd;
717 struct intel_iommu *iommu;
718 int ret = 1;
720 rcu_read_lock();
721 for_each_active_iommu(iommu, drhd) {
722 if (iommu != skip) {
723 if (!ecap_sc_support(iommu->ecap)) {
724 ret = 0;
725 break;
729 rcu_read_unlock();
731 return ret;
734 static int domain_update_iommu_superpage(struct intel_iommu *skip)
736 struct dmar_drhd_unit *drhd;
737 struct intel_iommu *iommu;
738 int mask = 0xf;
740 if (!intel_iommu_superpage) {
741 return 0;
744 /* set iommu_superpage to the smallest common denominator */
745 rcu_read_lock();
746 for_each_active_iommu(iommu, drhd) {
747 if (iommu != skip) {
748 mask &= cap_super_page_val(iommu->cap);
749 if (!mask)
750 break;
753 rcu_read_unlock();
755 return fls(mask);
758 /* Some capabilities may be different across iommus */
759 static void domain_update_iommu_cap(struct dmar_domain *domain)
761 domain_update_iommu_coherency(domain);
762 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
763 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
766 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
767 u8 devfn, int alloc)
769 struct root_entry *root = &iommu->root_entry[bus];
770 struct context_entry *context;
771 u64 *entry;
773 entry = &root->lo;
774 if (ecs_enabled(iommu)) {
775 if (devfn >= 0x80) {
776 devfn -= 0x80;
777 entry = &root->hi;
779 devfn *= 2;
781 if (*entry & 1)
782 context = phys_to_virt(*entry & VTD_PAGE_MASK);
783 else {
784 unsigned long phy_addr;
785 if (!alloc)
786 return NULL;
788 context = alloc_pgtable_page(iommu->node);
789 if (!context)
790 return NULL;
792 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
793 phy_addr = virt_to_phys((void *)context);
794 *entry = phy_addr | 1;
795 __iommu_flush_cache(iommu, entry, sizeof(*entry));
797 return &context[devfn];
800 static int iommu_dummy(struct device *dev)
802 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
805 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
807 struct dmar_drhd_unit *drhd = NULL;
808 struct intel_iommu *iommu;
809 struct device *tmp;
810 struct pci_dev *ptmp, *pdev = NULL;
811 u16 segment = 0;
812 int i;
814 if (iommu_dummy(dev))
815 return NULL;
817 if (dev_is_pci(dev)) {
818 struct pci_dev *pf_pdev;
820 pdev = to_pci_dev(dev);
822 #ifdef CONFIG_X86
823 /* VMD child devices currently cannot be handled individually */
824 if (is_vmd(pdev->bus))
825 return NULL;
826 #endif
828 /* VFs aren't listed in scope tables; we need to look up
829 * the PF instead to find the IOMMU. */
830 pf_pdev = pci_physfn(pdev);
831 dev = &pf_pdev->dev;
832 segment = pci_domain_nr(pdev->bus);
833 } else if (has_acpi_companion(dev))
834 dev = &ACPI_COMPANION(dev)->dev;
836 rcu_read_lock();
837 for_each_active_iommu(iommu, drhd) {
838 if (pdev && segment != drhd->segment)
839 continue;
841 for_each_active_dev_scope(drhd->devices,
842 drhd->devices_cnt, i, tmp) {
843 if (tmp == dev) {
844 /* For a VF use its original BDF# not that of the PF
845 * which we used for the IOMMU lookup. Strictly speaking
846 * we could do this for all PCI devices; we only need to
847 * get the BDF# from the scope table for ACPI matches. */
848 if (pdev && pdev->is_virtfn)
849 goto got_pdev;
851 *bus = drhd->devices[i].bus;
852 *devfn = drhd->devices[i].devfn;
853 goto out;
856 if (!pdev || !dev_is_pci(tmp))
857 continue;
859 ptmp = to_pci_dev(tmp);
860 if (ptmp->subordinate &&
861 ptmp->subordinate->number <= pdev->bus->number &&
862 ptmp->subordinate->busn_res.end >= pdev->bus->number)
863 goto got_pdev;
866 if (pdev && drhd->include_all) {
867 got_pdev:
868 *bus = pdev->bus->number;
869 *devfn = pdev->devfn;
870 goto out;
873 iommu = NULL;
874 out:
875 rcu_read_unlock();
877 return iommu;
880 static void domain_flush_cache(struct dmar_domain *domain,
881 void *addr, int size)
883 if (!domain->iommu_coherency)
884 clflush_cache_range(addr, size);
887 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
889 struct context_entry *context;
890 int ret = 0;
891 unsigned long flags;
893 spin_lock_irqsave(&iommu->lock, flags);
894 context = iommu_context_addr(iommu, bus, devfn, 0);
895 if (context)
896 ret = context_present(context);
897 spin_unlock_irqrestore(&iommu->lock, flags);
898 return ret;
901 static void free_context_table(struct intel_iommu *iommu)
903 int i;
904 unsigned long flags;
905 struct context_entry *context;
907 spin_lock_irqsave(&iommu->lock, flags);
908 if (!iommu->root_entry) {
909 goto out;
911 for (i = 0; i < ROOT_ENTRY_NR; i++) {
912 context = iommu_context_addr(iommu, i, 0, 0);
913 if (context)
914 free_pgtable_page(context);
916 if (!ecs_enabled(iommu))
917 continue;
919 context = iommu_context_addr(iommu, i, 0x80, 0);
920 if (context)
921 free_pgtable_page(context);
924 free_pgtable_page(iommu->root_entry);
925 iommu->root_entry = NULL;
926 out:
927 spin_unlock_irqrestore(&iommu->lock, flags);
930 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
931 unsigned long pfn, int *target_level)
933 struct dma_pte *parent, *pte = NULL;
934 int level = agaw_to_level(domain->agaw);
935 int offset;
937 BUG_ON(!domain->pgd);
939 if (!domain_pfn_supported(domain, pfn))
940 /* Address beyond IOMMU's addressing capabilities. */
941 return NULL;
943 parent = domain->pgd;
945 while (1) {
946 void *tmp_page;
948 offset = pfn_level_offset(pfn, level);
949 pte = &parent[offset];
950 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
951 break;
952 if (level == *target_level)
953 break;
955 if (!dma_pte_present(pte)) {
956 uint64_t pteval;
958 tmp_page = alloc_pgtable_page(domain->nid);
960 if (!tmp_page)
961 return NULL;
963 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
964 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
965 if (cmpxchg64(&pte->val, 0ULL, pteval))
966 /* Someone else set it while we were thinking; use theirs. */
967 free_pgtable_page(tmp_page);
968 else
969 domain_flush_cache(domain, pte, sizeof(*pte));
971 if (level == 1)
972 break;
974 parent = phys_to_virt(dma_pte_addr(pte));
975 level--;
978 if (!*target_level)
979 *target_level = level;
981 return pte;
985 /* return address's pte at specific level */
986 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
987 unsigned long pfn,
988 int level, int *large_page)
990 struct dma_pte *parent, *pte = NULL;
991 int total = agaw_to_level(domain->agaw);
992 int offset;
994 parent = domain->pgd;
995 while (level <= total) {
996 offset = pfn_level_offset(pfn, total);
997 pte = &parent[offset];
998 if (level == total)
999 return pte;
1001 if (!dma_pte_present(pte)) {
1002 *large_page = total;
1003 break;
1006 if (dma_pte_superpage(pte)) {
1007 *large_page = total;
1008 return pte;
1011 parent = phys_to_virt(dma_pte_addr(pte));
1012 total--;
1014 return NULL;
1017 /* clear last level pte, a tlb flush should be followed */
1018 static void dma_pte_clear_range(struct dmar_domain *domain,
1019 unsigned long start_pfn,
1020 unsigned long last_pfn)
1022 unsigned int large_page = 1;
1023 struct dma_pte *first_pte, *pte;
1025 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1026 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1027 BUG_ON(start_pfn > last_pfn);
1029 /* we don't need lock here; nobody else touches the iova range */
1030 do {
1031 large_page = 1;
1032 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1033 if (!pte) {
1034 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1035 continue;
1037 do {
1038 dma_clear_pte(pte);
1039 start_pfn += lvl_to_nr_pages(large_page);
1040 pte++;
1041 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1043 domain_flush_cache(domain, first_pte,
1044 (void *)pte - (void *)first_pte);
1046 } while (start_pfn && start_pfn <= last_pfn);
1049 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1050 int retain_level, struct dma_pte *pte,
1051 unsigned long pfn, unsigned long start_pfn,
1052 unsigned long last_pfn)
1054 pfn = max(start_pfn, pfn);
1055 pte = &pte[pfn_level_offset(pfn, level)];
1057 do {
1058 unsigned long level_pfn;
1059 struct dma_pte *level_pte;
1061 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1062 goto next;
1064 level_pfn = pfn & level_mask(level);
1065 level_pte = phys_to_virt(dma_pte_addr(pte));
1067 if (level > 2) {
1068 dma_pte_free_level(domain, level - 1, retain_level,
1069 level_pte, level_pfn, start_pfn,
1070 last_pfn);
1074 * Free the page table if we're below the level we want to
1075 * retain and the range covers the entire table.
1077 if (level < retain_level && !(start_pfn > level_pfn ||
1078 last_pfn < level_pfn + level_size(level) - 1)) {
1079 dma_clear_pte(pte);
1080 domain_flush_cache(domain, pte, sizeof(*pte));
1081 free_pgtable_page(level_pte);
1083 next:
1084 pfn += level_size(level);
1085 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1089 * clear last level (leaf) ptes and free page table pages below the
1090 * level we wish to keep intact.
1092 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1093 unsigned long start_pfn,
1094 unsigned long last_pfn,
1095 int retain_level)
1097 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1098 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1099 BUG_ON(start_pfn > last_pfn);
1101 dma_pte_clear_range(domain, start_pfn, last_pfn);
1103 /* We don't need lock here; nobody else touches the iova range */
1104 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1105 domain->pgd, 0, start_pfn, last_pfn);
1107 /* free pgd */
1108 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1109 free_pgtable_page(domain->pgd);
1110 domain->pgd = NULL;
1114 /* When a page at a given level is being unlinked from its parent, we don't
1115 need to *modify* it at all. All we need to do is make a list of all the
1116 pages which can be freed just as soon as we've flushed the IOTLB and we
1117 know the hardware page-walk will no longer touch them.
1118 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1119 be freed. */
1120 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1121 int level, struct dma_pte *pte,
1122 struct page *freelist)
1124 struct page *pg;
1126 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1127 pg->freelist = freelist;
1128 freelist = pg;
1130 if (level == 1)
1131 return freelist;
1133 pte = page_address(pg);
1134 do {
1135 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1136 freelist = dma_pte_list_pagetables(domain, level - 1,
1137 pte, freelist);
1138 pte++;
1139 } while (!first_pte_in_page(pte));
1141 return freelist;
1144 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1145 struct dma_pte *pte, unsigned long pfn,
1146 unsigned long start_pfn,
1147 unsigned long last_pfn,
1148 struct page *freelist)
1150 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1152 pfn = max(start_pfn, pfn);
1153 pte = &pte[pfn_level_offset(pfn, level)];
1155 do {
1156 unsigned long level_pfn;
1158 if (!dma_pte_present(pte))
1159 goto next;
1161 level_pfn = pfn & level_mask(level);
1163 /* If range covers entire pagetable, free it */
1164 if (start_pfn <= level_pfn &&
1165 last_pfn >= level_pfn + level_size(level) - 1) {
1166 /* These suborbinate page tables are going away entirely. Don't
1167 bother to clear them; we're just going to *free* them. */
1168 if (level > 1 && !dma_pte_superpage(pte))
1169 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1171 dma_clear_pte(pte);
1172 if (!first_pte)
1173 first_pte = pte;
1174 last_pte = pte;
1175 } else if (level > 1) {
1176 /* Recurse down into a level that isn't *entirely* obsolete */
1177 freelist = dma_pte_clear_level(domain, level - 1,
1178 phys_to_virt(dma_pte_addr(pte)),
1179 level_pfn, start_pfn, last_pfn,
1180 freelist);
1182 next:
1183 pfn += level_size(level);
1184 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1186 if (first_pte)
1187 domain_flush_cache(domain, first_pte,
1188 (void *)++last_pte - (void *)first_pte);
1190 return freelist;
1193 /* We can't just free the pages because the IOMMU may still be walking
1194 the page tables, and may have cached the intermediate levels. The
1195 pages can only be freed after the IOTLB flush has been done. */
1196 static struct page *domain_unmap(struct dmar_domain *domain,
1197 unsigned long start_pfn,
1198 unsigned long last_pfn)
1200 struct page *freelist = NULL;
1202 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1203 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1204 BUG_ON(start_pfn > last_pfn);
1206 /* we don't need lock here; nobody else touches the iova range */
1207 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1208 domain->pgd, 0, start_pfn, last_pfn, NULL);
1210 /* free pgd */
1211 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1212 struct page *pgd_page = virt_to_page(domain->pgd);
1213 pgd_page->freelist = freelist;
1214 freelist = pgd_page;
1216 domain->pgd = NULL;
1219 return freelist;
1222 static void dma_free_pagelist(struct page *freelist)
1224 struct page *pg;
1226 while ((pg = freelist)) {
1227 freelist = pg->freelist;
1228 free_pgtable_page(page_address(pg));
1232 static void iova_entry_free(unsigned long data)
1234 struct page *freelist = (struct page *)data;
1236 dma_free_pagelist(freelist);
1239 /* iommu handling */
1240 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1242 struct root_entry *root;
1243 unsigned long flags;
1245 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1246 if (!root) {
1247 pr_err("Allocating root entry for %s failed\n",
1248 iommu->name);
1249 return -ENOMEM;
1252 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1254 spin_lock_irqsave(&iommu->lock, flags);
1255 iommu->root_entry = root;
1256 spin_unlock_irqrestore(&iommu->lock, flags);
1258 return 0;
1261 static void iommu_set_root_entry(struct intel_iommu *iommu)
1263 u64 addr;
1264 u32 sts;
1265 unsigned long flag;
1267 addr = virt_to_phys(iommu->root_entry);
1268 if (ecs_enabled(iommu))
1269 addr |= DMA_RTADDR_RTT;
1271 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1272 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1274 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1276 /* Make sure hardware complete it */
1277 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1278 readl, (sts & DMA_GSTS_RTPS), sts);
1280 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1283 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1285 u32 val;
1286 unsigned long flag;
1288 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1289 return;
1291 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1292 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1294 /* Make sure hardware complete it */
1295 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1296 readl, (!(val & DMA_GSTS_WBFS)), val);
1298 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1301 /* return value determine if we need a write buffer flush */
1302 static void __iommu_flush_context(struct intel_iommu *iommu,
1303 u16 did, u16 source_id, u8 function_mask,
1304 u64 type)
1306 u64 val = 0;
1307 unsigned long flag;
1309 switch (type) {
1310 case DMA_CCMD_GLOBAL_INVL:
1311 val = DMA_CCMD_GLOBAL_INVL;
1312 break;
1313 case DMA_CCMD_DOMAIN_INVL:
1314 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1315 break;
1316 case DMA_CCMD_DEVICE_INVL:
1317 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1318 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1319 break;
1320 default:
1321 BUG();
1323 val |= DMA_CCMD_ICC;
1325 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1326 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1328 /* Make sure hardware complete it */
1329 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1330 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1332 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335 /* return value determine if we need a write buffer flush */
1336 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1337 u64 addr, unsigned int size_order, u64 type)
1339 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1340 u64 val = 0, val_iva = 0;
1341 unsigned long flag;
1343 switch (type) {
1344 case DMA_TLB_GLOBAL_FLUSH:
1345 /* global flush doesn't need set IVA_REG */
1346 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1347 break;
1348 case DMA_TLB_DSI_FLUSH:
1349 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1350 break;
1351 case DMA_TLB_PSI_FLUSH:
1352 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1353 /* IH bit is passed in as part of address */
1354 val_iva = size_order | addr;
1355 break;
1356 default:
1357 BUG();
1359 /* Note: set drain read/write */
1360 #if 0
1362 * This is probably to be super secure.. Looks like we can
1363 * ignore it without any impact.
1365 if (cap_read_drain(iommu->cap))
1366 val |= DMA_TLB_READ_DRAIN;
1367 #endif
1368 if (cap_write_drain(iommu->cap))
1369 val |= DMA_TLB_WRITE_DRAIN;
1371 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1372 /* Note: Only uses first TLB reg currently */
1373 if (val_iva)
1374 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1375 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1377 /* Make sure hardware complete it */
1378 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1379 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1381 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1383 /* check IOTLB invalidation granularity */
1384 if (DMA_TLB_IAIG(val) == 0)
1385 pr_err("Flush IOTLB failed\n");
1386 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1387 pr_debug("TLB flush request %Lx, actual %Lx\n",
1388 (unsigned long long)DMA_TLB_IIRG(type),
1389 (unsigned long long)DMA_TLB_IAIG(val));
1392 static struct device_domain_info *
1393 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1394 u8 bus, u8 devfn)
1396 struct device_domain_info *info;
1398 assert_spin_locked(&device_domain_lock);
1400 if (!iommu->qi)
1401 return NULL;
1403 list_for_each_entry(info, &domain->devices, link)
1404 if (info->iommu == iommu && info->bus == bus &&
1405 info->devfn == devfn) {
1406 if (info->ats_supported && info->dev)
1407 return info;
1408 break;
1411 return NULL;
1414 static void domain_update_iotlb(struct dmar_domain *domain)
1416 struct device_domain_info *info;
1417 bool has_iotlb_device = false;
1419 assert_spin_locked(&device_domain_lock);
1421 list_for_each_entry(info, &domain->devices, link) {
1422 struct pci_dev *pdev;
1424 if (!info->dev || !dev_is_pci(info->dev))
1425 continue;
1427 pdev = to_pci_dev(info->dev);
1428 if (pdev->ats_enabled) {
1429 has_iotlb_device = true;
1430 break;
1434 domain->has_iotlb_device = has_iotlb_device;
1437 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1439 struct pci_dev *pdev;
1441 assert_spin_locked(&device_domain_lock);
1443 if (!info || !dev_is_pci(info->dev))
1444 return;
1446 pdev = to_pci_dev(info->dev);
1447 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1448 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1449 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1450 * reserved, which should be set to 0.
1452 if (!ecap_dit(info->iommu->ecap))
1453 info->pfsid = 0;
1454 else {
1455 struct pci_dev *pf_pdev;
1457 /* pdev will be returned if device is not a vf */
1458 pf_pdev = pci_physfn(pdev);
1459 info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
1462 #ifdef CONFIG_INTEL_IOMMU_SVM
1463 /* The PCIe spec, in its wisdom, declares that the behaviour of
1464 the device if you enable PASID support after ATS support is
1465 undefined. So always enable PASID support on devices which
1466 have it, even if we can't yet know if we're ever going to
1467 use it. */
1468 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1469 info->pasid_enabled = 1;
1471 if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1472 info->pri_enabled = 1;
1473 #endif
1474 if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1475 info->ats_enabled = 1;
1476 domain_update_iotlb(info->domain);
1477 info->ats_qdep = pci_ats_queue_depth(pdev);
1481 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1483 struct pci_dev *pdev;
1485 assert_spin_locked(&device_domain_lock);
1487 if (!dev_is_pci(info->dev))
1488 return;
1490 pdev = to_pci_dev(info->dev);
1492 if (info->ats_enabled) {
1493 pci_disable_ats(pdev);
1494 info->ats_enabled = 0;
1495 domain_update_iotlb(info->domain);
1497 #ifdef CONFIG_INTEL_IOMMU_SVM
1498 if (info->pri_enabled) {
1499 pci_disable_pri(pdev);
1500 info->pri_enabled = 0;
1502 if (info->pasid_enabled) {
1503 pci_disable_pasid(pdev);
1504 info->pasid_enabled = 0;
1506 #endif
1509 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1510 u64 addr, unsigned mask)
1512 u16 sid, qdep;
1513 unsigned long flags;
1514 struct device_domain_info *info;
1516 if (!domain->has_iotlb_device)
1517 return;
1519 spin_lock_irqsave(&device_domain_lock, flags);
1520 list_for_each_entry(info, &domain->devices, link) {
1521 if (!info->ats_enabled)
1522 continue;
1524 sid = info->bus << 8 | info->devfn;
1525 qdep = info->ats_qdep;
1526 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1527 qdep, addr, mask);
1529 spin_unlock_irqrestore(&device_domain_lock, flags);
1532 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1533 struct dmar_domain *domain,
1534 unsigned long pfn, unsigned int pages,
1535 int ih, int map)
1537 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1538 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1539 u16 did = domain->iommu_did[iommu->seq_id];
1541 BUG_ON(pages == 0);
1543 if (ih)
1544 ih = 1 << 6;
1546 * Fallback to domain selective flush if no PSI support or the size is
1547 * too big.
1548 * PSI requires page size to be 2 ^ x, and the base address is naturally
1549 * aligned to the size
1551 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1552 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1553 DMA_TLB_DSI_FLUSH);
1554 else
1555 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1556 DMA_TLB_PSI_FLUSH);
1559 * In caching mode, changes of pages from non-present to present require
1560 * flush. However, device IOTLB doesn't need to be flushed in this case.
1562 if (!cap_caching_mode(iommu->cap) || !map)
1563 iommu_flush_dev_iotlb(domain, addr, mask);
1566 /* Notification for newly created mappings */
1567 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1568 struct dmar_domain *domain,
1569 unsigned long pfn, unsigned int pages)
1571 /* It's a non-present to present mapping. Only flush if caching mode */
1572 if (cap_caching_mode(iommu->cap))
1573 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1574 else
1575 iommu_flush_write_buffer(iommu);
1578 static void iommu_flush_iova(struct iova_domain *iovad)
1580 struct dmar_domain *domain;
1581 int idx;
1583 domain = container_of(iovad, struct dmar_domain, iovad);
1585 for_each_domain_iommu(idx, domain) {
1586 struct intel_iommu *iommu = g_iommus[idx];
1587 u16 did = domain->iommu_did[iommu->seq_id];
1589 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1591 if (!cap_caching_mode(iommu->cap))
1592 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1593 0, MAX_AGAW_PFN_WIDTH);
1597 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1599 u32 pmen;
1600 unsigned long flags;
1602 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1603 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1604 pmen &= ~DMA_PMEN_EPM;
1605 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1607 /* wait for the protected region status bit to clear */
1608 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1609 readl, !(pmen & DMA_PMEN_PRS), pmen);
1611 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1614 static void iommu_enable_translation(struct intel_iommu *iommu)
1616 u32 sts;
1617 unsigned long flags;
1619 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1620 iommu->gcmd |= DMA_GCMD_TE;
1621 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1623 /* Make sure hardware complete it */
1624 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1625 readl, (sts & DMA_GSTS_TES), sts);
1627 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630 static void iommu_disable_translation(struct intel_iommu *iommu)
1632 u32 sts;
1633 unsigned long flag;
1635 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1636 iommu->gcmd &= ~DMA_GCMD_TE;
1637 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1639 /* Make sure hardware complete it */
1640 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1641 readl, (!(sts & DMA_GSTS_TES)), sts);
1643 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1647 static int iommu_init_domains(struct intel_iommu *iommu)
1649 u32 ndomains, nlongs;
1650 size_t size;
1652 ndomains = cap_ndoms(iommu->cap);
1653 pr_debug("%s: Number of Domains supported <%d>\n",
1654 iommu->name, ndomains);
1655 nlongs = BITS_TO_LONGS(ndomains);
1657 spin_lock_init(&iommu->lock);
1659 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1660 if (!iommu->domain_ids) {
1661 pr_err("%s: Allocating domain id array failed\n",
1662 iommu->name);
1663 return -ENOMEM;
1666 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1667 iommu->domains = kzalloc(size, GFP_KERNEL);
1669 if (iommu->domains) {
1670 size = 256 * sizeof(struct dmar_domain *);
1671 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1674 if (!iommu->domains || !iommu->domains[0]) {
1675 pr_err("%s: Allocating domain array failed\n",
1676 iommu->name);
1677 kfree(iommu->domain_ids);
1678 kfree(iommu->domains);
1679 iommu->domain_ids = NULL;
1680 iommu->domains = NULL;
1681 return -ENOMEM;
1687 * If Caching mode is set, then invalid translations are tagged
1688 * with domain-id 0, hence we need to pre-allocate it. We also
1689 * use domain-id 0 as a marker for non-allocated domain-id, so
1690 * make sure it is not used for a real domain.
1692 set_bit(0, iommu->domain_ids);
1694 return 0;
1697 static void disable_dmar_iommu(struct intel_iommu *iommu)
1699 struct device_domain_info *info, *tmp;
1700 unsigned long flags;
1702 if (!iommu->domains || !iommu->domain_ids)
1703 return;
1705 again:
1706 spin_lock_irqsave(&device_domain_lock, flags);
1707 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1708 struct dmar_domain *domain;
1710 if (info->iommu != iommu)
1711 continue;
1713 if (!info->dev || !info->domain)
1714 continue;
1716 domain = info->domain;
1718 __dmar_remove_one_dev_info(info);
1720 if (!domain_type_is_vm_or_si(domain)) {
1722 * The domain_exit() function can't be called under
1723 * device_domain_lock, as it takes this lock itself.
1724 * So release the lock here and re-run the loop
1725 * afterwards.
1727 spin_unlock_irqrestore(&device_domain_lock, flags);
1728 domain_exit(domain);
1729 goto again;
1732 spin_unlock_irqrestore(&device_domain_lock, flags);
1734 if (iommu->gcmd & DMA_GCMD_TE)
1735 iommu_disable_translation(iommu);
1738 static void free_dmar_iommu(struct intel_iommu *iommu)
1740 if ((iommu->domains) && (iommu->domain_ids)) {
1741 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1742 int i;
1744 for (i = 0; i < elems; i++)
1745 kfree(iommu->domains[i]);
1746 kfree(iommu->domains);
1747 kfree(iommu->domain_ids);
1748 iommu->domains = NULL;
1749 iommu->domain_ids = NULL;
1752 g_iommus[iommu->seq_id] = NULL;
1754 /* free context mapping */
1755 free_context_table(iommu);
1757 #ifdef CONFIG_INTEL_IOMMU_SVM
1758 if (pasid_enabled(iommu)) {
1759 if (ecap_prs(iommu->ecap))
1760 intel_svm_finish_prq(iommu);
1761 intel_svm_exit(iommu);
1763 #endif
1766 static struct dmar_domain *alloc_domain(int flags)
1768 struct dmar_domain *domain;
1770 domain = alloc_domain_mem();
1771 if (!domain)
1772 return NULL;
1774 memset(domain, 0, sizeof(*domain));
1775 domain->nid = -1;
1776 domain->flags = flags;
1777 domain->has_iotlb_device = false;
1778 INIT_LIST_HEAD(&domain->devices);
1780 return domain;
1783 /* Must be called with iommu->lock */
1784 static int domain_attach_iommu(struct dmar_domain *domain,
1785 struct intel_iommu *iommu)
1787 unsigned long ndomains;
1788 int num;
1790 assert_spin_locked(&device_domain_lock);
1791 assert_spin_locked(&iommu->lock);
1793 domain->iommu_refcnt[iommu->seq_id] += 1;
1794 domain->iommu_count += 1;
1795 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1796 ndomains = cap_ndoms(iommu->cap);
1797 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1799 if (num >= ndomains) {
1800 pr_err("%s: No free domain ids\n", iommu->name);
1801 domain->iommu_refcnt[iommu->seq_id] -= 1;
1802 domain->iommu_count -= 1;
1803 return -ENOSPC;
1806 set_bit(num, iommu->domain_ids);
1807 set_iommu_domain(iommu, num, domain);
1809 domain->iommu_did[iommu->seq_id] = num;
1810 domain->nid = iommu->node;
1812 domain_update_iommu_cap(domain);
1815 return 0;
1818 static int domain_detach_iommu(struct dmar_domain *domain,
1819 struct intel_iommu *iommu)
1821 int num, count = INT_MAX;
1823 assert_spin_locked(&device_domain_lock);
1824 assert_spin_locked(&iommu->lock);
1826 domain->iommu_refcnt[iommu->seq_id] -= 1;
1827 count = --domain->iommu_count;
1828 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1829 num = domain->iommu_did[iommu->seq_id];
1830 clear_bit(num, iommu->domain_ids);
1831 set_iommu_domain(iommu, num, NULL);
1833 domain_update_iommu_cap(domain);
1834 domain->iommu_did[iommu->seq_id] = 0;
1837 return count;
1840 static struct iova_domain reserved_iova_list;
1841 static struct lock_class_key reserved_rbtree_key;
1843 static int dmar_init_reserved_ranges(void)
1845 struct pci_dev *pdev = NULL;
1846 struct iova *iova;
1847 int i;
1849 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1851 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1852 &reserved_rbtree_key);
1854 /* IOAPIC ranges shouldn't be accessed by DMA */
1855 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1856 IOVA_PFN(IOAPIC_RANGE_END));
1857 if (!iova) {
1858 pr_err("Reserve IOAPIC range failed\n");
1859 return -ENODEV;
1862 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1863 for_each_pci_dev(pdev) {
1864 struct resource *r;
1866 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1867 r = &pdev->resource[i];
1868 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1869 continue;
1870 iova = reserve_iova(&reserved_iova_list,
1871 IOVA_PFN(r->start),
1872 IOVA_PFN(r->end));
1873 if (!iova) {
1874 pr_err("Reserve iova failed\n");
1875 return -ENODEV;
1879 return 0;
1882 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1884 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1887 static inline int guestwidth_to_adjustwidth(int gaw)
1889 int agaw;
1890 int r = (gaw - 12) % 9;
1892 if (r == 0)
1893 agaw = gaw;
1894 else
1895 agaw = gaw + 9 - r;
1896 if (agaw > 64)
1897 agaw = 64;
1898 return agaw;
1901 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1902 int guest_width)
1904 int adjust_width, agaw;
1905 unsigned long sagaw;
1906 int err;
1908 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1910 err = init_iova_flush_queue(&domain->iovad,
1911 iommu_flush_iova, iova_entry_free);
1912 if (err)
1913 return err;
1915 domain_reserve_special_ranges(domain);
1917 /* calculate AGAW */
1918 if (guest_width > cap_mgaw(iommu->cap))
1919 guest_width = cap_mgaw(iommu->cap);
1920 domain->gaw = guest_width;
1921 adjust_width = guestwidth_to_adjustwidth(guest_width);
1922 agaw = width_to_agaw(adjust_width);
1923 sagaw = cap_sagaw(iommu->cap);
1924 if (!test_bit(agaw, &sagaw)) {
1925 /* hardware doesn't support it, choose a bigger one */
1926 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1927 agaw = find_next_bit(&sagaw, 5, agaw);
1928 if (agaw >= 5)
1929 return -ENODEV;
1931 domain->agaw = agaw;
1933 if (ecap_coherent(iommu->ecap))
1934 domain->iommu_coherency = 1;
1935 else
1936 domain->iommu_coherency = 0;
1938 if (ecap_sc_support(iommu->ecap))
1939 domain->iommu_snooping = 1;
1940 else
1941 domain->iommu_snooping = 0;
1943 if (intel_iommu_superpage)
1944 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1945 else
1946 domain->iommu_superpage = 0;
1948 domain->nid = iommu->node;
1950 /* always allocate the top pgd */
1951 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1952 if (!domain->pgd)
1953 return -ENOMEM;
1954 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1955 return 0;
1958 static void domain_exit(struct dmar_domain *domain)
1960 struct page *freelist = NULL;
1962 /* Domain 0 is reserved, so dont process it */
1963 if (!domain)
1964 return;
1966 /* Remove associated devices and clear attached or cached domains */
1967 rcu_read_lock();
1968 domain_remove_dev_info(domain);
1969 rcu_read_unlock();
1971 /* destroy iovas */
1972 put_iova_domain(&domain->iovad);
1974 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1976 dma_free_pagelist(freelist);
1978 free_domain_mem(domain);
1981 static int domain_context_mapping_one(struct dmar_domain *domain,
1982 struct intel_iommu *iommu,
1983 u8 bus, u8 devfn)
1985 u16 did = domain->iommu_did[iommu->seq_id];
1986 int translation = CONTEXT_TT_MULTI_LEVEL;
1987 struct device_domain_info *info = NULL;
1988 struct context_entry *context;
1989 unsigned long flags;
1990 struct dma_pte *pgd;
1991 int ret, agaw;
1993 WARN_ON(did == 0);
1995 if (hw_pass_through && domain_type_is_si(domain))
1996 translation = CONTEXT_TT_PASS_THROUGH;
1998 pr_debug("Set context mapping for %02x:%02x.%d\n",
1999 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2001 BUG_ON(!domain->pgd);
2003 spin_lock_irqsave(&device_domain_lock, flags);
2004 spin_lock(&iommu->lock);
2006 ret = -ENOMEM;
2007 context = iommu_context_addr(iommu, bus, devfn, 1);
2008 if (!context)
2009 goto out_unlock;
2011 ret = 0;
2012 if (context_present(context))
2013 goto out_unlock;
2016 * For kdump cases, old valid entries may be cached due to the
2017 * in-flight DMA and copied pgtable, but there is no unmapping
2018 * behaviour for them, thus we need an explicit cache flush for
2019 * the newly-mapped device. For kdump, at this point, the device
2020 * is supposed to finish reset at its driver probe stage, so no
2021 * in-flight DMA will exist, and we don't need to worry anymore
2022 * hereafter.
2024 if (context_copied(context)) {
2025 u16 did_old = context_domain_id(context);
2027 if (did_old < cap_ndoms(iommu->cap)) {
2028 iommu->flush.flush_context(iommu, did_old,
2029 (((u16)bus) << 8) | devfn,
2030 DMA_CCMD_MASK_NOBIT,
2031 DMA_CCMD_DEVICE_INVL);
2032 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2033 DMA_TLB_DSI_FLUSH);
2037 pgd = domain->pgd;
2039 context_clear_entry(context);
2040 context_set_domain_id(context, did);
2043 * Skip top levels of page tables for iommu which has less agaw
2044 * than default. Unnecessary for PT mode.
2046 if (translation != CONTEXT_TT_PASS_THROUGH) {
2047 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2048 ret = -ENOMEM;
2049 pgd = phys_to_virt(dma_pte_addr(pgd));
2050 if (!dma_pte_present(pgd))
2051 goto out_unlock;
2054 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2055 if (info && info->ats_supported)
2056 translation = CONTEXT_TT_DEV_IOTLB;
2057 else
2058 translation = CONTEXT_TT_MULTI_LEVEL;
2060 context_set_address_root(context, virt_to_phys(pgd));
2061 context_set_address_width(context, iommu->agaw);
2062 } else {
2064 * In pass through mode, AW must be programmed to
2065 * indicate the largest AGAW value supported by
2066 * hardware. And ASR is ignored by hardware.
2068 context_set_address_width(context, iommu->msagaw);
2071 context_set_translation_type(context, translation);
2072 context_set_fault_enable(context);
2073 context_set_present(context);
2074 domain_flush_cache(domain, context, sizeof(*context));
2077 * It's a non-present to present mapping. If hardware doesn't cache
2078 * non-present entry we only need to flush the write-buffer. If the
2079 * _does_ cache non-present entries, then it does so in the special
2080 * domain #0, which we have to flush:
2082 if (cap_caching_mode(iommu->cap)) {
2083 iommu->flush.flush_context(iommu, 0,
2084 (((u16)bus) << 8) | devfn,
2085 DMA_CCMD_MASK_NOBIT,
2086 DMA_CCMD_DEVICE_INVL);
2087 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2088 } else {
2089 iommu_flush_write_buffer(iommu);
2091 iommu_enable_dev_iotlb(info);
2093 ret = 0;
2095 out_unlock:
2096 spin_unlock(&iommu->lock);
2097 spin_unlock_irqrestore(&device_domain_lock, flags);
2099 return ret;
2102 struct domain_context_mapping_data {
2103 struct dmar_domain *domain;
2104 struct intel_iommu *iommu;
2107 static int domain_context_mapping_cb(struct pci_dev *pdev,
2108 u16 alias, void *opaque)
2110 struct domain_context_mapping_data *data = opaque;
2112 return domain_context_mapping_one(data->domain, data->iommu,
2113 PCI_BUS_NUM(alias), alias & 0xff);
2116 static int
2117 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2119 struct intel_iommu *iommu;
2120 u8 bus, devfn;
2121 struct domain_context_mapping_data data;
2123 iommu = device_to_iommu(dev, &bus, &devfn);
2124 if (!iommu)
2125 return -ENODEV;
2127 if (!dev_is_pci(dev))
2128 return domain_context_mapping_one(domain, iommu, bus, devfn);
2130 data.domain = domain;
2131 data.iommu = iommu;
2133 return pci_for_each_dma_alias(to_pci_dev(dev),
2134 &domain_context_mapping_cb, &data);
2137 static int domain_context_mapped_cb(struct pci_dev *pdev,
2138 u16 alias, void *opaque)
2140 struct intel_iommu *iommu = opaque;
2142 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2145 static int domain_context_mapped(struct device *dev)
2147 struct intel_iommu *iommu;
2148 u8 bus, devfn;
2150 iommu = device_to_iommu(dev, &bus, &devfn);
2151 if (!iommu)
2152 return -ENODEV;
2154 if (!dev_is_pci(dev))
2155 return device_context_mapped(iommu, bus, devfn);
2157 return !pci_for_each_dma_alias(to_pci_dev(dev),
2158 domain_context_mapped_cb, iommu);
2161 /* Returns a number of VTD pages, but aligned to MM page size */
2162 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2163 size_t size)
2165 host_addr &= ~PAGE_MASK;
2166 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2169 /* Return largest possible superpage level for a given mapping */
2170 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2171 unsigned long iov_pfn,
2172 unsigned long phy_pfn,
2173 unsigned long pages)
2175 int support, level = 1;
2176 unsigned long pfnmerge;
2178 support = domain->iommu_superpage;
2180 /* To use a large page, the virtual *and* physical addresses
2181 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2182 of them will mean we have to use smaller pages. So just
2183 merge them and check both at once. */
2184 pfnmerge = iov_pfn | phy_pfn;
2186 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2187 pages >>= VTD_STRIDE_SHIFT;
2188 if (!pages)
2189 break;
2190 pfnmerge >>= VTD_STRIDE_SHIFT;
2191 level++;
2192 support--;
2194 return level;
2197 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2198 struct scatterlist *sg, unsigned long phys_pfn,
2199 unsigned long nr_pages, int prot)
2201 struct dma_pte *first_pte = NULL, *pte = NULL;
2202 phys_addr_t uninitialized_var(pteval);
2203 unsigned long sg_res = 0;
2204 unsigned int largepage_lvl = 0;
2205 unsigned long lvl_pages = 0;
2207 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2209 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2210 return -EINVAL;
2212 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2214 if (!sg) {
2215 sg_res = nr_pages;
2216 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2219 while (nr_pages > 0) {
2220 uint64_t tmp;
2222 if (!sg_res) {
2223 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2225 sg_res = aligned_nrpages(sg->offset, sg->length);
2226 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2227 sg->dma_length = sg->length;
2228 pteval = (sg_phys(sg) - pgoff) | prot;
2229 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2232 if (!pte) {
2233 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2235 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2236 if (!pte)
2237 return -ENOMEM;
2238 /* It is large page*/
2239 if (largepage_lvl > 1) {
2240 unsigned long nr_superpages, end_pfn;
2242 pteval |= DMA_PTE_LARGE_PAGE;
2243 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2245 nr_superpages = sg_res / lvl_pages;
2246 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2249 * Ensure that old small page tables are
2250 * removed to make room for superpage(s).
2251 * We're adding new large pages, so make sure
2252 * we don't remove their parent tables.
2254 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2255 largepage_lvl + 1);
2256 } else {
2257 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2261 /* We don't need lock here, nobody else
2262 * touches the iova range
2264 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2265 if (tmp) {
2266 static int dumps = 5;
2267 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2268 iov_pfn, tmp, (unsigned long long)pteval);
2269 if (dumps) {
2270 dumps--;
2271 debug_dma_dump_mappings(NULL);
2273 WARN_ON(1);
2276 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2278 BUG_ON(nr_pages < lvl_pages);
2279 BUG_ON(sg_res < lvl_pages);
2281 nr_pages -= lvl_pages;
2282 iov_pfn += lvl_pages;
2283 phys_pfn += lvl_pages;
2284 pteval += lvl_pages * VTD_PAGE_SIZE;
2285 sg_res -= lvl_pages;
2287 /* If the next PTE would be the first in a new page, then we
2288 need to flush the cache on the entries we've just written.
2289 And then we'll need to recalculate 'pte', so clear it and
2290 let it get set again in the if (!pte) block above.
2292 If we're done (!nr_pages) we need to flush the cache too.
2294 Also if we've been setting superpages, we may need to
2295 recalculate 'pte' and switch back to smaller pages for the
2296 end of the mapping, if the trailing size is not enough to
2297 use another superpage (i.e. sg_res < lvl_pages). */
2298 pte++;
2299 if (!nr_pages || first_pte_in_page(pte) ||
2300 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2301 domain_flush_cache(domain, first_pte,
2302 (void *)pte - (void *)first_pte);
2303 pte = NULL;
2306 if (!sg_res && nr_pages)
2307 sg = sg_next(sg);
2309 return 0;
2312 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2313 struct scatterlist *sg, unsigned long phys_pfn,
2314 unsigned long nr_pages, int prot)
2316 int ret;
2317 struct intel_iommu *iommu;
2319 /* Do the real mapping first */
2320 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2321 if (ret)
2322 return ret;
2324 /* Notify about the new mapping */
2325 if (domain_type_is_vm(domain)) {
2326 /* VM typed domains can have more than one IOMMUs */
2327 int iommu_id;
2328 for_each_domain_iommu(iommu_id, domain) {
2329 iommu = g_iommus[iommu_id];
2330 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2332 } else {
2333 /* General domains only have one IOMMU */
2334 iommu = domain_get_iommu(domain);
2335 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2338 return 0;
2341 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2342 struct scatterlist *sg, unsigned long nr_pages,
2343 int prot)
2345 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2348 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2349 unsigned long phys_pfn, unsigned long nr_pages,
2350 int prot)
2352 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2355 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2357 unsigned long flags;
2358 struct context_entry *context;
2359 u16 did_old;
2361 if (!iommu)
2362 return;
2364 spin_lock_irqsave(&iommu->lock, flags);
2365 context = iommu_context_addr(iommu, bus, devfn, 0);
2366 if (!context) {
2367 spin_unlock_irqrestore(&iommu->lock, flags);
2368 return;
2370 did_old = context_domain_id(context);
2371 context_clear_entry(context);
2372 __iommu_flush_cache(iommu, context, sizeof(*context));
2373 spin_unlock_irqrestore(&iommu->lock, flags);
2374 iommu->flush.flush_context(iommu,
2375 did_old,
2376 (((u16)bus) << 8) | devfn,
2377 DMA_CCMD_MASK_NOBIT,
2378 DMA_CCMD_DEVICE_INVL);
2379 iommu->flush.flush_iotlb(iommu,
2380 did_old,
2383 DMA_TLB_DSI_FLUSH);
2386 static inline void unlink_domain_info(struct device_domain_info *info)
2388 assert_spin_locked(&device_domain_lock);
2389 list_del(&info->link);
2390 list_del(&info->global);
2391 if (info->dev)
2392 info->dev->archdata.iommu = NULL;
2395 static void domain_remove_dev_info(struct dmar_domain *domain)
2397 struct device_domain_info *info, *tmp;
2398 unsigned long flags;
2400 spin_lock_irqsave(&device_domain_lock, flags);
2401 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2402 __dmar_remove_one_dev_info(info);
2403 spin_unlock_irqrestore(&device_domain_lock, flags);
2407 * find_domain
2408 * Note: we use struct device->archdata.iommu stores the info
2410 static struct dmar_domain *find_domain(struct device *dev)
2412 struct device_domain_info *info;
2414 /* No lock here, assumes no domain exit in normal case */
2415 info = dev->archdata.iommu;
2416 if (likely(info))
2417 return info->domain;
2418 return NULL;
2421 static inline struct device_domain_info *
2422 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2424 struct device_domain_info *info;
2426 list_for_each_entry(info, &device_domain_list, global)
2427 if (info->iommu->segment == segment && info->bus == bus &&
2428 info->devfn == devfn)
2429 return info;
2431 return NULL;
2434 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2435 int bus, int devfn,
2436 struct device *dev,
2437 struct dmar_domain *domain)
2439 struct dmar_domain *found = NULL;
2440 struct device_domain_info *info;
2441 unsigned long flags;
2442 int ret;
2444 info = alloc_devinfo_mem();
2445 if (!info)
2446 return NULL;
2448 info->bus = bus;
2449 info->devfn = devfn;
2450 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2451 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2452 info->ats_qdep = 0;
2453 info->dev = dev;
2454 info->domain = domain;
2455 info->iommu = iommu;
2456 info->pasid_table = NULL;
2458 if (dev && dev_is_pci(dev)) {
2459 struct pci_dev *pdev = to_pci_dev(info->dev);
2461 if (!pci_ats_disabled() &&
2462 ecap_dev_iotlb_support(iommu->ecap) &&
2463 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2464 dmar_find_matched_atsr_unit(pdev))
2465 info->ats_supported = 1;
2467 if (ecs_enabled(iommu)) {
2468 if (pasid_enabled(iommu)) {
2469 int features = pci_pasid_features(pdev);
2470 if (features >= 0)
2471 info->pasid_supported = features | 1;
2474 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2475 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2476 info->pri_supported = 1;
2480 spin_lock_irqsave(&device_domain_lock, flags);
2481 if (dev)
2482 found = find_domain(dev);
2484 if (!found) {
2485 struct device_domain_info *info2;
2486 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2487 if (info2) {
2488 found = info2->domain;
2489 info2->dev = dev;
2493 if (found) {
2494 spin_unlock_irqrestore(&device_domain_lock, flags);
2495 free_devinfo_mem(info);
2496 /* Caller must free the original domain */
2497 return found;
2500 spin_lock(&iommu->lock);
2501 ret = domain_attach_iommu(domain, iommu);
2502 spin_unlock(&iommu->lock);
2504 if (ret) {
2505 spin_unlock_irqrestore(&device_domain_lock, flags);
2506 free_devinfo_mem(info);
2507 return NULL;
2510 list_add(&info->link, &domain->devices);
2511 list_add(&info->global, &device_domain_list);
2512 if (dev)
2513 dev->archdata.iommu = info;
2515 if (dev && dev_is_pci(dev) && info->pasid_supported) {
2516 ret = intel_pasid_alloc_table(dev);
2517 if (ret) {
2518 pr_warn("No pasid table for %s, pasid disabled\n",
2519 dev_name(dev));
2520 info->pasid_supported = 0;
2523 spin_unlock_irqrestore(&device_domain_lock, flags);
2525 if (dev && domain_context_mapping(domain, dev)) {
2526 pr_err("Domain context map for %s failed\n", dev_name(dev));
2527 dmar_remove_one_dev_info(domain, dev);
2528 return NULL;
2531 return domain;
2534 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2536 *(u16 *)opaque = alias;
2537 return 0;
2540 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2542 struct device_domain_info *info = NULL;
2543 struct dmar_domain *domain = NULL;
2544 struct intel_iommu *iommu;
2545 u16 dma_alias;
2546 unsigned long flags;
2547 u8 bus, devfn;
2549 iommu = device_to_iommu(dev, &bus, &devfn);
2550 if (!iommu)
2551 return NULL;
2553 if (dev_is_pci(dev)) {
2554 struct pci_dev *pdev = to_pci_dev(dev);
2556 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2558 spin_lock_irqsave(&device_domain_lock, flags);
2559 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2560 PCI_BUS_NUM(dma_alias),
2561 dma_alias & 0xff);
2562 if (info) {
2563 iommu = info->iommu;
2564 domain = info->domain;
2566 spin_unlock_irqrestore(&device_domain_lock, flags);
2568 /* DMA alias already has a domain, use it */
2569 if (info)
2570 goto out;
2573 /* Allocate and initialize new domain for the device */
2574 domain = alloc_domain(0);
2575 if (!domain)
2576 return NULL;
2577 if (domain_init(domain, iommu, gaw)) {
2578 domain_exit(domain);
2579 return NULL;
2582 out:
2584 return domain;
2587 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2588 struct dmar_domain *domain)
2590 struct intel_iommu *iommu;
2591 struct dmar_domain *tmp;
2592 u16 req_id, dma_alias;
2593 u8 bus, devfn;
2595 iommu = device_to_iommu(dev, &bus, &devfn);
2596 if (!iommu)
2597 return NULL;
2599 req_id = ((u16)bus << 8) | devfn;
2601 if (dev_is_pci(dev)) {
2602 struct pci_dev *pdev = to_pci_dev(dev);
2604 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2606 /* register PCI DMA alias device */
2607 if (req_id != dma_alias) {
2608 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2609 dma_alias & 0xff, NULL, domain);
2611 if (!tmp || tmp != domain)
2612 return tmp;
2616 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2617 if (!tmp || tmp != domain)
2618 return tmp;
2620 return domain;
2623 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2625 struct dmar_domain *domain, *tmp;
2627 domain = find_domain(dev);
2628 if (domain)
2629 goto out;
2631 domain = find_or_alloc_domain(dev, gaw);
2632 if (!domain)
2633 goto out;
2635 tmp = set_domain_for_dev(dev, domain);
2636 if (!tmp || domain != tmp) {
2637 domain_exit(domain);
2638 domain = tmp;
2641 out:
2643 return domain;
2646 static int iommu_domain_identity_map(struct dmar_domain *domain,
2647 unsigned long long start,
2648 unsigned long long end)
2650 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2651 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2653 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2654 dma_to_mm_pfn(last_vpfn))) {
2655 pr_err("Reserving iova failed\n");
2656 return -ENOMEM;
2659 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2661 * RMRR range might have overlap with physical memory range,
2662 * clear it first
2664 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2666 return __domain_mapping(domain, first_vpfn, NULL,
2667 first_vpfn, last_vpfn - first_vpfn + 1,
2668 DMA_PTE_READ|DMA_PTE_WRITE);
2671 static int domain_prepare_identity_map(struct device *dev,
2672 struct dmar_domain *domain,
2673 unsigned long long start,
2674 unsigned long long end)
2676 /* For _hardware_ passthrough, don't bother. But for software
2677 passthrough, we do it anyway -- it may indicate a memory
2678 range which is reserved in E820, so which didn't get set
2679 up to start with in si_domain */
2680 if (domain == si_domain && hw_pass_through) {
2681 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2682 dev_name(dev), start, end);
2683 return 0;
2686 pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2687 dev_name(dev), start, end);
2689 if (end < start) {
2690 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2691 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2692 dmi_get_system_info(DMI_BIOS_VENDOR),
2693 dmi_get_system_info(DMI_BIOS_VERSION),
2694 dmi_get_system_info(DMI_PRODUCT_VERSION));
2695 return -EIO;
2698 if (end >> agaw_to_width(domain->agaw)) {
2699 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2700 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2701 agaw_to_width(domain->agaw),
2702 dmi_get_system_info(DMI_BIOS_VENDOR),
2703 dmi_get_system_info(DMI_BIOS_VERSION),
2704 dmi_get_system_info(DMI_PRODUCT_VERSION));
2705 return -EIO;
2708 return iommu_domain_identity_map(domain, start, end);
2711 static int iommu_prepare_identity_map(struct device *dev,
2712 unsigned long long start,
2713 unsigned long long end)
2715 struct dmar_domain *domain;
2716 int ret;
2718 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2719 if (!domain)
2720 return -ENOMEM;
2722 ret = domain_prepare_identity_map(dev, domain, start, end);
2723 if (ret)
2724 domain_exit(domain);
2726 return ret;
2729 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2730 struct device *dev)
2732 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2733 return 0;
2734 return iommu_prepare_identity_map(dev, rmrr->base_address,
2735 rmrr->end_address);
2738 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2739 static inline void iommu_prepare_isa(void)
2741 struct pci_dev *pdev;
2742 int ret;
2744 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2745 if (!pdev)
2746 return;
2748 pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2749 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2751 if (ret)
2752 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2754 pci_dev_put(pdev);
2756 #else
2757 static inline void iommu_prepare_isa(void)
2759 return;
2761 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2763 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2765 static int __init si_domain_init(int hw)
2767 int nid, ret = 0;
2769 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2770 if (!si_domain)
2771 return -EFAULT;
2773 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2774 domain_exit(si_domain);
2775 return -EFAULT;
2778 pr_debug("Identity mapping domain allocated\n");
2780 if (hw)
2781 return 0;
2783 for_each_online_node(nid) {
2784 unsigned long start_pfn, end_pfn;
2785 int i;
2787 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2788 ret = iommu_domain_identity_map(si_domain,
2789 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2790 if (ret)
2791 return ret;
2795 return 0;
2798 static int identity_mapping(struct device *dev)
2800 struct device_domain_info *info;
2802 if (likely(!iommu_identity_mapping))
2803 return 0;
2805 info = dev->archdata.iommu;
2806 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2807 return (info->domain == si_domain);
2809 return 0;
2812 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2814 struct dmar_domain *ndomain;
2815 struct intel_iommu *iommu;
2816 u8 bus, devfn;
2818 iommu = device_to_iommu(dev, &bus, &devfn);
2819 if (!iommu)
2820 return -ENODEV;
2822 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2823 if (ndomain != domain)
2824 return -EBUSY;
2826 return 0;
2829 static bool device_has_rmrr(struct device *dev)
2831 struct dmar_rmrr_unit *rmrr;
2832 struct device *tmp;
2833 int i;
2835 rcu_read_lock();
2836 for_each_rmrr_units(rmrr) {
2838 * Return TRUE if this RMRR contains the device that
2839 * is passed in.
2841 for_each_active_dev_scope(rmrr->devices,
2842 rmrr->devices_cnt, i, tmp)
2843 if (tmp == dev) {
2844 rcu_read_unlock();
2845 return true;
2848 rcu_read_unlock();
2849 return false;
2853 * There are a couple cases where we need to restrict the functionality of
2854 * devices associated with RMRRs. The first is when evaluating a device for
2855 * identity mapping because problems exist when devices are moved in and out
2856 * of domains and their respective RMRR information is lost. This means that
2857 * a device with associated RMRRs will never be in a "passthrough" domain.
2858 * The second is use of the device through the IOMMU API. This interface
2859 * expects to have full control of the IOVA space for the device. We cannot
2860 * satisfy both the requirement that RMRR access is maintained and have an
2861 * unencumbered IOVA space. We also have no ability to quiesce the device's
2862 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2863 * We therefore prevent devices associated with an RMRR from participating in
2864 * the IOMMU API, which eliminates them from device assignment.
2866 * In both cases we assume that PCI USB devices with RMRRs have them largely
2867 * for historical reasons and that the RMRR space is not actively used post
2868 * boot. This exclusion may change if vendors begin to abuse it.
2870 * The same exception is made for graphics devices, with the requirement that
2871 * any use of the RMRR regions will be torn down before assigning the device
2872 * to a guest.
2874 static bool device_is_rmrr_locked(struct device *dev)
2876 if (!device_has_rmrr(dev))
2877 return false;
2879 if (dev_is_pci(dev)) {
2880 struct pci_dev *pdev = to_pci_dev(dev);
2882 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2883 return false;
2886 return true;
2889 static int iommu_should_identity_map(struct device *dev, int startup)
2892 if (dev_is_pci(dev)) {
2893 struct pci_dev *pdev = to_pci_dev(dev);
2895 if (device_is_rmrr_locked(dev))
2896 return 0;
2898 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2899 return 1;
2901 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2902 return 1;
2904 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2905 return 0;
2908 * We want to start off with all devices in the 1:1 domain, and
2909 * take them out later if we find they can't access all of memory.
2911 * However, we can't do this for PCI devices behind bridges,
2912 * because all PCI devices behind the same bridge will end up
2913 * with the same source-id on their transactions.
2915 * Practically speaking, we can't change things around for these
2916 * devices at run-time, because we can't be sure there'll be no
2917 * DMA transactions in flight for any of their siblings.
2919 * So PCI devices (unless they're on the root bus) as well as
2920 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2921 * the 1:1 domain, just in _case_ one of their siblings turns out
2922 * not to be able to map all of memory.
2924 if (!pci_is_pcie(pdev)) {
2925 if (!pci_is_root_bus(pdev->bus))
2926 return 0;
2927 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2928 return 0;
2929 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2930 return 0;
2931 } else {
2932 if (device_has_rmrr(dev))
2933 return 0;
2937 * At boot time, we don't yet know if devices will be 64-bit capable.
2938 * Assume that they will — if they turn out not to be, then we can
2939 * take them out of the 1:1 domain later.
2941 if (!startup) {
2943 * If the device's dma_mask is less than the system's memory
2944 * size then this is not a candidate for identity mapping.
2946 u64 dma_mask = *dev->dma_mask;
2948 if (dev->coherent_dma_mask &&
2949 dev->coherent_dma_mask < dma_mask)
2950 dma_mask = dev->coherent_dma_mask;
2952 return dma_mask >= dma_get_required_mask(dev);
2955 return 1;
2958 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2960 int ret;
2962 if (!iommu_should_identity_map(dev, 1))
2963 return 0;
2965 ret = domain_add_dev_info(si_domain, dev);
2966 if (!ret)
2967 pr_info("%s identity mapping for device %s\n",
2968 hw ? "Hardware" : "Software", dev_name(dev));
2969 else if (ret == -ENODEV)
2970 /* device not associated with an iommu */
2971 ret = 0;
2973 return ret;
2977 static int __init iommu_prepare_static_identity_mapping(int hw)
2979 struct pci_dev *pdev = NULL;
2980 struct dmar_drhd_unit *drhd;
2981 struct intel_iommu *iommu;
2982 struct device *dev;
2983 int i;
2984 int ret = 0;
2986 for_each_pci_dev(pdev) {
2987 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2988 if (ret)
2989 return ret;
2992 for_each_active_iommu(iommu, drhd)
2993 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2994 struct acpi_device_physical_node *pn;
2995 struct acpi_device *adev;
2997 if (dev->bus != &acpi_bus_type)
2998 continue;
3000 adev= to_acpi_device(dev);
3001 mutex_lock(&adev->physical_node_lock);
3002 list_for_each_entry(pn, &adev->physical_node_list, node) {
3003 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3004 if (ret)
3005 break;
3007 mutex_unlock(&adev->physical_node_lock);
3008 if (ret)
3009 return ret;
3012 return 0;
3015 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3018 * Start from the sane iommu hardware state.
3019 * If the queued invalidation is already initialized by us
3020 * (for example, while enabling interrupt-remapping) then
3021 * we got the things already rolling from a sane state.
3023 if (!iommu->qi) {
3025 * Clear any previous faults.
3027 dmar_fault(-1, iommu);
3029 * Disable queued invalidation if supported and already enabled
3030 * before OS handover.
3032 dmar_disable_qi(iommu);
3035 if (dmar_enable_qi(iommu)) {
3037 * Queued Invalidate not enabled, use Register Based Invalidate
3039 iommu->flush.flush_context = __iommu_flush_context;
3040 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3041 pr_info("%s: Using Register based invalidation\n",
3042 iommu->name);
3043 } else {
3044 iommu->flush.flush_context = qi_flush_context;
3045 iommu->flush.flush_iotlb = qi_flush_iotlb;
3046 pr_info("%s: Using Queued invalidation\n", iommu->name);
3050 static int copy_context_table(struct intel_iommu *iommu,
3051 struct root_entry *old_re,
3052 struct context_entry **tbl,
3053 int bus, bool ext)
3055 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3056 struct context_entry *new_ce = NULL, ce;
3057 struct context_entry *old_ce = NULL;
3058 struct root_entry re;
3059 phys_addr_t old_ce_phys;
3061 tbl_idx = ext ? bus * 2 : bus;
3062 memcpy(&re, old_re, sizeof(re));
3064 for (devfn = 0; devfn < 256; devfn++) {
3065 /* First calculate the correct index */
3066 idx = (ext ? devfn * 2 : devfn) % 256;
3068 if (idx == 0) {
3069 /* First save what we may have and clean up */
3070 if (new_ce) {
3071 tbl[tbl_idx] = new_ce;
3072 __iommu_flush_cache(iommu, new_ce,
3073 VTD_PAGE_SIZE);
3074 pos = 1;
3077 if (old_ce)
3078 memunmap(old_ce);
3080 ret = 0;
3081 if (devfn < 0x80)
3082 old_ce_phys = root_entry_lctp(&re);
3083 else
3084 old_ce_phys = root_entry_uctp(&re);
3086 if (!old_ce_phys) {
3087 if (ext && devfn == 0) {
3088 /* No LCTP, try UCTP */
3089 devfn = 0x7f;
3090 continue;
3091 } else {
3092 goto out;
3096 ret = -ENOMEM;
3097 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3098 MEMREMAP_WB);
3099 if (!old_ce)
3100 goto out;
3102 new_ce = alloc_pgtable_page(iommu->node);
3103 if (!new_ce)
3104 goto out_unmap;
3106 ret = 0;
3109 /* Now copy the context entry */
3110 memcpy(&ce, old_ce + idx, sizeof(ce));
3112 if (!__context_present(&ce))
3113 continue;
3115 did = context_domain_id(&ce);
3116 if (did >= 0 && did < cap_ndoms(iommu->cap))
3117 set_bit(did, iommu->domain_ids);
3120 * We need a marker for copied context entries. This
3121 * marker needs to work for the old format as well as
3122 * for extended context entries.
3124 * Bit 67 of the context entry is used. In the old
3125 * format this bit is available to software, in the
3126 * extended format it is the PGE bit, but PGE is ignored
3127 * by HW if PASIDs are disabled (and thus still
3128 * available).
3130 * So disable PASIDs first and then mark the entry
3131 * copied. This means that we don't copy PASID
3132 * translations from the old kernel, but this is fine as
3133 * faults there are not fatal.
3135 context_clear_pasid_enable(&ce);
3136 context_set_copied(&ce);
3138 new_ce[idx] = ce;
3141 tbl[tbl_idx + pos] = new_ce;
3143 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3145 out_unmap:
3146 memunmap(old_ce);
3148 out:
3149 return ret;
3152 static int copy_translation_tables(struct intel_iommu *iommu)
3154 struct context_entry **ctxt_tbls;
3155 struct root_entry *old_rt;
3156 phys_addr_t old_rt_phys;
3157 int ctxt_table_entries;
3158 unsigned long flags;
3159 u64 rtaddr_reg;
3160 int bus, ret;
3161 bool new_ext, ext;
3163 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3164 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3165 new_ext = !!ecap_ecs(iommu->ecap);
3168 * The RTT bit can only be changed when translation is disabled,
3169 * but disabling translation means to open a window for data
3170 * corruption. So bail out and don't copy anything if we would
3171 * have to change the bit.
3173 if (new_ext != ext)
3174 return -EINVAL;
3176 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3177 if (!old_rt_phys)
3178 return -EINVAL;
3180 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3181 if (!old_rt)
3182 return -ENOMEM;
3184 /* This is too big for the stack - allocate it from slab */
3185 ctxt_table_entries = ext ? 512 : 256;
3186 ret = -ENOMEM;
3187 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3188 if (!ctxt_tbls)
3189 goto out_unmap;
3191 for (bus = 0; bus < 256; bus++) {
3192 ret = copy_context_table(iommu, &old_rt[bus],
3193 ctxt_tbls, bus, ext);
3194 if (ret) {
3195 pr_err("%s: Failed to copy context table for bus %d\n",
3196 iommu->name, bus);
3197 continue;
3201 spin_lock_irqsave(&iommu->lock, flags);
3203 /* Context tables are copied, now write them to the root_entry table */
3204 for (bus = 0; bus < 256; bus++) {
3205 int idx = ext ? bus * 2 : bus;
3206 u64 val;
3208 if (ctxt_tbls[idx]) {
3209 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3210 iommu->root_entry[bus].lo = val;
3213 if (!ext || !ctxt_tbls[idx + 1])
3214 continue;
3216 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3217 iommu->root_entry[bus].hi = val;
3220 spin_unlock_irqrestore(&iommu->lock, flags);
3222 kfree(ctxt_tbls);
3224 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3226 ret = 0;
3228 out_unmap:
3229 memunmap(old_rt);
3231 return ret;
3234 static int __init init_dmars(void)
3236 struct dmar_drhd_unit *drhd;
3237 struct dmar_rmrr_unit *rmrr;
3238 bool copied_tables = false;
3239 struct device *dev;
3240 struct intel_iommu *iommu;
3241 int i, ret;
3244 * for each drhd
3245 * allocate root
3246 * initialize and program root entry to not present
3247 * endfor
3249 for_each_drhd_unit(drhd) {
3251 * lock not needed as this is only incremented in the single
3252 * threaded kernel __init code path all other access are read
3253 * only
3255 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3256 g_num_of_iommus++;
3257 continue;
3259 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3262 /* Preallocate enough resources for IOMMU hot-addition */
3263 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3264 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3266 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3267 GFP_KERNEL);
3268 if (!g_iommus) {
3269 pr_err("Allocating global iommu array failed\n");
3270 ret = -ENOMEM;
3271 goto error;
3274 for_each_active_iommu(iommu, drhd) {
3276 * Find the max pasid size of all IOMMU's in the system.
3277 * We need to ensure the system pasid table is no bigger
3278 * than the smallest supported.
3280 if (pasid_enabled(iommu)) {
3281 u32 temp = 2 << ecap_pss(iommu->ecap);
3283 intel_pasid_max_id = min_t(u32, temp,
3284 intel_pasid_max_id);
3287 g_iommus[iommu->seq_id] = iommu;
3289 intel_iommu_init_qi(iommu);
3291 ret = iommu_init_domains(iommu);
3292 if (ret)
3293 goto free_iommu;
3295 init_translation_status(iommu);
3297 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3298 iommu_disable_translation(iommu);
3299 clear_translation_pre_enabled(iommu);
3300 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3301 iommu->name);
3305 * TBD:
3306 * we could share the same root & context tables
3307 * among all IOMMU's. Need to Split it later.
3309 ret = iommu_alloc_root_entry(iommu);
3310 if (ret)
3311 goto free_iommu;
3313 if (translation_pre_enabled(iommu)) {
3314 pr_info("Translation already enabled - trying to copy translation structures\n");
3316 ret = copy_translation_tables(iommu);
3317 if (ret) {
3319 * We found the IOMMU with translation
3320 * enabled - but failed to copy over the
3321 * old root-entry table. Try to proceed
3322 * by disabling translation now and
3323 * allocating a clean root-entry table.
3324 * This might cause DMAR faults, but
3325 * probably the dump will still succeed.
3327 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3328 iommu->name);
3329 iommu_disable_translation(iommu);
3330 clear_translation_pre_enabled(iommu);
3331 } else {
3332 pr_info("Copied translation tables from previous kernel for %s\n",
3333 iommu->name);
3334 copied_tables = true;
3338 if (!ecap_pass_through(iommu->ecap))
3339 hw_pass_through = 0;
3340 #ifdef CONFIG_INTEL_IOMMU_SVM
3341 if (pasid_enabled(iommu))
3342 intel_svm_init(iommu);
3343 #endif
3347 * Now that qi is enabled on all iommus, set the root entry and flush
3348 * caches. This is required on some Intel X58 chipsets, otherwise the
3349 * flush_context function will loop forever and the boot hangs.
3351 for_each_active_iommu(iommu, drhd) {
3352 iommu_flush_write_buffer(iommu);
3353 iommu_set_root_entry(iommu);
3354 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3355 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3358 if (iommu_pass_through)
3359 iommu_identity_mapping |= IDENTMAP_ALL;
3361 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3362 iommu_identity_mapping |= IDENTMAP_GFX;
3363 #endif
3365 check_tylersburg_isoch();
3367 if (iommu_identity_mapping) {
3368 ret = si_domain_init(hw_pass_through);
3369 if (ret)
3370 goto free_iommu;
3375 * If we copied translations from a previous kernel in the kdump
3376 * case, we can not assign the devices to domains now, as that
3377 * would eliminate the old mappings. So skip this part and defer
3378 * the assignment to device driver initialization time.
3380 if (copied_tables)
3381 goto domains_done;
3384 * If pass through is not set or not enabled, setup context entries for
3385 * identity mappings for rmrr, gfx, and isa and may fall back to static
3386 * identity mapping if iommu_identity_mapping is set.
3388 if (iommu_identity_mapping) {
3389 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3390 if (ret) {
3391 pr_crit("Failed to setup IOMMU pass-through\n");
3392 goto free_iommu;
3396 * For each rmrr
3397 * for each dev attached to rmrr
3398 * do
3399 * locate drhd for dev, alloc domain for dev
3400 * allocate free domain
3401 * allocate page table entries for rmrr
3402 * if context not allocated for bus
3403 * allocate and init context
3404 * set present in root table for this bus
3405 * init context with domain, translation etc
3406 * endfor
3407 * endfor
3409 pr_info("Setting RMRR:\n");
3410 for_each_rmrr_units(rmrr) {
3411 /* some BIOS lists non-exist devices in DMAR table. */
3412 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3413 i, dev) {
3414 ret = iommu_prepare_rmrr_dev(rmrr, dev);
3415 if (ret)
3416 pr_err("Mapping reserved region failed\n");
3420 iommu_prepare_isa();
3422 domains_done:
3425 * for each drhd
3426 * enable fault log
3427 * global invalidate context cache
3428 * global invalidate iotlb
3429 * enable translation
3431 for_each_iommu(iommu, drhd) {
3432 if (drhd->ignored) {
3434 * we always have to disable PMRs or DMA may fail on
3435 * this device
3437 if (force_on)
3438 iommu_disable_protect_mem_regions(iommu);
3439 continue;
3442 iommu_flush_write_buffer(iommu);
3444 #ifdef CONFIG_INTEL_IOMMU_SVM
3445 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3446 ret = intel_svm_enable_prq(iommu);
3447 if (ret)
3448 goto free_iommu;
3450 #endif
3451 ret = dmar_set_interrupt(iommu);
3452 if (ret)
3453 goto free_iommu;
3455 if (!translation_pre_enabled(iommu))
3456 iommu_enable_translation(iommu);
3458 iommu_disable_protect_mem_regions(iommu);
3461 return 0;
3463 free_iommu:
3464 for_each_active_iommu(iommu, drhd) {
3465 disable_dmar_iommu(iommu);
3466 free_dmar_iommu(iommu);
3469 kfree(g_iommus);
3471 error:
3472 return ret;
3475 /* This takes a number of _MM_ pages, not VTD pages */
3476 static unsigned long intel_alloc_iova(struct device *dev,
3477 struct dmar_domain *domain,
3478 unsigned long nrpages, uint64_t dma_mask)
3480 unsigned long iova_pfn = 0;
3482 /* Restrict dma_mask to the width that the iommu can handle */
3483 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3484 /* Ensure we reserve the whole size-aligned region */
3485 nrpages = __roundup_pow_of_two(nrpages);
3487 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3489 * First try to allocate an io virtual address in
3490 * DMA_BIT_MASK(32) and if that fails then try allocating
3491 * from higher range
3493 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3494 IOVA_PFN(DMA_BIT_MASK(32)), false);
3495 if (iova_pfn)
3496 return iova_pfn;
3498 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3499 IOVA_PFN(dma_mask), true);
3500 if (unlikely(!iova_pfn)) {
3501 pr_err("Allocating %ld-page iova for %s failed",
3502 nrpages, dev_name(dev));
3503 return 0;
3506 return iova_pfn;
3509 struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3511 struct dmar_domain *domain, *tmp;
3512 struct dmar_rmrr_unit *rmrr;
3513 struct device *i_dev;
3514 int i, ret;
3516 domain = find_domain(dev);
3517 if (domain)
3518 goto out;
3520 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3521 if (!domain)
3522 goto out;
3524 /* We have a new domain - setup possible RMRRs for the device */
3525 rcu_read_lock();
3526 for_each_rmrr_units(rmrr) {
3527 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3528 i, i_dev) {
3529 if (i_dev != dev)
3530 continue;
3532 ret = domain_prepare_identity_map(dev, domain,
3533 rmrr->base_address,
3534 rmrr->end_address);
3535 if (ret)
3536 dev_err(dev, "Mapping reserved region failed\n");
3539 rcu_read_unlock();
3541 tmp = set_domain_for_dev(dev, domain);
3542 if (!tmp || domain != tmp) {
3543 domain_exit(domain);
3544 domain = tmp;
3547 out:
3549 if (!domain)
3550 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3553 return domain;
3556 /* Check if the dev needs to go through non-identity map and unmap process.*/
3557 static int iommu_no_mapping(struct device *dev)
3559 int found;
3561 if (iommu_dummy(dev))
3562 return 1;
3564 if (!iommu_identity_mapping)
3565 return 0;
3567 found = identity_mapping(dev);
3568 if (found) {
3569 if (iommu_should_identity_map(dev, 0))
3570 return 1;
3571 else {
3573 * 32 bit DMA is removed from si_domain and fall back
3574 * to non-identity mapping.
3576 dmar_remove_one_dev_info(si_domain, dev);
3577 pr_info("32bit %s uses non-identity mapping\n",
3578 dev_name(dev));
3579 return 0;
3581 } else {
3583 * In case of a detached 64 bit DMA device from vm, the device
3584 * is put into si_domain for identity mapping.
3586 if (iommu_should_identity_map(dev, 0)) {
3587 int ret;
3588 ret = domain_add_dev_info(si_domain, dev);
3589 if (!ret) {
3590 pr_info("64bit %s uses identity mapping\n",
3591 dev_name(dev));
3592 return 1;
3597 return 0;
3600 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3601 size_t size, int dir, u64 dma_mask)
3603 struct dmar_domain *domain;
3604 phys_addr_t start_paddr;
3605 unsigned long iova_pfn;
3606 int prot = 0;
3607 int ret;
3608 struct intel_iommu *iommu;
3609 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3611 BUG_ON(dir == DMA_NONE);
3613 if (iommu_no_mapping(dev))
3614 return paddr;
3616 domain = get_valid_domain_for_dev(dev);
3617 if (!domain)
3618 return 0;
3620 iommu = domain_get_iommu(domain);
3621 size = aligned_nrpages(paddr, size);
3623 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3624 if (!iova_pfn)
3625 goto error;
3628 * Check if DMAR supports zero-length reads on write only
3629 * mappings..
3631 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3632 !cap_zlr(iommu->cap))
3633 prot |= DMA_PTE_READ;
3634 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3635 prot |= DMA_PTE_WRITE;
3637 * paddr - (paddr + size) might be partial page, we should map the whole
3638 * page. Note: if two part of one page are separately mapped, we
3639 * might have two guest_addr mapping to the same host paddr, but this
3640 * is not a big problem
3642 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3643 mm_to_dma_pfn(paddr_pfn), size, prot);
3644 if (ret)
3645 goto error;
3647 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3648 start_paddr += paddr & ~PAGE_MASK;
3649 return start_paddr;
3651 error:
3652 if (iova_pfn)
3653 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3654 pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3655 dev_name(dev), size, (unsigned long long)paddr, dir);
3656 return 0;
3659 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3660 unsigned long offset, size_t size,
3661 enum dma_data_direction dir,
3662 unsigned long attrs)
3664 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3665 dir, *dev->dma_mask);
3668 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3670 struct dmar_domain *domain;
3671 unsigned long start_pfn, last_pfn;
3672 unsigned long nrpages;
3673 unsigned long iova_pfn;
3674 struct intel_iommu *iommu;
3675 struct page *freelist;
3677 if (iommu_no_mapping(dev))
3678 return;
3680 domain = find_domain(dev);
3681 BUG_ON(!domain);
3683 iommu = domain_get_iommu(domain);
3685 iova_pfn = IOVA_PFN(dev_addr);
3687 nrpages = aligned_nrpages(dev_addr, size);
3688 start_pfn = mm_to_dma_pfn(iova_pfn);
3689 last_pfn = start_pfn + nrpages - 1;
3691 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3692 dev_name(dev), start_pfn, last_pfn);
3694 freelist = domain_unmap(domain, start_pfn, last_pfn);
3696 if (intel_iommu_strict) {
3697 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3698 nrpages, !freelist, 0);
3699 /* free iova */
3700 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3701 dma_free_pagelist(freelist);
3702 } else {
3703 queue_iova(&domain->iovad, iova_pfn, nrpages,
3704 (unsigned long)freelist);
3706 * queue up the release of the unmap to save the 1/6th of the
3707 * cpu used up by the iotlb flush operation...
3712 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3713 size_t size, enum dma_data_direction dir,
3714 unsigned long attrs)
3716 intel_unmap(dev, dev_addr, size);
3719 static void *intel_alloc_coherent(struct device *dev, size_t size,
3720 dma_addr_t *dma_handle, gfp_t flags,
3721 unsigned long attrs)
3723 struct page *page = NULL;
3724 int order;
3726 size = PAGE_ALIGN(size);
3727 order = get_order(size);
3729 if (!iommu_no_mapping(dev))
3730 flags &= ~(GFP_DMA | GFP_DMA32);
3731 else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3732 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3733 flags |= GFP_DMA;
3734 else
3735 flags |= GFP_DMA32;
3738 if (gfpflags_allow_blocking(flags)) {
3739 unsigned int count = size >> PAGE_SHIFT;
3741 page = dma_alloc_from_contiguous(dev, count, order,
3742 flags & __GFP_NOWARN);
3743 if (page && iommu_no_mapping(dev) &&
3744 page_to_phys(page) + size > dev->coherent_dma_mask) {
3745 dma_release_from_contiguous(dev, page, count);
3746 page = NULL;
3750 if (!page)
3751 page = alloc_pages(flags, order);
3752 if (!page)
3753 return NULL;
3754 memset(page_address(page), 0, size);
3756 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3757 DMA_BIDIRECTIONAL,
3758 dev->coherent_dma_mask);
3759 if (*dma_handle)
3760 return page_address(page);
3761 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3762 __free_pages(page, order);
3764 return NULL;
3767 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3768 dma_addr_t dma_handle, unsigned long attrs)
3770 int order;
3771 struct page *page = virt_to_page(vaddr);
3773 size = PAGE_ALIGN(size);
3774 order = get_order(size);
3776 intel_unmap(dev, dma_handle, size);
3777 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3778 __free_pages(page, order);
3781 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3782 int nelems, enum dma_data_direction dir,
3783 unsigned long attrs)
3785 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3786 unsigned long nrpages = 0;
3787 struct scatterlist *sg;
3788 int i;
3790 for_each_sg(sglist, sg, nelems, i) {
3791 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3794 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3797 static int intel_nontranslate_map_sg(struct device *hddev,
3798 struct scatterlist *sglist, int nelems, int dir)
3800 int i;
3801 struct scatterlist *sg;
3803 for_each_sg(sglist, sg, nelems, i) {
3804 BUG_ON(!sg_page(sg));
3805 sg->dma_address = sg_phys(sg);
3806 sg->dma_length = sg->length;
3808 return nelems;
3811 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3812 enum dma_data_direction dir, unsigned long attrs)
3814 int i;
3815 struct dmar_domain *domain;
3816 size_t size = 0;
3817 int prot = 0;
3818 unsigned long iova_pfn;
3819 int ret;
3820 struct scatterlist *sg;
3821 unsigned long start_vpfn;
3822 struct intel_iommu *iommu;
3824 BUG_ON(dir == DMA_NONE);
3825 if (iommu_no_mapping(dev))
3826 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3828 domain = get_valid_domain_for_dev(dev);
3829 if (!domain)
3830 return 0;
3832 iommu = domain_get_iommu(domain);
3834 for_each_sg(sglist, sg, nelems, i)
3835 size += aligned_nrpages(sg->offset, sg->length);
3837 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3838 *dev->dma_mask);
3839 if (!iova_pfn) {
3840 sglist->dma_length = 0;
3841 return 0;
3845 * Check if DMAR supports zero-length reads on write only
3846 * mappings..
3848 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3849 !cap_zlr(iommu->cap))
3850 prot |= DMA_PTE_READ;
3851 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3852 prot |= DMA_PTE_WRITE;
3854 start_vpfn = mm_to_dma_pfn(iova_pfn);
3856 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3857 if (unlikely(ret)) {
3858 dma_pte_free_pagetable(domain, start_vpfn,
3859 start_vpfn + size - 1,
3860 agaw_to_level(domain->agaw) + 1);
3861 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3862 return 0;
3865 return nelems;
3868 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3870 return !dma_addr;
3873 static const struct dma_map_ops intel_dma_ops = {
3874 .alloc = intel_alloc_coherent,
3875 .free = intel_free_coherent,
3876 .map_sg = intel_map_sg,
3877 .unmap_sg = intel_unmap_sg,
3878 .map_page = intel_map_page,
3879 .unmap_page = intel_unmap_page,
3880 .mapping_error = intel_mapping_error,
3881 .dma_supported = dma_direct_supported,
3884 static inline int iommu_domain_cache_init(void)
3886 int ret = 0;
3888 iommu_domain_cache = kmem_cache_create("iommu_domain",
3889 sizeof(struct dmar_domain),
3891 SLAB_HWCACHE_ALIGN,
3893 NULL);
3894 if (!iommu_domain_cache) {
3895 pr_err("Couldn't create iommu_domain cache\n");
3896 ret = -ENOMEM;
3899 return ret;
3902 static inline int iommu_devinfo_cache_init(void)
3904 int ret = 0;
3906 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3907 sizeof(struct device_domain_info),
3909 SLAB_HWCACHE_ALIGN,
3910 NULL);
3911 if (!iommu_devinfo_cache) {
3912 pr_err("Couldn't create devinfo cache\n");
3913 ret = -ENOMEM;
3916 return ret;
3919 static int __init iommu_init_mempool(void)
3921 int ret;
3922 ret = iova_cache_get();
3923 if (ret)
3924 return ret;
3926 ret = iommu_domain_cache_init();
3927 if (ret)
3928 goto domain_error;
3930 ret = iommu_devinfo_cache_init();
3931 if (!ret)
3932 return ret;
3934 kmem_cache_destroy(iommu_domain_cache);
3935 domain_error:
3936 iova_cache_put();
3938 return -ENOMEM;
3941 static void __init iommu_exit_mempool(void)
3943 kmem_cache_destroy(iommu_devinfo_cache);
3944 kmem_cache_destroy(iommu_domain_cache);
3945 iova_cache_put();
3948 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3950 struct dmar_drhd_unit *drhd;
3951 u32 vtbar;
3952 int rc;
3954 /* We know that this device on this chipset has its own IOMMU.
3955 * If we find it under a different IOMMU, then the BIOS is lying
3956 * to us. Hope that the IOMMU for this device is actually
3957 * disabled, and it needs no translation...
3959 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3960 if (rc) {
3961 /* "can't" happen */
3962 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3963 return;
3965 vtbar &= 0xffff0000;
3967 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3968 drhd = dmar_find_matched_drhd_unit(pdev);
3969 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3970 TAINT_FIRMWARE_WORKAROUND,
3971 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3972 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3974 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3976 static void __init init_no_remapping_devices(void)
3978 struct dmar_drhd_unit *drhd;
3979 struct device *dev;
3980 int i;
3982 for_each_drhd_unit(drhd) {
3983 if (!drhd->include_all) {
3984 for_each_active_dev_scope(drhd->devices,
3985 drhd->devices_cnt, i, dev)
3986 break;
3987 /* ignore DMAR unit if no devices exist */
3988 if (i == drhd->devices_cnt)
3989 drhd->ignored = 1;
3993 for_each_active_drhd_unit(drhd) {
3994 if (drhd->include_all)
3995 continue;
3997 for_each_active_dev_scope(drhd->devices,
3998 drhd->devices_cnt, i, dev)
3999 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4000 break;
4001 if (i < drhd->devices_cnt)
4002 continue;
4004 /* This IOMMU has *only* gfx devices. Either bypass it or
4005 set the gfx_mapped flag, as appropriate */
4006 if (dmar_map_gfx) {
4007 intel_iommu_gfx_mapped = 1;
4008 } else {
4009 drhd->ignored = 1;
4010 for_each_active_dev_scope(drhd->devices,
4011 drhd->devices_cnt, i, dev)
4012 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4017 #ifdef CONFIG_SUSPEND
4018 static int init_iommu_hw(void)
4020 struct dmar_drhd_unit *drhd;
4021 struct intel_iommu *iommu = NULL;
4023 for_each_active_iommu(iommu, drhd)
4024 if (iommu->qi)
4025 dmar_reenable_qi(iommu);
4027 for_each_iommu(iommu, drhd) {
4028 if (drhd->ignored) {
4030 * we always have to disable PMRs or DMA may fail on
4031 * this device
4033 if (force_on)
4034 iommu_disable_protect_mem_regions(iommu);
4035 continue;
4038 iommu_flush_write_buffer(iommu);
4040 iommu_set_root_entry(iommu);
4042 iommu->flush.flush_context(iommu, 0, 0, 0,
4043 DMA_CCMD_GLOBAL_INVL);
4044 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4045 iommu_enable_translation(iommu);
4046 iommu_disable_protect_mem_regions(iommu);
4049 return 0;
4052 static void iommu_flush_all(void)
4054 struct dmar_drhd_unit *drhd;
4055 struct intel_iommu *iommu;
4057 for_each_active_iommu(iommu, drhd) {
4058 iommu->flush.flush_context(iommu, 0, 0, 0,
4059 DMA_CCMD_GLOBAL_INVL);
4060 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4061 DMA_TLB_GLOBAL_FLUSH);
4065 static int iommu_suspend(void)
4067 struct dmar_drhd_unit *drhd;
4068 struct intel_iommu *iommu = NULL;
4069 unsigned long flag;
4071 for_each_active_iommu(iommu, drhd) {
4072 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4073 GFP_ATOMIC);
4074 if (!iommu->iommu_state)
4075 goto nomem;
4078 iommu_flush_all();
4080 for_each_active_iommu(iommu, drhd) {
4081 iommu_disable_translation(iommu);
4083 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4085 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4086 readl(iommu->reg + DMAR_FECTL_REG);
4087 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4088 readl(iommu->reg + DMAR_FEDATA_REG);
4089 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4090 readl(iommu->reg + DMAR_FEADDR_REG);
4091 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4092 readl(iommu->reg + DMAR_FEUADDR_REG);
4094 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4096 return 0;
4098 nomem:
4099 for_each_active_iommu(iommu, drhd)
4100 kfree(iommu->iommu_state);
4102 return -ENOMEM;
4105 static void iommu_resume(void)
4107 struct dmar_drhd_unit *drhd;
4108 struct intel_iommu *iommu = NULL;
4109 unsigned long flag;
4111 if (init_iommu_hw()) {
4112 if (force_on)
4113 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4114 else
4115 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4116 return;
4119 for_each_active_iommu(iommu, drhd) {
4121 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4123 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4124 iommu->reg + DMAR_FECTL_REG);
4125 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4126 iommu->reg + DMAR_FEDATA_REG);
4127 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4128 iommu->reg + DMAR_FEADDR_REG);
4129 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4130 iommu->reg + DMAR_FEUADDR_REG);
4132 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4135 for_each_active_iommu(iommu, drhd)
4136 kfree(iommu->iommu_state);
4139 static struct syscore_ops iommu_syscore_ops = {
4140 .resume = iommu_resume,
4141 .suspend = iommu_suspend,
4144 static void __init init_iommu_pm_ops(void)
4146 register_syscore_ops(&iommu_syscore_ops);
4149 #else
4150 static inline void init_iommu_pm_ops(void) {}
4151 #endif /* CONFIG_PM */
4154 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4156 struct acpi_dmar_reserved_memory *rmrr;
4157 int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4158 struct dmar_rmrr_unit *rmrru;
4159 size_t length;
4161 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4162 if (!rmrru)
4163 goto out;
4165 rmrru->hdr = header;
4166 rmrr = (struct acpi_dmar_reserved_memory *)header;
4167 rmrru->base_address = rmrr->base_address;
4168 rmrru->end_address = rmrr->end_address;
4170 length = rmrr->end_address - rmrr->base_address + 1;
4171 rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4172 IOMMU_RESV_DIRECT);
4173 if (!rmrru->resv)
4174 goto free_rmrru;
4176 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4177 ((void *)rmrr) + rmrr->header.length,
4178 &rmrru->devices_cnt);
4179 if (rmrru->devices_cnt && rmrru->devices == NULL)
4180 goto free_all;
4182 list_add(&rmrru->list, &dmar_rmrr_units);
4184 return 0;
4185 free_all:
4186 kfree(rmrru->resv);
4187 free_rmrru:
4188 kfree(rmrru);
4189 out:
4190 return -ENOMEM;
4193 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4195 struct dmar_atsr_unit *atsru;
4196 struct acpi_dmar_atsr *tmp;
4198 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4199 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4200 if (atsr->segment != tmp->segment)
4201 continue;
4202 if (atsr->header.length != tmp->header.length)
4203 continue;
4204 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4205 return atsru;
4208 return NULL;
4211 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4213 struct acpi_dmar_atsr *atsr;
4214 struct dmar_atsr_unit *atsru;
4216 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4217 return 0;
4219 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4220 atsru = dmar_find_atsr(atsr);
4221 if (atsru)
4222 return 0;
4224 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4225 if (!atsru)
4226 return -ENOMEM;
4229 * If memory is allocated from slab by ACPI _DSM method, we need to
4230 * copy the memory content because the memory buffer will be freed
4231 * on return.
4233 atsru->hdr = (void *)(atsru + 1);
4234 memcpy(atsru->hdr, hdr, hdr->length);
4235 atsru->include_all = atsr->flags & 0x1;
4236 if (!atsru->include_all) {
4237 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4238 (void *)atsr + atsr->header.length,
4239 &atsru->devices_cnt);
4240 if (atsru->devices_cnt && atsru->devices == NULL) {
4241 kfree(atsru);
4242 return -ENOMEM;
4246 list_add_rcu(&atsru->list, &dmar_atsr_units);
4248 return 0;
4251 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4253 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4254 kfree(atsru);
4257 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4259 struct acpi_dmar_atsr *atsr;
4260 struct dmar_atsr_unit *atsru;
4262 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4263 atsru = dmar_find_atsr(atsr);
4264 if (atsru) {
4265 list_del_rcu(&atsru->list);
4266 synchronize_rcu();
4267 intel_iommu_free_atsr(atsru);
4270 return 0;
4273 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4275 int i;
4276 struct device *dev;
4277 struct acpi_dmar_atsr *atsr;
4278 struct dmar_atsr_unit *atsru;
4280 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4281 atsru = dmar_find_atsr(atsr);
4282 if (!atsru)
4283 return 0;
4285 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4286 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4287 i, dev)
4288 return -EBUSY;
4291 return 0;
4294 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4296 int sp, ret = 0;
4297 struct intel_iommu *iommu = dmaru->iommu;
4299 if (g_iommus[iommu->seq_id])
4300 return 0;
4302 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4303 pr_warn("%s: Doesn't support hardware pass through.\n",
4304 iommu->name);
4305 return -ENXIO;
4307 if (!ecap_sc_support(iommu->ecap) &&
4308 domain_update_iommu_snooping(iommu)) {
4309 pr_warn("%s: Doesn't support snooping.\n",
4310 iommu->name);
4311 return -ENXIO;
4313 sp = domain_update_iommu_superpage(iommu) - 1;
4314 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4315 pr_warn("%s: Doesn't support large page.\n",
4316 iommu->name);
4317 return -ENXIO;
4321 * Disable translation if already enabled prior to OS handover.
4323 if (iommu->gcmd & DMA_GCMD_TE)
4324 iommu_disable_translation(iommu);
4326 g_iommus[iommu->seq_id] = iommu;
4327 ret = iommu_init_domains(iommu);
4328 if (ret == 0)
4329 ret = iommu_alloc_root_entry(iommu);
4330 if (ret)
4331 goto out;
4333 #ifdef CONFIG_INTEL_IOMMU_SVM
4334 if (pasid_enabled(iommu))
4335 intel_svm_init(iommu);
4336 #endif
4338 if (dmaru->ignored) {
4340 * we always have to disable PMRs or DMA may fail on this device
4342 if (force_on)
4343 iommu_disable_protect_mem_regions(iommu);
4344 return 0;
4347 intel_iommu_init_qi(iommu);
4348 iommu_flush_write_buffer(iommu);
4350 #ifdef CONFIG_INTEL_IOMMU_SVM
4351 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4352 ret = intel_svm_enable_prq(iommu);
4353 if (ret)
4354 goto disable_iommu;
4356 #endif
4357 ret = dmar_set_interrupt(iommu);
4358 if (ret)
4359 goto disable_iommu;
4361 iommu_set_root_entry(iommu);
4362 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4363 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4364 iommu_enable_translation(iommu);
4366 iommu_disable_protect_mem_regions(iommu);
4367 return 0;
4369 disable_iommu:
4370 disable_dmar_iommu(iommu);
4371 out:
4372 free_dmar_iommu(iommu);
4373 return ret;
4376 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4378 int ret = 0;
4379 struct intel_iommu *iommu = dmaru->iommu;
4381 if (!intel_iommu_enabled)
4382 return 0;
4383 if (iommu == NULL)
4384 return -EINVAL;
4386 if (insert) {
4387 ret = intel_iommu_add(dmaru);
4388 } else {
4389 disable_dmar_iommu(iommu);
4390 free_dmar_iommu(iommu);
4393 return ret;
4396 static void intel_iommu_free_dmars(void)
4398 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4399 struct dmar_atsr_unit *atsru, *atsr_n;
4401 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4402 list_del(&rmrru->list);
4403 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4404 kfree(rmrru->resv);
4405 kfree(rmrru);
4408 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4409 list_del(&atsru->list);
4410 intel_iommu_free_atsr(atsru);
4414 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4416 int i, ret = 1;
4417 struct pci_bus *bus;
4418 struct pci_dev *bridge = NULL;
4419 struct device *tmp;
4420 struct acpi_dmar_atsr *atsr;
4421 struct dmar_atsr_unit *atsru;
4423 dev = pci_physfn(dev);
4424 for (bus = dev->bus; bus; bus = bus->parent) {
4425 bridge = bus->self;
4426 /* If it's an integrated device, allow ATS */
4427 if (!bridge)
4428 return 1;
4429 /* Connected via non-PCIe: no ATS */
4430 if (!pci_is_pcie(bridge) ||
4431 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4432 return 0;
4433 /* If we found the root port, look it up in the ATSR */
4434 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4435 break;
4438 rcu_read_lock();
4439 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4440 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4441 if (atsr->segment != pci_domain_nr(dev->bus))
4442 continue;
4444 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4445 if (tmp == &bridge->dev)
4446 goto out;
4448 if (atsru->include_all)
4449 goto out;
4451 ret = 0;
4452 out:
4453 rcu_read_unlock();
4455 return ret;
4458 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4460 int ret = 0;
4461 struct dmar_rmrr_unit *rmrru;
4462 struct dmar_atsr_unit *atsru;
4463 struct acpi_dmar_atsr *atsr;
4464 struct acpi_dmar_reserved_memory *rmrr;
4466 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4467 return 0;
4469 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4470 rmrr = container_of(rmrru->hdr,
4471 struct acpi_dmar_reserved_memory, header);
4472 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4473 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4474 ((void *)rmrr) + rmrr->header.length,
4475 rmrr->segment, rmrru->devices,
4476 rmrru->devices_cnt);
4477 if(ret < 0)
4478 return ret;
4479 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4480 dmar_remove_dev_scope(info, rmrr->segment,
4481 rmrru->devices, rmrru->devices_cnt);
4485 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4486 if (atsru->include_all)
4487 continue;
4489 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4490 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4491 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4492 (void *)atsr + atsr->header.length,
4493 atsr->segment, atsru->devices,
4494 atsru->devices_cnt);
4495 if (ret > 0)
4496 break;
4497 else if(ret < 0)
4498 return ret;
4499 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4500 if (dmar_remove_dev_scope(info, atsr->segment,
4501 atsru->devices, atsru->devices_cnt))
4502 break;
4506 return 0;
4510 * Here we only respond to action of unbound device from driver.
4512 * Added device is not attached to its DMAR domain here yet. That will happen
4513 * when mapping the device to iova.
4515 static int device_notifier(struct notifier_block *nb,
4516 unsigned long action, void *data)
4518 struct device *dev = data;
4519 struct dmar_domain *domain;
4521 if (iommu_dummy(dev))
4522 return 0;
4524 if (action != BUS_NOTIFY_REMOVED_DEVICE)
4525 return 0;
4527 domain = find_domain(dev);
4528 if (!domain)
4529 return 0;
4531 dmar_remove_one_dev_info(domain, dev);
4532 if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4533 domain_exit(domain);
4535 return 0;
4538 static struct notifier_block device_nb = {
4539 .notifier_call = device_notifier,
4542 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4543 unsigned long val, void *v)
4545 struct memory_notify *mhp = v;
4546 unsigned long long start, end;
4547 unsigned long start_vpfn, last_vpfn;
4549 switch (val) {
4550 case MEM_GOING_ONLINE:
4551 start = mhp->start_pfn << PAGE_SHIFT;
4552 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4553 if (iommu_domain_identity_map(si_domain, start, end)) {
4554 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4555 start, end);
4556 return NOTIFY_BAD;
4558 break;
4560 case MEM_OFFLINE:
4561 case MEM_CANCEL_ONLINE:
4562 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4563 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4564 while (start_vpfn <= last_vpfn) {
4565 struct iova *iova;
4566 struct dmar_drhd_unit *drhd;
4567 struct intel_iommu *iommu;
4568 struct page *freelist;
4570 iova = find_iova(&si_domain->iovad, start_vpfn);
4571 if (iova == NULL) {
4572 pr_debug("Failed get IOVA for PFN %lx\n",
4573 start_vpfn);
4574 break;
4577 iova = split_and_remove_iova(&si_domain->iovad, iova,
4578 start_vpfn, last_vpfn);
4579 if (iova == NULL) {
4580 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4581 start_vpfn, last_vpfn);
4582 return NOTIFY_BAD;
4585 freelist = domain_unmap(si_domain, iova->pfn_lo,
4586 iova->pfn_hi);
4588 rcu_read_lock();
4589 for_each_active_iommu(iommu, drhd)
4590 iommu_flush_iotlb_psi(iommu, si_domain,
4591 iova->pfn_lo, iova_size(iova),
4592 !freelist, 0);
4593 rcu_read_unlock();
4594 dma_free_pagelist(freelist);
4596 start_vpfn = iova->pfn_hi + 1;
4597 free_iova_mem(iova);
4599 break;
4602 return NOTIFY_OK;
4605 static struct notifier_block intel_iommu_memory_nb = {
4606 .notifier_call = intel_iommu_memory_notifier,
4607 .priority = 0
4610 static void free_all_cpu_cached_iovas(unsigned int cpu)
4612 int i;
4614 for (i = 0; i < g_num_of_iommus; i++) {
4615 struct intel_iommu *iommu = g_iommus[i];
4616 struct dmar_domain *domain;
4617 int did;
4619 if (!iommu)
4620 continue;
4622 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4623 domain = get_iommu_domain(iommu, (u16)did);
4625 if (!domain)
4626 continue;
4627 free_cpu_cached_iovas(cpu, &domain->iovad);
4632 static int intel_iommu_cpu_dead(unsigned int cpu)
4634 free_all_cpu_cached_iovas(cpu);
4635 return 0;
4638 static void intel_disable_iommus(void)
4640 struct intel_iommu *iommu = NULL;
4641 struct dmar_drhd_unit *drhd;
4643 for_each_iommu(iommu, drhd)
4644 iommu_disable_translation(iommu);
4647 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4649 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4651 return container_of(iommu_dev, struct intel_iommu, iommu);
4654 static ssize_t intel_iommu_show_version(struct device *dev,
4655 struct device_attribute *attr,
4656 char *buf)
4658 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4659 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4660 return sprintf(buf, "%d:%d\n",
4661 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4663 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4665 static ssize_t intel_iommu_show_address(struct device *dev,
4666 struct device_attribute *attr,
4667 char *buf)
4669 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4670 return sprintf(buf, "%llx\n", iommu->reg_phys);
4672 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4674 static ssize_t intel_iommu_show_cap(struct device *dev,
4675 struct device_attribute *attr,
4676 char *buf)
4678 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4679 return sprintf(buf, "%llx\n", iommu->cap);
4681 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4683 static ssize_t intel_iommu_show_ecap(struct device *dev,
4684 struct device_attribute *attr,
4685 char *buf)
4687 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4688 return sprintf(buf, "%llx\n", iommu->ecap);
4690 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4692 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4693 struct device_attribute *attr,
4694 char *buf)
4696 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4697 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4699 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4701 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4702 struct device_attribute *attr,
4703 char *buf)
4705 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4706 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4707 cap_ndoms(iommu->cap)));
4709 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4711 static struct attribute *intel_iommu_attrs[] = {
4712 &dev_attr_version.attr,
4713 &dev_attr_address.attr,
4714 &dev_attr_cap.attr,
4715 &dev_attr_ecap.attr,
4716 &dev_attr_domains_supported.attr,
4717 &dev_attr_domains_used.attr,
4718 NULL,
4721 static struct attribute_group intel_iommu_group = {
4722 .name = "intel-iommu",
4723 .attrs = intel_iommu_attrs,
4726 const struct attribute_group *intel_iommu_groups[] = {
4727 &intel_iommu_group,
4728 NULL,
4731 int __init intel_iommu_init(void)
4733 int ret = -ENODEV;
4734 struct dmar_drhd_unit *drhd;
4735 struct intel_iommu *iommu;
4737 /* VT-d is required for a TXT/tboot launch, so enforce that */
4738 force_on = tboot_force_iommu();
4740 if (iommu_init_mempool()) {
4741 if (force_on)
4742 panic("tboot: Failed to initialize iommu memory\n");
4743 return -ENOMEM;
4746 down_write(&dmar_global_lock);
4747 if (dmar_table_init()) {
4748 if (force_on)
4749 panic("tboot: Failed to initialize DMAR table\n");
4750 goto out_free_dmar;
4753 if (dmar_dev_scope_init() < 0) {
4754 if (force_on)
4755 panic("tboot: Failed to initialize DMAR device scope\n");
4756 goto out_free_dmar;
4759 up_write(&dmar_global_lock);
4762 * The bus notifier takes the dmar_global_lock, so lockdep will
4763 * complain later when we register it under the lock.
4765 dmar_register_bus_notifier();
4767 down_write(&dmar_global_lock);
4769 if (no_iommu || dmar_disabled) {
4771 * We exit the function here to ensure IOMMU's remapping and
4772 * mempool aren't setup, which means that the IOMMU's PMRs
4773 * won't be disabled via the call to init_dmars(). So disable
4774 * it explicitly here. The PMRs were setup by tboot prior to
4775 * calling SENTER, but the kernel is expected to reset/tear
4776 * down the PMRs.
4778 if (intel_iommu_tboot_noforce) {
4779 for_each_iommu(iommu, drhd)
4780 iommu_disable_protect_mem_regions(iommu);
4784 * Make sure the IOMMUs are switched off, even when we
4785 * boot into a kexec kernel and the previous kernel left
4786 * them enabled
4788 intel_disable_iommus();
4789 goto out_free_dmar;
4792 if (list_empty(&dmar_rmrr_units))
4793 pr_info("No RMRR found\n");
4795 if (list_empty(&dmar_atsr_units))
4796 pr_info("No ATSR found\n");
4798 if (dmar_init_reserved_ranges()) {
4799 if (force_on)
4800 panic("tboot: Failed to reserve iommu ranges\n");
4801 goto out_free_reserved_range;
4804 init_no_remapping_devices();
4806 ret = init_dmars();
4807 if (ret) {
4808 if (force_on)
4809 panic("tboot: Failed to initialize DMARs\n");
4810 pr_err("Initialization failed\n");
4811 goto out_free_reserved_range;
4813 up_write(&dmar_global_lock);
4814 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4816 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4817 swiotlb = 0;
4818 #endif
4819 dma_ops = &intel_dma_ops;
4821 init_iommu_pm_ops();
4823 for_each_active_iommu(iommu, drhd) {
4824 iommu_device_sysfs_add(&iommu->iommu, NULL,
4825 intel_iommu_groups,
4826 "%s", iommu->name);
4827 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4828 iommu_device_register(&iommu->iommu);
4831 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4832 bus_register_notifier(&pci_bus_type, &device_nb);
4833 if (si_domain && !hw_pass_through)
4834 register_memory_notifier(&intel_iommu_memory_nb);
4835 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4836 intel_iommu_cpu_dead);
4837 intel_iommu_enabled = 1;
4838 intel_iommu_debugfs_init();
4840 return 0;
4842 out_free_reserved_range:
4843 put_iova_domain(&reserved_iova_list);
4844 out_free_dmar:
4845 intel_iommu_free_dmars();
4846 up_write(&dmar_global_lock);
4847 iommu_exit_mempool();
4848 return ret;
4851 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4853 struct intel_iommu *iommu = opaque;
4855 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4856 return 0;
4860 * NB - intel-iommu lacks any sort of reference counting for the users of
4861 * dependent devices. If multiple endpoints have intersecting dependent
4862 * devices, unbinding the driver from any one of them will possibly leave
4863 * the others unable to operate.
4865 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4867 if (!iommu || !dev || !dev_is_pci(dev))
4868 return;
4870 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4873 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4875 struct intel_iommu *iommu;
4876 unsigned long flags;
4878 assert_spin_locked(&device_domain_lock);
4880 if (WARN_ON(!info))
4881 return;
4883 iommu = info->iommu;
4885 if (info->dev) {
4886 iommu_disable_dev_iotlb(info);
4887 domain_context_clear(iommu, info->dev);
4888 intel_pasid_free_table(info->dev);
4891 unlink_domain_info(info);
4893 spin_lock_irqsave(&iommu->lock, flags);
4894 domain_detach_iommu(info->domain, iommu);
4895 spin_unlock_irqrestore(&iommu->lock, flags);
4897 free_devinfo_mem(info);
4900 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4901 struct device *dev)
4903 struct device_domain_info *info;
4904 unsigned long flags;
4906 spin_lock_irqsave(&device_domain_lock, flags);
4907 info = dev->archdata.iommu;
4908 __dmar_remove_one_dev_info(info);
4909 spin_unlock_irqrestore(&device_domain_lock, flags);
4912 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4914 int adjust_width;
4916 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4917 domain_reserve_special_ranges(domain);
4919 /* calculate AGAW */
4920 domain->gaw = guest_width;
4921 adjust_width = guestwidth_to_adjustwidth(guest_width);
4922 domain->agaw = width_to_agaw(adjust_width);
4924 domain->iommu_coherency = 0;
4925 domain->iommu_snooping = 0;
4926 domain->iommu_superpage = 0;
4927 domain->max_addr = 0;
4929 /* always allocate the top pgd */
4930 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4931 if (!domain->pgd)
4932 return -ENOMEM;
4933 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4934 return 0;
4937 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4939 struct dmar_domain *dmar_domain;
4940 struct iommu_domain *domain;
4942 if (type != IOMMU_DOMAIN_UNMANAGED)
4943 return NULL;
4945 dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4946 if (!dmar_domain) {
4947 pr_err("Can't allocate dmar_domain\n");
4948 return NULL;
4950 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4951 pr_err("Domain initialization failed\n");
4952 domain_exit(dmar_domain);
4953 return NULL;
4955 domain_update_iommu_cap(dmar_domain);
4957 domain = &dmar_domain->domain;
4958 domain->geometry.aperture_start = 0;
4959 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4960 domain->geometry.force_aperture = true;
4962 return domain;
4965 static void intel_iommu_domain_free(struct iommu_domain *domain)
4967 domain_exit(to_dmar_domain(domain));
4970 static int intel_iommu_attach_device(struct iommu_domain *domain,
4971 struct device *dev)
4973 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4974 struct intel_iommu *iommu;
4975 int addr_width;
4976 u8 bus, devfn;
4978 if (device_is_rmrr_locked(dev)) {
4979 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4980 return -EPERM;
4983 /* normally dev is not mapped */
4984 if (unlikely(domain_context_mapped(dev))) {
4985 struct dmar_domain *old_domain;
4987 old_domain = find_domain(dev);
4988 if (old_domain) {
4989 rcu_read_lock();
4990 dmar_remove_one_dev_info(old_domain, dev);
4991 rcu_read_unlock();
4993 if (!domain_type_is_vm_or_si(old_domain) &&
4994 list_empty(&old_domain->devices))
4995 domain_exit(old_domain);
4999 iommu = device_to_iommu(dev, &bus, &devfn);
5000 if (!iommu)
5001 return -ENODEV;
5003 /* check if this iommu agaw is sufficient for max mapped address */
5004 addr_width = agaw_to_width(iommu->agaw);
5005 if (addr_width > cap_mgaw(iommu->cap))
5006 addr_width = cap_mgaw(iommu->cap);
5008 if (dmar_domain->max_addr > (1LL << addr_width)) {
5009 pr_err("%s: iommu width (%d) is not "
5010 "sufficient for the mapped address (%llx)\n",
5011 __func__, addr_width, dmar_domain->max_addr);
5012 return -EFAULT;
5014 dmar_domain->gaw = addr_width;
5017 * Knock out extra levels of page tables if necessary
5019 while (iommu->agaw < dmar_domain->agaw) {
5020 struct dma_pte *pte;
5022 pte = dmar_domain->pgd;
5023 if (dma_pte_present(pte)) {
5024 dmar_domain->pgd = (struct dma_pte *)
5025 phys_to_virt(dma_pte_addr(pte));
5026 free_pgtable_page(pte);
5028 dmar_domain->agaw--;
5031 return domain_add_dev_info(dmar_domain, dev);
5034 static void intel_iommu_detach_device(struct iommu_domain *domain,
5035 struct device *dev)
5037 dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5040 static int intel_iommu_map(struct iommu_domain *domain,
5041 unsigned long iova, phys_addr_t hpa,
5042 size_t size, int iommu_prot)
5044 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5045 u64 max_addr;
5046 int prot = 0;
5047 int ret;
5049 if (iommu_prot & IOMMU_READ)
5050 prot |= DMA_PTE_READ;
5051 if (iommu_prot & IOMMU_WRITE)
5052 prot |= DMA_PTE_WRITE;
5053 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5054 prot |= DMA_PTE_SNP;
5056 max_addr = iova + size;
5057 if (dmar_domain->max_addr < max_addr) {
5058 u64 end;
5060 /* check if minimum agaw is sufficient for mapped address */
5061 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5062 if (end < max_addr) {
5063 pr_err("%s: iommu width (%d) is not "
5064 "sufficient for the mapped address (%llx)\n",
5065 __func__, dmar_domain->gaw, max_addr);
5066 return -EFAULT;
5068 dmar_domain->max_addr = max_addr;
5070 /* Round up size to next multiple of PAGE_SIZE, if it and
5071 the low bits of hpa would take us onto the next page */
5072 size = aligned_nrpages(hpa, size);
5073 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5074 hpa >> VTD_PAGE_SHIFT, size, prot);
5075 return ret;
5078 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5079 unsigned long iova, size_t size)
5081 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5082 struct page *freelist = NULL;
5083 unsigned long start_pfn, last_pfn;
5084 unsigned int npages;
5085 int iommu_id, level = 0;
5087 /* Cope with horrid API which requires us to unmap more than the
5088 size argument if it happens to be a large-page mapping. */
5089 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5091 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5092 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5094 start_pfn = iova >> VTD_PAGE_SHIFT;
5095 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5097 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5099 npages = last_pfn - start_pfn + 1;
5101 for_each_domain_iommu(iommu_id, dmar_domain)
5102 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5103 start_pfn, npages, !freelist, 0);
5105 dma_free_pagelist(freelist);
5107 if (dmar_domain->max_addr == iova + size)
5108 dmar_domain->max_addr = iova;
5110 return size;
5113 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5114 dma_addr_t iova)
5116 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5117 struct dma_pte *pte;
5118 int level = 0;
5119 u64 phys = 0;
5121 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5122 if (pte)
5123 phys = dma_pte_addr(pte);
5125 return phys;
5128 static bool intel_iommu_capable(enum iommu_cap cap)
5130 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5131 return domain_update_iommu_snooping(NULL) == 1;
5132 if (cap == IOMMU_CAP_INTR_REMAP)
5133 return irq_remapping_enabled == 1;
5135 return false;
5138 static int intel_iommu_add_device(struct device *dev)
5140 struct intel_iommu *iommu;
5141 struct iommu_group *group;
5142 u8 bus, devfn;
5144 iommu = device_to_iommu(dev, &bus, &devfn);
5145 if (!iommu)
5146 return -ENODEV;
5148 iommu_device_link(&iommu->iommu, dev);
5150 group = iommu_group_get_for_dev(dev);
5152 if (IS_ERR(group))
5153 return PTR_ERR(group);
5155 iommu_group_put(group);
5156 return 0;
5159 static void intel_iommu_remove_device(struct device *dev)
5161 struct intel_iommu *iommu;
5162 u8 bus, devfn;
5164 iommu = device_to_iommu(dev, &bus, &devfn);
5165 if (!iommu)
5166 return;
5168 iommu_group_remove_device(dev);
5170 iommu_device_unlink(&iommu->iommu, dev);
5173 static void intel_iommu_get_resv_regions(struct device *device,
5174 struct list_head *head)
5176 struct iommu_resv_region *reg;
5177 struct dmar_rmrr_unit *rmrr;
5178 struct device *i_dev;
5179 int i;
5181 rcu_read_lock();
5182 for_each_rmrr_units(rmrr) {
5183 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5184 i, i_dev) {
5185 if (i_dev != device)
5186 continue;
5188 list_add_tail(&rmrr->resv->list, head);
5191 rcu_read_unlock();
5193 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5194 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5195 0, IOMMU_RESV_MSI);
5196 if (!reg)
5197 return;
5198 list_add_tail(&reg->list, head);
5201 static void intel_iommu_put_resv_regions(struct device *dev,
5202 struct list_head *head)
5204 struct iommu_resv_region *entry, *next;
5206 list_for_each_entry_safe(entry, next, head, list) {
5207 if (entry->type == IOMMU_RESV_RESERVED)
5208 kfree(entry);
5212 #ifdef CONFIG_INTEL_IOMMU_SVM
5213 #define MAX_NR_PASID_BITS (20)
5214 static inline unsigned long intel_iommu_get_pts(struct device *dev)
5216 int pts, max_pasid;
5218 max_pasid = intel_pasid_get_dev_max_id(dev);
5219 pts = find_first_bit((unsigned long *)&max_pasid, MAX_NR_PASID_BITS);
5220 if (pts < 5)
5221 return 0;
5223 return pts - 5;
5226 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5228 struct device_domain_info *info;
5229 struct context_entry *context;
5230 struct dmar_domain *domain;
5231 unsigned long flags;
5232 u64 ctx_lo;
5233 int ret;
5235 domain = get_valid_domain_for_dev(sdev->dev);
5236 if (!domain)
5237 return -EINVAL;
5239 spin_lock_irqsave(&device_domain_lock, flags);
5240 spin_lock(&iommu->lock);
5242 ret = -EINVAL;
5243 info = sdev->dev->archdata.iommu;
5244 if (!info || !info->pasid_supported)
5245 goto out;
5247 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5248 if (WARN_ON(!context))
5249 goto out;
5251 ctx_lo = context[0].lo;
5253 sdev->did = domain->iommu_did[iommu->seq_id];
5254 sdev->sid = PCI_DEVID(info->bus, info->devfn);
5256 if (!(ctx_lo & CONTEXT_PASIDE)) {
5257 if (iommu->pasid_state_table)
5258 context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5259 context[1].lo = (u64)virt_to_phys(info->pasid_table->table) |
5260 intel_iommu_get_pts(sdev->dev);
5262 wmb();
5263 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5264 * extended to permit requests-with-PASID if the PASIDE bit
5265 * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5266 * however, the PASIDE bit is ignored and requests-with-PASID
5267 * are unconditionally blocked. Which makes less sense.
5268 * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5269 * "guest mode" translation types depending on whether ATS
5270 * is available or not. Annoyingly, we can't use the new
5271 * modes *unless* PASIDE is set. */
5272 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5273 ctx_lo &= ~CONTEXT_TT_MASK;
5274 if (info->ats_supported)
5275 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5276 else
5277 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5279 ctx_lo |= CONTEXT_PASIDE;
5280 if (iommu->pasid_state_table)
5281 ctx_lo |= CONTEXT_DINVE;
5282 if (info->pri_supported)
5283 ctx_lo |= CONTEXT_PRS;
5284 context[0].lo = ctx_lo;
5285 wmb();
5286 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5287 DMA_CCMD_MASK_NOBIT,
5288 DMA_CCMD_DEVICE_INVL);
5291 /* Enable PASID support in the device, if it wasn't already */
5292 if (!info->pasid_enabled)
5293 iommu_enable_dev_iotlb(info);
5295 if (info->ats_enabled) {
5296 sdev->dev_iotlb = 1;
5297 sdev->qdep = info->ats_qdep;
5298 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5299 sdev->qdep = 0;
5301 ret = 0;
5303 out:
5304 spin_unlock(&iommu->lock);
5305 spin_unlock_irqrestore(&device_domain_lock, flags);
5307 return ret;
5310 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5312 struct intel_iommu *iommu;
5313 u8 bus, devfn;
5315 if (iommu_dummy(dev)) {
5316 dev_warn(dev,
5317 "No IOMMU translation for device; cannot enable SVM\n");
5318 return NULL;
5321 iommu = device_to_iommu(dev, &bus, &devfn);
5322 if ((!iommu)) {
5323 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5324 return NULL;
5327 return iommu;
5329 #endif /* CONFIG_INTEL_IOMMU_SVM */
5331 const struct iommu_ops intel_iommu_ops = {
5332 .capable = intel_iommu_capable,
5333 .domain_alloc = intel_iommu_domain_alloc,
5334 .domain_free = intel_iommu_domain_free,
5335 .attach_dev = intel_iommu_attach_device,
5336 .detach_dev = intel_iommu_detach_device,
5337 .map = intel_iommu_map,
5338 .unmap = intel_iommu_unmap,
5339 .iova_to_phys = intel_iommu_iova_to_phys,
5340 .add_device = intel_iommu_add_device,
5341 .remove_device = intel_iommu_remove_device,
5342 .get_resv_regions = intel_iommu_get_resv_regions,
5343 .put_resv_regions = intel_iommu_put_resv_regions,
5344 .device_group = pci_device_group,
5345 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5348 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5350 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5351 pr_info("Disabling IOMMU for graphics on this chipset\n");
5352 dmar_map_gfx = 0;
5355 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5356 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5357 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5358 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5359 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5360 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5361 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5363 static void quirk_iommu_rwbf(struct pci_dev *dev)
5366 * Mobile 4 Series Chipset neglects to set RWBF capability,
5367 * but needs it. Same seems to hold for the desktop versions.
5369 pr_info("Forcing write-buffer flush capability\n");
5370 rwbf_quirk = 1;
5373 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5374 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5375 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5376 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5377 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5378 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5379 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5381 #define GGC 0x52
5382 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5383 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5384 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5385 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5386 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5387 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5388 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5389 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5391 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5393 unsigned short ggc;
5395 if (pci_read_config_word(dev, GGC, &ggc))
5396 return;
5398 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5399 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5400 dmar_map_gfx = 0;
5401 } else if (dmar_map_gfx) {
5402 /* we have to ensure the gfx device is idle before we flush */
5403 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5404 intel_iommu_strict = 1;
5407 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5408 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5409 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5410 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5412 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5413 ISOCH DMAR unit for the Azalia sound device, but not give it any
5414 TLB entries, which causes it to deadlock. Check for that. We do
5415 this in a function called from init_dmars(), instead of in a PCI
5416 quirk, because we don't want to print the obnoxious "BIOS broken"
5417 message if VT-d is actually disabled.
5419 static void __init check_tylersburg_isoch(void)
5421 struct pci_dev *pdev;
5422 uint32_t vtisochctrl;
5424 /* If there's no Azalia in the system anyway, forget it. */
5425 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5426 if (!pdev)
5427 return;
5428 pci_dev_put(pdev);
5430 /* System Management Registers. Might be hidden, in which case
5431 we can't do the sanity check. But that's OK, because the
5432 known-broken BIOSes _don't_ actually hide it, so far. */
5433 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5434 if (!pdev)
5435 return;
5437 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5438 pci_dev_put(pdev);
5439 return;
5442 pci_dev_put(pdev);
5444 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5445 if (vtisochctrl & 1)
5446 return;
5448 /* Drop all bits other than the number of TLB entries */
5449 vtisochctrl &= 0x1c;
5451 /* If we have the recommended number of TLB entries (16), fine. */
5452 if (vtisochctrl == 0x10)
5453 return;
5455 /* Zero TLB entries? You get to ride the short bus to school. */
5456 if (!vtisochctrl) {
5457 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5458 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5459 dmi_get_system_info(DMI_BIOS_VENDOR),
5460 dmi_get_system_info(DMI_BIOS_VERSION),
5461 dmi_get_system_info(DMI_PRODUCT_VERSION));
5462 iommu_identity_mapping |= IDENTMAP_AZALIA;
5463 return;
5466 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5467 vtisochctrl);