drivers/iommu/intel-iommu.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright © 2006-2014 Intel Corporation.
   4  *
   5  * Authors: David Woodhouse <dwmw2@infradead.org>,
   6  *          Ashok Raj <ashok.raj@intel.com>,
   7  *          Shaohua Li <shaohua.li@intel.com>,
   8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9  *          Fenghua Yu <fenghua.yu@intel.com>
  10  *          Joerg Roedel <jroedel@suse.de>
  11  */
  12
  13 #define pr_fmt(fmt)     "DMAR: " fmt
  14 #define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16 #include <linux/init.h>
  17 #include <linux/bitmap.h>
  18 #include <linux/debugfs.h>
  19 #include <linux/export.h>
  20 #include <linux/slab.h>
  21 #include <linux/irq.h>
  22 #include <linux/interrupt.h>
  23 #include <linux/spinlock.h>
  24 #include <linux/pci.h>
  25 #include <linux/dmar.h>
  26 #include <linux/dma-mapping.h>
  27 #include <linux/mempool.h>
  28 #include <linux/memory.h>
  29 #include <linux/cpu.h>
  30 #include <linux/timer.h>
  31 #include <linux/io.h>
  32 #include <linux/iova.h>
  33 #include <linux/iommu.h>
  34 #include <linux/intel-iommu.h>
  35 #include <linux/syscore_ops.h>
  36 #include <linux/tboot.h>
  37 #include <linux/dmi.h>
  38 #include <linux/pci-ats.h>
  39 #include <linux/memblock.h>
  40 #include <linux/dma-contiguous.h>
  41 #include <linux/dma-direct.h>
  42 #include <linux/crash_dump.h>
  43 #include <linux/numa.h>
  44 #include <linux/swiotlb.h>
  45 #include <asm/irq_remapping.h>
  46 #include <asm/cacheflush.h>
  47 #include <asm/iommu.h>
  48 #include <trace/events/intel_iommu.h>
  49
  50 #include "irq_remapping.h"
  51 #include "intel-pasid.h"
  52
  53 #define ROOT_SIZE               VTD_PAGE_SIZE
  54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  55
  56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  60
  61 #define IOAPIC_RANGE_START      (0xfee00000)
  62 #define IOAPIC_RANGE_END        (0xfeefffff)
  63 #define IOVA_START_ADDR         (0x1000)
  64
  65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  66
  67 #define MAX_AGAW_WIDTH 64
  68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  69
  70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  72
  73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  78
  79 /* IO virtual address start page frame number */
  80 #define IOVA_START_PFN          (1)
  81
  82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  83
  84 /* page table handling */
  85 #define LEVEL_STRIDE            (9)
  86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  87
  88 /*
  89  * This bitmap is used to advertise the page sizes our hardware support
  90  * to the IOMMU core, which will then use this information to split
  91  * physically contiguous memory regions it is mapping into page sizes
  92  * that we support.
  93  *
  94  * Traditionally the IOMMU core just handed us the mappings directly,
  95  * after making sure the size is an order of a 4KiB page and that the
  96  * mapping has natural alignment.
  97  *
  98  * To retain this behavior, we currently advertise that we support
  99  * all page sizes that are an order of 4KiB.
 100  *
 101  * If at some point we'd like to utilize the IOMMU core's new behavior,
 102  * we could change this to advertise the real page sizes we support.
 103  */
 104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 105
 106 static inline int agaw_to_level(int agaw)
 107 {
 108         return agaw + 2;
 109 }
 110
 111 static inline int agaw_to_width(int agaw)
 112 {
 113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 114 }
 115
 116 static inline int width_to_agaw(int width)
 117 {
 118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 119 }
 120
 121 static inline unsigned int level_to_offset_bits(int level)
 122 {
 123         return (level - 1) * LEVEL_STRIDE;
 124 }
 125
 126 static inline int pfn_level_offset(u64 pfn, int level)
 127 {
 128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 129 }
 130
 131 static inline u64 level_mask(int level)
 132 {
 133         return -1ULL << level_to_offset_bits(level);
 134 }
 135
 136 static inline u64 level_size(int level)
 137 {
 138         return 1ULL << level_to_offset_bits(level);
 139 }
 140
 141 static inline u64 align_to_level(u64 pfn, int level)
 142 {
 143         return (pfn + level_size(level) - 1) & level_mask(level);
 144 }
 145
 146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 147 {
 148         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 149 }
 150
 151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 152    are never going to work. */
 153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 154 {
 155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 156 }
 157
 158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 159 {
 160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 161 }
 162 static inline unsigned long page_to_dma_pfn(struct page *pg)
 163 {
 164         return mm_to_dma_pfn(page_to_pfn(pg));
 165 }
 166 static inline unsigned long virt_to_dma_pfn(void *p)
 167 {
 168         return page_to_dma_pfn(virt_to_page(p));
 169 }
 170
 171 /* global iommu list, set NULL for ignored DMAR units */
 172 static struct intel_iommu **g_iommus;
 173
 174 static void __init check_tylersburg_isoch(void);
 175 static int rwbf_quirk;
 176
 177 /*
 178  * set to 1 to panic kernel if can't successfully enable VT-d
 179  * (used when kernel is launched w/ TXT)
 180  */
 181 static int force_on = 0;
 182 int intel_iommu_tboot_noforce;
 183 static int no_platform_optin;
 184
 185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 186
 187 /*
 188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 189  * if marked present.
 190  */
 191 static phys_addr_t root_entry_lctp(struct root_entry *re)
 192 {
 193         if (!(re->lo & 1))
 194                 return 0;
 195
 196         return re->lo & VTD_PAGE_MASK;
 197 }
 198
 199 /*
 200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 201  * if marked present.
 202  */
 203 static phys_addr_t root_entry_uctp(struct root_entry *re)
 204 {
 205         if (!(re->hi & 1))
 206                 return 0;
 207
 208         return re->hi & VTD_PAGE_MASK;
 209 }
 210
 211 static inline void context_clear_pasid_enable(struct context_entry *context)
 212 {
 213         context->lo &= ~(1ULL << 11);
 214 }
 215
 216 static inline bool context_pasid_enabled(struct context_entry *context)
 217 {
 218         return !!(context->lo & (1ULL << 11));
 219 }
 220
 221 static inline void context_set_copied(struct context_entry *context)
 222 {
 223         context->hi |= (1ull << 3);
 224 }
 225
 226 static inline bool context_copied(struct context_entry *context)
 227 {
 228         return !!(context->hi & (1ULL << 3));
 229 }
 230
 231 static inline bool __context_present(struct context_entry *context)
 232 {
 233         return (context->lo & 1);
 234 }
 235
 236 bool context_present(struct context_entry *context)
 237 {
 238         return context_pasid_enabled(context) ?
 239              __context_present(context) :
 240              __context_present(context) && !context_copied(context);
 241 }
 242
 243 static inline void context_set_present(struct context_entry *context)
 244 {
 245         context->lo |= 1;
 246 }
 247
 248 static inline void context_set_fault_enable(struct context_entry *context)
 249 {
 250         context->lo &= (((u64)-1) << 2) | 1;
 251 }
 252
 253 static inline void context_set_translation_type(struct context_entry *context,
 254                                                 unsigned long value)
 255 {
 256         context->lo &= (((u64)-1) << 4) | 3;
 257         context->lo |= (value & 3) << 2;
 258 }
 259
 260 static inline void context_set_address_root(struct context_entry *context,
 261                                             unsigned long value)
 262 {
 263         context->lo &= ~VTD_PAGE_MASK;
 264         context->lo |= value & VTD_PAGE_MASK;
 265 }
 266
 267 static inline void context_set_address_width(struct context_entry *context,
 268                                              unsigned long value)
 269 {
 270         context->hi |= value & 7;
 271 }
 272
 273 static inline void context_set_domain_id(struct context_entry *context,
 274                                          unsigned long value)
 275 {
 276         context->hi |= (value & ((1 << 16) - 1)) << 8;
 277 }
 278
 279 static inline int context_domain_id(struct context_entry *c)
 280 {
 281         return((c->hi >> 8) & 0xffff);
 282 }
 283
 284 static inline void context_clear_entry(struct context_entry *context)
 285 {
 286         context->lo = 0;
 287         context->hi = 0;
 288 }
 289
 290 /*
 291  * This domain is a statically identity mapping domain.
 292  *      1. This domain creats a static 1:1 mapping to all usable memory.
 293  *      2. It maps to each iommu if successful.
 294  *      3. Each iommu mapps to this domain if successful.
 295  */
 296 static struct dmar_domain *si_domain;
 297 static int hw_pass_through = 1;
 298
 299 /* si_domain contains mulitple devices */
 300 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
 301
 302 /*
 303  * This is a DMA domain allocated through the iommu domain allocation
 304  * interface. But one or more devices belonging to this domain have
 305  * been chosen to use a private domain. We should avoid to use the
 306  * map/unmap/iova_to_phys APIs on it.
 307  */
 308 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
 309
 310 #define for_each_domain_iommu(idx, domain)                      \
 311         for (idx = 0; idx < g_num_of_iommus; idx++)             \
 312                 if (domain->iommu_refcnt[idx])
 313
 314 struct dmar_rmrr_unit {
 315         struct list_head list;          /* list of rmrr units   */
 316         struct acpi_dmar_header *hdr;   /* ACPI header          */
 317         u64     base_address;           /* reserved base address*/
 318         u64     end_address;            /* reserved end address */
 319         struct dmar_dev_scope *devices; /* target devices */
 320         int     devices_cnt;            /* target device count */
 321 };
 322
 323 struct dmar_atsr_unit {
 324         struct list_head list;          /* list of ATSR units */
 325         struct acpi_dmar_header *hdr;   /* ACPI header */
 326         struct dmar_dev_scope *devices; /* target devices */
 327         int devices_cnt;                /* target device count */
 328         u8 include_all:1;               /* include all ports */
 329 };
 330
 331 static LIST_HEAD(dmar_atsr_units);
 332 static LIST_HEAD(dmar_rmrr_units);
 333
 334 #define for_each_rmrr_units(rmrr) \
 335         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 336
 337 /* bitmap for indexing intel_iommus */
 338 static int g_num_of_iommus;
 339
 340 static void domain_exit(struct dmar_domain *domain);
 341 static void domain_remove_dev_info(struct dmar_domain *domain);
 342 static void dmar_remove_one_dev_info(struct device *dev);
 343 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 344 static void domain_context_clear(struct intel_iommu *iommu,
 345                                  struct device *dev);
 346 static int domain_detach_iommu(struct dmar_domain *domain,
 347                                struct intel_iommu *iommu);
 348 static bool device_is_rmrr_locked(struct device *dev);
 349 static int intel_iommu_attach_device(struct iommu_domain *domain,
 350                                      struct device *dev);
 351 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 352                                             dma_addr_t iova);
 353
 354 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 355 int dmar_disabled = 0;
 356 #else
 357 int dmar_disabled = 1;
 358 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 359
 360 int intel_iommu_sm;
 361 int intel_iommu_enabled = 0;
 362 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 363
 364 static int dmar_map_gfx = 1;
 365 static int dmar_forcedac;
 366 static int intel_iommu_strict;
 367 static int intel_iommu_superpage = 1;
 368 static int iommu_identity_mapping;
 369 static int intel_no_bounce;
 370
 371 #define IDENTMAP_ALL            1
 372 #define IDENTMAP_GFX            2
 373 #define IDENTMAP_AZALIA         4
 374
 375 int intel_iommu_gfx_mapped;
 376 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 377
 378 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 379 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
 380 static DEFINE_SPINLOCK(device_domain_lock);
 381 static LIST_HEAD(device_domain_list);
 382
 383 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
 384                                 to_pci_dev(d)->untrusted)
 385
 386 /*
 387  * Iterate over elements in device_domain_list and call the specified
 388  * callback @fn against each element.
 389  */
 390 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 391                                      void *data), void *data)
 392 {
 393         int ret = 0;
 394         unsigned long flags;
 395         struct device_domain_info *info;
 396
 397         spin_lock_irqsave(&device_domain_lock, flags);
 398         list_for_each_entry(info, &device_domain_list, global) {
 399                 ret = fn(info, data);
 400                 if (ret) {
 401                         spin_unlock_irqrestore(&device_domain_lock, flags);
 402                         return ret;
 403                 }
 404         }
 405         spin_unlock_irqrestore(&device_domain_lock, flags);
 406
 407         return 0;
 408 }
 409
 410 const struct iommu_ops intel_iommu_ops;
 411
 412 static bool translation_pre_enabled(struct intel_iommu *iommu)
 413 {
 414         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 415 }
 416
 417 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 418 {
 419         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 420 }
 421
 422 static void init_translation_status(struct intel_iommu *iommu)
 423 {
 424         u32 gsts;
 425
 426         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 427         if (gsts & DMA_GSTS_TES)
 428                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 429 }
 430
 431 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
 432 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 433 {
 434         return container_of(dom, struct dmar_domain, domain);
 435 }
 436
 437 static int __init intel_iommu_setup(char *str)
 438 {
 439         if (!str)
 440                 return -EINVAL;
 441         while (*str) {
 442                 if (!strncmp(str, "on", 2)) {
 443                         dmar_disabled = 0;
 444                         pr_info("IOMMU enabled\n");
 445                 } else if (!strncmp(str, "off", 3)) {
 446                         dmar_disabled = 1;
 447                         no_platform_optin = 1;
 448                         pr_info("IOMMU disabled\n");
 449                 } else if (!strncmp(str, "igfx_off", 8)) {
 450                         dmar_map_gfx = 0;
 451                         pr_info("Disable GFX device mapping\n");
 452                 } else if (!strncmp(str, "forcedac", 8)) {
 453                         pr_info("Forcing DAC for PCI devices\n");
 454                         dmar_forcedac = 1;
 455                 } else if (!strncmp(str, "strict", 6)) {
 456                         pr_info("Disable batched IOTLB flush\n");
 457                         intel_iommu_strict = 1;
 458                 } else if (!strncmp(str, "sp_off", 6)) {
 459                         pr_info("Disable supported super page\n");
 460                         intel_iommu_superpage = 0;
 461                 } else if (!strncmp(str, "sm_on", 5)) {
 462                         pr_info("Intel-IOMMU: scalable mode supported\n");
 463                         intel_iommu_sm = 1;
 464                 } else if (!strncmp(str, "tboot_noforce", 13)) {
 465                         printk(KERN_INFO
 466                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 467                         intel_iommu_tboot_noforce = 1;
 468                 } else if (!strncmp(str, "nobounce", 8)) {
 469                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
 470                         intel_no_bounce = 1;
 471                 }
 472
 473                 str += strcspn(str, ",");
 474                 while (*str == ',')
 475                         str++;
 476         }
 477         return 0;
 478 }
 479 __setup("intel_iommu=", intel_iommu_setup);
 480
 481 static struct kmem_cache *iommu_domain_cache;
 482 static struct kmem_cache *iommu_devinfo_cache;
 483
 484 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 485 {
 486         struct dmar_domain **domains;
 487         int idx = did >> 8;
 488
 489         domains = iommu->domains[idx];
 490         if (!domains)
 491                 return NULL;
 492
 493         return domains[did & 0xff];
 494 }
 495
 496 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 497                              struct dmar_domain *domain)
 498 {
 499         struct dmar_domain **domains;
 500         int idx = did >> 8;
 501
 502         if (!iommu->domains[idx]) {
 503                 size_t size = 256 * sizeof(struct dmar_domain *);
 504                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 505         }
 506
 507         domains = iommu->domains[idx];
 508         if (WARN_ON(!domains))
 509                 return;
 510         else
 511                 domains[did & 0xff] = domain;
 512 }
 513
 514 void *alloc_pgtable_page(int node)
 515 {
 516         struct page *page;
 517         void *vaddr = NULL;
 518
 519         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 520         if (page)
 521                 vaddr = page_address(page);
 522         return vaddr;
 523 }
 524
 525 void free_pgtable_page(void *vaddr)
 526 {
 527         free_page((unsigned long)vaddr);
 528 }
 529
 530 static inline void *alloc_domain_mem(void)
 531 {
 532         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 533 }
 534
 535 static void free_domain_mem(void *vaddr)
 536 {
 537         kmem_cache_free(iommu_domain_cache, vaddr);
 538 }
 539
 540 static inline void * alloc_devinfo_mem(void)
 541 {
 542         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 543 }
 544
 545 static inline void free_devinfo_mem(void *vaddr)
 546 {
 547         kmem_cache_free(iommu_devinfo_cache, vaddr);
 548 }
 549
 550 static inline int domain_type_is_si(struct dmar_domain *domain)
 551 {
 552         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 553 }
 554
 555 static inline int domain_pfn_supported(struct dmar_domain *domain,
 556                                        unsigned long pfn)
 557 {
 558         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 559
 560         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 561 }
 562
 563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 564 {
 565         unsigned long sagaw;
 566         int agaw = -1;
 567
 568         sagaw = cap_sagaw(iommu->cap);
 569         for (agaw = width_to_agaw(max_gaw);
 570              agaw >= 0; agaw--) {
 571                 if (test_bit(agaw, &sagaw))
 572                         break;
 573         }
 574
 575         return agaw;
 576 }
 577
 578 /*
 579  * Calculate max SAGAW for each iommu.
 580  */
 581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 582 {
 583         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 584 }
 585
 586 /*
 587  * calculate agaw for each iommu.
 588  * "SAGAW" may be different across iommus, use a default agaw, and
 589  * get a supported less agaw for iommus that don't support the default agaw.
 590  */
 591 int iommu_calculate_agaw(struct intel_iommu *iommu)
 592 {
 593         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 594 }
 595
 596 /* This functionin only returns single iommu in a domain */
 597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 598 {
 599         int iommu_id;
 600
 601         /* si_domain and vm domain should not get here. */
 602         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
 603                 return NULL;
 604
 605         for_each_domain_iommu(iommu_id, domain)
 606                 break;
 607
 608         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 609                 return NULL;
 610
 611         return g_iommus[iommu_id];
 612 }
 613
 614 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 615 {
 616         return sm_supported(iommu) ?
 617                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 618 }
 619
 620 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 621 {
 622         struct dmar_drhd_unit *drhd;
 623         struct intel_iommu *iommu;
 624         bool found = false;
 625         int i;
 626
 627         domain->iommu_coherency = 1;
 628
 629         for_each_domain_iommu(i, domain) {
 630                 found = true;
 631                 if (!iommu_paging_structure_coherency(g_iommus[i])) {
 632                         domain->iommu_coherency = 0;
 633                         break;
 634                 }
 635         }
 636         if (found)
 637                 return;
 638
 639         /* No hardware attached; use lowest common denominator */
 640         rcu_read_lock();
 641         for_each_active_iommu(iommu, drhd) {
 642                 if (!iommu_paging_structure_coherency(iommu)) {
 643                         domain->iommu_coherency = 0;
 644                         break;
 645                 }
 646         }
 647         rcu_read_unlock();
 648 }
 649
 650 static int domain_update_iommu_snooping(struct intel_iommu *skip)
 651 {
 652         struct dmar_drhd_unit *drhd;
 653         struct intel_iommu *iommu;
 654         int ret = 1;
 655
 656         rcu_read_lock();
 657         for_each_active_iommu(iommu, drhd) {
 658                 if (iommu != skip) {
 659                         if (!ecap_sc_support(iommu->ecap)) {
 660                                 ret = 0;
 661                                 break;
 662                         }
 663                 }
 664         }
 665         rcu_read_unlock();
 666
 667         return ret;
 668 }
 669
 670 static int domain_update_iommu_superpage(struct intel_iommu *skip)
 671 {
 672         struct dmar_drhd_unit *drhd;
 673         struct intel_iommu *iommu;
 674         int mask = 0xf;
 675
 676         if (!intel_iommu_superpage) {
 677                 return 0;
 678         }
 679
 680         /* set iommu_superpage to the smallest common denominator */
 681         rcu_read_lock();
 682         for_each_active_iommu(iommu, drhd) {
 683                 if (iommu != skip) {
 684                         mask &= cap_super_page_val(iommu->cap);
 685                         if (!mask)
 686                                 break;
 687                 }
 688         }
 689         rcu_read_unlock();
 690
 691         return fls(mask);
 692 }
 693
 694 /* Some capabilities may be different across iommus */
 695 static void domain_update_iommu_cap(struct dmar_domain *domain)
 696 {
 697         domain_update_iommu_coherency(domain);
 698         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 699         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
 700 }
 701
 702 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 703                                          u8 devfn, int alloc)
 704 {
 705         struct root_entry *root = &iommu->root_entry[bus];
 706         struct context_entry *context;
 707         u64 *entry;
 708
 709         entry = &root->lo;
 710         if (sm_supported(iommu)) {
 711                 if (devfn >= 0x80) {
 712                         devfn -= 0x80;
 713                         entry = &root->hi;
 714                 }
 715                 devfn *= 2;
 716         }
 717         if (*entry & 1)
 718                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 719         else {
 720                 unsigned long phy_addr;
 721                 if (!alloc)
 722                         return NULL;
 723
 724                 context = alloc_pgtable_page(iommu->node);
 725                 if (!context)
 726                         return NULL;
 727
 728                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 729                 phy_addr = virt_to_phys((void *)context);
 730                 *entry = phy_addr | 1;
 731                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 732         }
 733         return &context[devfn];
 734 }
 735
 736 static int iommu_dummy(struct device *dev)
 737 {
 738         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 739 }
 740
 741 /**
 742  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 743  *                               sub-hierarchy of a candidate PCI-PCI bridge
 744  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 745  * @bridge: the candidate PCI-PCI bridge
 746  *
 747  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 748  */
 749 static bool
 750 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 751 {
 752         struct pci_dev *pdev, *pbridge;
 753
 754         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 755                 return false;
 756
 757         pdev = to_pci_dev(dev);
 758         pbridge = to_pci_dev(bridge);
 759
 760         if (pbridge->subordinate &&
 761             pbridge->subordinate->number <= pdev->bus->number &&
 762             pbridge->subordinate->busn_res.end >= pdev->bus->number)
 763                 return true;
 764
 765         return false;
 766 }
 767
 768 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 769 {
 770         struct dmar_drhd_unit *drhd = NULL;
 771         struct intel_iommu *iommu;
 772         struct device *tmp;
 773         struct pci_dev *pdev = NULL;
 774         u16 segment = 0;
 775         int i;
 776
 777         if (iommu_dummy(dev))
 778                 return NULL;
 779
 780         if (dev_is_pci(dev)) {
 781                 struct pci_dev *pf_pdev;
 782
 783                 pdev = to_pci_dev(dev);
 784
 785 #ifdef CONFIG_X86
 786                 /* VMD child devices currently cannot be handled individually */
 787                 if (is_vmd(pdev->bus))
 788                         return NULL;
 789 #endif
 790
 791                 /* VFs aren't listed in scope tables; we need to look up
 792                  * the PF instead to find the IOMMU. */
 793                 pf_pdev = pci_physfn(pdev);
 794                 dev = &pf_pdev->dev;
 795                 segment = pci_domain_nr(pdev->bus);
 796         } else if (has_acpi_companion(dev))
 797                 dev = &ACPI_COMPANION(dev)->dev;
 798
 799         rcu_read_lock();
 800         for_each_active_iommu(iommu, drhd) {
 801                 if (pdev && segment != drhd->segment)
 802                         continue;
 803
 804                 for_each_active_dev_scope(drhd->devices,
 805                                           drhd->devices_cnt, i, tmp) {
 806                         if (tmp == dev) {
 807                                 /* For a VF use its original BDF# not that of the PF
 808                                  * which we used for the IOMMU lookup. Strictly speaking
 809                                  * we could do this for all PCI devices; we only need to
 810                                  * get the BDF# from the scope table for ACPI matches. */
 811                                 if (pdev && pdev->is_virtfn)
 812                                         goto got_pdev;
 813
 814                                 *bus = drhd->devices[i].bus;
 815                                 *devfn = drhd->devices[i].devfn;
 816                                 goto out;
 817                         }
 818
 819                         if (is_downstream_to_pci_bridge(dev, tmp))
 820                                 goto got_pdev;
 821                 }
 822
 823                 if (pdev && drhd->include_all) {
 824                 got_pdev:
 825                         *bus = pdev->bus->number;
 826                         *devfn = pdev->devfn;
 827                         goto out;
 828                 }
 829         }
 830         iommu = NULL;
 831  out:
 832         rcu_read_unlock();
 833
 834         return iommu;
 835 }
 836
 837 static void domain_flush_cache(struct dmar_domain *domain,
 838                                void *addr, int size)
 839 {
 840         if (!domain->iommu_coherency)
 841                 clflush_cache_range(addr, size);
 842 }
 843
 844 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 845 {
 846         struct context_entry *context;
 847         int ret = 0;
 848         unsigned long flags;
 849
 850         spin_lock_irqsave(&iommu->lock, flags);
 851         context = iommu_context_addr(iommu, bus, devfn, 0);
 852         if (context)
 853                 ret = context_present(context);
 854         spin_unlock_irqrestore(&iommu->lock, flags);
 855         return ret;
 856 }
 857
 858 static void free_context_table(struct intel_iommu *iommu)
 859 {
 860         int i;
 861         unsigned long flags;
 862         struct context_entry *context;
 863
 864         spin_lock_irqsave(&iommu->lock, flags);
 865         if (!iommu->root_entry) {
 866                 goto out;
 867         }
 868         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 869                 context = iommu_context_addr(iommu, i, 0, 0);
 870                 if (context)
 871                         free_pgtable_page(context);
 872
 873                 if (!sm_supported(iommu))
 874                         continue;
 875
 876                 context = iommu_context_addr(iommu, i, 0x80, 0);
 877                 if (context)
 878                         free_pgtable_page(context);
 879
 880         }
 881         free_pgtable_page(iommu->root_entry);
 882         iommu->root_entry = NULL;
 883 out:
 884         spin_unlock_irqrestore(&iommu->lock, flags);
 885 }
 886
 887 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 888                                       unsigned long pfn, int *target_level)
 889 {
 890         struct dma_pte *parent, *pte;
 891         int level = agaw_to_level(domain->agaw);
 892         int offset;
 893
 894         BUG_ON(!domain->pgd);
 895
 896         if (!domain_pfn_supported(domain, pfn))
 897                 /* Address beyond IOMMU's addressing capabilities. */
 898                 return NULL;
 899
 900         parent = domain->pgd;
 901
 902         while (1) {
 903                 void *tmp_page;
 904
 905                 offset = pfn_level_offset(pfn, level);
 906                 pte = &parent[offset];
 907                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 908                         break;
 909                 if (level == *target_level)
 910                         break;
 911
 912                 if (!dma_pte_present(pte)) {
 913                         uint64_t pteval;
 914
 915                         tmp_page = alloc_pgtable_page(domain->nid);
 916
 917                         if (!tmp_page)
 918                                 return NULL;
 919
 920                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 921                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 922                         if (cmpxchg64(&pte->val, 0ULL, pteval))
 923                                 /* Someone else set it while we were thinking; use theirs. */
 924                                 free_pgtable_page(tmp_page);
 925                         else
 926                                 domain_flush_cache(domain, pte, sizeof(*pte));
 927                 }
 928                 if (level == 1)
 929                         break;
 930
 931                 parent = phys_to_virt(dma_pte_addr(pte));
 932                 level--;
 933         }
 934
 935         if (!*target_level)
 936                 *target_level = level;
 937
 938         return pte;
 939 }
 940
 941 /* return address's pte at specific level */
 942 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 943                                          unsigned long pfn,
 944                                          int level, int *large_page)
 945 {
 946         struct dma_pte *parent, *pte;
 947         int total = agaw_to_level(domain->agaw);
 948         int offset;
 949
 950         parent = domain->pgd;
 951         while (level <= total) {
 952                 offset = pfn_level_offset(pfn, total);
 953                 pte = &parent[offset];
 954                 if (level == total)
 955                         return pte;
 956
 957                 if (!dma_pte_present(pte)) {
 958                         *large_page = total;
 959                         break;
 960                 }
 961
 962                 if (dma_pte_superpage(pte)) {
 963                         *large_page = total;
 964                         return pte;
 965                 }
 966
 967                 parent = phys_to_virt(dma_pte_addr(pte));
 968                 total--;
 969         }
 970         return NULL;
 971 }
 972
 973 /* clear last level pte, a tlb flush should be followed */
 974 static void dma_pte_clear_range(struct dmar_domain *domain,
 975                                 unsigned long start_pfn,
 976                                 unsigned long last_pfn)
 977 {
 978         unsigned int large_page;
 979         struct dma_pte *first_pte, *pte;
 980
 981         BUG_ON(!domain_pfn_supported(domain, start_pfn));
 982         BUG_ON(!domain_pfn_supported(domain, last_pfn));
 983         BUG_ON(start_pfn > last_pfn);
 984
 985         /* we don't need lock here; nobody else touches the iova range */
 986         do {
 987                 large_page = 1;
 988                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 989                 if (!pte) {
 990                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 991                         continue;
 992                 }
 993                 do {
 994                         dma_clear_pte(pte);
 995                         start_pfn += lvl_to_nr_pages(large_page);
 996                         pte++;
 997                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 998
 999                 domain_flush_cache(domain, first_pte,
1000                                    (void *)pte - (void *)first_pte);
1001
1002         } while (start_pfn && start_pfn <= last_pfn);
1003 }
1004
1005 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1006                                int retain_level, struct dma_pte *pte,
1007                                unsigned long pfn, unsigned long start_pfn,
1008                                unsigned long last_pfn)
1009 {
1010         pfn = max(start_pfn, pfn);
1011         pte = &pte[pfn_level_offset(pfn, level)];
1012
1013         do {
1014                 unsigned long level_pfn;
1015                 struct dma_pte *level_pte;
1016
1017                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1018                         goto next;
1019
1020                 level_pfn = pfn & level_mask(level);
1021                 level_pte = phys_to_virt(dma_pte_addr(pte));
1022
1023                 if (level > 2) {
1024                         dma_pte_free_level(domain, level - 1, retain_level,
1025                                            level_pte, level_pfn, start_pfn,
1026                                            last_pfn);
1027                 }
1028
1029                 /*
1030                  * Free the page table if we're below the level we want to
1031                  * retain and the range covers the entire table.
1032                  */
1033                 if (level < retain_level && !(start_pfn > level_pfn ||
1034                       last_pfn < level_pfn + level_size(level) - 1)) {
1035                         dma_clear_pte(pte);
1036                         domain_flush_cache(domain, pte, sizeof(*pte));
1037                         free_pgtable_page(level_pte);
1038                 }
1039 next:
1040                 pfn += level_size(level);
1041         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1042 }
1043
1044 /*
1045  * clear last level (leaf) ptes and free page table pages below the
1046  * level we wish to keep intact.
1047  */
1048 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1049                                    unsigned long start_pfn,
1050                                    unsigned long last_pfn,
1051                                    int retain_level)
1052 {
1053         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1054         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1055         BUG_ON(start_pfn > last_pfn);
1056
1057         dma_pte_clear_range(domain, start_pfn, last_pfn);
1058
1059         /* We don't need lock here; nobody else touches the iova range */
1060         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1061                            domain->pgd, 0, start_pfn, last_pfn);
1062
1063         /* free pgd */
1064         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1065                 free_pgtable_page(domain->pgd);
1066                 domain->pgd = NULL;
1067         }
1068 }
1069
1070 /* When a page at a given level is being unlinked from its parent, we don't
1071    need to *modify* it at all. All we need to do is make a list of all the
1072    pages which can be freed just as soon as we've flushed the IOTLB and we
1073    know the hardware page-walk will no longer touch them.
1074    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1075    be freed. */
1076 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1077                                             int level, struct dma_pte *pte,
1078                                             struct page *freelist)
1079 {
1080         struct page *pg;
1081
1082         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1083         pg->freelist = freelist;
1084         freelist = pg;
1085
1086         if (level == 1)
1087                 return freelist;
1088
1089         pte = page_address(pg);
1090         do {
1091                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1092                         freelist = dma_pte_list_pagetables(domain, level - 1,
1093                                                            pte, freelist);
1094                 pte++;
1095         } while (!first_pte_in_page(pte));
1096
1097         return freelist;
1098 }
1099
1100 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1101                                         struct dma_pte *pte, unsigned long pfn,
1102                                         unsigned long start_pfn,
1103                                         unsigned long last_pfn,
1104                                         struct page *freelist)
1105 {
1106         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1107
1108         pfn = max(start_pfn, pfn);
1109         pte = &pte[pfn_level_offset(pfn, level)];
1110
1111         do {
1112                 unsigned long level_pfn;
1113
1114                 if (!dma_pte_present(pte))
1115                         goto next;
1116
1117                 level_pfn = pfn & level_mask(level);
1118
1119                 /* If range covers entire pagetable, free it */
1120                 if (start_pfn <= level_pfn &&
1121                     last_pfn >= level_pfn + level_size(level) - 1) {
1122                         /* These suborbinate page tables are going away entirely. Don't
1123                            bother to clear them; we're just going to *free* them. */
1124                         if (level > 1 && !dma_pte_superpage(pte))
1125                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1126
1127                         dma_clear_pte(pte);
1128                         if (!first_pte)
1129                                 first_pte = pte;
1130                         last_pte = pte;
1131                 } else if (level > 1) {
1132                         /* Recurse down into a level that isn't *entirely* obsolete */
1133                         freelist = dma_pte_clear_level(domain, level - 1,
1134                                                        phys_to_virt(dma_pte_addr(pte)),
1135                                                        level_pfn, start_pfn, last_pfn,
1136                                                        freelist);
1137                 }
1138 next:
1139                 pfn += level_size(level);
1140         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1141
1142         if (first_pte)
1143                 domain_flush_cache(domain, first_pte,
1144                                    (void *)++last_pte - (void *)first_pte);
1145
1146         return freelist;
1147 }
1148
1149 /* We can't just free the pages because the IOMMU may still be walking
1150    the page tables, and may have cached the intermediate levels. The
1151    pages can only be freed after the IOTLB flush has been done. */
1152 static struct page *domain_unmap(struct dmar_domain *domain,
1153                                  unsigned long start_pfn,
1154                                  unsigned long last_pfn)
1155 {
1156         struct page *freelist;
1157
1158         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1159         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1160         BUG_ON(start_pfn > last_pfn);
1161
1162         /* we don't need lock here; nobody else touches the iova range */
1163         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1164                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1165
1166         /* free pgd */
1167         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1168                 struct page *pgd_page = virt_to_page(domain->pgd);
1169                 pgd_page->freelist = freelist;
1170                 freelist = pgd_page;
1171
1172                 domain->pgd = NULL;
1173         }
1174
1175         return freelist;
1176 }
1177
1178 static void dma_free_pagelist(struct page *freelist)
1179 {
1180         struct page *pg;
1181
1182         while ((pg = freelist)) {
1183                 freelist = pg->freelist;
1184                 free_pgtable_page(page_address(pg));
1185         }
1186 }
1187
1188 static void iova_entry_free(unsigned long data)
1189 {
1190         struct page *freelist = (struct page *)data;
1191
1192         dma_free_pagelist(freelist);
1193 }
1194
1195 /* iommu handling */
1196 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1197 {
1198         struct root_entry *root;
1199         unsigned long flags;
1200
1201         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1202         if (!root) {
1203                 pr_err("Allocating root entry for %s failed\n",
1204                         iommu->name);
1205                 return -ENOMEM;
1206         }
1207
1208         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1209
1210         spin_lock_irqsave(&iommu->lock, flags);
1211         iommu->root_entry = root;
1212         spin_unlock_irqrestore(&iommu->lock, flags);
1213
1214         return 0;
1215 }
1216
1217 static void iommu_set_root_entry(struct intel_iommu *iommu)
1218 {
1219         u64 addr;
1220         u32 sts;
1221         unsigned long flag;
1222
1223         addr = virt_to_phys(iommu->root_entry);
1224         if (sm_supported(iommu))
1225                 addr |= DMA_RTADDR_SMT;
1226
1227         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1228         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1229
1230         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1231
1232         /* Make sure hardware complete it */
1233         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1234                       readl, (sts & DMA_GSTS_RTPS), sts);
1235
1236         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1237 }
1238
1239 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1240 {
1241         u32 val;
1242         unsigned long flag;
1243
1244         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1245                 return;
1246
1247         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1248         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1249
1250         /* Make sure hardware complete it */
1251         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1252                       readl, (!(val & DMA_GSTS_WBFS)), val);
1253
1254         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1255 }
1256
1257 /* return value determine if we need a write buffer flush */
1258 static void __iommu_flush_context(struct intel_iommu *iommu,
1259                                   u16 did, u16 source_id, u8 function_mask,
1260                                   u64 type)
1261 {
1262         u64 val = 0;
1263         unsigned long flag;
1264
1265         switch (type) {
1266         case DMA_CCMD_GLOBAL_INVL:
1267                 val = DMA_CCMD_GLOBAL_INVL;
1268                 break;
1269         case DMA_CCMD_DOMAIN_INVL:
1270                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1271                 break;
1272         case DMA_CCMD_DEVICE_INVL:
1273                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1274                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1275                 break;
1276         default:
1277                 BUG();
1278         }
1279         val |= DMA_CCMD_ICC;
1280
1281         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1282         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1283
1284         /* Make sure hardware complete it */
1285         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1286                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1287
1288         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1289 }
1290
1291 /* return value determine if we need a write buffer flush */
1292 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1293                                 u64 addr, unsigned int size_order, u64 type)
1294 {
1295         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1296         u64 val = 0, val_iva = 0;
1297         unsigned long flag;
1298
1299         switch (type) {
1300         case DMA_TLB_GLOBAL_FLUSH:
1301                 /* global flush doesn't need set IVA_REG */
1302                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1303                 break;
1304         case DMA_TLB_DSI_FLUSH:
1305                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1306                 break;
1307         case DMA_TLB_PSI_FLUSH:
1308                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1309                 /* IH bit is passed in as part of address */
1310                 val_iva = size_order | addr;
1311                 break;
1312         default:
1313                 BUG();
1314         }
1315         /* Note: set drain read/write */
1316 #if 0
1317         /*
1318          * This is probably to be super secure.. Looks like we can
1319          * ignore it without any impact.
1320          */
1321         if (cap_read_drain(iommu->cap))
1322                 val |= DMA_TLB_READ_DRAIN;
1323 #endif
1324         if (cap_write_drain(iommu->cap))
1325                 val |= DMA_TLB_WRITE_DRAIN;
1326
1327         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1328         /* Note: Only uses first TLB reg currently */
1329         if (val_iva)
1330                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1331         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1332
1333         /* Make sure hardware complete it */
1334         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1335                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1336
1337         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1338
1339         /* check IOTLB invalidation granularity */
1340         if (DMA_TLB_IAIG(val) == 0)
1341                 pr_err("Flush IOTLB failed\n");
1342         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1343                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1344                         (unsigned long long)DMA_TLB_IIRG(type),
1345                         (unsigned long long)DMA_TLB_IAIG(val));
1346 }
1347
1348 static struct device_domain_info *
1349 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1350                          u8 bus, u8 devfn)
1351 {
1352         struct device_domain_info *info;
1353
1354         assert_spin_locked(&device_domain_lock);
1355
1356         if (!iommu->qi)
1357                 return NULL;
1358
1359         list_for_each_entry(info, &domain->devices, link)
1360                 if (info->iommu == iommu && info->bus == bus &&
1361                     info->devfn == devfn) {
1362                         if (info->ats_supported && info->dev)
1363                                 return info;
1364                         break;
1365                 }
1366
1367         return NULL;
1368 }
1369
1370 static void domain_update_iotlb(struct dmar_domain *domain)
1371 {
1372         struct device_domain_info *info;
1373         bool has_iotlb_device = false;
1374
1375         assert_spin_locked(&device_domain_lock);
1376
1377         list_for_each_entry(info, &domain->devices, link) {
1378                 struct pci_dev *pdev;
1379
1380                 if (!info->dev || !dev_is_pci(info->dev))
1381                         continue;
1382
1383                 pdev = to_pci_dev(info->dev);
1384                 if (pdev->ats_enabled) {
1385                         has_iotlb_device = true;
1386                         break;
1387                 }
1388         }
1389
1390         domain->has_iotlb_device = has_iotlb_device;
1391 }
1392
1393 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1394 {
1395         struct pci_dev *pdev;
1396
1397         assert_spin_locked(&device_domain_lock);
1398
1399         if (!info || !dev_is_pci(info->dev))
1400                 return;
1401
1402         pdev = to_pci_dev(info->dev);
1403         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1404          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1405          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1406          * reserved, which should be set to 0.
1407          */
1408         if (!ecap_dit(info->iommu->ecap))
1409                 info->pfsid = 0;
1410         else {
1411                 struct pci_dev *pf_pdev;
1412
1413                 /* pdev will be returned if device is not a vf */
1414                 pf_pdev = pci_physfn(pdev);
1415                 info->pfsid = pci_dev_id(pf_pdev);
1416         }
1417
1418 #ifdef CONFIG_INTEL_IOMMU_SVM
1419         /* The PCIe spec, in its wisdom, declares that the behaviour of
1420            the device if you enable PASID support after ATS support is
1421            undefined. So always enable PASID support on devices which
1422            have it, even if we can't yet know if we're ever going to
1423            use it. */
1424         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1425                 info->pasid_enabled = 1;
1426
1427         if (info->pri_supported &&
1428             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1429             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1430                 info->pri_enabled = 1;
1431 #endif
1432         if (!pdev->untrusted && info->ats_supported &&
1433             pci_ats_page_aligned(pdev) &&
1434             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1435                 info->ats_enabled = 1;
1436                 domain_update_iotlb(info->domain);
1437                 info->ats_qdep = pci_ats_queue_depth(pdev);
1438         }
1439 }
1440
1441 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1442 {
1443         struct pci_dev *pdev;
1444
1445         assert_spin_locked(&device_domain_lock);
1446
1447         if (!dev_is_pci(info->dev))
1448                 return;
1449
1450         pdev = to_pci_dev(info->dev);
1451
1452         if (info->ats_enabled) {
1453                 pci_disable_ats(pdev);
1454                 info->ats_enabled = 0;
1455                 domain_update_iotlb(info->domain);
1456         }
1457 #ifdef CONFIG_INTEL_IOMMU_SVM
1458         if (info->pri_enabled) {
1459                 pci_disable_pri(pdev);
1460                 info->pri_enabled = 0;
1461         }
1462         if (info->pasid_enabled) {
1463                 pci_disable_pasid(pdev);
1464                 info->pasid_enabled = 0;
1465         }
1466 #endif
1467 }
1468
1469 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1470                                   u64 addr, unsigned mask)
1471 {
1472         u16 sid, qdep;
1473         unsigned long flags;
1474         struct device_domain_info *info;
1475
1476         if (!domain->has_iotlb_device)
1477                 return;
1478
1479         spin_lock_irqsave(&device_domain_lock, flags);
1480         list_for_each_entry(info, &domain->devices, link) {
1481                 if (!info->ats_enabled)
1482                         continue;
1483
1484                 sid = info->bus << 8 | info->devfn;
1485                 qdep = info->ats_qdep;
1486                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1487                                 qdep, addr, mask);
1488         }
1489         spin_unlock_irqrestore(&device_domain_lock, flags);
1490 }
1491
1492 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1493                                   struct dmar_domain *domain,
1494                                   unsigned long pfn, unsigned int pages,
1495                                   int ih, int map)
1496 {
1497         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1498         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1499         u16 did = domain->iommu_did[iommu->seq_id];
1500
1501         BUG_ON(pages == 0);
1502
1503         if (ih)
1504                 ih = 1 << 6;
1505         /*
1506          * Fallback to domain selective flush if no PSI support or the size is
1507          * too big.
1508          * PSI requires page size to be 2 ^ x, and the base address is naturally
1509          * aligned to the size
1510          */
1511         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1512                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1513                                                 DMA_TLB_DSI_FLUSH);
1514         else
1515                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1516                                                 DMA_TLB_PSI_FLUSH);
1517
1518         /*
1519          * In caching mode, changes of pages from non-present to present require
1520          * flush. However, device IOTLB doesn't need to be flushed in this case.
1521          */
1522         if (!cap_caching_mode(iommu->cap) || !map)
1523                 iommu_flush_dev_iotlb(domain, addr, mask);
1524 }
1525
1526 /* Notification for newly created mappings */
1527 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1528                                         struct dmar_domain *domain,
1529                                         unsigned long pfn, unsigned int pages)
1530 {
1531         /* It's a non-present to present mapping. Only flush if caching mode */
1532         if (cap_caching_mode(iommu->cap))
1533                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1534         else
1535                 iommu_flush_write_buffer(iommu);
1536 }
1537
1538 static void iommu_flush_iova(struct iova_domain *iovad)
1539 {
1540         struct dmar_domain *domain;
1541         int idx;
1542
1543         domain = container_of(iovad, struct dmar_domain, iovad);
1544
1545         for_each_domain_iommu(idx, domain) {
1546                 struct intel_iommu *iommu = g_iommus[idx];
1547                 u16 did = domain->iommu_did[iommu->seq_id];
1548
1549                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1550
1551                 if (!cap_caching_mode(iommu->cap))
1552                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1553                                               0, MAX_AGAW_PFN_WIDTH);
1554         }
1555 }
1556
1557 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1558 {
1559         u32 pmen;
1560         unsigned long flags;
1561
1562         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1563                 return;
1564
1565         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1566         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1567         pmen &= ~DMA_PMEN_EPM;
1568         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1569
1570         /* wait for the protected region status bit to clear */
1571         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1572                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1573
1574         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1575 }
1576
1577 static void iommu_enable_translation(struct intel_iommu *iommu)
1578 {
1579         u32 sts;
1580         unsigned long flags;
1581
1582         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1583         iommu->gcmd |= DMA_GCMD_TE;
1584         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1585
1586         /* Make sure hardware complete it */
1587         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1588                       readl, (sts & DMA_GSTS_TES), sts);
1589
1590         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1591 }
1592
1593 static void iommu_disable_translation(struct intel_iommu *iommu)
1594 {
1595         u32 sts;
1596         unsigned long flag;
1597
1598         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1599         iommu->gcmd &= ~DMA_GCMD_TE;
1600         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1601
1602         /* Make sure hardware complete it */
1603         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1604                       readl, (!(sts & DMA_GSTS_TES)), sts);
1605
1606         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1607 }
1608
1609 static int iommu_init_domains(struct intel_iommu *iommu)
1610 {
1611         u32 ndomains, nlongs;
1612         size_t size;
1613
1614         ndomains = cap_ndoms(iommu->cap);
1615         pr_debug("%s: Number of Domains supported <%d>\n",
1616                  iommu->name, ndomains);
1617         nlongs = BITS_TO_LONGS(ndomains);
1618
1619         spin_lock_init(&iommu->lock);
1620
1621         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1622         if (!iommu->domain_ids) {
1623                 pr_err("%s: Allocating domain id array failed\n",
1624                        iommu->name);
1625                 return -ENOMEM;
1626         }
1627
1628         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1629         iommu->domains = kzalloc(size, GFP_KERNEL);
1630
1631         if (iommu->domains) {
1632                 size = 256 * sizeof(struct dmar_domain *);
1633                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1634         }
1635
1636         if (!iommu->domains || !iommu->domains[0]) {
1637                 pr_err("%s: Allocating domain array failed\n",
1638                        iommu->name);
1639                 kfree(iommu->domain_ids);
1640                 kfree(iommu->domains);
1641                 iommu->domain_ids = NULL;
1642                 iommu->domains    = NULL;
1643                 return -ENOMEM;
1644         }
1645
1646         /*
1647          * If Caching mode is set, then invalid translations are tagged
1648          * with domain-id 0, hence we need to pre-allocate it. We also
1649          * use domain-id 0 as a marker for non-allocated domain-id, so
1650          * make sure it is not used for a real domain.
1651          */
1652         set_bit(0, iommu->domain_ids);
1653
1654         /*
1655          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1656          * entry for first-level or pass-through translation modes should
1657          * be programmed with a domain id different from those used for
1658          * second-level or nested translation. We reserve a domain id for
1659          * this purpose.
1660          */
1661         if (sm_supported(iommu))
1662                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1663
1664         return 0;
1665 }
1666
1667 static void disable_dmar_iommu(struct intel_iommu *iommu)
1668 {
1669         struct device_domain_info *info, *tmp;
1670         unsigned long flags;
1671
1672         if (!iommu->domains || !iommu->domain_ids)
1673                 return;
1674
1675         spin_lock_irqsave(&device_domain_lock, flags);
1676         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1677                 if (info->iommu != iommu)
1678                         continue;
1679
1680                 if (!info->dev || !info->domain)
1681                         continue;
1682
1683                 __dmar_remove_one_dev_info(info);
1684         }
1685         spin_unlock_irqrestore(&device_domain_lock, flags);
1686
1687         if (iommu->gcmd & DMA_GCMD_TE)
1688                 iommu_disable_translation(iommu);
1689 }
1690
1691 static void free_dmar_iommu(struct intel_iommu *iommu)
1692 {
1693         if ((iommu->domains) && (iommu->domain_ids)) {
1694                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1695                 int i;
1696
1697                 for (i = 0; i < elems; i++)
1698                         kfree(iommu->domains[i]);
1699                 kfree(iommu->domains);
1700                 kfree(iommu->domain_ids);
1701                 iommu->domains = NULL;
1702                 iommu->domain_ids = NULL;
1703         }
1704
1705         g_iommus[iommu->seq_id] = NULL;
1706
1707         /* free context mapping */
1708         free_context_table(iommu);
1709
1710 #ifdef CONFIG_INTEL_IOMMU_SVM
1711         if (pasid_supported(iommu)) {
1712                 if (ecap_prs(iommu->ecap))
1713                         intel_svm_finish_prq(iommu);
1714         }
1715 #endif
1716 }
1717
1718 static struct dmar_domain *alloc_domain(int flags)
1719 {
1720         struct dmar_domain *domain;
1721
1722         domain = alloc_domain_mem();
1723         if (!domain)
1724                 return NULL;
1725
1726         memset(domain, 0, sizeof(*domain));
1727         domain->nid = NUMA_NO_NODE;
1728         domain->flags = flags;
1729         domain->has_iotlb_device = false;
1730         INIT_LIST_HEAD(&domain->devices);
1731
1732         return domain;
1733 }
1734
1735 /* Must be called with iommu->lock */
1736 static int domain_attach_iommu(struct dmar_domain *domain,
1737                                struct intel_iommu *iommu)
1738 {
1739         unsigned long ndomains;
1740         int num;
1741
1742         assert_spin_locked(&device_domain_lock);
1743         assert_spin_locked(&iommu->lock);
1744
1745         domain->iommu_refcnt[iommu->seq_id] += 1;
1746         domain->iommu_count += 1;
1747         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1748                 ndomains = cap_ndoms(iommu->cap);
1749                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1750
1751                 if (num >= ndomains) {
1752                         pr_err("%s: No free domain ids\n", iommu->name);
1753                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1754                         domain->iommu_count -= 1;
1755                         return -ENOSPC;
1756                 }
1757
1758                 set_bit(num, iommu->domain_ids);
1759                 set_iommu_domain(iommu, num, domain);
1760
1761                 domain->iommu_did[iommu->seq_id] = num;
1762                 domain->nid                      = iommu->node;
1763
1764                 domain_update_iommu_cap(domain);
1765         }
1766
1767         return 0;
1768 }
1769
1770 static int domain_detach_iommu(struct dmar_domain *domain,
1771                                struct intel_iommu *iommu)
1772 {
1773         int num, count;
1774
1775         assert_spin_locked(&device_domain_lock);
1776         assert_spin_locked(&iommu->lock);
1777
1778         domain->iommu_refcnt[iommu->seq_id] -= 1;
1779         count = --domain->iommu_count;
1780         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1781                 num = domain->iommu_did[iommu->seq_id];
1782                 clear_bit(num, iommu->domain_ids);
1783                 set_iommu_domain(iommu, num, NULL);
1784
1785                 domain_update_iommu_cap(domain);
1786                 domain->iommu_did[iommu->seq_id] = 0;
1787         }
1788
1789         return count;
1790 }
1791
1792 static struct iova_domain reserved_iova_list;
1793 static struct lock_class_key reserved_rbtree_key;
1794
1795 static int dmar_init_reserved_ranges(void)
1796 {
1797         struct pci_dev *pdev = NULL;
1798         struct iova *iova;
1799         int i;
1800
1801         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1802
1803         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1804                 &reserved_rbtree_key);
1805
1806         /* IOAPIC ranges shouldn't be accessed by DMA */
1807         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1808                 IOVA_PFN(IOAPIC_RANGE_END));
1809         if (!iova) {
1810                 pr_err("Reserve IOAPIC range failed\n");
1811                 return -ENODEV;
1812         }
1813
1814         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1815         for_each_pci_dev(pdev) {
1816                 struct resource *r;
1817
1818                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1819                         r = &pdev->resource[i];
1820                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1821                                 continue;
1822                         iova = reserve_iova(&reserved_iova_list,
1823                                             IOVA_PFN(r->start),
1824                                             IOVA_PFN(r->end));
1825                         if (!iova) {
1826                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1827                                 return -ENODEV;
1828                         }
1829                 }
1830         }
1831         return 0;
1832 }
1833
1834 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1835 {
1836         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1837 }
1838
1839 static inline int guestwidth_to_adjustwidth(int gaw)
1840 {
1841         int agaw;
1842         int r = (gaw - 12) % 9;
1843
1844         if (r == 0)
1845                 agaw = gaw;
1846         else
1847                 agaw = gaw + 9 - r;
1848         if (agaw > 64)
1849                 agaw = 64;
1850         return agaw;
1851 }
1852
1853 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1854                        int guest_width)
1855 {
1856         int adjust_width, agaw;
1857         unsigned long sagaw;
1858         int err;
1859
1860         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1861
1862         err = init_iova_flush_queue(&domain->iovad,
1863                                     iommu_flush_iova, iova_entry_free);
1864         if (err)
1865                 return err;
1866
1867         domain_reserve_special_ranges(domain);
1868
1869         /* calculate AGAW */
1870         if (guest_width > cap_mgaw(iommu->cap))
1871                 guest_width = cap_mgaw(iommu->cap);
1872         domain->gaw = guest_width;
1873         adjust_width = guestwidth_to_adjustwidth(guest_width);
1874         agaw = width_to_agaw(adjust_width);
1875         sagaw = cap_sagaw(iommu->cap);
1876         if (!test_bit(agaw, &sagaw)) {
1877                 /* hardware doesn't support it, choose a bigger one */
1878                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1879                 agaw = find_next_bit(&sagaw, 5, agaw);
1880                 if (agaw >= 5)
1881                         return -ENODEV;
1882         }
1883         domain->agaw = agaw;
1884
1885         if (ecap_coherent(iommu->ecap))
1886                 domain->iommu_coherency = 1;
1887         else
1888                 domain->iommu_coherency = 0;
1889
1890         if (ecap_sc_support(iommu->ecap))
1891                 domain->iommu_snooping = 1;
1892         else
1893                 domain->iommu_snooping = 0;
1894
1895         if (intel_iommu_superpage)
1896                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1897         else
1898                 domain->iommu_superpage = 0;
1899
1900         domain->nid = iommu->node;
1901
1902         /* always allocate the top pgd */
1903         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1904         if (!domain->pgd)
1905                 return -ENOMEM;
1906         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1907         return 0;
1908 }
1909
1910 static void domain_exit(struct dmar_domain *domain)
1911 {
1912
1913         /* Remove associated devices and clear attached or cached domains */
1914         domain_remove_dev_info(domain);
1915
1916         /* destroy iovas */
1917         put_iova_domain(&domain->iovad);
1918
1919         if (domain->pgd) {
1920                 struct page *freelist;
1921
1922                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1923                 dma_free_pagelist(freelist);
1924         }
1925
1926         free_domain_mem(domain);
1927 }
1928
1929 /*
1930  * Get the PASID directory size for scalable mode context entry.
1931  * Value of X in the PDTS field of a scalable mode context entry
1932  * indicates PASID directory with 2^(X + 7) entries.
1933  */
1934 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1935 {
1936         int pds, max_pde;
1937
1938         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1939         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1940         if (pds < 7)
1941                 return 0;
1942
1943         return pds - 7;
1944 }
1945
1946 /*
1947  * Set the RID_PASID field of a scalable mode context entry. The
1948  * IOMMU hardware will use the PASID value set in this field for
1949  * DMA translations of DMA requests without PASID.
1950  */
1951 static inline void
1952 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1953 {
1954         context->hi |= pasid & ((1 << 20) - 1);
1955         context->hi |= (1 << 20);
1956 }
1957
1958 /*
1959  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1960  * entry.
1961  */
1962 static inline void context_set_sm_dte(struct context_entry *context)
1963 {
1964         context->lo |= (1 << 2);
1965 }
1966
1967 /*
1968  * Set the PRE(Page Request Enable) field of a scalable mode context
1969  * entry.
1970  */
1971 static inline void context_set_sm_pre(struct context_entry *context)
1972 {
1973         context->lo |= (1 << 4);
1974 }
1975
1976 /* Convert value to context PASID directory size field coding. */
1977 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1978
1979 static int domain_context_mapping_one(struct dmar_domain *domain,
1980                                       struct intel_iommu *iommu,
1981                                       struct pasid_table *table,
1982                                       u8 bus, u8 devfn)
1983 {
1984         u16 did = domain->iommu_did[iommu->seq_id];
1985         int translation = CONTEXT_TT_MULTI_LEVEL;
1986         struct device_domain_info *info = NULL;
1987         struct context_entry *context;
1988         unsigned long flags;
1989         int ret;
1990
1991         WARN_ON(did == 0);
1992
1993         if (hw_pass_through && domain_type_is_si(domain))
1994                 translation = CONTEXT_TT_PASS_THROUGH;
1995
1996         pr_debug("Set context mapping for %02x:%02x.%d\n",
1997                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1998
1999         BUG_ON(!domain->pgd);
2000
2001         spin_lock_irqsave(&device_domain_lock, flags);
2002         spin_lock(&iommu->lock);
2003
2004         ret = -ENOMEM;
2005         context = iommu_context_addr(iommu, bus, devfn, 1);
2006         if (!context)
2007                 goto out_unlock;
2008
2009         ret = 0;
2010         if (context_present(context))
2011                 goto out_unlock;
2012
2013         /*
2014          * For kdump cases, old valid entries may be cached due to the
2015          * in-flight DMA and copied pgtable, but there is no unmapping
2016          * behaviour for them, thus we need an explicit cache flush for
2017          * the newly-mapped device. For kdump, at this point, the device
2018          * is supposed to finish reset at its driver probe stage, so no
2019          * in-flight DMA will exist, and we don't need to worry anymore
2020          * hereafter.
2021          */
2022         if (context_copied(context)) {
2023                 u16 did_old = context_domain_id(context);
2024
2025                 if (did_old < cap_ndoms(iommu->cap)) {
2026                         iommu->flush.flush_context(iommu, did_old,
2027                                                    (((u16)bus) << 8) | devfn,
2028                                                    DMA_CCMD_MASK_NOBIT,
2029                                                    DMA_CCMD_DEVICE_INVL);
2030                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2031                                                  DMA_TLB_DSI_FLUSH);
2032                 }
2033         }
2034
2035         context_clear_entry(context);
2036
2037         if (sm_supported(iommu)) {
2038                 unsigned long pds;
2039
2040                 WARN_ON(!table);
2041
2042                 /* Setup the PASID DIR pointer: */
2043                 pds = context_get_sm_pds(table);
2044                 context->lo = (u64)virt_to_phys(table->table) |
2045                                 context_pdts(pds);
2046
2047                 /* Setup the RID_PASID field: */
2048                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2049
2050                 /*
2051                  * Setup the Device-TLB enable bit and Page request
2052                  * Enable bit:
2053                  */
2054                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2055                 if (info && info->ats_supported)
2056                         context_set_sm_dte(context);
2057                 if (info && info->pri_supported)
2058                         context_set_sm_pre(context);
2059         } else {
2060                 struct dma_pte *pgd = domain->pgd;
2061                 int agaw;
2062
2063                 context_set_domain_id(context, did);
2064
2065                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2066                         /*
2067                          * Skip top levels of page tables for iommu which has
2068                          * less agaw than default. Unnecessary for PT mode.
2069                          */
2070                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2071                                 ret = -ENOMEM;
2072                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2073                                 if (!dma_pte_present(pgd))
2074                                         goto out_unlock;
2075                         }
2076
2077                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2078                         if (info && info->ats_supported)
2079                                 translation = CONTEXT_TT_DEV_IOTLB;
2080                         else
2081                                 translation = CONTEXT_TT_MULTI_LEVEL;
2082
2083                         context_set_address_root(context, virt_to_phys(pgd));
2084                         context_set_address_width(context, agaw);
2085                 } else {
2086                         /*
2087                          * In pass through mode, AW must be programmed to
2088                          * indicate the largest AGAW value supported by
2089                          * hardware. And ASR is ignored by hardware.
2090                          */
2091                         context_set_address_width(context, iommu->msagaw);
2092                 }
2093
2094                 context_set_translation_type(context, translation);
2095         }
2096
2097         context_set_fault_enable(context);
2098         context_set_present(context);
2099         if (!ecap_coherent(iommu->ecap))
2100                 clflush_cache_range(context, sizeof(*context));
2101
2102         /*
2103          * It's a non-present to present mapping. If hardware doesn't cache
2104          * non-present entry we only need to flush the write-buffer. If the
2105          * _does_ cache non-present entries, then it does so in the special
2106          * domain #0, which we have to flush:
2107          */
2108         if (cap_caching_mode(iommu->cap)) {
2109                 iommu->flush.flush_context(iommu, 0,
2110                                            (((u16)bus) << 8) | devfn,
2111                                            DMA_CCMD_MASK_NOBIT,
2112                                            DMA_CCMD_DEVICE_INVL);
2113                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2114         } else {
2115                 iommu_flush_write_buffer(iommu);
2116         }
2117         iommu_enable_dev_iotlb(info);
2118
2119         ret = 0;
2120
2121 out_unlock:
2122         spin_unlock(&iommu->lock);
2123         spin_unlock_irqrestore(&device_domain_lock, flags);
2124
2125         return ret;
2126 }
2127
2128 struct domain_context_mapping_data {
2129         struct dmar_domain *domain;
2130         struct intel_iommu *iommu;
2131         struct pasid_table *table;
2132 };
2133
2134 static int domain_context_mapping_cb(struct pci_dev *pdev,
2135                                      u16 alias, void *opaque)
2136 {
2137         struct domain_context_mapping_data *data = opaque;
2138
2139         return domain_context_mapping_one(data->domain, data->iommu,
2140                                           data->table, PCI_BUS_NUM(alias),
2141                                           alias & 0xff);
2142 }
2143
2144 static int
2145 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2146 {
2147         struct domain_context_mapping_data data;
2148         struct pasid_table *table;
2149         struct intel_iommu *iommu;
2150         u8 bus, devfn;
2151
2152         iommu = device_to_iommu(dev, &bus, &devfn);
2153         if (!iommu)
2154                 return -ENODEV;
2155
2156         table = intel_pasid_get_table(dev);
2157
2158         if (!dev_is_pci(dev))
2159                 return domain_context_mapping_one(domain, iommu, table,
2160                                                   bus, devfn);
2161
2162         data.domain = domain;
2163         data.iommu = iommu;
2164         data.table = table;
2165
2166         return pci_for_each_dma_alias(to_pci_dev(dev),
2167                                       &domain_context_mapping_cb, &data);
2168 }
2169
2170 static int domain_context_mapped_cb(struct pci_dev *pdev,
2171                                     u16 alias, void *opaque)
2172 {
2173         struct intel_iommu *iommu = opaque;
2174
2175         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2176 }
2177
2178 static int domain_context_mapped(struct device *dev)
2179 {
2180         struct intel_iommu *iommu;
2181         u8 bus, devfn;
2182
2183         iommu = device_to_iommu(dev, &bus, &devfn);
2184         if (!iommu)
2185                 return -ENODEV;
2186
2187         if (!dev_is_pci(dev))
2188                 return device_context_mapped(iommu, bus, devfn);
2189
2190         return !pci_for_each_dma_alias(to_pci_dev(dev),
2191                                        domain_context_mapped_cb, iommu);
2192 }
2193
2194 /* Returns a number of VTD pages, but aligned to MM page size */
2195 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2196                                             size_t size)
2197 {
2198         host_addr &= ~PAGE_MASK;
2199         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2200 }
2201
2202 /* Return largest possible superpage level for a given mapping */
2203 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2204                                           unsigned long iov_pfn,
2205                                           unsigned long phy_pfn,
2206                                           unsigned long pages)
2207 {
2208         int support, level = 1;
2209         unsigned long pfnmerge;
2210
2211         support = domain->iommu_superpage;
2212
2213         /* To use a large page, the virtual *and* physical addresses
2214            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2215            of them will mean we have to use smaller pages. So just
2216            merge them and check both at once. */
2217         pfnmerge = iov_pfn | phy_pfn;
2218
2219         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2220                 pages >>= VTD_STRIDE_SHIFT;
2221                 if (!pages)
2222                         break;
2223                 pfnmerge >>= VTD_STRIDE_SHIFT;
2224                 level++;
2225                 support--;
2226         }
2227         return level;
2228 }
2229
2230 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2231                             struct scatterlist *sg, unsigned long phys_pfn,
2232                             unsigned long nr_pages, int prot)
2233 {
2234         struct dma_pte *first_pte = NULL, *pte = NULL;
2235         phys_addr_t uninitialized_var(pteval);
2236         unsigned long sg_res = 0;
2237         unsigned int largepage_lvl = 0;
2238         unsigned long lvl_pages = 0;
2239
2240         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2241
2242         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2243                 return -EINVAL;
2244
2245         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2246
2247         if (!sg) {
2248                 sg_res = nr_pages;
2249                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2250         }
2251
2252         while (nr_pages > 0) {
2253                 uint64_t tmp;
2254
2255                 if (!sg_res) {
2256                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2257
2258                         sg_res = aligned_nrpages(sg->offset, sg->length);
2259                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2260                         sg->dma_length = sg->length;
2261                         pteval = (sg_phys(sg) - pgoff) | prot;
2262                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2263                 }
2264
2265                 if (!pte) {
2266                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2267
2268                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2269                         if (!pte)
2270                                 return -ENOMEM;
2271                         /* It is large page*/
2272                         if (largepage_lvl > 1) {
2273                                 unsigned long nr_superpages, end_pfn;
2274
2275                                 pteval |= DMA_PTE_LARGE_PAGE;
2276                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2277
2278                                 nr_superpages = sg_res / lvl_pages;
2279                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2280
2281                                 /*
2282                                  * Ensure that old small page tables are
2283                                  * removed to make room for superpage(s).
2284                                  * We're adding new large pages, so make sure
2285                                  * we don't remove their parent tables.
2286                                  */
2287                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2288                                                        largepage_lvl + 1);
2289                         } else {
2290                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2291                         }
2292
2293                 }
2294                 /* We don't need lock here, nobody else
2295                  * touches the iova range
2296                  */
2297                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2298                 if (tmp) {
2299                         static int dumps = 5;
2300                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2301                                 iov_pfn, tmp, (unsigned long long)pteval);
2302                         if (dumps) {
2303                                 dumps--;
2304                                 debug_dma_dump_mappings(NULL);
2305                         }
2306                         WARN_ON(1);
2307                 }
2308
2309                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2310
2311                 BUG_ON(nr_pages < lvl_pages);
2312                 BUG_ON(sg_res < lvl_pages);
2313
2314                 nr_pages -= lvl_pages;
2315                 iov_pfn += lvl_pages;
2316                 phys_pfn += lvl_pages;
2317                 pteval += lvl_pages * VTD_PAGE_SIZE;
2318                 sg_res -= lvl_pages;
2319
2320                 /* If the next PTE would be the first in a new page, then we
2321                    need to flush the cache on the entries we've just written.
2322                    And then we'll need to recalculate 'pte', so clear it and
2323                    let it get set again in the if (!pte) block above.
2324
2325                    If we're done (!nr_pages) we need to flush the cache too.
2326
2327                    Also if we've been setting superpages, we may need to
2328                    recalculate 'pte' and switch back to smaller pages for the
2329                    end of the mapping, if the trailing size is not enough to
2330                    use another superpage (i.e. sg_res < lvl_pages). */
2331                 pte++;
2332                 if (!nr_pages || first_pte_in_page(pte) ||
2333                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2334                         domain_flush_cache(domain, first_pte,
2335                                            (void *)pte - (void *)first_pte);
2336                         pte = NULL;
2337                 }
2338
2339                 if (!sg_res && nr_pages)
2340                         sg = sg_next(sg);
2341         }
2342         return 0;
2343 }
2344
2345 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2346                           struct scatterlist *sg, unsigned long phys_pfn,
2347                           unsigned long nr_pages, int prot)
2348 {
2349         int iommu_id, ret;
2350         struct intel_iommu *iommu;
2351
2352         /* Do the real mapping first */
2353         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2354         if (ret)
2355                 return ret;
2356
2357         for_each_domain_iommu(iommu_id, domain) {
2358                 iommu = g_iommus[iommu_id];
2359                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2360         }
2361
2362         return 0;
2363 }
2364
2365 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2366                                     struct scatterlist *sg, unsigned long nr_pages,
2367                                     int prot)
2368 {
2369         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2370 }
2371
2372 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2373                                      unsigned long phys_pfn, unsigned long nr_pages,
2374                                      int prot)
2375 {
2376         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2377 }
2378
2379 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2380 {
2381         unsigned long flags;
2382         struct context_entry *context;
2383         u16 did_old;
2384
2385         if (!iommu)
2386                 return;
2387
2388         spin_lock_irqsave(&iommu->lock, flags);
2389         context = iommu_context_addr(iommu, bus, devfn, 0);
2390         if (!context) {
2391                 spin_unlock_irqrestore(&iommu->lock, flags);
2392                 return;
2393         }
2394         did_old = context_domain_id(context);
2395         context_clear_entry(context);
2396         __iommu_flush_cache(iommu, context, sizeof(*context));
2397         spin_unlock_irqrestore(&iommu->lock, flags);
2398         iommu->flush.flush_context(iommu,
2399                                    did_old,
2400                                    (((u16)bus) << 8) | devfn,
2401                                    DMA_CCMD_MASK_NOBIT,
2402                                    DMA_CCMD_DEVICE_INVL);
2403         iommu->flush.flush_iotlb(iommu,
2404                                  did_old,
2405                                  0,
2406                                  0,
2407                                  DMA_TLB_DSI_FLUSH);
2408 }
2409
2410 static inline void unlink_domain_info(struct device_domain_info *info)
2411 {
2412         assert_spin_locked(&device_domain_lock);
2413         list_del(&info->link);
2414         list_del(&info->global);
2415         if (info->dev)
2416                 info->dev->archdata.iommu = NULL;
2417 }
2418
2419 static void domain_remove_dev_info(struct dmar_domain *domain)
2420 {
2421         struct device_domain_info *info, *tmp;
2422         unsigned long flags;
2423
2424         spin_lock_irqsave(&device_domain_lock, flags);
2425         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2426                 __dmar_remove_one_dev_info(info);
2427         spin_unlock_irqrestore(&device_domain_lock, flags);
2428 }
2429
2430 /*
2431  * find_domain
2432  * Note: we use struct device->archdata.iommu stores the info
2433  */
2434 static struct dmar_domain *find_domain(struct device *dev)
2435 {
2436         struct device_domain_info *info;
2437
2438         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2439                 struct iommu_domain *domain;
2440
2441                 dev->archdata.iommu = NULL;
2442                 domain = iommu_get_domain_for_dev(dev);
2443                 if (domain)
2444                         intel_iommu_attach_device(domain, dev);
2445         }
2446
2447         /* No lock here, assumes no domain exit in normal case */
2448         info = dev->archdata.iommu;
2449
2450         if (likely(info))
2451                 return info->domain;
2452         return NULL;
2453 }
2454
2455 static inline struct device_domain_info *
2456 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2457 {
2458         struct device_domain_info *info;
2459
2460         list_for_each_entry(info, &device_domain_list, global)
2461                 if (info->iommu->segment == segment && info->bus == bus &&
2462                     info->devfn == devfn)
2463                         return info;
2464
2465         return NULL;
2466 }
2467
2468 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2469                                                     int bus, int devfn,
2470                                                     struct device *dev,
2471                                                     struct dmar_domain *domain)
2472 {
2473         struct dmar_domain *found = NULL;
2474         struct device_domain_info *info;
2475         unsigned long flags;
2476         int ret;
2477
2478         info = alloc_devinfo_mem();
2479         if (!info)
2480                 return NULL;
2481
2482         info->bus = bus;
2483         info->devfn = devfn;
2484         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2485         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2486         info->ats_qdep = 0;
2487         info->dev = dev;
2488         info->domain = domain;
2489         info->iommu = iommu;
2490         info->pasid_table = NULL;
2491         info->auxd_enabled = 0;
2492         INIT_LIST_HEAD(&info->auxiliary_domains);
2493
2494         if (dev && dev_is_pci(dev)) {
2495                 struct pci_dev *pdev = to_pci_dev(info->dev);
2496
2497                 if (!pdev->untrusted &&
2498                     !pci_ats_disabled() &&
2499                     ecap_dev_iotlb_support(iommu->ecap) &&
2500                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2501                     dmar_find_matched_atsr_unit(pdev))
2502                         info->ats_supported = 1;
2503
2504                 if (sm_supported(iommu)) {
2505                         if (pasid_supported(iommu)) {
2506                                 int features = pci_pasid_features(pdev);
2507                                 if (features >= 0)
2508                                         info->pasid_supported = features | 1;
2509                         }
2510
2511                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2512                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2513                                 info->pri_supported = 1;
2514                 }
2515         }
2516
2517         spin_lock_irqsave(&device_domain_lock, flags);
2518         if (dev)
2519                 found = find_domain(dev);
2520
2521         if (!found) {
2522                 struct device_domain_info *info2;
2523                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2524                 if (info2) {
2525                         found      = info2->domain;
2526                         info2->dev = dev;
2527                 }
2528         }
2529
2530         if (found) {
2531                 spin_unlock_irqrestore(&device_domain_lock, flags);
2532                 free_devinfo_mem(info);
2533                 /* Caller must free the original domain */
2534                 return found;
2535         }
2536
2537         spin_lock(&iommu->lock);
2538         ret = domain_attach_iommu(domain, iommu);
2539         spin_unlock(&iommu->lock);
2540
2541         if (ret) {
2542                 spin_unlock_irqrestore(&device_domain_lock, flags);
2543                 free_devinfo_mem(info);
2544                 return NULL;
2545         }
2546
2547         list_add(&info->link, &domain->devices);
2548         list_add(&info->global, &device_domain_list);
2549         if (dev)
2550                 dev->archdata.iommu = info;
2551         spin_unlock_irqrestore(&device_domain_lock, flags);
2552
2553         /* PASID table is mandatory for a PCI device in scalable mode. */
2554         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2555                 ret = intel_pasid_alloc_table(dev);
2556                 if (ret) {
2557                         dev_err(dev, "PASID table allocation failed\n");
2558                         dmar_remove_one_dev_info(dev);
2559                         return NULL;
2560                 }
2561
2562                 /* Setup the PASID entry for requests without PASID: */
2563                 spin_lock(&iommu->lock);
2564                 if (hw_pass_through && domain_type_is_si(domain))
2565                         ret = intel_pasid_setup_pass_through(iommu, domain,
2566                                         dev, PASID_RID2PASID);
2567                 else
2568                         ret = intel_pasid_setup_second_level(iommu, domain,
2569                                         dev, PASID_RID2PASID);
2570                 spin_unlock(&iommu->lock);
2571                 if (ret) {
2572                         dev_err(dev, "Setup RID2PASID failed\n");
2573                         dmar_remove_one_dev_info(dev);
2574                         return NULL;
2575                 }
2576         }
2577
2578         if (dev && domain_context_mapping(domain, dev)) {
2579                 dev_err(dev, "Domain context map failed\n");
2580                 dmar_remove_one_dev_info(dev);
2581                 return NULL;
2582         }
2583
2584         return domain;
2585 }
2586
2587 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2588 {
2589         *(u16 *)opaque = alias;
2590         return 0;
2591 }
2592
2593 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2594 {
2595         struct device_domain_info *info;
2596         struct dmar_domain *domain = NULL;
2597         struct intel_iommu *iommu;
2598         u16 dma_alias;
2599         unsigned long flags;
2600         u8 bus, devfn;
2601
2602         iommu = device_to_iommu(dev, &bus, &devfn);
2603         if (!iommu)
2604                 return NULL;
2605
2606         if (dev_is_pci(dev)) {
2607                 struct pci_dev *pdev = to_pci_dev(dev);
2608
2609                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2610
2611                 spin_lock_irqsave(&device_domain_lock, flags);
2612                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2613                                                       PCI_BUS_NUM(dma_alias),
2614                                                       dma_alias & 0xff);
2615                 if (info) {
2616                         iommu = info->iommu;
2617                         domain = info->domain;
2618                 }
2619                 spin_unlock_irqrestore(&device_domain_lock, flags);
2620
2621                 /* DMA alias already has a domain, use it */
2622                 if (info)
2623                         goto out;
2624         }
2625
2626         /* Allocate and initialize new domain for the device */
2627         domain = alloc_domain(0);
2628         if (!domain)
2629                 return NULL;
2630         if (domain_init(domain, iommu, gaw)) {
2631                 domain_exit(domain);
2632                 return NULL;
2633         }
2634
2635 out:
2636         return domain;
2637 }
2638
2639 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2640                                               struct dmar_domain *domain)
2641 {
2642         struct intel_iommu *iommu;
2643         struct dmar_domain *tmp;
2644         u16 req_id, dma_alias;
2645         u8 bus, devfn;
2646
2647         iommu = device_to_iommu(dev, &bus, &devfn);
2648         if (!iommu)
2649                 return NULL;
2650
2651         req_id = ((u16)bus << 8) | devfn;
2652
2653         if (dev_is_pci(dev)) {
2654                 struct pci_dev *pdev = to_pci_dev(dev);
2655
2656                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2657
2658                 /* register PCI DMA alias device */
2659                 if (req_id != dma_alias) {
2660                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2661                                         dma_alias & 0xff, NULL, domain);
2662
2663                         if (!tmp || tmp != domain)
2664                                 return tmp;
2665                 }
2666         }
2667
2668         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2669         if (!tmp || tmp != domain)
2670                 return tmp;
2671
2672         return domain;
2673 }
2674
2675 static int iommu_domain_identity_map(struct dmar_domain *domain,
2676                                      unsigned long long start,
2677                                      unsigned long long end)
2678 {
2679         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2680         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2681
2682         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2683                           dma_to_mm_pfn(last_vpfn))) {
2684                 pr_err("Reserving iova failed\n");
2685                 return -ENOMEM;
2686         }
2687
2688         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2689         /*
2690          * RMRR range might have overlap with physical memory range,
2691          * clear it first
2692          */
2693         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2694
2695         return __domain_mapping(domain, first_vpfn, NULL,
2696                                 first_vpfn, last_vpfn - first_vpfn + 1,
2697                                 DMA_PTE_READ|DMA_PTE_WRITE);
2698 }
2699
2700 static int domain_prepare_identity_map(struct device *dev,
2701                                        struct dmar_domain *domain,
2702                                        unsigned long long start,
2703                                        unsigned long long end)
2704 {
2705         /* For _hardware_ passthrough, don't bother. But for software
2706            passthrough, we do it anyway -- it may indicate a memory
2707            range which is reserved in E820, so which didn't get set
2708            up to start with in si_domain */
2709         if (domain == si_domain && hw_pass_through) {
2710                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2711                          start, end);
2712                 return 0;
2713         }
2714
2715         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2716
2717         if (end < start) {
2718                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2719                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2720                         dmi_get_system_info(DMI_BIOS_VENDOR),
2721                         dmi_get_system_info(DMI_BIOS_VERSION),
2722                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2723                 return -EIO;
2724         }
2725
2726         if (end >> agaw_to_width(domain->agaw)) {
2727                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2728                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2729                      agaw_to_width(domain->agaw),
2730                      dmi_get_system_info(DMI_BIOS_VENDOR),
2731                      dmi_get_system_info(DMI_BIOS_VERSION),
2732                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2733                 return -EIO;
2734         }
2735
2736         return iommu_domain_identity_map(domain, start, end);
2737 }
2738
2739 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2740
2741 static int __init si_domain_init(int hw)
2742 {
2743         struct dmar_rmrr_unit *rmrr;
2744         struct device *dev;
2745         int i, nid, ret;
2746
2747         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2748         if (!si_domain)
2749                 return -EFAULT;
2750
2751         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2752                 domain_exit(si_domain);
2753                 return -EFAULT;
2754         }
2755
2756         if (hw)
2757                 return 0;
2758
2759         for_each_online_node(nid) {
2760                 unsigned long start_pfn, end_pfn;
2761                 int i;
2762
2763                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2764                         ret = iommu_domain_identity_map(si_domain,
2765                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2766                         if (ret)
2767                                 return ret;
2768                 }
2769         }
2770
2771         /*
2772          * Identity map the RMRRs so that devices with RMRRs could also use
2773          * the si_domain.
2774          */
2775         for_each_rmrr_units(rmrr) {
2776                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2777                                           i, dev) {
2778                         unsigned long long start = rmrr->base_address;
2779                         unsigned long long end = rmrr->end_address;
2780
2781                         if (WARN_ON(end < start ||
2782                                     end >> agaw_to_width(si_domain->agaw)))
2783                                 continue;
2784
2785                         ret = iommu_domain_identity_map(si_domain, start, end);
2786                         if (ret)
2787                                 return ret;
2788                 }
2789         }
2790
2791         return 0;
2792 }
2793
2794 static int identity_mapping(struct device *dev)
2795 {
2796         struct device_domain_info *info;
2797
2798         info = dev->archdata.iommu;
2799         if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO)
2800                 return (info->domain == si_domain);
2801
2802         return 0;
2803 }
2804
2805 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2806 {
2807         struct dmar_domain *ndomain;
2808         struct intel_iommu *iommu;
2809         u8 bus, devfn;
2810
2811         iommu = device_to_iommu(dev, &bus, &devfn);
2812         if (!iommu)
2813                 return -ENODEV;
2814
2815         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2816         if (ndomain != domain)
2817                 return -EBUSY;
2818
2819         return 0;
2820 }
2821
2822 static bool device_has_rmrr(struct device *dev)
2823 {
2824         struct dmar_rmrr_unit *rmrr;
2825         struct device *tmp;
2826         int i;
2827
2828         rcu_read_lock();
2829         for_each_rmrr_units(rmrr) {
2830                 /*
2831                  * Return TRUE if this RMRR contains the device that
2832                  * is passed in.
2833                  */
2834                 for_each_active_dev_scope(rmrr->devices,
2835                                           rmrr->devices_cnt, i, tmp)
2836                         if (tmp == dev ||
2837                             is_downstream_to_pci_bridge(dev, tmp)) {
2838                                 rcu_read_unlock();
2839                                 return true;
2840                         }
2841         }
2842         rcu_read_unlock();
2843         return false;
2844 }
2845
2846 /**
2847  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2848  * is relaxable (ie. is allowed to be not enforced under some conditions)
2849  * @dev: device handle
2850  *
2851  * We assume that PCI USB devices with RMRRs have them largely
2852  * for historical reasons and that the RMRR space is not actively used post
2853  * boot.  This exclusion may change if vendors begin to abuse it.
2854  *
2855  * The same exception is made for graphics devices, with the requirement that
2856  * any use of the RMRR regions will be torn down before assigning the device
2857  * to a guest.
2858  *
2859  * Return: true if the RMRR is relaxable, false otherwise
2860  */
2861 static bool device_rmrr_is_relaxable(struct device *dev)
2862 {
2863         struct pci_dev *pdev;
2864
2865         if (!dev_is_pci(dev))
2866                 return false;
2867
2868         pdev = to_pci_dev(dev);
2869         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2870                 return true;
2871         else
2872                 return false;
2873 }
2874
2875 /*
2876  * There are a couple cases where we need to restrict the functionality of
2877  * devices associated with RMRRs.  The first is when evaluating a device for
2878  * identity mapping because problems exist when devices are moved in and out
2879  * of domains and their respective RMRR information is lost.  This means that
2880  * a device with associated RMRRs will never be in a "passthrough" domain.
2881  * The second is use of the device through the IOMMU API.  This interface
2882  * expects to have full control of the IOVA space for the device.  We cannot
2883  * satisfy both the requirement that RMRR access is maintained and have an
2884  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2885  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2886  * We therefore prevent devices associated with an RMRR from participating in
2887  * the IOMMU API, which eliminates them from device assignment.
2888  *
2889  * In both cases, devices which have relaxable RMRRs are not concerned by this
2890  * restriction. See device_rmrr_is_relaxable comment.
2891  */
2892 static bool device_is_rmrr_locked(struct device *dev)
2893 {
2894         if (!device_has_rmrr(dev))
2895                 return false;
2896
2897         if (device_rmrr_is_relaxable(dev))
2898                 return false;
2899
2900         return true;
2901 }
2902
2903 /*
2904  * Return the required default domain type for a specific device.
2905  *
2906  * @dev: the device in query
2907  * @startup: true if this is during early boot
2908  *
2909  * Returns:
2910  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2911  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2912  *  - 0: both identity and dynamic domains work for this device
2913  */
2914 static int device_def_domain_type(struct device *dev)
2915 {
2916         if (dev_is_pci(dev)) {
2917                 struct pci_dev *pdev = to_pci_dev(dev);
2918
2919                 /*
2920                  * Prevent any device marked as untrusted from getting
2921                  * placed into the statically identity mapping domain.
2922                  */
2923                 if (pdev->untrusted)
2924                         return IOMMU_DOMAIN_DMA;
2925
2926                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2927                         return IOMMU_DOMAIN_IDENTITY;
2928
2929                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2930                         return IOMMU_DOMAIN_IDENTITY;
2931
2932                 /*
2933                  * We want to start off with all devices in the 1:1 domain, and
2934                  * take them out later if we find they can't access all of memory.
2935                  *
2936                  * However, we can't do this for PCI devices behind bridges,
2937                  * because all PCI devices behind the same bridge will end up
2938                  * with the same source-id on their transactions.
2939                  *
2940                  * Practically speaking, we can't change things around for these
2941                  * devices at run-time, because we can't be sure there'll be no
2942                  * DMA transactions in flight for any of their siblings.
2943                  *
2944                  * So PCI devices (unless they're on the root bus) as well as
2945                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2946                  * the 1:1 domain, just in _case_ one of their siblings turns out
2947                  * not to be able to map all of memory.
2948                  */
2949                 if (!pci_is_pcie(pdev)) {
2950                         if (!pci_is_root_bus(pdev->bus))
2951                                 return IOMMU_DOMAIN_DMA;
2952                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2953                                 return IOMMU_DOMAIN_DMA;
2954                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2955                         return IOMMU_DOMAIN_DMA;
2956         }
2957
2958         return (iommu_identity_mapping & IDENTMAP_ALL) ?
2959                         IOMMU_DOMAIN_IDENTITY : 0;
2960 }
2961
2962 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2963 {
2964         /*
2965          * Start from the sane iommu hardware state.
2966          * If the queued invalidation is already initialized by us
2967          * (for example, while enabling interrupt-remapping) then
2968          * we got the things already rolling from a sane state.
2969          */
2970         if (!iommu->qi) {
2971                 /*
2972                  * Clear any previous faults.
2973                  */
2974                 dmar_fault(-1, iommu);
2975                 /*
2976                  * Disable queued invalidation if supported and already enabled
2977                  * before OS handover.
2978                  */
2979                 dmar_disable_qi(iommu);
2980         }
2981
2982         if (dmar_enable_qi(iommu)) {
2983                 /*
2984                  * Queued Invalidate not enabled, use Register Based Invalidate
2985                  */
2986                 iommu->flush.flush_context = __iommu_flush_context;
2987                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2988                 pr_info("%s: Using Register based invalidation\n",
2989                         iommu->name);
2990         } else {
2991                 iommu->flush.flush_context = qi_flush_context;
2992                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2993                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2994         }
2995 }
2996
2997 static int copy_context_table(struct intel_iommu *iommu,
2998                               struct root_entry *old_re,
2999                               struct context_entry **tbl,
3000                               int bus, bool ext)
3001 {
3002         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3003         struct context_entry *new_ce = NULL, ce;
3004         struct context_entry *old_ce = NULL;
3005         struct root_entry re;
3006         phys_addr_t old_ce_phys;
3007
3008         tbl_idx = ext ? bus * 2 : bus;
3009         memcpy(&re, old_re, sizeof(re));
3010
3011         for (devfn = 0; devfn < 256; devfn++) {
3012                 /* First calculate the correct index */
3013                 idx = (ext ? devfn * 2 : devfn) % 256;
3014
3015                 if (idx == 0) {
3016                         /* First save what we may have and clean up */
3017                         if (new_ce) {
3018                                 tbl[tbl_idx] = new_ce;
3019                                 __iommu_flush_cache(iommu, new_ce,
3020                                                     VTD_PAGE_SIZE);
3021                                 pos = 1;
3022                         }
3023
3024                         if (old_ce)
3025                                 memunmap(old_ce);
3026
3027                         ret = 0;
3028                         if (devfn < 0x80)
3029                                 old_ce_phys = root_entry_lctp(&re);
3030                         else
3031                                 old_ce_phys = root_entry_uctp(&re);
3032
3033                         if (!old_ce_phys) {
3034                                 if (ext && devfn == 0) {
3035                                         /* No LCTP, try UCTP */
3036                                         devfn = 0x7f;
3037                                         continue;
3038                                 } else {
3039                                         goto out;
3040                                 }
3041                         }
3042
3043                         ret = -ENOMEM;
3044                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3045                                         MEMREMAP_WB);
3046                         if (!old_ce)
3047                                 goto out;
3048
3049                         new_ce = alloc_pgtable_page(iommu->node);
3050                         if (!new_ce)
3051                                 goto out_unmap;
3052
3053                         ret = 0;
3054                 }
3055
3056                 /* Now copy the context entry */
3057                 memcpy(&ce, old_ce + idx, sizeof(ce));
3058
3059                 if (!__context_present(&ce))
3060                         continue;
3061
3062                 did = context_domain_id(&ce);
3063                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3064                         set_bit(did, iommu->domain_ids);
3065
3066                 /*
3067                  * We need a marker for copied context entries. This
3068                  * marker needs to work for the old format as well as
3069                  * for extended context entries.
3070                  *
3071                  * Bit 67 of the context entry is used. In the old
3072                  * format this bit is available to software, in the
3073                  * extended format it is the PGE bit, but PGE is ignored
3074                  * by HW if PASIDs are disabled (and thus still
3075                  * available).
3076                  *
3077                  * So disable PASIDs first and then mark the entry
3078                  * copied. This means that we don't copy PASID
3079                  * translations from the old kernel, but this is fine as
3080                  * faults there are not fatal.
3081                  */
3082                 context_clear_pasid_enable(&ce);
3083                 context_set_copied(&ce);
3084
3085                 new_ce[idx] = ce;
3086         }
3087
3088         tbl[tbl_idx + pos] = new_ce;
3089
3090         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3091
3092 out_unmap:
3093         memunmap(old_ce);
3094
3095 out:
3096         return ret;
3097 }
3098
3099 static int copy_translation_tables(struct intel_iommu *iommu)
3100 {
3101         struct context_entry **ctxt_tbls;
3102         struct root_entry *old_rt;
3103         phys_addr_t old_rt_phys;
3104         int ctxt_table_entries;
3105         unsigned long flags;
3106         u64 rtaddr_reg;
3107         int bus, ret;
3108         bool new_ext, ext;
3109
3110         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3111         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3112         new_ext    = !!ecap_ecs(iommu->ecap);
3113
3114         /*
3115          * The RTT bit can only be changed when translation is disabled,
3116          * but disabling translation means to open a window for data
3117          * corruption. So bail out and don't copy anything if we would
3118          * have to change the bit.
3119          */
3120         if (new_ext != ext)
3121                 return -EINVAL;
3122
3123         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3124         if (!old_rt_phys)
3125                 return -EINVAL;
3126
3127         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3128         if (!old_rt)
3129                 return -ENOMEM;
3130
3131         /* This is too big for the stack - allocate it from slab */
3132         ctxt_table_entries = ext ? 512 : 256;
3133         ret = -ENOMEM;
3134         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3135         if (!ctxt_tbls)
3136                 goto out_unmap;
3137
3138         for (bus = 0; bus < 256; bus++) {
3139                 ret = copy_context_table(iommu, &old_rt[bus],
3140                                          ctxt_tbls, bus, ext);
3141                 if (ret) {
3142                         pr_err("%s: Failed to copy context table for bus %d\n",
3143                                 iommu->name, bus);
3144                         continue;
3145                 }
3146         }
3147
3148         spin_lock_irqsave(&iommu->lock, flags);
3149
3150         /* Context tables are copied, now write them to the root_entry table */
3151         for (bus = 0; bus < 256; bus++) {
3152                 int idx = ext ? bus * 2 : bus;
3153                 u64 val;
3154
3155                 if (ctxt_tbls[idx]) {
3156                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3157                         iommu->root_entry[bus].lo = val;
3158                 }
3159
3160                 if (!ext || !ctxt_tbls[idx + 1])
3161                         continue;
3162
3163                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3164                 iommu->root_entry[bus].hi = val;
3165         }
3166
3167         spin_unlock_irqrestore(&iommu->lock, flags);
3168
3169         kfree(ctxt_tbls);
3170
3171         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3172
3173         ret = 0;
3174
3175 out_unmap:
3176         memunmap(old_rt);
3177
3178         return ret;
3179 }
3180
3181 static int __init init_dmars(void)
3182 {
3183         struct dmar_drhd_unit *drhd;
3184         struct intel_iommu *iommu;
3185         int ret;
3186
3187         /*
3188          * for each drhd
3189          *    allocate root
3190          *    initialize and program root entry to not present
3191          * endfor
3192          */
3193         for_each_drhd_unit(drhd) {
3194                 /*
3195                  * lock not needed as this is only incremented in the single
3196                  * threaded kernel __init code path all other access are read
3197                  * only
3198                  */
3199                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3200                         g_num_of_iommus++;
3201                         continue;
3202                 }
3203                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3204         }
3205
3206         /* Preallocate enough resources for IOMMU hot-addition */
3207         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3208                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3209
3210         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3211                         GFP_KERNEL);
3212         if (!g_iommus) {
3213                 pr_err("Allocating global iommu array failed\n");
3214                 ret = -ENOMEM;
3215                 goto error;
3216         }
3217
3218         for_each_iommu(iommu, drhd) {
3219                 if (drhd->ignored) {
3220                         iommu_disable_translation(iommu);
3221                         continue;
3222                 }
3223
3224                 /*
3225                  * Find the max pasid size of all IOMMU's in the system.
3226                  * We need to ensure the system pasid table is no bigger
3227                  * than the smallest supported.
3228                  */
3229                 if (pasid_supported(iommu)) {
3230                         u32 temp = 2 << ecap_pss(iommu->ecap);
3231
3232                         intel_pasid_max_id = min_t(u32, temp,
3233                                                    intel_pasid_max_id);
3234                 }
3235
3236                 g_iommus[iommu->seq_id] = iommu;
3237
3238                 intel_iommu_init_qi(iommu);
3239
3240                 ret = iommu_init_domains(iommu);
3241                 if (ret)
3242                         goto free_iommu;
3243
3244                 init_translation_status(iommu);
3245
3246                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3247                         iommu_disable_translation(iommu);
3248                         clear_translation_pre_enabled(iommu);
3249                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3250                                 iommu->name);
3251                 }
3252
3253                 /*
3254                  * TBD:
3255                  * we could share the same root & context tables
3256                  * among all IOMMU's. Need to Split it later.
3257                  */
3258                 ret = iommu_alloc_root_entry(iommu);
3259                 if (ret)
3260                         goto free_iommu;
3261
3262                 if (translation_pre_enabled(iommu)) {
3263                         pr_info("Translation already enabled - trying to copy translation structures\n");
3264
3265                         ret = copy_translation_tables(iommu);
3266                         if (ret) {
3267                                 /*
3268                                  * We found the IOMMU with translation
3269                                  * enabled - but failed to copy over the
3270                                  * old root-entry table. Try to proceed
3271                                  * by disabling translation now and
3272                                  * allocating a clean root-entry table.
3273                                  * This might cause DMAR faults, but
3274                                  * probably the dump will still succeed.
3275                                  */
3276                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3277                                        iommu->name);
3278                                 iommu_disable_translation(iommu);
3279                                 clear_translation_pre_enabled(iommu);
3280                         } else {
3281                                 pr_info("Copied translation tables from previous kernel for %s\n",
3282                                         iommu->name);
3283                         }
3284                 }
3285
3286                 if (!ecap_pass_through(iommu->ecap))
3287                         hw_pass_through = 0;
3288 #ifdef CONFIG_INTEL_IOMMU_SVM
3289                 if (pasid_supported(iommu))
3290                         intel_svm_init(iommu);
3291 #endif
3292         }
3293
3294         /*
3295          * Now that qi is enabled on all iommus, set the root entry and flush
3296          * caches. This is required on some Intel X58 chipsets, otherwise the
3297          * flush_context function will loop forever and the boot hangs.
3298          */
3299         for_each_active_iommu(iommu, drhd) {
3300                 iommu_flush_write_buffer(iommu);
3301                 iommu_set_root_entry(iommu);
3302                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3303                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3304         }
3305
3306         if (iommu_default_passthrough())
3307                 iommu_identity_mapping |= IDENTMAP_ALL;
3308
3309 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3310         dmar_map_gfx = 0;
3311 #endif
3312
3313         if (!dmar_map_gfx)
3314                 iommu_identity_mapping |= IDENTMAP_GFX;
3315
3316         check_tylersburg_isoch();
3317
3318         ret = si_domain_init(hw_pass_through);
3319         if (ret)
3320                 goto free_iommu;
3321
3322         /*
3323          * for each drhd
3324          *   enable fault log
3325          *   global invalidate context cache
3326          *   global invalidate iotlb
3327          *   enable translation
3328          */
3329         for_each_iommu(iommu, drhd) {
3330                 if (drhd->ignored) {
3331                         /*
3332                          * we always have to disable PMRs or DMA may fail on
3333                          * this device
3334                          */
3335                         if (force_on)
3336                                 iommu_disable_protect_mem_regions(iommu);
3337                         continue;
3338                 }
3339
3340                 iommu_flush_write_buffer(iommu);
3341
3342 #ifdef CONFIG_INTEL_IOMMU_SVM
3343                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3344                         /*
3345                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3346                          * could cause possible lock race condition.
3347                          */
3348                         up_write(&dmar_global_lock);
3349                         ret = intel_svm_enable_prq(iommu);
3350                         down_write(&dmar_global_lock);
3351                         if (ret)
3352                                 goto free_iommu;
3353                 }
3354 #endif
3355                 ret = dmar_set_interrupt(iommu);
3356                 if (ret)
3357                         goto free_iommu;
3358         }
3359
3360         return 0;
3361
3362 free_iommu:
3363         for_each_active_iommu(iommu, drhd) {
3364                 disable_dmar_iommu(iommu);
3365                 free_dmar_iommu(iommu);
3366         }
3367
3368         kfree(g_iommus);
3369
3370 error:
3371         return ret;
3372 }
3373
3374 /* This takes a number of _MM_ pages, not VTD pages */
3375 static unsigned long intel_alloc_iova(struct device *dev,
3376                                      struct dmar_domain *domain,
3377                                      unsigned long nrpages, uint64_t dma_mask)
3378 {
3379         unsigned long iova_pfn;
3380
3381         /* Restrict dma_mask to the width that the iommu can handle */
3382         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3383         /* Ensure we reserve the whole size-aligned region */
3384         nrpages = __roundup_pow_of_two(nrpages);
3385
3386         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3387                 /*
3388                  * First try to allocate an io virtual address in
3389                  * DMA_BIT_MASK(32) and if that fails then try allocating
3390                  * from higher range
3391                  */
3392                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3393                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3394                 if (iova_pfn)
3395                         return iova_pfn;
3396         }
3397         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3398                                    IOVA_PFN(dma_mask), true);
3399         if (unlikely(!iova_pfn)) {
3400                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3401                              nrpages);
3402                 return 0;
3403         }
3404
3405         return iova_pfn;
3406 }
3407
3408 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3409 {
3410         struct dmar_domain *domain, *tmp;
3411         struct dmar_rmrr_unit *rmrr;
3412         struct device *i_dev;
3413         int i, ret;
3414
3415         /* Device shouldn't be attached by any domains. */
3416         domain = find_domain(dev);
3417         if (domain)
3418                 return NULL;
3419
3420         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3421         if (!domain)
3422                 goto out;
3423
3424         /* We have a new domain - setup possible RMRRs for the device */
3425         rcu_read_lock();
3426         for_each_rmrr_units(rmrr) {
3427                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3428                                           i, i_dev) {
3429                         if (i_dev != dev)
3430                                 continue;
3431
3432                         ret = domain_prepare_identity_map(dev, domain,
3433                                                           rmrr->base_address,
3434                                                           rmrr->end_address);
3435                         if (ret)
3436                                 dev_err(dev, "Mapping reserved region failed\n");
3437                 }
3438         }
3439         rcu_read_unlock();
3440
3441         tmp = set_domain_for_dev(dev, domain);
3442         if (!tmp || domain != tmp) {
3443                 domain_exit(domain);
3444                 domain = tmp;
3445         }
3446
3447 out:
3448         if (!domain)
3449                 dev_err(dev, "Allocating domain failed\n");
3450         else
3451                 domain->domain.type = IOMMU_DOMAIN_DMA;
3452
3453         return domain;
3454 }
3455
3456 /* Check if the dev needs to go through non-identity map and unmap process.*/
3457 static bool iommu_need_mapping(struct device *dev)
3458 {
3459         int ret;
3460
3461         if (iommu_dummy(dev))
3462                 return false;
3463
3464         ret = identity_mapping(dev);
3465         if (ret) {
3466                 u64 dma_mask = *dev->dma_mask;
3467
3468                 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3469                         dma_mask = dev->coherent_dma_mask;
3470
3471                 if (dma_mask >= dma_direct_get_required_mask(dev))
3472                         return false;
3473
3474                 /*
3475                  * 32 bit DMA is removed from si_domain and fall back to
3476                  * non-identity mapping.
3477                  */
3478                 dmar_remove_one_dev_info(dev);
3479                 ret = iommu_request_dma_domain_for_dev(dev);
3480                 if (ret) {
3481                         struct iommu_domain *domain;
3482                         struct dmar_domain *dmar_domain;
3483
3484                         domain = iommu_get_domain_for_dev(dev);
3485                         if (domain) {
3486                                 dmar_domain = to_dmar_domain(domain);
3487                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3488                         }
3489                         dmar_remove_one_dev_info(dev);
3490                         get_private_domain_for_dev(dev);
3491                 }
3492
3493                 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3494         }
3495
3496         return true;
3497 }
3498
3499 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3500                                      size_t size, int dir, u64 dma_mask)
3501 {
3502         struct dmar_domain *domain;
3503         phys_addr_t start_paddr;
3504         unsigned long iova_pfn;
3505         int prot = 0;
3506         int ret;
3507         struct intel_iommu *iommu;
3508         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3509
3510         BUG_ON(dir == DMA_NONE);
3511
3512         domain = find_domain(dev);
3513         if (!domain)
3514                 return DMA_MAPPING_ERROR;
3515
3516         iommu = domain_get_iommu(domain);
3517         size = aligned_nrpages(paddr, size);
3518
3519         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3520         if (!iova_pfn)
3521                 goto error;
3522
3523         /*
3524          * Check if DMAR supports zero-length reads on write only
3525          * mappings..
3526          */
3527         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3528                         !cap_zlr(iommu->cap))
3529                 prot |= DMA_PTE_READ;
3530         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3531                 prot |= DMA_PTE_WRITE;
3532         /*
3533          * paddr - (paddr + size) might be partial page, we should map the whole
3534          * page.  Note: if two part of one page are separately mapped, we
3535          * might have two guest_addr mapping to the same host paddr, but this
3536          * is not a big problem
3537          */
3538         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3539                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3540         if (ret)
3541                 goto error;
3542
3543         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3544         start_paddr += paddr & ~PAGE_MASK;
3545
3546         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3547
3548         return start_paddr;
3549
3550 error:
3551         if (iova_pfn)
3552                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3553         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3554                 size, (unsigned long long)paddr, dir);
3555         return DMA_MAPPING_ERROR;
3556 }
3557
3558 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3559                                  unsigned long offset, size_t size,
3560                                  enum dma_data_direction dir,
3561                                  unsigned long attrs)
3562 {
3563         if (iommu_need_mapping(dev))
3564                 return __intel_map_single(dev, page_to_phys(page) + offset,
3565                                 size, dir, *dev->dma_mask);
3566         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3567 }
3568
3569 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3570                                      size_t size, enum dma_data_direction dir,
3571                                      unsigned long attrs)
3572 {
3573         if (iommu_need_mapping(dev))
3574                 return __intel_map_single(dev, phys_addr, size, dir,
3575                                 *dev->dma_mask);
3576         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3577 }
3578
3579 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3580 {
3581         struct dmar_domain *domain;
3582         unsigned long start_pfn, last_pfn;
3583         unsigned long nrpages;
3584         unsigned long iova_pfn;
3585         struct intel_iommu *iommu;
3586         struct page *freelist;
3587         struct pci_dev *pdev = NULL;
3588
3589         domain = find_domain(dev);
3590         BUG_ON(!domain);
3591
3592         iommu = domain_get_iommu(domain);
3593
3594         iova_pfn = IOVA_PFN(dev_addr);
3595
3596         nrpages = aligned_nrpages(dev_addr, size);
3597         start_pfn = mm_to_dma_pfn(iova_pfn);
3598         last_pfn = start_pfn + nrpages - 1;
3599
3600         if (dev_is_pci(dev))
3601                 pdev = to_pci_dev(dev);
3602
3603         freelist = domain_unmap(domain, start_pfn, last_pfn);
3604         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3605                         !has_iova_flush_queue(&domain->iovad)) {
3606                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3607                                       nrpages, !freelist, 0);
3608                 /* free iova */
3609                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3610                 dma_free_pagelist(freelist);
3611         } else {
3612                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3613                            (unsigned long)freelist);
3614                 /*
3615                  * queue up the release of the unmap to save the 1/6th of the
3616                  * cpu used up by the iotlb flush operation...
3617                  */
3618         }
3619
3620         trace_unmap_single(dev, dev_addr, size);
3621 }
3622
3623 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3624                              size_t size, enum dma_data_direction dir,
3625                              unsigned long attrs)
3626 {
3627         if (iommu_need_mapping(dev))
3628                 intel_unmap(dev, dev_addr, size);
3629         else
3630                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3631 }
3632
3633 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3634                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3635 {
3636         if (iommu_need_mapping(dev))
3637                 intel_unmap(dev, dev_addr, size);
3638 }
3639
3640 static void *intel_alloc_coherent(struct device *dev, size_t size,
3641                                   dma_addr_t *dma_handle, gfp_t flags,
3642                                   unsigned long attrs)
3643 {
3644         struct page *page = NULL;
3645         int order;
3646
3647         if (!iommu_need_mapping(dev))
3648                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3649
3650         size = PAGE_ALIGN(size);
3651         order = get_order(size);
3652
3653         if (gfpflags_allow_blocking(flags)) {
3654                 unsigned int count = size >> PAGE_SHIFT;
3655
3656                 page = dma_alloc_from_contiguous(dev, count, order,
3657                                                  flags & __GFP_NOWARN);
3658         }
3659
3660         if (!page)
3661                 page = alloc_pages(flags, order);
3662         if (!page)
3663                 return NULL;
3664         memset(page_address(page), 0, size);
3665
3666         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3667                                          DMA_BIDIRECTIONAL,
3668                                          dev->coherent_dma_mask);
3669         if (*dma_handle != DMA_MAPPING_ERROR)
3670                 return page_address(page);
3671         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3672                 __free_pages(page, order);
3673
3674         return NULL;
3675 }
3676
3677 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3678                                 dma_addr_t dma_handle, unsigned long attrs)
3679 {
3680         int order;
3681         struct page *page = virt_to_page(vaddr);
3682
3683         if (!iommu_need_mapping(dev))
3684                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3685
3686         size = PAGE_ALIGN(size);
3687         order = get_order(size);
3688
3689         intel_unmap(dev, dma_handle, size);
3690         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3691                 __free_pages(page, order);
3692 }
3693
3694 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3695                            int nelems, enum dma_data_direction dir,
3696                            unsigned long attrs)
3697 {
3698         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3699         unsigned long nrpages = 0;
3700         struct scatterlist *sg;
3701         int i;
3702
3703         if (!iommu_need_mapping(dev))
3704                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3705
3706         for_each_sg(sglist, sg, nelems, i) {
3707                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3708         }
3709
3710         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3711
3712         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3713 }
3714
3715 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3716                         enum dma_data_direction dir, unsigned long attrs)
3717 {
3718         int i;
3719         struct dmar_domain *domain;
3720         size_t size = 0;
3721         int prot = 0;
3722         unsigned long iova_pfn;
3723         int ret;
3724         struct scatterlist *sg;
3725         unsigned long start_vpfn;
3726         struct intel_iommu *iommu;
3727
3728         BUG_ON(dir == DMA_NONE);
3729         if (!iommu_need_mapping(dev))
3730                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3731
3732         domain = find_domain(dev);
3733         if (!domain)
3734                 return 0;
3735
3736         iommu = domain_get_iommu(domain);
3737
3738         for_each_sg(sglist, sg, nelems, i)
3739                 size += aligned_nrpages(sg->offset, sg->length);
3740
3741         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3742                                 *dev->dma_mask);
3743         if (!iova_pfn) {
3744                 sglist->dma_length = 0;
3745                 return 0;
3746         }
3747
3748         /*
3749          * Check if DMAR supports zero-length reads on write only
3750          * mappings..
3751          */
3752         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3753                         !cap_zlr(iommu->cap))
3754                 prot |= DMA_PTE_READ;
3755         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3756                 prot |= DMA_PTE_WRITE;
3757
3758         start_vpfn = mm_to_dma_pfn(iova_pfn);
3759
3760         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3761         if (unlikely(ret)) {
3762                 dma_pte_free_pagetable(domain, start_vpfn,
3763                                        start_vpfn + size - 1,
3764                                        agaw_to_level(domain->agaw) + 1);
3765                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3766                 return 0;
3767         }
3768
3769         trace_map_sg(dev, iova_pfn << PAGE_SHIFT,
3770                      sg_phys(sglist), size << VTD_PAGE_SHIFT);
3771
3772         return nelems;
3773 }
3774
3775 static u64 intel_get_required_mask(struct device *dev)
3776 {
3777         if (!iommu_need_mapping(dev))
3778                 return dma_direct_get_required_mask(dev);
3779         return DMA_BIT_MASK(32);
3780 }
3781
3782 static const struct dma_map_ops intel_dma_ops = {
3783         .alloc = intel_alloc_coherent,
3784         .free = intel_free_coherent,
3785         .map_sg = intel_map_sg,
3786         .unmap_sg = intel_unmap_sg,
3787         .map_page = intel_map_page,
3788         .unmap_page = intel_unmap_page,
3789         .map_resource = intel_map_resource,
3790         .unmap_resource = intel_unmap_resource,
3791         .dma_supported = dma_direct_supported,
3792         .mmap = dma_common_mmap,
3793         .get_sgtable = dma_common_get_sgtable,
3794         .get_required_mask = intel_get_required_mask,
3795 };
3796
3797 static void
3798 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3799                    enum dma_data_direction dir, enum dma_sync_target target)
3800 {
3801         struct dmar_domain *domain;
3802         phys_addr_t tlb_addr;
3803
3804         domain = find_domain(dev);
3805         if (WARN_ON(!domain))
3806                 return;
3807
3808         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3809         if (is_swiotlb_buffer(tlb_addr))
3810                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3811 }
3812
3813 static dma_addr_t
3814 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3815                   enum dma_data_direction dir, unsigned long attrs,
3816                   u64 dma_mask)
3817 {
3818         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3819         struct dmar_domain *domain;
3820         struct intel_iommu *iommu;
3821         unsigned long iova_pfn;
3822         unsigned long nrpages;
3823         phys_addr_t tlb_addr;
3824         int prot = 0;
3825         int ret;
3826
3827         domain = find_domain(dev);
3828         if (WARN_ON(dir == DMA_NONE || !domain))
3829                 return DMA_MAPPING_ERROR;
3830
3831         iommu = domain_get_iommu(domain);
3832         if (WARN_ON(!iommu))
3833                 return DMA_MAPPING_ERROR;
3834
3835         nrpages = aligned_nrpages(0, size);
3836         iova_pfn = intel_alloc_iova(dev, domain,
3837                                     dma_to_mm_pfn(nrpages), dma_mask);
3838         if (!iova_pfn)
3839                 return DMA_MAPPING_ERROR;
3840
3841         /*
3842          * Check if DMAR supports zero-length reads on write only
3843          * mappings..
3844          */
3845         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3846                         !cap_zlr(iommu->cap))
3847                 prot |= DMA_PTE_READ;
3848         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3849                 prot |= DMA_PTE_WRITE;
3850
3851         /*
3852          * If both the physical buffer start address and size are
3853          * page aligned, we don't need to use a bounce page.
3854          */
3855         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3856                 tlb_addr = swiotlb_tbl_map_single(dev,
3857                                 __phys_to_dma(dev, io_tlb_start),
3858                                 paddr, size, aligned_size, dir, attrs);
3859                 if (tlb_addr == DMA_MAPPING_ERROR) {
3860                         goto swiotlb_error;
3861                 } else {
3862                         /* Cleanup the padding area. */
3863                         void *padding_start = phys_to_virt(tlb_addr);
3864                         size_t padding_size = aligned_size;
3865
3866                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3867                             (dir == DMA_TO_DEVICE ||
3868                              dir == DMA_BIDIRECTIONAL)) {
3869                                 padding_start += size;
3870                                 padding_size -= size;
3871                         }
3872
3873                         memset(padding_start, 0, padding_size);
3874                 }
3875         } else {
3876                 tlb_addr = paddr;
3877         }
3878
3879         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3880                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3881         if (ret)
3882                 goto mapping_error;
3883
3884         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3885
3886         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3887
3888 mapping_error:
3889         if (is_swiotlb_buffer(tlb_addr))
3890                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3891                                          aligned_size, dir, attrs);
3892 swiotlb_error:
3893         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3894         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3895                 size, (unsigned long long)paddr, dir);
3896
3897         return DMA_MAPPING_ERROR;
3898 }
3899
3900 static void
3901 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3902                     enum dma_data_direction dir, unsigned long attrs)
3903 {
3904         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3905         struct dmar_domain *domain;
3906         phys_addr_t tlb_addr;
3907
3908         domain = find_domain(dev);
3909         if (WARN_ON(!domain))
3910                 return;
3911
3912         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3913         if (WARN_ON(!tlb_addr))
3914                 return;
3915
3916         intel_unmap(dev, dev_addr, size);
3917         if (is_swiotlb_buffer(tlb_addr))
3918                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3919                                          aligned_size, dir, attrs);
3920
3921         trace_bounce_unmap_single(dev, dev_addr, size);
3922 }
3923
3924 static dma_addr_t
3925 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3926                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3927 {
3928         return bounce_map_single(dev, page_to_phys(page) + offset,
3929                                  size, dir, attrs, *dev->dma_mask);
3930 }
3931
3932 static dma_addr_t
3933 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3934                     enum dma_data_direction dir, unsigned long attrs)
3935 {
3936         return bounce_map_single(dev, phys_addr, size,
3937                                  dir, attrs, *dev->dma_mask);
3938 }
3939
3940 static void
3941 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3942                   enum dma_data_direction dir, unsigned long attrs)
3943 {
3944         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3945 }
3946
3947 static void
3948 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3949                       enum dma_data_direction dir, unsigned long attrs)
3950 {
3951         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3952 }
3953
3954 static void
3955 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3956                 enum dma_data_direction dir, unsigned long attrs)
3957 {
3958         struct scatterlist *sg;
3959         int i;
3960
3961         for_each_sg(sglist, sg, nelems, i)
3962                 bounce_unmap_page(dev, sg->dma_address,
3963                                   sg_dma_len(sg), dir, attrs);
3964 }
3965
3966 static int
3967 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3968               enum dma_data_direction dir, unsigned long attrs)
3969 {
3970         int i;
3971         struct scatterlist *sg;
3972
3973         for_each_sg(sglist, sg, nelems, i) {
3974                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3975                                                   sg->offset, sg->length,
3976                                                   dir, attrs);
3977                 if (sg->dma_address == DMA_MAPPING_ERROR)
3978                         goto out_unmap;
3979                 sg_dma_len(sg) = sg->length;
3980         }
3981
3982         return nelems;
3983
3984 out_unmap:
3985         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3986         return 0;
3987 }
3988
3989 static void
3990 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3991                            size_t size, enum dma_data_direction dir)
3992 {
3993         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3994 }
3995
3996 static void
3997 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3998                               size_t size, enum dma_data_direction dir)
3999 {
4000         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4001 }
4002
4003 static void
4004 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4005                        int nelems, enum dma_data_direction dir)
4006 {
4007         struct scatterlist *sg;
4008         int i;
4009
4010         for_each_sg(sglist, sg, nelems, i)
4011                 bounce_sync_single(dev, sg_dma_address(sg),
4012                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
4013 }
4014
4015 static void
4016 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4017                           int nelems, enum dma_data_direction dir)
4018 {
4019         struct scatterlist *sg;
4020         int i;
4021
4022         for_each_sg(sglist, sg, nelems, i)
4023                 bounce_sync_single(dev, sg_dma_address(sg),
4024                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4025 }
4026
4027 static const struct dma_map_ops bounce_dma_ops = {
4028         .alloc                  = intel_alloc_coherent,
4029         .free                   = intel_free_coherent,
4030         .map_sg                 = bounce_map_sg,
4031         .unmap_sg               = bounce_unmap_sg,
4032         .map_page               = bounce_map_page,
4033         .unmap_page             = bounce_unmap_page,
4034         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
4035         .sync_single_for_device = bounce_sync_single_for_device,
4036         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
4037         .sync_sg_for_device     = bounce_sync_sg_for_device,
4038         .map_resource           = bounce_map_resource,
4039         .unmap_resource         = bounce_unmap_resource,
4040         .dma_supported          = dma_direct_supported,
4041 };
4042
4043 static inline int iommu_domain_cache_init(void)
4044 {
4045         int ret = 0;
4046
4047         iommu_domain_cache = kmem_cache_create("iommu_domain",
4048                                          sizeof(struct dmar_domain),
4049                                          0,
4050                                          SLAB_HWCACHE_ALIGN,
4051
4052                                          NULL);
4053         if (!iommu_domain_cache) {
4054                 pr_err("Couldn't create iommu_domain cache\n");
4055                 ret = -ENOMEM;
4056         }
4057
4058         return ret;
4059 }
4060
4061 static inline int iommu_devinfo_cache_init(void)
4062 {
4063         int ret = 0;
4064
4065         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4066                                          sizeof(struct device_domain_info),
4067                                          0,
4068                                          SLAB_HWCACHE_ALIGN,
4069                                          NULL);
4070         if (!iommu_devinfo_cache) {
4071                 pr_err("Couldn't create devinfo cache\n");
4072                 ret = -ENOMEM;
4073         }
4074
4075         return ret;
4076 }
4077
4078 static int __init iommu_init_mempool(void)
4079 {
4080         int ret;
4081         ret = iova_cache_get();
4082         if (ret)
4083                 return ret;
4084
4085         ret = iommu_domain_cache_init();
4086         if (ret)
4087                 goto domain_error;
4088
4089         ret = iommu_devinfo_cache_init();
4090         if (!ret)
4091                 return ret;
4092
4093         kmem_cache_destroy(iommu_domain_cache);
4094 domain_error:
4095         iova_cache_put();
4096
4097         return -ENOMEM;
4098 }
4099
4100 static void __init iommu_exit_mempool(void)
4101 {
4102         kmem_cache_destroy(iommu_devinfo_cache);
4103         kmem_cache_destroy(iommu_domain_cache);
4104         iova_cache_put();
4105 }
4106
4107 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4108 {
4109         struct dmar_drhd_unit *drhd;
4110         u32 vtbar;
4111         int rc;
4112
4113         /* We know that this device on this chipset has its own IOMMU.
4114          * If we find it under a different IOMMU, then the BIOS is lying
4115          * to us. Hope that the IOMMU for this device is actually
4116          * disabled, and it needs no translation...
4117          */
4118         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4119         if (rc) {
4120                 /* "can't" happen */
4121                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4122                 return;
4123         }
4124         vtbar &= 0xffff0000;
4125
4126         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4127         drhd = dmar_find_matched_drhd_unit(pdev);
4128         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
4129                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4130                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4131                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4132         }
4133 }
4134 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4135
4136 static void __init init_no_remapping_devices(void)
4137 {
4138         struct dmar_drhd_unit *drhd;
4139         struct device *dev;
4140         int i;
4141
4142         for_each_drhd_unit(drhd) {
4143                 if (!drhd->include_all) {
4144                         for_each_active_dev_scope(drhd->devices,
4145                                                   drhd->devices_cnt, i, dev)
4146                                 break;
4147                         /* ignore DMAR unit if no devices exist */
4148                         if (i == drhd->devices_cnt)
4149                                 drhd->ignored = 1;
4150                 }
4151         }
4152
4153         for_each_active_drhd_unit(drhd) {
4154                 if (drhd->include_all)
4155                         continue;
4156
4157                 for_each_active_dev_scope(drhd->devices,
4158                                           drhd->devices_cnt, i, dev)
4159                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4160                                 break;
4161                 if (i < drhd->devices_cnt)
4162                         continue;
4163
4164                 /* This IOMMU has *only* gfx devices. Either bypass it or
4165                    set the gfx_mapped flag, as appropriate */
4166                 if (!dmar_map_gfx) {
4167                         drhd->ignored = 1;
4168                         for_each_active_dev_scope(drhd->devices,
4169                                                   drhd->devices_cnt, i, dev)
4170                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4171                 }
4172         }
4173 }
4174
4175 #ifdef CONFIG_SUSPEND
4176 static int init_iommu_hw(void)
4177 {
4178         struct dmar_drhd_unit *drhd;
4179         struct intel_iommu *iommu = NULL;
4180
4181         for_each_active_iommu(iommu, drhd)
4182                 if (iommu->qi)
4183                         dmar_reenable_qi(iommu);
4184
4185         for_each_iommu(iommu, drhd) {
4186                 if (drhd->ignored) {
4187                         /*
4188                          * we always have to disable PMRs or DMA may fail on
4189                          * this device
4190                          */
4191                         if (force_on)
4192                                 iommu_disable_protect_mem_regions(iommu);
4193                         continue;
4194                 }
4195
4196                 iommu_flush_write_buffer(iommu);
4197
4198                 iommu_set_root_entry(iommu);
4199
4200                 iommu->flush.flush_context(iommu, 0, 0, 0,
4201                                            DMA_CCMD_GLOBAL_INVL);
4202                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4203                 iommu_enable_translation(iommu);
4204                 iommu_disable_protect_mem_regions(iommu);
4205         }
4206
4207         return 0;
4208 }
4209
4210 static void iommu_flush_all(void)
4211 {
4212         struct dmar_drhd_unit *drhd;
4213         struct intel_iommu *iommu;
4214
4215         for_each_active_iommu(iommu, drhd) {
4216                 iommu->flush.flush_context(iommu, 0, 0, 0,
4217                                            DMA_CCMD_GLOBAL_INVL);
4218                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4219                                          DMA_TLB_GLOBAL_FLUSH);
4220         }
4221 }
4222
4223 static int iommu_suspend(void)
4224 {
4225         struct dmar_drhd_unit *drhd;
4226         struct intel_iommu *iommu = NULL;
4227         unsigned long flag;
4228
4229         for_each_active_iommu(iommu, drhd) {
4230                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4231                                                  GFP_ATOMIC);
4232                 if (!iommu->iommu_state)
4233                         goto nomem;
4234         }
4235
4236         iommu_flush_all();
4237
4238         for_each_active_iommu(iommu, drhd) {
4239                 iommu_disable_translation(iommu);
4240
4241                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4242
4243                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4244                         readl(iommu->reg + DMAR_FECTL_REG);
4245                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4246                         readl(iommu->reg + DMAR_FEDATA_REG);
4247                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4248                         readl(iommu->reg + DMAR_FEADDR_REG);
4249                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4250                         readl(iommu->reg + DMAR_FEUADDR_REG);
4251
4252                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4253         }
4254         return 0;
4255
4256 nomem:
4257         for_each_active_iommu(iommu, drhd)
4258                 kfree(iommu->iommu_state);
4259
4260         return -ENOMEM;
4261 }
4262
4263 static void iommu_resume(void)
4264 {
4265         struct dmar_drhd_unit *drhd;
4266         struct intel_iommu *iommu = NULL;
4267         unsigned long flag;
4268
4269         if (init_iommu_hw()) {
4270                 if (force_on)
4271                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4272                 else
4273                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4274                 return;
4275         }
4276
4277         for_each_active_iommu(iommu, drhd) {
4278
4279                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4280
4281                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4282                         iommu->reg + DMAR_FECTL_REG);
4283                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4284                         iommu->reg + DMAR_FEDATA_REG);
4285                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4286                         iommu->reg + DMAR_FEADDR_REG);
4287                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4288                         iommu->reg + DMAR_FEUADDR_REG);
4289
4290                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4291         }
4292
4293         for_each_active_iommu(iommu, drhd)
4294                 kfree(iommu->iommu_state);
4295 }
4296
4297 static struct syscore_ops iommu_syscore_ops = {
4298         .resume         = iommu_resume,
4299         .suspend        = iommu_suspend,
4300 };
4301
4302 static void __init init_iommu_pm_ops(void)
4303 {
4304         register_syscore_ops(&iommu_syscore_ops);
4305 }
4306
4307 #else
4308 static inline void init_iommu_pm_ops(void) {}
4309 #endif  /* CONFIG_PM */
4310
4311 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4312 {
4313         struct acpi_dmar_reserved_memory *rmrr;
4314         struct dmar_rmrr_unit *rmrru;
4315
4316         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4317         if (!rmrru)
4318                 goto out;
4319
4320         rmrru->hdr = header;
4321         rmrr = (struct acpi_dmar_reserved_memory *)header;
4322         rmrru->base_address = rmrr->base_address;
4323         rmrru->end_address = rmrr->end_address;
4324
4325         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4326                                 ((void *)rmrr) + rmrr->header.length,
4327                                 &rmrru->devices_cnt);
4328         if (rmrru->devices_cnt && rmrru->devices == NULL)
4329                 goto free_rmrru;
4330
4331         list_add(&rmrru->list, &dmar_rmrr_units);
4332
4333         return 0;
4334 free_rmrru:
4335         kfree(rmrru);
4336 out:
4337         return -ENOMEM;
4338 }
4339
4340 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4341 {
4342         struct dmar_atsr_unit *atsru;
4343         struct acpi_dmar_atsr *tmp;
4344
4345         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4346                                 dmar_rcu_check()) {
4347                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4348                 if (atsr->segment != tmp->segment)
4349                         continue;
4350                 if (atsr->header.length != tmp->header.length)
4351                         continue;
4352                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4353                         return atsru;
4354         }
4355
4356         return NULL;
4357 }
4358
4359 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4360 {
4361         struct acpi_dmar_atsr *atsr;
4362         struct dmar_atsr_unit *atsru;
4363
4364         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4365                 return 0;
4366
4367         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4368         atsru = dmar_find_atsr(atsr);
4369         if (atsru)
4370                 return 0;
4371
4372         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4373         if (!atsru)
4374                 return -ENOMEM;
4375
4376         /*
4377          * If memory is allocated from slab by ACPI _DSM method, we need to
4378          * copy the memory content because the memory buffer will be freed
4379          * on return.
4380          */
4381         atsru->hdr = (void *)(atsru + 1);
4382         memcpy(atsru->hdr, hdr, hdr->length);
4383         atsru->include_all = atsr->flags & 0x1;
4384         if (!atsru->include_all) {
4385                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4386                                 (void *)atsr + atsr->header.length,
4387                                 &atsru->devices_cnt);
4388                 if (atsru->devices_cnt && atsru->devices == NULL) {
4389                         kfree(atsru);
4390                         return -ENOMEM;
4391                 }
4392         }
4393
4394         list_add_rcu(&atsru->list, &dmar_atsr_units);
4395
4396         return 0;
4397 }
4398
4399 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4400 {
4401         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4402         kfree(atsru);
4403 }
4404
4405 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4406 {
4407         struct acpi_dmar_atsr *atsr;
4408         struct dmar_atsr_unit *atsru;
4409
4410         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4411         atsru = dmar_find_atsr(atsr);
4412         if (atsru) {
4413                 list_del_rcu(&atsru->list);
4414                 synchronize_rcu();
4415                 intel_iommu_free_atsr(atsru);
4416         }
4417
4418         return 0;
4419 }
4420
4421 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4422 {
4423         int i;
4424         struct device *dev;
4425         struct acpi_dmar_atsr *atsr;
4426         struct dmar_atsr_unit *atsru;
4427
4428         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4429         atsru = dmar_find_atsr(atsr);
4430         if (!atsru)
4431                 return 0;
4432
4433         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4434                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4435                                           i, dev)
4436                         return -EBUSY;
4437         }
4438
4439         return 0;
4440 }
4441
4442 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4443 {
4444         int sp, ret;
4445         struct intel_iommu *iommu = dmaru->iommu;
4446
4447         if (g_iommus[iommu->seq_id])
4448                 return 0;
4449
4450         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4451                 pr_warn("%s: Doesn't support hardware pass through.\n",
4452                         iommu->name);
4453                 return -ENXIO;
4454         }
4455         if (!ecap_sc_support(iommu->ecap) &&
4456             domain_update_iommu_snooping(iommu)) {
4457                 pr_warn("%s: Doesn't support snooping.\n",
4458                         iommu->name);
4459                 return -ENXIO;
4460         }
4461         sp = domain_update_iommu_superpage(iommu) - 1;
4462         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4463                 pr_warn("%s: Doesn't support large page.\n",
4464                         iommu->name);
4465                 return -ENXIO;
4466         }
4467
4468         /*
4469          * Disable translation if already enabled prior to OS handover.
4470          */
4471         if (iommu->gcmd & DMA_GCMD_TE)
4472                 iommu_disable_translation(iommu);
4473
4474         g_iommus[iommu->seq_id] = iommu;
4475         ret = iommu_init_domains(iommu);
4476         if (ret == 0)
4477                 ret = iommu_alloc_root_entry(iommu);
4478         if (ret)
4479                 goto out;
4480
4481 #ifdef CONFIG_INTEL_IOMMU_SVM
4482         if (pasid_supported(iommu))
4483                 intel_svm_init(iommu);
4484 #endif
4485
4486         if (dmaru->ignored) {
4487                 /*
4488                  * we always have to disable PMRs or DMA may fail on this device
4489                  */
4490                 if (force_on)
4491                         iommu_disable_protect_mem_regions(iommu);
4492                 return 0;
4493         }
4494
4495         intel_iommu_init_qi(iommu);
4496         iommu_flush_write_buffer(iommu);
4497
4498 #ifdef CONFIG_INTEL_IOMMU_SVM
4499         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4500                 ret = intel_svm_enable_prq(iommu);
4501                 if (ret)
4502                         goto disable_iommu;
4503         }
4504 #endif
4505         ret = dmar_set_interrupt(iommu);
4506         if (ret)
4507                 goto disable_iommu;
4508
4509         iommu_set_root_entry(iommu);
4510         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4511         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4512         iommu_enable_translation(iommu);
4513
4514         iommu_disable_protect_mem_regions(iommu);
4515         return 0;
4516
4517 disable_iommu:
4518         disable_dmar_iommu(iommu);
4519 out:
4520         free_dmar_iommu(iommu);
4521         return ret;
4522 }
4523
4524 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4525 {
4526         int ret = 0;
4527         struct intel_iommu *iommu = dmaru->iommu;
4528
4529         if (!intel_iommu_enabled)
4530                 return 0;
4531         if (iommu == NULL)
4532                 return -EINVAL;
4533
4534         if (insert) {
4535                 ret = intel_iommu_add(dmaru);
4536         } else {
4537                 disable_dmar_iommu(iommu);
4538                 free_dmar_iommu(iommu);
4539         }
4540
4541         return ret;
4542 }
4543
4544 static void intel_iommu_free_dmars(void)
4545 {
4546         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4547         struct dmar_atsr_unit *atsru, *atsr_n;
4548
4549         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4550                 list_del(&rmrru->list);
4551                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4552                 kfree(rmrru);
4553         }
4554
4555         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4556                 list_del(&atsru->list);
4557                 intel_iommu_free_atsr(atsru);
4558         }
4559 }
4560
4561 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4562 {
4563         int i, ret = 1;
4564         struct pci_bus *bus;
4565         struct pci_dev *bridge = NULL;
4566         struct device *tmp;
4567         struct acpi_dmar_atsr *atsr;
4568         struct dmar_atsr_unit *atsru;
4569
4570         dev = pci_physfn(dev);
4571         for (bus = dev->bus; bus; bus = bus->parent) {
4572                 bridge = bus->self;
4573                 /* If it's an integrated device, allow ATS */
4574                 if (!bridge)
4575                         return 1;
4576                 /* Connected via non-PCIe: no ATS */
4577                 if (!pci_is_pcie(bridge) ||
4578                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4579                         return 0;
4580                 /* If we found the root port, look it up in the ATSR */
4581                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4582                         break;
4583         }
4584
4585         rcu_read_lock();
4586         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4587                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4588                 if (atsr->segment != pci_domain_nr(dev->bus))
4589                         continue;
4590
4591                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4592                         if (tmp == &bridge->dev)
4593                                 goto out;
4594
4595                 if (atsru->include_all)
4596                         goto out;
4597         }
4598         ret = 0;
4599 out:
4600         rcu_read_unlock();
4601
4602         return ret;
4603 }
4604
4605 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4606 {
4607         int ret;
4608         struct dmar_rmrr_unit *rmrru;
4609         struct dmar_atsr_unit *atsru;
4610         struct acpi_dmar_atsr *atsr;
4611         struct acpi_dmar_reserved_memory *rmrr;
4612
4613         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4614                 return 0;
4615
4616         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4617                 rmrr = container_of(rmrru->hdr,
4618                                     struct acpi_dmar_reserved_memory, header);
4619                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4620                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4621                                 ((void *)rmrr) + rmrr->header.length,
4622                                 rmrr->segment, rmrru->devices,
4623                                 rmrru->devices_cnt);
4624                         if (ret < 0)
4625                                 return ret;
4626                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4627                         dmar_remove_dev_scope(info, rmrr->segment,
4628                                 rmrru->devices, rmrru->devices_cnt);
4629                 }
4630         }
4631
4632         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4633                 if (atsru->include_all)
4634                         continue;
4635
4636                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4637                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4638                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4639                                         (void *)atsr + atsr->header.length,
4640                                         atsr->segment, atsru->devices,
4641                                         atsru->devices_cnt);
4642                         if (ret > 0)
4643                                 break;
4644                         else if (ret < 0)
4645                                 return ret;
4646                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4647                         if (dmar_remove_dev_scope(info, atsr->segment,
4648                                         atsru->devices, atsru->devices_cnt))
4649                                 break;
4650                 }
4651         }
4652
4653         return 0;
4654 }
4655
4656 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4657                                        unsigned long val, void *v)
4658 {
4659         struct memory_notify *mhp = v;
4660         unsigned long long start, end;
4661         unsigned long start_vpfn, last_vpfn;
4662
4663         switch (val) {
4664         case MEM_GOING_ONLINE:
4665                 start = mhp->start_pfn << PAGE_SHIFT;
4666                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4667                 if (iommu_domain_identity_map(si_domain, start, end)) {
4668                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4669                                 start, end);
4670                         return NOTIFY_BAD;
4671                 }
4672                 break;
4673
4674         case MEM_OFFLINE:
4675         case MEM_CANCEL_ONLINE:
4676                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4677                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4678                 while (start_vpfn <= last_vpfn) {
4679                         struct iova *iova;
4680                         struct dmar_drhd_unit *drhd;
4681                         struct intel_iommu *iommu;
4682                         struct page *freelist;
4683
4684                         iova = find_iova(&si_domain->iovad, start_vpfn);
4685                         if (iova == NULL) {
4686                                 pr_debug("Failed get IOVA for PFN %lx\n",
4687                                          start_vpfn);
4688                                 break;
4689                         }
4690
4691                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4692                                                      start_vpfn, last_vpfn);
4693                         if (iova == NULL) {
4694                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4695                                         start_vpfn, last_vpfn);
4696                                 return NOTIFY_BAD;
4697                         }
4698
4699                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4700                                                iova->pfn_hi);
4701
4702                         rcu_read_lock();
4703                         for_each_active_iommu(iommu, drhd)
4704                                 iommu_flush_iotlb_psi(iommu, si_domain,
4705                                         iova->pfn_lo, iova_size(iova),
4706                                         !freelist, 0);
4707                         rcu_read_unlock();
4708                         dma_free_pagelist(freelist);
4709
4710                         start_vpfn = iova->pfn_hi + 1;
4711                         free_iova_mem(iova);
4712                 }
4713                 break;
4714         }
4715
4716         return NOTIFY_OK;
4717 }
4718
4719 static struct notifier_block intel_iommu_memory_nb = {
4720         .notifier_call = intel_iommu_memory_notifier,
4721         .priority = 0
4722 };
4723
4724 static void free_all_cpu_cached_iovas(unsigned int cpu)
4725 {
4726         int i;
4727
4728         for (i = 0; i < g_num_of_iommus; i++) {
4729                 struct intel_iommu *iommu = g_iommus[i];
4730                 struct dmar_domain *domain;
4731                 int did;
4732
4733                 if (!iommu)
4734                         continue;
4735
4736                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4737                         domain = get_iommu_domain(iommu, (u16)did);
4738
4739                         if (!domain)
4740                                 continue;
4741                         free_cpu_cached_iovas(cpu, &domain->iovad);
4742                 }
4743         }
4744 }
4745
4746 static int intel_iommu_cpu_dead(unsigned int cpu)
4747 {
4748         free_all_cpu_cached_iovas(cpu);
4749         return 0;
4750 }
4751
4752 static void intel_disable_iommus(void)
4753 {
4754         struct intel_iommu *iommu = NULL;
4755         struct dmar_drhd_unit *drhd;
4756
4757         for_each_iommu(iommu, drhd)
4758                 iommu_disable_translation(iommu);
4759 }
4760
4761 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4762 {
4763         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4764
4765         return container_of(iommu_dev, struct intel_iommu, iommu);
4766 }
4767
4768 static ssize_t intel_iommu_show_version(struct device *dev,
4769                                         struct device_attribute *attr,
4770                                         char *buf)
4771 {
4772         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4773         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4774         return sprintf(buf, "%d:%d\n",
4775                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4776 }
4777 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4778
4779 static ssize_t intel_iommu_show_address(struct device *dev,
4780                                         struct device_attribute *attr,
4781                                         char *buf)
4782 {
4783         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4784         return sprintf(buf, "%llx\n", iommu->reg_phys);
4785 }
4786 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4787
4788 static ssize_t intel_iommu_show_cap(struct device *dev,
4789                                     struct device_attribute *attr,
4790                                     char *buf)
4791 {
4792         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4793         return sprintf(buf, "%llx\n", iommu->cap);
4794 }
4795 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4796
4797 static ssize_t intel_iommu_show_ecap(struct device *dev,
4798                                     struct device_attribute *attr,
4799                                     char *buf)
4800 {
4801         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4802         return sprintf(buf, "%llx\n", iommu->ecap);
4803 }
4804 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4805
4806 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4807                                       struct device_attribute *attr,
4808                                       char *buf)
4809 {
4810         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4811         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4812 }
4813 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4814
4815 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4816                                            struct device_attribute *attr,
4817                                            char *buf)
4818 {
4819         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4820         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4821                                                   cap_ndoms(iommu->cap)));
4822 }
4823 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4824
4825 static struct attribute *intel_iommu_attrs[] = {
4826         &dev_attr_version.attr,
4827         &dev_attr_address.attr,
4828         &dev_attr_cap.attr,
4829         &dev_attr_ecap.attr,
4830         &dev_attr_domains_supported.attr,
4831         &dev_attr_domains_used.attr,
4832         NULL,
4833 };
4834
4835 static struct attribute_group intel_iommu_group = {
4836         .name = "intel-iommu",
4837         .attrs = intel_iommu_attrs,
4838 };
4839
4840 const struct attribute_group *intel_iommu_groups[] = {
4841         &intel_iommu_group,
4842         NULL,
4843 };
4844
4845 static inline bool has_untrusted_dev(void)
4846 {
4847         struct pci_dev *pdev = NULL;
4848
4849         for_each_pci_dev(pdev)
4850                 if (pdev->untrusted)
4851                         return true;
4852
4853         return false;
4854 }
4855
4856 static int __init platform_optin_force_iommu(void)
4857 {
4858         if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4859                 return 0;
4860
4861         if (no_iommu || dmar_disabled)
4862                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4863
4864         /*
4865          * If Intel-IOMMU is disabled by default, we will apply identity
4866          * map for all devices except those marked as being untrusted.
4867          */
4868         if (dmar_disabled)
4869                 iommu_identity_mapping |= IDENTMAP_ALL;
4870
4871         dmar_disabled = 0;
4872         no_iommu = 0;
4873
4874         return 1;
4875 }
4876
4877 static int __init probe_acpi_namespace_devices(void)
4878 {
4879         struct dmar_drhd_unit *drhd;
4880         /* To avoid a -Wunused-but-set-variable warning. */
4881         struct intel_iommu *iommu __maybe_unused;
4882         struct device *dev;
4883         int i, ret = 0;
4884
4885         for_each_active_iommu(iommu, drhd) {
4886                 for_each_active_dev_scope(drhd->devices,
4887                                           drhd->devices_cnt, i, dev) {
4888                         struct acpi_device_physical_node *pn;
4889                         struct iommu_group *group;
4890                         struct acpi_device *adev;
4891
4892                         if (dev->bus != &acpi_bus_type)
4893                                 continue;
4894
4895                         adev = to_acpi_device(dev);
4896                         mutex_lock(&adev->physical_node_lock);
4897                         list_for_each_entry(pn,
4898                                             &adev->physical_node_list, node) {
4899                                 group = iommu_group_get(pn->dev);
4900                                 if (group) {
4901                                         iommu_group_put(group);
4902                                         continue;
4903                                 }
4904
4905                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4906                                 ret = iommu_probe_device(pn->dev);
4907                                 if (ret)
4908                                         break;
4909                         }
4910                         mutex_unlock(&adev->physical_node_lock);
4911
4912                         if (ret)
4913                                 return ret;
4914                 }
4915         }
4916
4917         return 0;
4918 }
4919
4920 int __init intel_iommu_init(void)
4921 {
4922         int ret = -ENODEV;
4923         struct dmar_drhd_unit *drhd;
4924         struct intel_iommu *iommu;
4925
4926         /*
4927          * Intel IOMMU is required for a TXT/tboot launch or platform
4928          * opt in, so enforce that.
4929          */
4930         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4931
4932         if (iommu_init_mempool()) {
4933                 if (force_on)
4934                         panic("tboot: Failed to initialize iommu memory\n");
4935                 return -ENOMEM;
4936         }
4937
4938         down_write(&dmar_global_lock);
4939         if (dmar_table_init()) {
4940                 if (force_on)
4941                         panic("tboot: Failed to initialize DMAR table\n");
4942                 goto out_free_dmar;
4943         }
4944
4945         if (dmar_dev_scope_init() < 0) {
4946                 if (force_on)
4947                         panic("tboot: Failed to initialize DMAR device scope\n");
4948                 goto out_free_dmar;
4949         }
4950
4951         up_write(&dmar_global_lock);
4952
4953         /*
4954          * The bus notifier takes the dmar_global_lock, so lockdep will
4955          * complain later when we register it under the lock.
4956          */
4957         dmar_register_bus_notifier();
4958
4959         down_write(&dmar_global_lock);
4960
4961         if (!no_iommu)
4962                 intel_iommu_debugfs_init();
4963
4964         if (no_iommu || dmar_disabled) {
4965                 /*
4966                  * We exit the function here to ensure IOMMU's remapping and
4967                  * mempool aren't setup, which means that the IOMMU's PMRs
4968                  * won't be disabled via the call to init_dmars(). So disable
4969                  * it explicitly here. The PMRs were setup by tboot prior to
4970                  * calling SENTER, but the kernel is expected to reset/tear
4971                  * down the PMRs.
4972                  */
4973                 if (intel_iommu_tboot_noforce) {
4974                         for_each_iommu(iommu, drhd)
4975                                 iommu_disable_protect_mem_regions(iommu);
4976                 }
4977
4978                 /*
4979                  * Make sure the IOMMUs are switched off, even when we
4980                  * boot into a kexec kernel and the previous kernel left
4981                  * them enabled
4982                  */
4983                 intel_disable_iommus();
4984                 goto out_free_dmar;
4985         }
4986
4987         if (list_empty(&dmar_rmrr_units))
4988                 pr_info("No RMRR found\n");
4989
4990         if (list_empty(&dmar_atsr_units))
4991                 pr_info("No ATSR found\n");
4992
4993         if (dmar_init_reserved_ranges()) {
4994                 if (force_on)
4995                         panic("tboot: Failed to reserve iommu ranges\n");
4996                 goto out_free_reserved_range;
4997         }
4998
4999         if (dmar_map_gfx)
5000                 intel_iommu_gfx_mapped = 1;
5001
5002         init_no_remapping_devices();
5003
5004         ret = init_dmars();
5005         if (ret) {
5006                 if (force_on)
5007                         panic("tboot: Failed to initialize DMARs\n");
5008                 pr_err("Initialization failed\n");
5009                 goto out_free_reserved_range;
5010         }
5011         up_write(&dmar_global_lock);
5012
5013 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5014         /*
5015          * If the system has no untrusted device or the user has decided
5016          * to disable the bounce page mechanisms, we don't need swiotlb.
5017          * Mark this and the pre-allocated bounce pages will be released
5018          * later.
5019          */
5020         if (!has_untrusted_dev() || intel_no_bounce)
5021                 swiotlb = 0;
5022 #endif
5023         dma_ops = &intel_dma_ops;
5024
5025         init_iommu_pm_ops();
5026
5027         down_read(&dmar_global_lock);
5028         for_each_active_iommu(iommu, drhd) {
5029                 iommu_device_sysfs_add(&iommu->iommu, NULL,
5030                                        intel_iommu_groups,
5031                                        "%s", iommu->name);
5032                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5033                 iommu_device_register(&iommu->iommu);
5034         }
5035         up_read(&dmar_global_lock);
5036
5037         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5038         if (si_domain && !hw_pass_through)
5039                 register_memory_notifier(&intel_iommu_memory_nb);
5040         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5041                           intel_iommu_cpu_dead);
5042
5043         down_read(&dmar_global_lock);
5044         if (probe_acpi_namespace_devices())
5045                 pr_warn("ACPI name space devices didn't probe correctly\n");
5046
5047         /* Finally, we enable the DMA remapping hardware. */
5048         for_each_iommu(iommu, drhd) {
5049                 if (!drhd->ignored && !translation_pre_enabled(iommu))
5050                         iommu_enable_translation(iommu);
5051
5052                 iommu_disable_protect_mem_regions(iommu);
5053         }
5054         up_read(&dmar_global_lock);
5055
5056         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5057
5058         intel_iommu_enabled = 1;
5059
5060         return 0;
5061
5062 out_free_reserved_range:
5063         put_iova_domain(&reserved_iova_list);
5064 out_free_dmar:
5065         intel_iommu_free_dmars();
5066         up_write(&dmar_global_lock);
5067         iommu_exit_mempool();
5068         return ret;
5069 }
5070
5071 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5072 {
5073         struct intel_iommu *iommu = opaque;
5074
5075         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5076         return 0;
5077 }
5078
5079 /*
5080  * NB - intel-iommu lacks any sort of reference counting for the users of
5081  * dependent devices.  If multiple endpoints have intersecting dependent
5082  * devices, unbinding the driver from any one of them will possibly leave
5083  * the others unable to operate.
5084  */
5085 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5086 {
5087         if (!iommu || !dev || !dev_is_pci(dev))
5088                 return;
5089
5090         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5091 }
5092
5093 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5094 {
5095         struct dmar_domain *domain;
5096         struct intel_iommu *iommu;
5097         unsigned long flags;
5098
5099         assert_spin_locked(&device_domain_lock);
5100
5101         if (WARN_ON(!info))
5102                 return;
5103
5104         iommu = info->iommu;
5105         domain = info->domain;
5106
5107         if (info->dev) {
5108                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5109                         intel_pasid_tear_down_entry(iommu, info->dev,
5110                                         PASID_RID2PASID);
5111
5112                 iommu_disable_dev_iotlb(info);
5113                 domain_context_clear(iommu, info->dev);
5114                 intel_pasid_free_table(info->dev);
5115         }
5116
5117         unlink_domain_info(info);
5118
5119         spin_lock_irqsave(&iommu->lock, flags);
5120         domain_detach_iommu(domain, iommu);
5121         spin_unlock_irqrestore(&iommu->lock, flags);
5122
5123         /* free the private domain */
5124         if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5125             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5126             list_empty(&domain->devices))
5127                 domain_exit(info->domain);
5128
5129         free_devinfo_mem(info);
5130 }
5131
5132 static void dmar_remove_one_dev_info(struct device *dev)
5133 {
5134         struct device_domain_info *info;
5135         unsigned long flags;
5136
5137         spin_lock_irqsave(&device_domain_lock, flags);
5138         info = dev->archdata.iommu;
5139         if (info && info != DEFER_DEVICE_DOMAIN_INFO
5140             && info != DUMMY_DEVICE_DOMAIN_INFO)
5141                 __dmar_remove_one_dev_info(info);
5142         spin_unlock_irqrestore(&device_domain_lock, flags);
5143 }
5144
5145 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5146 {
5147         int adjust_width;
5148
5149         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5150         domain_reserve_special_ranges(domain);
5151
5152         /* calculate AGAW */
5153         domain->gaw = guest_width;
5154         adjust_width = guestwidth_to_adjustwidth(guest_width);
5155         domain->agaw = width_to_agaw(adjust_width);
5156
5157         domain->iommu_coherency = 0;
5158         domain->iommu_snooping = 0;
5159         domain->iommu_superpage = 0;
5160         domain->max_addr = 0;
5161
5162         /* always allocate the top pgd */
5163         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5164         if (!domain->pgd)
5165                 return -ENOMEM;
5166         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5167         return 0;
5168 }
5169
5170 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5171 {
5172         struct dmar_domain *dmar_domain;
5173         struct iommu_domain *domain;
5174
5175         switch (type) {
5176         case IOMMU_DOMAIN_DMA:
5177         /* fallthrough */
5178         case IOMMU_DOMAIN_UNMANAGED:
5179                 dmar_domain = alloc_domain(0);
5180                 if (!dmar_domain) {
5181                         pr_err("Can't allocate dmar_domain\n");
5182                         return NULL;
5183                 }
5184                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5185                         pr_err("Domain initialization failed\n");
5186                         domain_exit(dmar_domain);
5187                         return NULL;
5188                 }
5189
5190                 if (type == IOMMU_DOMAIN_DMA &&
5191                     init_iova_flush_queue(&dmar_domain->iovad,
5192                                           iommu_flush_iova, iova_entry_free)) {
5193                         pr_warn("iova flush queue initialization failed\n");
5194                         intel_iommu_strict = 1;
5195                 }
5196
5197                 domain_update_iommu_cap(dmar_domain);
5198
5199                 domain = &dmar_domain->domain;
5200                 domain->geometry.aperture_start = 0;
5201                 domain->geometry.aperture_end   =
5202                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5203                 domain->geometry.force_aperture = true;
5204
5205                 return domain;
5206         case IOMMU_DOMAIN_IDENTITY:
5207                 return &si_domain->domain;
5208         default:
5209                 return NULL;
5210         }
5211
5212         return NULL;
5213 }
5214
5215 static void intel_iommu_domain_free(struct iommu_domain *domain)
5216 {
5217         if (domain != &si_domain->domain)
5218                 domain_exit(to_dmar_domain(domain));
5219 }
5220
5221 /*
5222  * Check whether a @domain could be attached to the @dev through the
5223  * aux-domain attach/detach APIs.
5224  */
5225 static inline bool
5226 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5227 {
5228         struct device_domain_info *info = dev->archdata.iommu;
5229
5230         return info && info->auxd_enabled &&
5231                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5232 }
5233
5234 static void auxiliary_link_device(struct dmar_domain *domain,
5235                                   struct device *dev)
5236 {
5237         struct device_domain_info *info = dev->archdata.iommu;
5238
5239         assert_spin_locked(&device_domain_lock);
5240         if (WARN_ON(!info))
5241                 return;
5242
5243         domain->auxd_refcnt++;
5244         list_add(&domain->auxd, &info->auxiliary_domains);
5245 }
5246
5247 static void auxiliary_unlink_device(struct dmar_domain *domain,
5248                                     struct device *dev)
5249 {
5250         struct device_domain_info *info = dev->archdata.iommu;
5251
5252         assert_spin_locked(&device_domain_lock);
5253         if (WARN_ON(!info))
5254                 return;
5255
5256         list_del(&domain->auxd);
5257         domain->auxd_refcnt--;
5258
5259         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5260                 intel_pasid_free_id(domain->default_pasid);
5261 }
5262
5263 static int aux_domain_add_dev(struct dmar_domain *domain,
5264                               struct device *dev)
5265 {
5266         int ret;
5267         u8 bus, devfn;
5268         unsigned long flags;
5269         struct intel_iommu *iommu;
5270
5271         iommu = device_to_iommu(dev, &bus, &devfn);
5272         if (!iommu)
5273                 return -ENODEV;
5274
5275         if (domain->default_pasid <= 0) {
5276                 int pasid;
5277
5278                 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
5279                                              pci_max_pasids(to_pci_dev(dev)),
5280                                              GFP_KERNEL);
5281                 if (pasid <= 0) {
5282                         pr_err("Can't allocate default pasid\n");
5283                         return -ENODEV;
5284                 }
5285                 domain->default_pasid = pasid;
5286         }
5287
5288         spin_lock_irqsave(&device_domain_lock, flags);
5289         /*
5290          * iommu->lock must be held to attach domain to iommu and setup the
5291          * pasid entry for second level translation.
5292          */
5293         spin_lock(&iommu->lock);
5294         ret = domain_attach_iommu(domain, iommu);
5295         if (ret)
5296                 goto attach_failed;
5297
5298         /* Setup the PASID entry for mediated devices: */
5299         ret = intel_pasid_setup_second_level(iommu, domain, dev,
5300                                              domain->default_pasid);
5301         if (ret)
5302                 goto table_failed;
5303         spin_unlock(&iommu->lock);
5304
5305         auxiliary_link_device(domain, dev);
5306
5307         spin_unlock_irqrestore(&device_domain_lock, flags);
5308
5309         return 0;
5310
5311 table_failed:
5312         domain_detach_iommu(domain, iommu);
5313 attach_failed:
5314         spin_unlock(&iommu->lock);
5315         spin_unlock_irqrestore(&device_domain_lock, flags);
5316         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5317                 intel_pasid_free_id(domain->default_pasid);
5318
5319         return ret;
5320 }
5321
5322 static void aux_domain_remove_dev(struct dmar_domain *domain,
5323                                   struct device *dev)
5324 {
5325         struct device_domain_info *info;
5326         struct intel_iommu *iommu;
5327         unsigned long flags;
5328
5329         if (!is_aux_domain(dev, &domain->domain))
5330                 return;
5331
5332         spin_lock_irqsave(&device_domain_lock, flags);
5333         info = dev->archdata.iommu;
5334         iommu = info->iommu;
5335
5336         auxiliary_unlink_device(domain, dev);
5337
5338         spin_lock(&iommu->lock);
5339         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5340         domain_detach_iommu(domain, iommu);
5341         spin_unlock(&iommu->lock);
5342
5343         spin_unlock_irqrestore(&device_domain_lock, flags);
5344 }
5345
5346 static int prepare_domain_attach_device(struct iommu_domain *domain,
5347                                         struct device *dev)
5348 {
5349         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5350         struct intel_iommu *iommu;
5351         int addr_width;
5352         u8 bus, devfn;
5353
5354         iommu = device_to_iommu(dev, &bus, &devfn);
5355         if (!iommu)
5356                 return -ENODEV;
5357
5358         /* check if this iommu agaw is sufficient for max mapped address */
5359         addr_width = agaw_to_width(iommu->agaw);
5360         if (addr_width > cap_mgaw(iommu->cap))
5361                 addr_width = cap_mgaw(iommu->cap);
5362
5363         if (dmar_domain->max_addr > (1LL << addr_width)) {
5364                 dev_err(dev, "%s: iommu width (%d) is not "
5365                         "sufficient for the mapped address (%llx)\n",
5366                         __func__, addr_width, dmar_domain->max_addr);
5367                 return -EFAULT;
5368         }
5369         dmar_domain->gaw = addr_width;
5370
5371         /*
5372          * Knock out extra levels of page tables if necessary
5373          */
5374         while (iommu->agaw < dmar_domain->agaw) {
5375                 struct dma_pte *pte;
5376
5377                 pte = dmar_domain->pgd;
5378                 if (dma_pte_present(pte)) {
5379                         dmar_domain->pgd = (struct dma_pte *)
5380                                 phys_to_virt(dma_pte_addr(pte));
5381                         free_pgtable_page(pte);
5382                 }
5383                 dmar_domain->agaw--;
5384         }
5385
5386         return 0;
5387 }
5388
5389 static int intel_iommu_attach_device(struct iommu_domain *domain,
5390                                      struct device *dev)
5391 {
5392         int ret;
5393
5394         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5395             device_is_rmrr_locked(dev)) {
5396                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5397                 return -EPERM;
5398         }
5399
5400         if (is_aux_domain(dev, domain))
5401                 return -EPERM;
5402
5403         /* normally dev is not mapped */
5404         if (unlikely(domain_context_mapped(dev))) {
5405                 struct dmar_domain *old_domain;
5406
5407                 old_domain = find_domain(dev);
5408                 if (old_domain)
5409                         dmar_remove_one_dev_info(dev);
5410         }
5411
5412         ret = prepare_domain_attach_device(domain, dev);
5413         if (ret)
5414                 return ret;
5415
5416         return domain_add_dev_info(to_dmar_domain(domain), dev);
5417 }
5418
5419 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5420                                          struct device *dev)
5421 {
5422         int ret;
5423
5424         if (!is_aux_domain(dev, domain))
5425                 return -EPERM;
5426
5427         ret = prepare_domain_attach_device(domain, dev);
5428         if (ret)
5429                 return ret;
5430
5431         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5432 }
5433
5434 static void intel_iommu_detach_device(struct iommu_domain *domain,
5435                                       struct device *dev)
5436 {
5437         dmar_remove_one_dev_info(dev);
5438 }
5439
5440 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5441                                           struct device *dev)
5442 {
5443         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5444 }
5445
5446 static int intel_iommu_map(struct iommu_domain *domain,
5447                            unsigned long iova, phys_addr_t hpa,
5448                            size_t size, int iommu_prot)
5449 {
5450         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5451         u64 max_addr;
5452         int prot = 0;
5453         int ret;
5454
5455         if (iommu_prot & IOMMU_READ)
5456                 prot |= DMA_PTE_READ;
5457         if (iommu_prot & IOMMU_WRITE)
5458                 prot |= DMA_PTE_WRITE;
5459         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5460                 prot |= DMA_PTE_SNP;
5461
5462         max_addr = iova + size;
5463         if (dmar_domain->max_addr < max_addr) {
5464                 u64 end;
5465
5466                 /* check if minimum agaw is sufficient for mapped address */
5467                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5468                 if (end < max_addr) {
5469                         pr_err("%s: iommu width (%d) is not "
5470                                "sufficient for the mapped address (%llx)\n",
5471                                __func__, dmar_domain->gaw, max_addr);
5472                         return -EFAULT;
5473                 }
5474                 dmar_domain->max_addr = max_addr;
5475         }
5476         /* Round up size to next multiple of PAGE_SIZE, if it and
5477            the low bits of hpa would take us onto the next page */
5478         size = aligned_nrpages(hpa, size);
5479         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5480                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5481         return ret;
5482 }
5483
5484 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5485                                 unsigned long iova, size_t size,
5486                                 struct iommu_iotlb_gather *gather)
5487 {
5488         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5489         struct page *freelist = NULL;
5490         unsigned long start_pfn, last_pfn;
5491         unsigned int npages;
5492         int iommu_id, level = 0;
5493
5494         /* Cope with horrid API which requires us to unmap more than the
5495            size argument if it happens to be a large-page mapping. */
5496         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5497
5498         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5499                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5500
5501         start_pfn = iova >> VTD_PAGE_SHIFT;
5502         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5503
5504         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5505
5506         npages = last_pfn - start_pfn + 1;
5507
5508         for_each_domain_iommu(iommu_id, dmar_domain)
5509                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5510                                       start_pfn, npages, !freelist, 0);
5511
5512         dma_free_pagelist(freelist);
5513
5514         if (dmar_domain->max_addr == iova + size)
5515                 dmar_domain->max_addr = iova;
5516
5517         return size;
5518 }
5519
5520 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5521                                             dma_addr_t iova)
5522 {
5523         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5524         struct dma_pte *pte;
5525         int level = 0;
5526         u64 phys = 0;
5527
5528         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5529         if (pte && dma_pte_present(pte))
5530                 phys = dma_pte_addr(pte) +
5531                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5532                                                 VTD_PAGE_SHIFT) - 1));
5533
5534         return phys;
5535 }
5536
5537 static inline bool scalable_mode_support(void)
5538 {
5539         struct dmar_drhd_unit *drhd;
5540         struct intel_iommu *iommu;
5541         bool ret = true;
5542
5543         rcu_read_lock();
5544         for_each_active_iommu(iommu, drhd) {
5545                 if (!sm_supported(iommu)) {
5546                         ret = false;
5547                         break;
5548                 }
5549         }
5550         rcu_read_unlock();
5551
5552         return ret;
5553 }
5554
5555 static inline bool iommu_pasid_support(void)
5556 {
5557         struct dmar_drhd_unit *drhd;
5558         struct intel_iommu *iommu;
5559         bool ret = true;
5560
5561         rcu_read_lock();
5562         for_each_active_iommu(iommu, drhd) {
5563                 if (!pasid_supported(iommu)) {
5564                         ret = false;
5565                         break;
5566                 }
5567         }
5568         rcu_read_unlock();
5569
5570         return ret;
5571 }
5572
5573 static bool intel_iommu_capable(enum iommu_cap cap)
5574 {
5575         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5576                 return domain_update_iommu_snooping(NULL) == 1;
5577         if (cap == IOMMU_CAP_INTR_REMAP)
5578                 return irq_remapping_enabled == 1;
5579
5580         return false;
5581 }
5582
5583 static int intel_iommu_add_device(struct device *dev)
5584 {
5585         struct dmar_domain *dmar_domain;
5586         struct iommu_domain *domain;
5587         struct intel_iommu *iommu;
5588         struct iommu_group *group;
5589         u8 bus, devfn;
5590         int ret;
5591
5592         iommu = device_to_iommu(dev, &bus, &devfn);
5593         if (!iommu)
5594                 return -ENODEV;
5595
5596         iommu_device_link(&iommu->iommu, dev);
5597
5598         if (translation_pre_enabled(iommu))
5599                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5600
5601         group = iommu_group_get_for_dev(dev);
5602
5603         if (IS_ERR(group)) {
5604                 ret = PTR_ERR(group);
5605                 goto unlink;
5606         }
5607
5608         iommu_group_put(group);
5609
5610         domain = iommu_get_domain_for_dev(dev);
5611         dmar_domain = to_dmar_domain(domain);
5612         if (domain->type == IOMMU_DOMAIN_DMA) {
5613                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5614                         ret = iommu_request_dm_for_dev(dev);
5615                         if (ret) {
5616                                 dmar_remove_one_dev_info(dev);
5617                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5618                                 domain_add_dev_info(si_domain, dev);
5619                                 dev_info(dev,
5620                                          "Device uses a private identity domain.\n");
5621                         }
5622                 }
5623         } else {
5624                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5625                         ret = iommu_request_dma_domain_for_dev(dev);
5626                         if (ret) {
5627                                 dmar_remove_one_dev_info(dev);
5628                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5629                                 if (!get_private_domain_for_dev(dev)) {
5630                                         dev_warn(dev,
5631                                                  "Failed to get a private domain.\n");
5632                                         ret = -ENOMEM;
5633                                         goto unlink;
5634                                 }
5635
5636                                 dev_info(dev,
5637                                          "Device uses a private dma domain.\n");
5638                         }
5639                 }
5640         }
5641
5642         if (device_needs_bounce(dev)) {
5643                 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5644                 set_dma_ops(dev, &bounce_dma_ops);
5645         }
5646
5647         return 0;
5648
5649 unlink:
5650         iommu_device_unlink(&iommu->iommu, dev);
5651         return ret;
5652 }
5653
5654 static void intel_iommu_remove_device(struct device *dev)
5655 {
5656         struct intel_iommu *iommu;
5657         u8 bus, devfn;
5658
5659         iommu = device_to_iommu(dev, &bus, &devfn);
5660         if (!iommu)
5661                 return;
5662
5663         dmar_remove_one_dev_info(dev);
5664
5665         iommu_group_remove_device(dev);
5666
5667         iommu_device_unlink(&iommu->iommu, dev);
5668
5669         if (device_needs_bounce(dev))
5670                 set_dma_ops(dev, NULL);
5671 }
5672
5673 static void intel_iommu_get_resv_regions(struct device *device,
5674                                          struct list_head *head)
5675 {
5676         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5677         struct iommu_resv_region *reg;
5678         struct dmar_rmrr_unit *rmrr;
5679         struct device *i_dev;
5680         int i;
5681
5682         down_read(&dmar_global_lock);
5683         for_each_rmrr_units(rmrr) {
5684                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5685                                           i, i_dev) {
5686                         struct iommu_resv_region *resv;
5687                         enum iommu_resv_type type;
5688                         size_t length;
5689
5690                         if (i_dev != device &&
5691                             !is_downstream_to_pci_bridge(device, i_dev))
5692                                 continue;
5693
5694                         length = rmrr->end_address - rmrr->base_address + 1;
5695
5696                         type = device_rmrr_is_relaxable(device) ?
5697                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5698
5699                         resv = iommu_alloc_resv_region(rmrr->base_address,
5700                                                        length, prot, type);
5701                         if (!resv)
5702                                 break;
5703
5704                         list_add_tail(&resv->list, head);
5705                 }
5706         }
5707         up_read(&dmar_global_lock);
5708
5709 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5710         if (dev_is_pci(device)) {
5711                 struct pci_dev *pdev = to_pci_dev(device);
5712
5713                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5714                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5715                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5716                         if (reg)
5717                                 list_add_tail(&reg->list, head);
5718                 }
5719         }
5720 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5721
5722         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5723                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5724                                       0, IOMMU_RESV_MSI);
5725         if (!reg)
5726                 return;
5727         list_add_tail(&reg->list, head);
5728 }
5729
5730 static void intel_iommu_put_resv_regions(struct device *dev,
5731                                          struct list_head *head)
5732 {
5733         struct iommu_resv_region *entry, *next;
5734
5735         list_for_each_entry_safe(entry, next, head, list)
5736                 kfree(entry);
5737 }
5738
5739 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5740 {
5741         struct device_domain_info *info;
5742         struct context_entry *context;
5743         struct dmar_domain *domain;
5744         unsigned long flags;
5745         u64 ctx_lo;
5746         int ret;
5747
5748         domain = find_domain(dev);
5749         if (!domain)
5750                 return -EINVAL;
5751
5752         spin_lock_irqsave(&device_domain_lock, flags);
5753         spin_lock(&iommu->lock);
5754
5755         ret = -EINVAL;
5756         info = dev->archdata.iommu;
5757         if (!info || !info->pasid_supported)
5758                 goto out;
5759
5760         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5761         if (WARN_ON(!context))
5762                 goto out;
5763
5764         ctx_lo = context[0].lo;
5765
5766         if (!(ctx_lo & CONTEXT_PASIDE)) {
5767                 ctx_lo |= CONTEXT_PASIDE;
5768                 context[0].lo = ctx_lo;
5769                 wmb();
5770                 iommu->flush.flush_context(iommu,
5771                                            domain->iommu_did[iommu->seq_id],
5772                                            PCI_DEVID(info->bus, info->devfn),
5773                                            DMA_CCMD_MASK_NOBIT,
5774                                            DMA_CCMD_DEVICE_INVL);
5775         }
5776
5777         /* Enable PASID support in the device, if it wasn't already */
5778         if (!info->pasid_enabled)
5779                 iommu_enable_dev_iotlb(info);
5780
5781         ret = 0;
5782
5783  out:
5784         spin_unlock(&iommu->lock);
5785         spin_unlock_irqrestore(&device_domain_lock, flags);
5786
5787         return ret;
5788 }
5789
5790 static void intel_iommu_apply_resv_region(struct device *dev,
5791                                           struct iommu_domain *domain,
5792                                           struct iommu_resv_region *region)
5793 {
5794         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5795         unsigned long start, end;
5796
5797         start = IOVA_PFN(region->start);
5798         end   = IOVA_PFN(region->start + region->length - 1);
5799
5800         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5801 }
5802
5803 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5804 {
5805         if (dev_is_pci(dev))
5806                 return pci_device_group(dev);
5807         return generic_device_group(dev);
5808 }
5809
5810 #ifdef CONFIG_INTEL_IOMMU_SVM
5811 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5812 {
5813         struct intel_iommu *iommu;
5814         u8 bus, devfn;
5815
5816         if (iommu_dummy(dev)) {
5817                 dev_warn(dev,
5818                          "No IOMMU translation for device; cannot enable SVM\n");
5819                 return NULL;
5820         }
5821
5822         iommu = device_to_iommu(dev, &bus, &devfn);
5823         if ((!iommu)) {
5824                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5825                 return NULL;
5826         }
5827
5828         return iommu;
5829 }
5830 #endif /* CONFIG_INTEL_IOMMU_SVM */
5831
5832 static int intel_iommu_enable_auxd(struct device *dev)
5833 {
5834         struct device_domain_info *info;
5835         struct intel_iommu *iommu;
5836         unsigned long flags;
5837         u8 bus, devfn;
5838         int ret;
5839
5840         iommu = device_to_iommu(dev, &bus, &devfn);
5841         if (!iommu || dmar_disabled)
5842                 return -EINVAL;
5843
5844         if (!sm_supported(iommu) || !pasid_supported(iommu))
5845                 return -EINVAL;
5846
5847         ret = intel_iommu_enable_pasid(iommu, dev);
5848         if (ret)
5849                 return -ENODEV;
5850
5851         spin_lock_irqsave(&device_domain_lock, flags);
5852         info = dev->archdata.iommu;
5853         info->auxd_enabled = 1;
5854         spin_unlock_irqrestore(&device_domain_lock, flags);
5855
5856         return 0;
5857 }
5858
5859 static int intel_iommu_disable_auxd(struct device *dev)
5860 {
5861         struct device_domain_info *info;
5862         unsigned long flags;
5863
5864         spin_lock_irqsave(&device_domain_lock, flags);
5865         info = dev->archdata.iommu;
5866         if (!WARN_ON(!info))
5867                 info->auxd_enabled = 0;
5868         spin_unlock_irqrestore(&device_domain_lock, flags);
5869
5870         return 0;
5871 }
5872
5873 /*
5874  * A PCI express designated vendor specific extended capability is defined
5875  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5876  * for system software and tools to detect endpoint devices supporting the
5877  * Intel scalable IO virtualization without host driver dependency.
5878  *
5879  * Returns the address of the matching extended capability structure within
5880  * the device's PCI configuration space or 0 if the device does not support
5881  * it.
5882  */
5883 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5884 {
5885         int pos;
5886         u16 vendor, id;
5887
5888         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5889         while (pos) {
5890                 pci_read_config_word(pdev, pos + 4, &vendor);
5891                 pci_read_config_word(pdev, pos + 8, &id);
5892                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5893                         return pos;
5894
5895                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5896         }
5897
5898         return 0;
5899 }
5900
5901 static bool
5902 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5903 {
5904         if (feat == IOMMU_DEV_FEAT_AUX) {
5905                 int ret;
5906
5907                 if (!dev_is_pci(dev) || dmar_disabled ||
5908                     !scalable_mode_support() || !iommu_pasid_support())
5909                         return false;
5910
5911                 ret = pci_pasid_features(to_pci_dev(dev));
5912                 if (ret < 0)
5913                         return false;
5914
5915                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5916         }
5917
5918         return false;
5919 }
5920
5921 static int
5922 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5923 {
5924         if (feat == IOMMU_DEV_FEAT_AUX)
5925                 return intel_iommu_enable_auxd(dev);
5926
5927         return -ENODEV;
5928 }
5929
5930 static int
5931 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5932 {
5933         if (feat == IOMMU_DEV_FEAT_AUX)
5934                 return intel_iommu_disable_auxd(dev);
5935
5936         return -ENODEV;
5937 }
5938
5939 static bool
5940 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5941 {
5942         struct device_domain_info *info = dev->archdata.iommu;
5943
5944         if (feat == IOMMU_DEV_FEAT_AUX)
5945                 return scalable_mode_support() && info && info->auxd_enabled;
5946
5947         return false;
5948 }
5949
5950 static int
5951 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5952 {
5953         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5954
5955         return dmar_domain->default_pasid > 0 ?
5956                         dmar_domain->default_pasid : -EINVAL;
5957 }
5958
5959 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5960                                            struct device *dev)
5961 {
5962         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5963 }
5964
5965 /*
5966  * Check that the device does not live on an external facing PCI port that is
5967  * marked as untrusted. Such devices should not be able to apply quirks and
5968  * thus not be able to bypass the IOMMU restrictions.
5969  */
5970 static bool risky_device(struct pci_dev *pdev)
5971 {
5972         if (pdev->untrusted) {
5973                 pci_info(pdev,
5974                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5975                          pdev->vendor, pdev->device);
5976                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5977                 return true;
5978         }
5979         return false;
5980 }
5981
5982 const struct iommu_ops intel_iommu_ops = {
5983         .capable                = intel_iommu_capable,
5984         .domain_alloc           = intel_iommu_domain_alloc,
5985         .domain_free            = intel_iommu_domain_free,
5986         .attach_dev             = intel_iommu_attach_device,
5987         .detach_dev             = intel_iommu_detach_device,
5988         .aux_attach_dev         = intel_iommu_aux_attach_device,
5989         .aux_detach_dev         = intel_iommu_aux_detach_device,
5990         .aux_get_pasid          = intel_iommu_aux_get_pasid,
5991         .map                    = intel_iommu_map,
5992         .unmap                  = intel_iommu_unmap,
5993         .iova_to_phys           = intel_iommu_iova_to_phys,
5994         .add_device             = intel_iommu_add_device,
5995         .remove_device          = intel_iommu_remove_device,
5996         .get_resv_regions       = intel_iommu_get_resv_regions,
5997         .put_resv_regions       = intel_iommu_put_resv_regions,
5998         .apply_resv_region      = intel_iommu_apply_resv_region,
5999         .device_group           = intel_iommu_device_group,
6000         .dev_has_feat           = intel_iommu_dev_has_feat,
6001         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6002         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6003         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6004         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6005         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6006 };
6007
6008 static void quirk_iommu_igfx(struct pci_dev *dev)
6009 {
6010         if (risky_device(dev))
6011                 return;
6012
6013         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6014         dmar_map_gfx = 0;
6015 }
6016
6017 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6018 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6019 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6020 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6021 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6022 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6023 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6024 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6025
6026 /* Broadwell igfx malfunctions with dmar */
6027 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6028 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6029 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6030 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6031 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6032 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6033 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6034 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6035 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6036 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6037 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6038 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6039 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6040 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6041 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6042 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6043 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6044 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6045 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6046 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6047 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6048 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6049 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6050 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6051
6052 static void quirk_iommu_rwbf(struct pci_dev *dev)
6053 {
6054         if (risky_device(dev))
6055                 return;
6056
6057         /*
6058          * Mobile 4 Series Chipset neglects to set RWBF capability,
6059          * but needs it. Same seems to hold for the desktop versions.
6060          */
6061         pci_info(dev, "Forcing write-buffer flush capability\n");
6062         rwbf_quirk = 1;
6063 }
6064
6065 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6066 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6067 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6068 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6069 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6070 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6071 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6072
6073 #define GGC 0x52
6074 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6075 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6076 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6077 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6078 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6079 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6080 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6081 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6082
6083 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6084 {
6085         unsigned short ggc;
6086
6087         if (risky_device(dev))
6088                 return;
6089
6090         if (pci_read_config_word(dev, GGC, &ggc))
6091                 return;
6092
6093         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6094                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6095                 dmar_map_gfx = 0;
6096         } else if (dmar_map_gfx) {
6097                 /* we have to ensure the gfx device is idle before we flush */
6098                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6099                 intel_iommu_strict = 1;
6100        }
6101 }
6102 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6103 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6104 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6105 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6106
6107 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6108    ISOCH DMAR unit for the Azalia sound device, but not give it any
6109    TLB entries, which causes it to deadlock. Check for that.  We do
6110    this in a function called from init_dmars(), instead of in a PCI
6111    quirk, because we don't want to print the obnoxious "BIOS broken"
6112    message if VT-d is actually disabled.
6113 */
6114 static void __init check_tylersburg_isoch(void)
6115 {
6116         struct pci_dev *pdev;
6117         uint32_t vtisochctrl;
6118
6119         /* If there's no Azalia in the system anyway, forget it. */
6120         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6121         if (!pdev)
6122                 return;
6123
6124         if (risky_device(pdev)) {
6125                 pci_dev_put(pdev);
6126                 return;
6127         }
6128
6129         pci_dev_put(pdev);
6130
6131         /* System Management Registers. Might be hidden, in which case
6132            we can't do the sanity check. But that's OK, because the
6133            known-broken BIOSes _don't_ actually hide it, so far. */
6134         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6135         if (!pdev)
6136                 return;
6137
6138         if (risky_device(pdev)) {
6139                 pci_dev_put(pdev);
6140                 return;
6141         }
6142
6143         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6144                 pci_dev_put(pdev);
6145                 return;
6146         }
6147
6148         pci_dev_put(pdev);
6149
6150         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6151         if (vtisochctrl & 1)
6152                 return;
6153
6154         /* Drop all bits other than the number of TLB entries */
6155         vtisochctrl &= 0x1c;
6156
6157         /* If we have the recommended number of TLB entries (16), fine. */
6158         if (vtisochctrl == 0x10)
6159                 return;
6160
6161         /* Zero TLB entries? You get to ride the short bus to school. */
6162         if (!vtisochctrl) {
6163                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6164                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6165                      dmi_get_system_info(DMI_BIOS_VENDOR),
6166                      dmi_get_system_info(DMI_BIOS_VERSION),
6167                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6168                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6169                 return;
6170         }
6171
6172         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6173                vtisochctrl);
6174 }