drivers/iommu/intel-iommu.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright © 2006-2014 Intel Corporation.
   4  *
   5  * Authors: David Woodhouse <dwmw2@infradead.org>,
   6  *          Ashok Raj <ashok.raj@intel.com>,
   7  *          Shaohua Li <shaohua.li@intel.com>,
   8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9  *          Fenghua Yu <fenghua.yu@intel.com>
  10  *          Joerg Roedel <jroedel@suse.de>
  11  */
  12
  13 #define pr_fmt(fmt)     "DMAR: " fmt
  14 #define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16 #include <linux/init.h>
  17 #include <linux/bitmap.h>
  18 #include <linux/debugfs.h>
  19 #include <linux/export.h>
  20 #include <linux/slab.h>
  21 #include <linux/irq.h>
  22 #include <linux/interrupt.h>
  23 #include <linux/spinlock.h>
  24 #include <linux/pci.h>
  25 #include <linux/dmar.h>
  26 #include <linux/dma-mapping.h>
  27 #include <linux/mempool.h>
  28 #include <linux/memory.h>
  29 #include <linux/cpu.h>
  30 #include <linux/timer.h>
  31 #include <linux/io.h>
  32 #include <linux/iova.h>
  33 #include <linux/iommu.h>
  34 #include <linux/intel-iommu.h>
  35 #include <linux/syscore_ops.h>
  36 #include <linux/tboot.h>
  37 #include <linux/dmi.h>
  38 #include <linux/pci-ats.h>
  39 #include <linux/memblock.h>
  40 #include <linux/dma-contiguous.h>
  41 #include <linux/dma-direct.h>
  42 #include <linux/crash_dump.h>
  43 #include <linux/numa.h>
  44 #include <linux/swiotlb.h>
  45 #include <asm/irq_remapping.h>
  46 #include <asm/cacheflush.h>
  47 #include <asm/iommu.h>
  48 #include <trace/events/intel_iommu.h>
  49
  50 #include "irq_remapping.h"
  51 #include "intel-pasid.h"
  52
  53 #define ROOT_SIZE               VTD_PAGE_SIZE
  54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  55
  56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  60
  61 #define IOAPIC_RANGE_START      (0xfee00000)
  62 #define IOAPIC_RANGE_END        (0xfeefffff)
  63 #define IOVA_START_ADDR         (0x1000)
  64
  65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  66
  67 #define MAX_AGAW_WIDTH 64
  68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  69
  70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  72
  73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  78
  79 /* IO virtual address start page frame number */
  80 #define IOVA_START_PFN          (1)
  81
  82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  83
  84 /* page table handling */
  85 #define LEVEL_STRIDE            (9)
  86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  87
  88 /*
  89  * This bitmap is used to advertise the page sizes our hardware support
  90  * to the IOMMU core, which will then use this information to split
  91  * physically contiguous memory regions it is mapping into page sizes
  92  * that we support.
  93  *
  94  * Traditionally the IOMMU core just handed us the mappings directly,
  95  * after making sure the size is an order of a 4KiB page and that the
  96  * mapping has natural alignment.
  97  *
  98  * To retain this behavior, we currently advertise that we support
  99  * all page sizes that are an order of 4KiB.
 100  *
 101  * If at some point we'd like to utilize the IOMMU core's new behavior,
 102  * we could change this to advertise the real page sizes we support.
 103  */
 104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 105
 106 static inline int agaw_to_level(int agaw)
 107 {
 108         return agaw + 2;
 109 }
 110
 111 static inline int agaw_to_width(int agaw)
 112 {
 113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 114 }
 115
 116 static inline int width_to_agaw(int width)
 117 {
 118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 119 }
 120
 121 static inline unsigned int level_to_offset_bits(int level)
 122 {
 123         return (level - 1) * LEVEL_STRIDE;
 124 }
 125
 126 static inline int pfn_level_offset(unsigned long pfn, int level)
 127 {
 128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 129 }
 130
 131 static inline unsigned long level_mask(int level)
 132 {
 133         return -1UL << level_to_offset_bits(level);
 134 }
 135
 136 static inline unsigned long level_size(int level)
 137 {
 138         return 1UL << level_to_offset_bits(level);
 139 }
 140
 141 static inline unsigned long align_to_level(unsigned long pfn, int level)
 142 {
 143         return (pfn + level_size(level) - 1) & level_mask(level);
 144 }
 145
 146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 147 {
 148         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 149 }
 150
 151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 152    are never going to work. */
 153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 154 {
 155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 156 }
 157
 158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 159 {
 160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 161 }
 162 static inline unsigned long page_to_dma_pfn(struct page *pg)
 163 {
 164         return mm_to_dma_pfn(page_to_pfn(pg));
 165 }
 166 static inline unsigned long virt_to_dma_pfn(void *p)
 167 {
 168         return page_to_dma_pfn(virt_to_page(p));
 169 }
 170
 171 /* global iommu list, set NULL for ignored DMAR units */
 172 static struct intel_iommu **g_iommus;
 173
 174 static void __init check_tylersburg_isoch(void);
 175 static int rwbf_quirk;
 176
 177 /*
 178  * set to 1 to panic kernel if can't successfully enable VT-d
 179  * (used when kernel is launched w/ TXT)
 180  */
 181 static int force_on = 0;
 182 int intel_iommu_tboot_noforce;
 183 static int no_platform_optin;
 184
 185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 186
 187 /*
 188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 189  * if marked present.
 190  */
 191 static phys_addr_t root_entry_lctp(struct root_entry *re)
 192 {
 193         if (!(re->lo & 1))
 194                 return 0;
 195
 196         return re->lo & VTD_PAGE_MASK;
 197 }
 198
 199 /*
 200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 201  * if marked present.
 202  */
 203 static phys_addr_t root_entry_uctp(struct root_entry *re)
 204 {
 205         if (!(re->hi & 1))
 206                 return 0;
 207
 208         return re->hi & VTD_PAGE_MASK;
 209 }
 210
 211 static inline void context_clear_pasid_enable(struct context_entry *context)
 212 {
 213         context->lo &= ~(1ULL << 11);
 214 }
 215
 216 static inline bool context_pasid_enabled(struct context_entry *context)
 217 {
 218         return !!(context->lo & (1ULL << 11));
 219 }
 220
 221 static inline void context_set_copied(struct context_entry *context)
 222 {
 223         context->hi |= (1ull << 3);
 224 }
 225
 226 static inline bool context_copied(struct context_entry *context)
 227 {
 228         return !!(context->hi & (1ULL << 3));
 229 }
 230
 231 static inline bool __context_present(struct context_entry *context)
 232 {
 233         return (context->lo & 1);
 234 }
 235
 236 bool context_present(struct context_entry *context)
 237 {
 238         return context_pasid_enabled(context) ?
 239              __context_present(context) :
 240              __context_present(context) && !context_copied(context);
 241 }
 242
 243 static inline void context_set_present(struct context_entry *context)
 244 {
 245         context->lo |= 1;
 246 }
 247
 248 static inline void context_set_fault_enable(struct context_entry *context)
 249 {
 250         context->lo &= (((u64)-1) << 2) | 1;
 251 }
 252
 253 static inline void context_set_translation_type(struct context_entry *context,
 254                                                 unsigned long value)
 255 {
 256         context->lo &= (((u64)-1) << 4) | 3;
 257         context->lo |= (value & 3) << 2;
 258 }
 259
 260 static inline void context_set_address_root(struct context_entry *context,
 261                                             unsigned long value)
 262 {
 263         context->lo &= ~VTD_PAGE_MASK;
 264         context->lo |= value & VTD_PAGE_MASK;
 265 }
 266
 267 static inline void context_set_address_width(struct context_entry *context,
 268                                              unsigned long value)
 269 {
 270         context->hi |= value & 7;
 271 }
 272
 273 static inline void context_set_domain_id(struct context_entry *context,
 274                                          unsigned long value)
 275 {
 276         context->hi |= (value & ((1 << 16) - 1)) << 8;
 277 }
 278
 279 static inline int context_domain_id(struct context_entry *c)
 280 {
 281         return((c->hi >> 8) & 0xffff);
 282 }
 283
 284 static inline void context_clear_entry(struct context_entry *context)
 285 {
 286         context->lo = 0;
 287         context->hi = 0;
 288 }
 289
 290 /*
 291  * This domain is a statically identity mapping domain.
 292  *      1. This domain creats a static 1:1 mapping to all usable memory.
 293  *      2. It maps to each iommu if successful.
 294  *      3. Each iommu mapps to this domain if successful.
 295  */
 296 static struct dmar_domain *si_domain;
 297 static int hw_pass_through = 1;
 298
 299 /* si_domain contains mulitple devices */
 300 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
 301
 302 /*
 303  * This is a DMA domain allocated through the iommu domain allocation
 304  * interface. But one or more devices belonging to this domain have
 305  * been chosen to use a private domain. We should avoid to use the
 306  * map/unmap/iova_to_phys APIs on it.
 307  */
 308 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
 309
 310 /*
 311  * When VT-d works in the scalable mode, it allows DMA translation to
 312  * happen through either first level or second level page table. This
 313  * bit marks that the DMA translation for the domain goes through the
 314  * first level page table, otherwise, it goes through the second level.
 315  */
 316 #define DOMAIN_FLAG_USE_FIRST_LEVEL             BIT(2)
 317
 318 /*
 319  * Domain represents a virtual machine which demands iommu nested
 320  * translation mode support.
 321  */
 322 #define DOMAIN_FLAG_NESTING_MODE                BIT(3)
 323
 324 #define for_each_domain_iommu(idx, domain)                      \
 325         for (idx = 0; idx < g_num_of_iommus; idx++)             \
 326                 if (domain->iommu_refcnt[idx])
 327
 328 struct dmar_rmrr_unit {
 329         struct list_head list;          /* list of rmrr units   */
 330         struct acpi_dmar_header *hdr;   /* ACPI header          */
 331         u64     base_address;           /* reserved base address*/
 332         u64     end_address;            /* reserved end address */
 333         struct dmar_dev_scope *devices; /* target devices */
 334         int     devices_cnt;            /* target device count */
 335 };
 336
 337 struct dmar_atsr_unit {
 338         struct list_head list;          /* list of ATSR units */
 339         struct acpi_dmar_header *hdr;   /* ACPI header */
 340         struct dmar_dev_scope *devices; /* target devices */
 341         int devices_cnt;                /* target device count */
 342         u8 include_all:1;               /* include all ports */
 343 };
 344
 345 static LIST_HEAD(dmar_atsr_units);
 346 static LIST_HEAD(dmar_rmrr_units);
 347
 348 #define for_each_rmrr_units(rmrr) \
 349         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 350
 351 /* bitmap for indexing intel_iommus */
 352 static int g_num_of_iommus;
 353
 354 static void domain_exit(struct dmar_domain *domain);
 355 static void domain_remove_dev_info(struct dmar_domain *domain);
 356 static void dmar_remove_one_dev_info(struct device *dev);
 357 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 358 static void domain_context_clear(struct intel_iommu *iommu,
 359                                  struct device *dev);
 360 static int domain_detach_iommu(struct dmar_domain *domain,
 361                                struct intel_iommu *iommu);
 362 static bool device_is_rmrr_locked(struct device *dev);
 363 static int intel_iommu_attach_device(struct iommu_domain *domain,
 364                                      struct device *dev);
 365 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 366                                             dma_addr_t iova);
 367
 368 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 369 int dmar_disabled = 0;
 370 #else
 371 int dmar_disabled = 1;
 372 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
 373
 374 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
 375 int intel_iommu_sm = 1;
 376 #else
 377 int intel_iommu_sm;
 378 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
 379
 380 int intel_iommu_enabled = 0;
 381 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 382
 383 static int dmar_map_gfx = 1;
 384 static int dmar_forcedac;
 385 static int intel_iommu_strict;
 386 static int intel_iommu_superpage = 1;
 387 static int iommu_identity_mapping;
 388 static int intel_no_bounce;
 389
 390 #define IDENTMAP_GFX            2
 391 #define IDENTMAP_AZALIA         4
 392
 393 int intel_iommu_gfx_mapped;
 394 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 395
 396 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 397 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
 398 DEFINE_SPINLOCK(device_domain_lock);
 399 static LIST_HEAD(device_domain_list);
 400
 401 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
 402                                 to_pci_dev(d)->untrusted)
 403
 404 /*
 405  * Iterate over elements in device_domain_list and call the specified
 406  * callback @fn against each element.
 407  */
 408 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 409                                      void *data), void *data)
 410 {
 411         int ret = 0;
 412         unsigned long flags;
 413         struct device_domain_info *info;
 414
 415         spin_lock_irqsave(&device_domain_lock, flags);
 416         list_for_each_entry(info, &device_domain_list, global) {
 417                 ret = fn(info, data);
 418                 if (ret) {
 419                         spin_unlock_irqrestore(&device_domain_lock, flags);
 420                         return ret;
 421                 }
 422         }
 423         spin_unlock_irqrestore(&device_domain_lock, flags);
 424
 425         return 0;
 426 }
 427
 428 const struct iommu_ops intel_iommu_ops;
 429
 430 static bool translation_pre_enabled(struct intel_iommu *iommu)
 431 {
 432         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 433 }
 434
 435 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 436 {
 437         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 438 }
 439
 440 static void init_translation_status(struct intel_iommu *iommu)
 441 {
 442         u32 gsts;
 443
 444         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 445         if (gsts & DMA_GSTS_TES)
 446                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 447 }
 448
 449 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
 450 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 451 {
 452         return container_of(dom, struct dmar_domain, domain);
 453 }
 454
 455 static int __init intel_iommu_setup(char *str)
 456 {
 457         if (!str)
 458                 return -EINVAL;
 459         while (*str) {
 460                 if (!strncmp(str, "on", 2)) {
 461                         dmar_disabled = 0;
 462                         pr_info("IOMMU enabled\n");
 463                 } else if (!strncmp(str, "off", 3)) {
 464                         dmar_disabled = 1;
 465                         no_platform_optin = 1;
 466                         pr_info("IOMMU disabled\n");
 467                 } else if (!strncmp(str, "igfx_off", 8)) {
 468                         dmar_map_gfx = 0;
 469                         pr_info("Disable GFX device mapping\n");
 470                 } else if (!strncmp(str, "forcedac", 8)) {
 471                         pr_info("Forcing DAC for PCI devices\n");
 472                         dmar_forcedac = 1;
 473                 } else if (!strncmp(str, "strict", 6)) {
 474                         pr_info("Disable batched IOTLB flush\n");
 475                         intel_iommu_strict = 1;
 476                 } else if (!strncmp(str, "sp_off", 6)) {
 477                         pr_info("Disable supported super page\n");
 478                         intel_iommu_superpage = 0;
 479                 } else if (!strncmp(str, "sm_on", 5)) {
 480                         pr_info("Intel-IOMMU: scalable mode supported\n");
 481                         intel_iommu_sm = 1;
 482                 } else if (!strncmp(str, "tboot_noforce", 13)) {
 483                         printk(KERN_INFO
 484                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 485                         intel_iommu_tboot_noforce = 1;
 486                 } else if (!strncmp(str, "nobounce", 8)) {
 487                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
 488                         intel_no_bounce = 1;
 489                 }
 490
 491                 str += strcspn(str, ",");
 492                 while (*str == ',')
 493                         str++;
 494         }
 495         return 0;
 496 }
 497 __setup("intel_iommu=", intel_iommu_setup);
 498
 499 static struct kmem_cache *iommu_domain_cache;
 500 static struct kmem_cache *iommu_devinfo_cache;
 501
 502 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 503 {
 504         struct dmar_domain **domains;
 505         int idx = did >> 8;
 506
 507         domains = iommu->domains[idx];
 508         if (!domains)
 509                 return NULL;
 510
 511         return domains[did & 0xff];
 512 }
 513
 514 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 515                              struct dmar_domain *domain)
 516 {
 517         struct dmar_domain **domains;
 518         int idx = did >> 8;
 519
 520         if (!iommu->domains[idx]) {
 521                 size_t size = 256 * sizeof(struct dmar_domain *);
 522                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 523         }
 524
 525         domains = iommu->domains[idx];
 526         if (WARN_ON(!domains))
 527                 return;
 528         else
 529                 domains[did & 0xff] = domain;
 530 }
 531
 532 void *alloc_pgtable_page(int node)
 533 {
 534         struct page *page;
 535         void *vaddr = NULL;
 536
 537         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 538         if (page)
 539                 vaddr = page_address(page);
 540         return vaddr;
 541 }
 542
 543 void free_pgtable_page(void *vaddr)
 544 {
 545         free_page((unsigned long)vaddr);
 546 }
 547
 548 static inline void *alloc_domain_mem(void)
 549 {
 550         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 551 }
 552
 553 static void free_domain_mem(void *vaddr)
 554 {
 555         kmem_cache_free(iommu_domain_cache, vaddr);
 556 }
 557
 558 static inline void * alloc_devinfo_mem(void)
 559 {
 560         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 561 }
 562
 563 static inline void free_devinfo_mem(void *vaddr)
 564 {
 565         kmem_cache_free(iommu_devinfo_cache, vaddr);
 566 }
 567
 568 static inline int domain_type_is_si(struct dmar_domain *domain)
 569 {
 570         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 571 }
 572
 573 static inline bool domain_use_first_level(struct dmar_domain *domain)
 574 {
 575         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
 576 }
 577
 578 static inline int domain_pfn_supported(struct dmar_domain *domain,
 579                                        unsigned long pfn)
 580 {
 581         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 582
 583         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 584 }
 585
 586 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 587 {
 588         unsigned long sagaw;
 589         int agaw = -1;
 590
 591         sagaw = cap_sagaw(iommu->cap);
 592         for (agaw = width_to_agaw(max_gaw);
 593              agaw >= 0; agaw--) {
 594                 if (test_bit(agaw, &sagaw))
 595                         break;
 596         }
 597
 598         return agaw;
 599 }
 600
 601 /*
 602  * Calculate max SAGAW for each iommu.
 603  */
 604 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 605 {
 606         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 607 }
 608
 609 /*
 610  * calculate agaw for each iommu.
 611  * "SAGAW" may be different across iommus, use a default agaw, and
 612  * get a supported less agaw for iommus that don't support the default agaw.
 613  */
 614 int iommu_calculate_agaw(struct intel_iommu *iommu)
 615 {
 616         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 617 }
 618
 619 /* This functionin only returns single iommu in a domain */
 620 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 621 {
 622         int iommu_id;
 623
 624         /* si_domain and vm domain should not get here. */
 625         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
 626                 return NULL;
 627
 628         for_each_domain_iommu(iommu_id, domain)
 629                 break;
 630
 631         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 632                 return NULL;
 633
 634         return g_iommus[iommu_id];
 635 }
 636
 637 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 638 {
 639         return sm_supported(iommu) ?
 640                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 641 }
 642
 643 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 644 {
 645         struct dmar_drhd_unit *drhd;
 646         struct intel_iommu *iommu;
 647         bool found = false;
 648         int i;
 649
 650         domain->iommu_coherency = 1;
 651
 652         for_each_domain_iommu(i, domain) {
 653                 found = true;
 654                 if (!iommu_paging_structure_coherency(g_iommus[i])) {
 655                         domain->iommu_coherency = 0;
 656                         break;
 657                 }
 658         }
 659         if (found)
 660                 return;
 661
 662         /* No hardware attached; use lowest common denominator */
 663         rcu_read_lock();
 664         for_each_active_iommu(iommu, drhd) {
 665                 if (!iommu_paging_structure_coherency(iommu)) {
 666                         domain->iommu_coherency = 0;
 667                         break;
 668                 }
 669         }
 670         rcu_read_unlock();
 671 }
 672
 673 static int domain_update_iommu_snooping(struct intel_iommu *skip)
 674 {
 675         struct dmar_drhd_unit *drhd;
 676         struct intel_iommu *iommu;
 677         int ret = 1;
 678
 679         rcu_read_lock();
 680         for_each_active_iommu(iommu, drhd) {
 681                 if (iommu != skip) {
 682                         if (!ecap_sc_support(iommu->ecap)) {
 683                                 ret = 0;
 684                                 break;
 685                         }
 686                 }
 687         }
 688         rcu_read_unlock();
 689
 690         return ret;
 691 }
 692
 693 static int domain_update_iommu_superpage(struct dmar_domain *domain,
 694                                          struct intel_iommu *skip)
 695 {
 696         struct dmar_drhd_unit *drhd;
 697         struct intel_iommu *iommu;
 698         int mask = 0x3;
 699
 700         if (!intel_iommu_superpage) {
 701                 return 0;
 702         }
 703
 704         /* set iommu_superpage to the smallest common denominator */
 705         rcu_read_lock();
 706         for_each_active_iommu(iommu, drhd) {
 707                 if (iommu != skip) {
 708                         if (domain && domain_use_first_level(domain)) {
 709                                 if (!cap_fl1gp_support(iommu->cap))
 710                                         mask = 0x1;
 711                         } else {
 712                                 mask &= cap_super_page_val(iommu->cap);
 713                         }
 714
 715                         if (!mask)
 716                                 break;
 717                 }
 718         }
 719         rcu_read_unlock();
 720
 721         return fls(mask);
 722 }
 723
 724 /* Some capabilities may be different across iommus */
 725 static void domain_update_iommu_cap(struct dmar_domain *domain)
 726 {
 727         domain_update_iommu_coherency(domain);
 728         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 729         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 730 }
 731
 732 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 733                                          u8 devfn, int alloc)
 734 {
 735         struct root_entry *root = &iommu->root_entry[bus];
 736         struct context_entry *context;
 737         u64 *entry;
 738
 739         entry = &root->lo;
 740         if (sm_supported(iommu)) {
 741                 if (devfn >= 0x80) {
 742                         devfn -= 0x80;
 743                         entry = &root->hi;
 744                 }
 745                 devfn *= 2;
 746         }
 747         if (*entry & 1)
 748                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 749         else {
 750                 unsigned long phy_addr;
 751                 if (!alloc)
 752                         return NULL;
 753
 754                 context = alloc_pgtable_page(iommu->node);
 755                 if (!context)
 756                         return NULL;
 757
 758                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 759                 phy_addr = virt_to_phys((void *)context);
 760                 *entry = phy_addr | 1;
 761                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 762         }
 763         return &context[devfn];
 764 }
 765
 766 static int iommu_dummy(struct device *dev)
 767 {
 768         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 769 }
 770
 771 static bool attach_deferred(struct device *dev)
 772 {
 773         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
 774 }
 775
 776 /**
 777  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 778  *                               sub-hierarchy of a candidate PCI-PCI bridge
 779  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 780  * @bridge: the candidate PCI-PCI bridge
 781  *
 782  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 783  */
 784 static bool
 785 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 786 {
 787         struct pci_dev *pdev, *pbridge;
 788
 789         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 790                 return false;
 791
 792         pdev = to_pci_dev(dev);
 793         pbridge = to_pci_dev(bridge);
 794
 795         if (pbridge->subordinate &&
 796             pbridge->subordinate->number <= pdev->bus->number &&
 797             pbridge->subordinate->busn_res.end >= pdev->bus->number)
 798                 return true;
 799
 800         return false;
 801 }
 802
 803 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 804 {
 805         struct dmar_drhd_unit *drhd = NULL;
 806         struct intel_iommu *iommu;
 807         struct device *tmp;
 808         struct pci_dev *pdev = NULL;
 809         u16 segment = 0;
 810         int i;
 811
 812         if (iommu_dummy(dev))
 813                 return NULL;
 814
 815         if (dev_is_pci(dev)) {
 816                 struct pci_dev *pf_pdev;
 817
 818                 pdev = pci_real_dma_dev(to_pci_dev(dev));
 819
 820                 /* VFs aren't listed in scope tables; we need to look up
 821                  * the PF instead to find the IOMMU. */
 822                 pf_pdev = pci_physfn(pdev);
 823                 dev = &pf_pdev->dev;
 824                 segment = pci_domain_nr(pdev->bus);
 825         } else if (has_acpi_companion(dev))
 826                 dev = &ACPI_COMPANION(dev)->dev;
 827
 828         rcu_read_lock();
 829         for_each_active_iommu(iommu, drhd) {
 830                 if (pdev && segment != drhd->segment)
 831                         continue;
 832
 833                 for_each_active_dev_scope(drhd->devices,
 834                                           drhd->devices_cnt, i, tmp) {
 835                         if (tmp == dev) {
 836                                 /* For a VF use its original BDF# not that of the PF
 837                                  * which we used for the IOMMU lookup. Strictly speaking
 838                                  * we could do this for all PCI devices; we only need to
 839                                  * get the BDF# from the scope table for ACPI matches. */
 840                                 if (pdev && pdev->is_virtfn)
 841                                         goto got_pdev;
 842
 843                                 *bus = drhd->devices[i].bus;
 844                                 *devfn = drhd->devices[i].devfn;
 845                                 goto out;
 846                         }
 847
 848                         if (is_downstream_to_pci_bridge(dev, tmp))
 849                                 goto got_pdev;
 850                 }
 851
 852                 if (pdev && drhd->include_all) {
 853                 got_pdev:
 854                         *bus = pdev->bus->number;
 855                         *devfn = pdev->devfn;
 856                         goto out;
 857                 }
 858         }
 859         iommu = NULL;
 860  out:
 861         rcu_read_unlock();
 862
 863         return iommu;
 864 }
 865
 866 static void domain_flush_cache(struct dmar_domain *domain,
 867                                void *addr, int size)
 868 {
 869         if (!domain->iommu_coherency)
 870                 clflush_cache_range(addr, size);
 871 }
 872
 873 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 874 {
 875         struct context_entry *context;
 876         int ret = 0;
 877         unsigned long flags;
 878
 879         spin_lock_irqsave(&iommu->lock, flags);
 880         context = iommu_context_addr(iommu, bus, devfn, 0);
 881         if (context)
 882                 ret = context_present(context);
 883         spin_unlock_irqrestore(&iommu->lock, flags);
 884         return ret;
 885 }
 886
 887 static void free_context_table(struct intel_iommu *iommu)
 888 {
 889         int i;
 890         unsigned long flags;
 891         struct context_entry *context;
 892
 893         spin_lock_irqsave(&iommu->lock, flags);
 894         if (!iommu->root_entry) {
 895                 goto out;
 896         }
 897         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 898                 context = iommu_context_addr(iommu, i, 0, 0);
 899                 if (context)
 900                         free_pgtable_page(context);
 901
 902                 if (!sm_supported(iommu))
 903                         continue;
 904
 905                 context = iommu_context_addr(iommu, i, 0x80, 0);
 906                 if (context)
 907                         free_pgtable_page(context);
 908
 909         }
 910         free_pgtable_page(iommu->root_entry);
 911         iommu->root_entry = NULL;
 912 out:
 913         spin_unlock_irqrestore(&iommu->lock, flags);
 914 }
 915
 916 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 917                                       unsigned long pfn, int *target_level)
 918 {
 919         struct dma_pte *parent, *pte;
 920         int level = agaw_to_level(domain->agaw);
 921         int offset;
 922
 923         BUG_ON(!domain->pgd);
 924
 925         if (!domain_pfn_supported(domain, pfn))
 926                 /* Address beyond IOMMU's addressing capabilities. */
 927                 return NULL;
 928
 929         parent = domain->pgd;
 930
 931         while (1) {
 932                 void *tmp_page;
 933
 934                 offset = pfn_level_offset(pfn, level);
 935                 pte = &parent[offset];
 936                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 937                         break;
 938                 if (level == *target_level)
 939                         break;
 940
 941                 if (!dma_pte_present(pte)) {
 942                         uint64_t pteval;
 943
 944                         tmp_page = alloc_pgtable_page(domain->nid);
 945
 946                         if (!tmp_page)
 947                                 return NULL;
 948
 949                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 950                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 951                         if (domain_use_first_level(domain))
 952                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
 953                         if (cmpxchg64(&pte->val, 0ULL, pteval))
 954                                 /* Someone else set it while we were thinking; use theirs. */
 955                                 free_pgtable_page(tmp_page);
 956                         else
 957                                 domain_flush_cache(domain, pte, sizeof(*pte));
 958                 }
 959                 if (level == 1)
 960                         break;
 961
 962                 parent = phys_to_virt(dma_pte_addr(pte));
 963                 level--;
 964         }
 965
 966         if (!*target_level)
 967                 *target_level = level;
 968
 969         return pte;
 970 }
 971
 972 /* return address's pte at specific level */
 973 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 974                                          unsigned long pfn,
 975                                          int level, int *large_page)
 976 {
 977         struct dma_pte *parent, *pte;
 978         int total = agaw_to_level(domain->agaw);
 979         int offset;
 980
 981         parent = domain->pgd;
 982         while (level <= total) {
 983                 offset = pfn_level_offset(pfn, total);
 984                 pte = &parent[offset];
 985                 if (level == total)
 986                         return pte;
 987
 988                 if (!dma_pte_present(pte)) {
 989                         *large_page = total;
 990                         break;
 991                 }
 992
 993                 if (dma_pte_superpage(pte)) {
 994                         *large_page = total;
 995                         return pte;
 996                 }
 997
 998                 parent = phys_to_virt(dma_pte_addr(pte));
 999                 total--;
1000         }
1001         return NULL;
1002 }
1003
1004 /* clear last level pte, a tlb flush should be followed */
1005 static void dma_pte_clear_range(struct dmar_domain *domain,
1006                                 unsigned long start_pfn,
1007                                 unsigned long last_pfn)
1008 {
1009         unsigned int large_page;
1010         struct dma_pte *first_pte, *pte;
1011
1012         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1013         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1014         BUG_ON(start_pfn > last_pfn);
1015
1016         /* we don't need lock here; nobody else touches the iova range */
1017         do {
1018                 large_page = 1;
1019                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1020                 if (!pte) {
1021                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1022                         continue;
1023                 }
1024                 do {
1025                         dma_clear_pte(pte);
1026                         start_pfn += lvl_to_nr_pages(large_page);
1027                         pte++;
1028                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1029
1030                 domain_flush_cache(domain, first_pte,
1031                                    (void *)pte - (void *)first_pte);
1032
1033         } while (start_pfn && start_pfn <= last_pfn);
1034 }
1035
1036 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1037                                int retain_level, struct dma_pte *pte,
1038                                unsigned long pfn, unsigned long start_pfn,
1039                                unsigned long last_pfn)
1040 {
1041         pfn = max(start_pfn, pfn);
1042         pte = &pte[pfn_level_offset(pfn, level)];
1043
1044         do {
1045                 unsigned long level_pfn;
1046                 struct dma_pte *level_pte;
1047
1048                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1049                         goto next;
1050
1051                 level_pfn = pfn & level_mask(level);
1052                 level_pte = phys_to_virt(dma_pte_addr(pte));
1053
1054                 if (level > 2) {
1055                         dma_pte_free_level(domain, level - 1, retain_level,
1056                                            level_pte, level_pfn, start_pfn,
1057                                            last_pfn);
1058                 }
1059
1060                 /*
1061                  * Free the page table if we're below the level we want to
1062                  * retain and the range covers the entire table.
1063                  */
1064                 if (level < retain_level && !(start_pfn > level_pfn ||
1065                       last_pfn < level_pfn + level_size(level) - 1)) {
1066                         dma_clear_pte(pte);
1067                         domain_flush_cache(domain, pte, sizeof(*pte));
1068                         free_pgtable_page(level_pte);
1069                 }
1070 next:
1071                 pfn += level_size(level);
1072         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1073 }
1074
1075 /*
1076  * clear last level (leaf) ptes and free page table pages below the
1077  * level we wish to keep intact.
1078  */
1079 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1080                                    unsigned long start_pfn,
1081                                    unsigned long last_pfn,
1082                                    int retain_level)
1083 {
1084         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1085         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1086         BUG_ON(start_pfn > last_pfn);
1087
1088         dma_pte_clear_range(domain, start_pfn, last_pfn);
1089
1090         /* We don't need lock here; nobody else touches the iova range */
1091         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1092                            domain->pgd, 0, start_pfn, last_pfn);
1093
1094         /* free pgd */
1095         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1096                 free_pgtable_page(domain->pgd);
1097                 domain->pgd = NULL;
1098         }
1099 }
1100
1101 /* When a page at a given level is being unlinked from its parent, we don't
1102    need to *modify* it at all. All we need to do is make a list of all the
1103    pages which can be freed just as soon as we've flushed the IOTLB and we
1104    know the hardware page-walk will no longer touch them.
1105    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1106    be freed. */
1107 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1108                                             int level, struct dma_pte *pte,
1109                                             struct page *freelist)
1110 {
1111         struct page *pg;
1112
1113         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1114         pg->freelist = freelist;
1115         freelist = pg;
1116
1117         if (level == 1)
1118                 return freelist;
1119
1120         pte = page_address(pg);
1121         do {
1122                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1123                         freelist = dma_pte_list_pagetables(domain, level - 1,
1124                                                            pte, freelist);
1125                 pte++;
1126         } while (!first_pte_in_page(pte));
1127
1128         return freelist;
1129 }
1130
1131 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1132                                         struct dma_pte *pte, unsigned long pfn,
1133                                         unsigned long start_pfn,
1134                                         unsigned long last_pfn,
1135                                         struct page *freelist)
1136 {
1137         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1138
1139         pfn = max(start_pfn, pfn);
1140         pte = &pte[pfn_level_offset(pfn, level)];
1141
1142         do {
1143                 unsigned long level_pfn;
1144
1145                 if (!dma_pte_present(pte))
1146                         goto next;
1147
1148                 level_pfn = pfn & level_mask(level);
1149
1150                 /* If range covers entire pagetable, free it */
1151                 if (start_pfn <= level_pfn &&
1152                     last_pfn >= level_pfn + level_size(level) - 1) {
1153                         /* These suborbinate page tables are going away entirely. Don't
1154                            bother to clear them; we're just going to *free* them. */
1155                         if (level > 1 && !dma_pte_superpage(pte))
1156                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1157
1158                         dma_clear_pte(pte);
1159                         if (!first_pte)
1160                                 first_pte = pte;
1161                         last_pte = pte;
1162                 } else if (level > 1) {
1163                         /* Recurse down into a level that isn't *entirely* obsolete */
1164                         freelist = dma_pte_clear_level(domain, level - 1,
1165                                                        phys_to_virt(dma_pte_addr(pte)),
1166                                                        level_pfn, start_pfn, last_pfn,
1167                                                        freelist);
1168                 }
1169 next:
1170                 pfn += level_size(level);
1171         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1172
1173         if (first_pte)
1174                 domain_flush_cache(domain, first_pte,
1175                                    (void *)++last_pte - (void *)first_pte);
1176
1177         return freelist;
1178 }
1179
1180 /* We can't just free the pages because the IOMMU may still be walking
1181    the page tables, and may have cached the intermediate levels. The
1182    pages can only be freed after the IOTLB flush has been done. */
1183 static struct page *domain_unmap(struct dmar_domain *domain,
1184                                  unsigned long start_pfn,
1185                                  unsigned long last_pfn)
1186 {
1187         struct page *freelist;
1188
1189         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1190         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1191         BUG_ON(start_pfn > last_pfn);
1192
1193         /* we don't need lock here; nobody else touches the iova range */
1194         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1195                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1196
1197         /* free pgd */
1198         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1199                 struct page *pgd_page = virt_to_page(domain->pgd);
1200                 pgd_page->freelist = freelist;
1201                 freelist = pgd_page;
1202
1203                 domain->pgd = NULL;
1204         }
1205
1206         return freelist;
1207 }
1208
1209 static void dma_free_pagelist(struct page *freelist)
1210 {
1211         struct page *pg;
1212
1213         while ((pg = freelist)) {
1214                 freelist = pg->freelist;
1215                 free_pgtable_page(page_address(pg));
1216         }
1217 }
1218
1219 static void iova_entry_free(unsigned long data)
1220 {
1221         struct page *freelist = (struct page *)data;
1222
1223         dma_free_pagelist(freelist);
1224 }
1225
1226 /* iommu handling */
1227 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1228 {
1229         struct root_entry *root;
1230         unsigned long flags;
1231
1232         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1233         if (!root) {
1234                 pr_err("Allocating root entry for %s failed\n",
1235                         iommu->name);
1236                 return -ENOMEM;
1237         }
1238
1239         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1240
1241         spin_lock_irqsave(&iommu->lock, flags);
1242         iommu->root_entry = root;
1243         spin_unlock_irqrestore(&iommu->lock, flags);
1244
1245         return 0;
1246 }
1247
1248 static void iommu_set_root_entry(struct intel_iommu *iommu)
1249 {
1250         u64 addr;
1251         u32 sts;
1252         unsigned long flag;
1253
1254         addr = virt_to_phys(iommu->root_entry);
1255         if (sm_supported(iommu))
1256                 addr |= DMA_RTADDR_SMT;
1257
1258         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1259         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1260
1261         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1262
1263         /* Make sure hardware complete it */
1264         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1265                       readl, (sts & DMA_GSTS_RTPS), sts);
1266
1267         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1268 }
1269
1270 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1271 {
1272         u32 val;
1273         unsigned long flag;
1274
1275         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1276                 return;
1277
1278         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1279         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1280
1281         /* Make sure hardware complete it */
1282         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1283                       readl, (!(val & DMA_GSTS_WBFS)), val);
1284
1285         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1286 }
1287
1288 /* return value determine if we need a write buffer flush */
1289 static void __iommu_flush_context(struct intel_iommu *iommu,
1290                                   u16 did, u16 source_id, u8 function_mask,
1291                                   u64 type)
1292 {
1293         u64 val = 0;
1294         unsigned long flag;
1295
1296         switch (type) {
1297         case DMA_CCMD_GLOBAL_INVL:
1298                 val = DMA_CCMD_GLOBAL_INVL;
1299                 break;
1300         case DMA_CCMD_DOMAIN_INVL:
1301                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1302                 break;
1303         case DMA_CCMD_DEVICE_INVL:
1304                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1305                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1306                 break;
1307         default:
1308                 BUG();
1309         }
1310         val |= DMA_CCMD_ICC;
1311
1312         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1313         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1314
1315         /* Make sure hardware complete it */
1316         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1317                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1318
1319         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1320 }
1321
1322 /* return value determine if we need a write buffer flush */
1323 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1324                                 u64 addr, unsigned int size_order, u64 type)
1325 {
1326         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1327         u64 val = 0, val_iva = 0;
1328         unsigned long flag;
1329
1330         switch (type) {
1331         case DMA_TLB_GLOBAL_FLUSH:
1332                 /* global flush doesn't need set IVA_REG */
1333                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1334                 break;
1335         case DMA_TLB_DSI_FLUSH:
1336                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1337                 break;
1338         case DMA_TLB_PSI_FLUSH:
1339                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1340                 /* IH bit is passed in as part of address */
1341                 val_iva = size_order | addr;
1342                 break;
1343         default:
1344                 BUG();
1345         }
1346         /* Note: set drain read/write */
1347 #if 0
1348         /*
1349          * This is probably to be super secure.. Looks like we can
1350          * ignore it without any impact.
1351          */
1352         if (cap_read_drain(iommu->cap))
1353                 val |= DMA_TLB_READ_DRAIN;
1354 #endif
1355         if (cap_write_drain(iommu->cap))
1356                 val |= DMA_TLB_WRITE_DRAIN;
1357
1358         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1359         /* Note: Only uses first TLB reg currently */
1360         if (val_iva)
1361                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1362         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1363
1364         /* Make sure hardware complete it */
1365         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1366                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1367
1368         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1369
1370         /* check IOTLB invalidation granularity */
1371         if (DMA_TLB_IAIG(val) == 0)
1372                 pr_err("Flush IOTLB failed\n");
1373         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1374                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1375                         (unsigned long long)DMA_TLB_IIRG(type),
1376                         (unsigned long long)DMA_TLB_IAIG(val));
1377 }
1378
1379 static struct device_domain_info *
1380 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1381                          u8 bus, u8 devfn)
1382 {
1383         struct device_domain_info *info;
1384
1385         assert_spin_locked(&device_domain_lock);
1386
1387         if (!iommu->qi)
1388                 return NULL;
1389
1390         list_for_each_entry(info, &domain->devices, link)
1391                 if (info->iommu == iommu && info->bus == bus &&
1392                     info->devfn == devfn) {
1393                         if (info->ats_supported && info->dev)
1394                                 return info;
1395                         break;
1396                 }
1397
1398         return NULL;
1399 }
1400
1401 static void domain_update_iotlb(struct dmar_domain *domain)
1402 {
1403         struct device_domain_info *info;
1404         bool has_iotlb_device = false;
1405
1406         assert_spin_locked(&device_domain_lock);
1407
1408         list_for_each_entry(info, &domain->devices, link) {
1409                 struct pci_dev *pdev;
1410
1411                 if (!info->dev || !dev_is_pci(info->dev))
1412                         continue;
1413
1414                 pdev = to_pci_dev(info->dev);
1415                 if (pdev->ats_enabled) {
1416                         has_iotlb_device = true;
1417                         break;
1418                 }
1419         }
1420
1421         domain->has_iotlb_device = has_iotlb_device;
1422 }
1423
1424 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1425 {
1426         struct pci_dev *pdev;
1427
1428         assert_spin_locked(&device_domain_lock);
1429
1430         if (!info || !dev_is_pci(info->dev))
1431                 return;
1432
1433         pdev = to_pci_dev(info->dev);
1434         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1435          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1436          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1437          * reserved, which should be set to 0.
1438          */
1439         if (!ecap_dit(info->iommu->ecap))
1440                 info->pfsid = 0;
1441         else {
1442                 struct pci_dev *pf_pdev;
1443
1444                 /* pdev will be returned if device is not a vf */
1445                 pf_pdev = pci_physfn(pdev);
1446                 info->pfsid = pci_dev_id(pf_pdev);
1447         }
1448
1449 #ifdef CONFIG_INTEL_IOMMU_SVM
1450         /* The PCIe spec, in its wisdom, declares that the behaviour of
1451            the device if you enable PASID support after ATS support is
1452            undefined. So always enable PASID support on devices which
1453            have it, even if we can't yet know if we're ever going to
1454            use it. */
1455         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1456                 info->pasid_enabled = 1;
1457
1458         if (info->pri_supported &&
1459             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1460             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1461                 info->pri_enabled = 1;
1462 #endif
1463         if (!pdev->untrusted && info->ats_supported &&
1464             pci_ats_page_aligned(pdev) &&
1465             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1466                 info->ats_enabled = 1;
1467                 domain_update_iotlb(info->domain);
1468                 info->ats_qdep = pci_ats_queue_depth(pdev);
1469         }
1470 }
1471
1472 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1473 {
1474         struct pci_dev *pdev;
1475
1476         assert_spin_locked(&device_domain_lock);
1477
1478         if (!dev_is_pci(info->dev))
1479                 return;
1480
1481         pdev = to_pci_dev(info->dev);
1482
1483         if (info->ats_enabled) {
1484                 pci_disable_ats(pdev);
1485                 info->ats_enabled = 0;
1486                 domain_update_iotlb(info->domain);
1487         }
1488 #ifdef CONFIG_INTEL_IOMMU_SVM
1489         if (info->pri_enabled) {
1490                 pci_disable_pri(pdev);
1491                 info->pri_enabled = 0;
1492         }
1493         if (info->pasid_enabled) {
1494                 pci_disable_pasid(pdev);
1495                 info->pasid_enabled = 0;
1496         }
1497 #endif
1498 }
1499
1500 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1501                                   u64 addr, unsigned mask)
1502 {
1503         u16 sid, qdep;
1504         unsigned long flags;
1505         struct device_domain_info *info;
1506
1507         if (!domain->has_iotlb_device)
1508                 return;
1509
1510         spin_lock_irqsave(&device_domain_lock, flags);
1511         list_for_each_entry(info, &domain->devices, link) {
1512                 if (!info->ats_enabled)
1513                         continue;
1514
1515                 sid = info->bus << 8 | info->devfn;
1516                 qdep = info->ats_qdep;
1517                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1518                                 qdep, addr, mask);
1519         }
1520         spin_unlock_irqrestore(&device_domain_lock, flags);
1521 }
1522
1523 static void domain_flush_piotlb(struct intel_iommu *iommu,
1524                                 struct dmar_domain *domain,
1525                                 u64 addr, unsigned long npages, bool ih)
1526 {
1527         u16 did = domain->iommu_did[iommu->seq_id];
1528
1529         if (domain->default_pasid)
1530                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1531                                 addr, npages, ih);
1532
1533         if (!list_empty(&domain->devices))
1534                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1535 }
1536
1537 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1538                                   struct dmar_domain *domain,
1539                                   unsigned long pfn, unsigned int pages,
1540                                   int ih, int map)
1541 {
1542         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1543         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1544         u16 did = domain->iommu_did[iommu->seq_id];
1545
1546         BUG_ON(pages == 0);
1547
1548         if (ih)
1549                 ih = 1 << 6;
1550
1551         if (domain_use_first_level(domain)) {
1552                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1553         } else {
1554                 /*
1555                  * Fallback to domain selective flush if no PSI support or
1556                  * the size is too big. PSI requires page size to be 2 ^ x,
1557                  * and the base address is naturally aligned to the size.
1558                  */
1559                 if (!cap_pgsel_inv(iommu->cap) ||
1560                     mask > cap_max_amask_val(iommu->cap))
1561                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1562                                                         DMA_TLB_DSI_FLUSH);
1563                 else
1564                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1565                                                         DMA_TLB_PSI_FLUSH);
1566         }
1567
1568         /*
1569          * In caching mode, changes of pages from non-present to present require
1570          * flush. However, device IOTLB doesn't need to be flushed in this case.
1571          */
1572         if (!cap_caching_mode(iommu->cap) || !map)
1573                 iommu_flush_dev_iotlb(domain, addr, mask);
1574 }
1575
1576 /* Notification for newly created mappings */
1577 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1578                                         struct dmar_domain *domain,
1579                                         unsigned long pfn, unsigned int pages)
1580 {
1581         /*
1582          * It's a non-present to present mapping. Only flush if caching mode
1583          * and second level.
1584          */
1585         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1586                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1587         else
1588                 iommu_flush_write_buffer(iommu);
1589 }
1590
1591 static void iommu_flush_iova(struct iova_domain *iovad)
1592 {
1593         struct dmar_domain *domain;
1594         int idx;
1595
1596         domain = container_of(iovad, struct dmar_domain, iovad);
1597
1598         for_each_domain_iommu(idx, domain) {
1599                 struct intel_iommu *iommu = g_iommus[idx];
1600                 u16 did = domain->iommu_did[iommu->seq_id];
1601
1602                 if (domain_use_first_level(domain))
1603                         domain_flush_piotlb(iommu, domain, 0, -1, 0);
1604                 else
1605                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1606                                                  DMA_TLB_DSI_FLUSH);
1607
1608                 if (!cap_caching_mode(iommu->cap))
1609                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1610                                               0, MAX_AGAW_PFN_WIDTH);
1611         }
1612 }
1613
1614 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1615 {
1616         u32 pmen;
1617         unsigned long flags;
1618
1619         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1620                 return;
1621
1622         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1623         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1624         pmen &= ~DMA_PMEN_EPM;
1625         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1626
1627         /* wait for the protected region status bit to clear */
1628         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1629                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1630
1631         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1632 }
1633
1634 static void iommu_enable_translation(struct intel_iommu *iommu)
1635 {
1636         u32 sts;
1637         unsigned long flags;
1638
1639         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1640         iommu->gcmd |= DMA_GCMD_TE;
1641         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1642
1643         /* Make sure hardware complete it */
1644         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1645                       readl, (sts & DMA_GSTS_TES), sts);
1646
1647         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1648 }
1649
1650 static void iommu_disable_translation(struct intel_iommu *iommu)
1651 {
1652         u32 sts;
1653         unsigned long flag;
1654
1655         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1656         iommu->gcmd &= ~DMA_GCMD_TE;
1657         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1658
1659         /* Make sure hardware complete it */
1660         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1661                       readl, (!(sts & DMA_GSTS_TES)), sts);
1662
1663         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1664 }
1665
1666 static int iommu_init_domains(struct intel_iommu *iommu)
1667 {
1668         u32 ndomains, nlongs;
1669         size_t size;
1670
1671         ndomains = cap_ndoms(iommu->cap);
1672         pr_debug("%s: Number of Domains supported <%d>\n",
1673                  iommu->name, ndomains);
1674         nlongs = BITS_TO_LONGS(ndomains);
1675
1676         spin_lock_init(&iommu->lock);
1677
1678         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1679         if (!iommu->domain_ids) {
1680                 pr_err("%s: Allocating domain id array failed\n",
1681                        iommu->name);
1682                 return -ENOMEM;
1683         }
1684
1685         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1686         iommu->domains = kzalloc(size, GFP_KERNEL);
1687
1688         if (iommu->domains) {
1689                 size = 256 * sizeof(struct dmar_domain *);
1690                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1691         }
1692
1693         if (!iommu->domains || !iommu->domains[0]) {
1694                 pr_err("%s: Allocating domain array failed\n",
1695                        iommu->name);
1696                 kfree(iommu->domain_ids);
1697                 kfree(iommu->domains);
1698                 iommu->domain_ids = NULL;
1699                 iommu->domains    = NULL;
1700                 return -ENOMEM;
1701         }
1702
1703         /*
1704          * If Caching mode is set, then invalid translations are tagged
1705          * with domain-id 0, hence we need to pre-allocate it. We also
1706          * use domain-id 0 as a marker for non-allocated domain-id, so
1707          * make sure it is not used for a real domain.
1708          */
1709         set_bit(0, iommu->domain_ids);
1710
1711         /*
1712          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1713          * entry for first-level or pass-through translation modes should
1714          * be programmed with a domain id different from those used for
1715          * second-level or nested translation. We reserve a domain id for
1716          * this purpose.
1717          */
1718         if (sm_supported(iommu))
1719                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1720
1721         return 0;
1722 }
1723
1724 static void disable_dmar_iommu(struct intel_iommu *iommu)
1725 {
1726         struct device_domain_info *info, *tmp;
1727         unsigned long flags;
1728
1729         if (!iommu->domains || !iommu->domain_ids)
1730                 return;
1731
1732         spin_lock_irqsave(&device_domain_lock, flags);
1733         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1734                 if (info->iommu != iommu)
1735                         continue;
1736
1737                 if (!info->dev || !info->domain)
1738                         continue;
1739
1740                 __dmar_remove_one_dev_info(info);
1741         }
1742         spin_unlock_irqrestore(&device_domain_lock, flags);
1743
1744         if (iommu->gcmd & DMA_GCMD_TE)
1745                 iommu_disable_translation(iommu);
1746 }
1747
1748 static void free_dmar_iommu(struct intel_iommu *iommu)
1749 {
1750         if ((iommu->domains) && (iommu->domain_ids)) {
1751                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1752                 int i;
1753
1754                 for (i = 0; i < elems; i++)
1755                         kfree(iommu->domains[i]);
1756                 kfree(iommu->domains);
1757                 kfree(iommu->domain_ids);
1758                 iommu->domains = NULL;
1759                 iommu->domain_ids = NULL;
1760         }
1761
1762         g_iommus[iommu->seq_id] = NULL;
1763
1764         /* free context mapping */
1765         free_context_table(iommu);
1766
1767 #ifdef CONFIG_INTEL_IOMMU_SVM
1768         if (pasid_supported(iommu)) {
1769                 if (ecap_prs(iommu->ecap))
1770                         intel_svm_finish_prq(iommu);
1771         }
1772 #endif
1773 }
1774
1775 /*
1776  * Check and return whether first level is used by default for
1777  * DMA translation.
1778  */
1779 static bool first_level_by_default(void)
1780 {
1781         struct dmar_drhd_unit *drhd;
1782         struct intel_iommu *iommu;
1783         static int first_level_support = -1;
1784
1785         if (likely(first_level_support != -1))
1786                 return first_level_support;
1787
1788         first_level_support = 1;
1789
1790         rcu_read_lock();
1791         for_each_active_iommu(iommu, drhd) {
1792                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1793                         first_level_support = 0;
1794                         break;
1795                 }
1796         }
1797         rcu_read_unlock();
1798
1799         return first_level_support;
1800 }
1801
1802 static struct dmar_domain *alloc_domain(int flags)
1803 {
1804         struct dmar_domain *domain;
1805
1806         domain = alloc_domain_mem();
1807         if (!domain)
1808                 return NULL;
1809
1810         memset(domain, 0, sizeof(*domain));
1811         domain->nid = NUMA_NO_NODE;
1812         domain->flags = flags;
1813         if (first_level_by_default())
1814                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1815         domain->has_iotlb_device = false;
1816         INIT_LIST_HEAD(&domain->devices);
1817
1818         return domain;
1819 }
1820
1821 /* Must be called with iommu->lock */
1822 static int domain_attach_iommu(struct dmar_domain *domain,
1823                                struct intel_iommu *iommu)
1824 {
1825         unsigned long ndomains;
1826         int num;
1827
1828         assert_spin_locked(&device_domain_lock);
1829         assert_spin_locked(&iommu->lock);
1830
1831         domain->iommu_refcnt[iommu->seq_id] += 1;
1832         domain->iommu_count += 1;
1833         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1834                 ndomains = cap_ndoms(iommu->cap);
1835                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1836
1837                 if (num >= ndomains) {
1838                         pr_err("%s: No free domain ids\n", iommu->name);
1839                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1840                         domain->iommu_count -= 1;
1841                         return -ENOSPC;
1842                 }
1843
1844                 set_bit(num, iommu->domain_ids);
1845                 set_iommu_domain(iommu, num, domain);
1846
1847                 domain->iommu_did[iommu->seq_id] = num;
1848                 domain->nid                      = iommu->node;
1849
1850                 domain_update_iommu_cap(domain);
1851         }
1852
1853         return 0;
1854 }
1855
1856 static int domain_detach_iommu(struct dmar_domain *domain,
1857                                struct intel_iommu *iommu)
1858 {
1859         int num, count;
1860
1861         assert_spin_locked(&device_domain_lock);
1862         assert_spin_locked(&iommu->lock);
1863
1864         domain->iommu_refcnt[iommu->seq_id] -= 1;
1865         count = --domain->iommu_count;
1866         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1867                 num = domain->iommu_did[iommu->seq_id];
1868                 clear_bit(num, iommu->domain_ids);
1869                 set_iommu_domain(iommu, num, NULL);
1870
1871                 domain_update_iommu_cap(domain);
1872                 domain->iommu_did[iommu->seq_id] = 0;
1873         }
1874
1875         return count;
1876 }
1877
1878 static struct iova_domain reserved_iova_list;
1879 static struct lock_class_key reserved_rbtree_key;
1880
1881 static int dmar_init_reserved_ranges(void)
1882 {
1883         struct pci_dev *pdev = NULL;
1884         struct iova *iova;
1885         int i;
1886
1887         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1888
1889         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1890                 &reserved_rbtree_key);
1891
1892         /* IOAPIC ranges shouldn't be accessed by DMA */
1893         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1894                 IOVA_PFN(IOAPIC_RANGE_END));
1895         if (!iova) {
1896                 pr_err("Reserve IOAPIC range failed\n");
1897                 return -ENODEV;
1898         }
1899
1900         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1901         for_each_pci_dev(pdev) {
1902                 struct resource *r;
1903
1904                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1905                         r = &pdev->resource[i];
1906                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1907                                 continue;
1908                         iova = reserve_iova(&reserved_iova_list,
1909                                             IOVA_PFN(r->start),
1910                                             IOVA_PFN(r->end));
1911                         if (!iova) {
1912                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1913                                 return -ENODEV;
1914                         }
1915                 }
1916         }
1917         return 0;
1918 }
1919
1920 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1921 {
1922         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1923 }
1924
1925 static inline int guestwidth_to_adjustwidth(int gaw)
1926 {
1927         int agaw;
1928         int r = (gaw - 12) % 9;
1929
1930         if (r == 0)
1931                 agaw = gaw;
1932         else
1933                 agaw = gaw + 9 - r;
1934         if (agaw > 64)
1935                 agaw = 64;
1936         return agaw;
1937 }
1938
1939 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1940                        int guest_width)
1941 {
1942         int adjust_width, agaw;
1943         unsigned long sagaw;
1944         int ret;
1945
1946         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1947
1948         if (!intel_iommu_strict) {
1949                 ret = init_iova_flush_queue(&domain->iovad,
1950                                             iommu_flush_iova, iova_entry_free);
1951                 if (ret)
1952                         pr_info("iova flush queue initialization failed\n");
1953         }
1954
1955         domain_reserve_special_ranges(domain);
1956
1957         /* calculate AGAW */
1958         if (guest_width > cap_mgaw(iommu->cap))
1959                 guest_width = cap_mgaw(iommu->cap);
1960         domain->gaw = guest_width;
1961         adjust_width = guestwidth_to_adjustwidth(guest_width);
1962         agaw = width_to_agaw(adjust_width);
1963         sagaw = cap_sagaw(iommu->cap);
1964         if (!test_bit(agaw, &sagaw)) {
1965                 /* hardware doesn't support it, choose a bigger one */
1966                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1967                 agaw = find_next_bit(&sagaw, 5, agaw);
1968                 if (agaw >= 5)
1969                         return -ENODEV;
1970         }
1971         domain->agaw = agaw;
1972
1973         if (ecap_coherent(iommu->ecap))
1974                 domain->iommu_coherency = 1;
1975         else
1976                 domain->iommu_coherency = 0;
1977
1978         if (ecap_sc_support(iommu->ecap))
1979                 domain->iommu_snooping = 1;
1980         else
1981                 domain->iommu_snooping = 0;
1982
1983         if (intel_iommu_superpage)
1984                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1985         else
1986                 domain->iommu_superpage = 0;
1987
1988         domain->nid = iommu->node;
1989
1990         /* always allocate the top pgd */
1991         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1992         if (!domain->pgd)
1993                 return -ENOMEM;
1994         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1995         return 0;
1996 }
1997
1998 static void domain_exit(struct dmar_domain *domain)
1999 {
2000
2001         /* Remove associated devices and clear attached or cached domains */
2002         domain_remove_dev_info(domain);
2003
2004         /* destroy iovas */
2005         put_iova_domain(&domain->iovad);
2006
2007         if (domain->pgd) {
2008                 struct page *freelist;
2009
2010                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2011                 dma_free_pagelist(freelist);
2012         }
2013
2014         free_domain_mem(domain);
2015 }
2016
2017 /*
2018  * Get the PASID directory size for scalable mode context entry.
2019  * Value of X in the PDTS field of a scalable mode context entry
2020  * indicates PASID directory with 2^(X + 7) entries.
2021  */
2022 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2023 {
2024         int pds, max_pde;
2025
2026         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2027         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2028         if (pds < 7)
2029                 return 0;
2030
2031         return pds - 7;
2032 }
2033
2034 /*
2035  * Set the RID_PASID field of a scalable mode context entry. The
2036  * IOMMU hardware will use the PASID value set in this field for
2037  * DMA translations of DMA requests without PASID.
2038  */
2039 static inline void
2040 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2041 {
2042         context->hi |= pasid & ((1 << 20) - 1);
2043 }
2044
2045 /*
2046  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2047  * entry.
2048  */
2049 static inline void context_set_sm_dte(struct context_entry *context)
2050 {
2051         context->lo |= (1 << 2);
2052 }
2053
2054 /*
2055  * Set the PRE(Page Request Enable) field of a scalable mode context
2056  * entry.
2057  */
2058 static inline void context_set_sm_pre(struct context_entry *context)
2059 {
2060         context->lo |= (1 << 4);
2061 }
2062
2063 /* Convert value to context PASID directory size field coding. */
2064 #define context_pdts(pds)       (((pds) & 0x7) << 9)
2065
2066 static int domain_context_mapping_one(struct dmar_domain *domain,
2067                                       struct intel_iommu *iommu,
2068                                       struct pasid_table *table,
2069                                       u8 bus, u8 devfn)
2070 {
2071         u16 did = domain->iommu_did[iommu->seq_id];
2072         int translation = CONTEXT_TT_MULTI_LEVEL;
2073         struct device_domain_info *info = NULL;
2074         struct context_entry *context;
2075         unsigned long flags;
2076         int ret;
2077
2078         WARN_ON(did == 0);
2079
2080         if (hw_pass_through && domain_type_is_si(domain))
2081                 translation = CONTEXT_TT_PASS_THROUGH;
2082
2083         pr_debug("Set context mapping for %02x:%02x.%d\n",
2084                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2085
2086         BUG_ON(!domain->pgd);
2087
2088         spin_lock_irqsave(&device_domain_lock, flags);
2089         spin_lock(&iommu->lock);
2090
2091         ret = -ENOMEM;
2092         context = iommu_context_addr(iommu, bus, devfn, 1);
2093         if (!context)
2094                 goto out_unlock;
2095
2096         ret = 0;
2097         if (context_present(context))
2098                 goto out_unlock;
2099
2100         /*
2101          * For kdump cases, old valid entries may be cached due to the
2102          * in-flight DMA and copied pgtable, but there is no unmapping
2103          * behaviour for them, thus we need an explicit cache flush for
2104          * the newly-mapped device. For kdump, at this point, the device
2105          * is supposed to finish reset at its driver probe stage, so no
2106          * in-flight DMA will exist, and we don't need to worry anymore
2107          * hereafter.
2108          */
2109         if (context_copied(context)) {
2110                 u16 did_old = context_domain_id(context);
2111
2112                 if (did_old < cap_ndoms(iommu->cap)) {
2113                         iommu->flush.flush_context(iommu, did_old,
2114                                                    (((u16)bus) << 8) | devfn,
2115                                                    DMA_CCMD_MASK_NOBIT,
2116                                                    DMA_CCMD_DEVICE_INVL);
2117                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2118                                                  DMA_TLB_DSI_FLUSH);
2119                 }
2120         }
2121
2122         context_clear_entry(context);
2123
2124         if (sm_supported(iommu)) {
2125                 unsigned long pds;
2126
2127                 WARN_ON(!table);
2128
2129                 /* Setup the PASID DIR pointer: */
2130                 pds = context_get_sm_pds(table);
2131                 context->lo = (u64)virt_to_phys(table->table) |
2132                                 context_pdts(pds);
2133
2134                 /* Setup the RID_PASID field: */
2135                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2136
2137                 /*
2138                  * Setup the Device-TLB enable bit and Page request
2139                  * Enable bit:
2140                  */
2141                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2142                 if (info && info->ats_supported)
2143                         context_set_sm_dte(context);
2144                 if (info && info->pri_supported)
2145                         context_set_sm_pre(context);
2146         } else {
2147                 struct dma_pte *pgd = domain->pgd;
2148                 int agaw;
2149
2150                 context_set_domain_id(context, did);
2151
2152                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2153                         /*
2154                          * Skip top levels of page tables for iommu which has
2155                          * less agaw than default. Unnecessary for PT mode.
2156                          */
2157                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2158                                 ret = -ENOMEM;
2159                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2160                                 if (!dma_pte_present(pgd))
2161                                         goto out_unlock;
2162                         }
2163
2164                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2165                         if (info && info->ats_supported)
2166                                 translation = CONTEXT_TT_DEV_IOTLB;
2167                         else
2168                                 translation = CONTEXT_TT_MULTI_LEVEL;
2169
2170                         context_set_address_root(context, virt_to_phys(pgd));
2171                         context_set_address_width(context, agaw);
2172                 } else {
2173                         /*
2174                          * In pass through mode, AW must be programmed to
2175                          * indicate the largest AGAW value supported by
2176                          * hardware. And ASR is ignored by hardware.
2177                          */
2178                         context_set_address_width(context, iommu->msagaw);
2179                 }
2180
2181                 context_set_translation_type(context, translation);
2182         }
2183
2184         context_set_fault_enable(context);
2185         context_set_present(context);
2186         if (!ecap_coherent(iommu->ecap))
2187                 clflush_cache_range(context, sizeof(*context));
2188
2189         /*
2190          * It's a non-present to present mapping. If hardware doesn't cache
2191          * non-present entry we only need to flush the write-buffer. If the
2192          * _does_ cache non-present entries, then it does so in the special
2193          * domain #0, which we have to flush:
2194          */
2195         if (cap_caching_mode(iommu->cap)) {
2196                 iommu->flush.flush_context(iommu, 0,
2197                                            (((u16)bus) << 8) | devfn,
2198                                            DMA_CCMD_MASK_NOBIT,
2199                                            DMA_CCMD_DEVICE_INVL);
2200                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2201         } else {
2202                 iommu_flush_write_buffer(iommu);
2203         }
2204         iommu_enable_dev_iotlb(info);
2205
2206         ret = 0;
2207
2208 out_unlock:
2209         spin_unlock(&iommu->lock);
2210         spin_unlock_irqrestore(&device_domain_lock, flags);
2211
2212         return ret;
2213 }
2214
2215 struct domain_context_mapping_data {
2216         struct dmar_domain *domain;
2217         struct intel_iommu *iommu;
2218         struct pasid_table *table;
2219 };
2220
2221 static int domain_context_mapping_cb(struct pci_dev *pdev,
2222                                      u16 alias, void *opaque)
2223 {
2224         struct domain_context_mapping_data *data = opaque;
2225
2226         return domain_context_mapping_one(data->domain, data->iommu,
2227                                           data->table, PCI_BUS_NUM(alias),
2228                                           alias & 0xff);
2229 }
2230
2231 static int
2232 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2233 {
2234         struct domain_context_mapping_data data;
2235         struct pasid_table *table;
2236         struct intel_iommu *iommu;
2237         u8 bus, devfn;
2238
2239         iommu = device_to_iommu(dev, &bus, &devfn);
2240         if (!iommu)
2241                 return -ENODEV;
2242
2243         table = intel_pasid_get_table(dev);
2244
2245         if (!dev_is_pci(dev))
2246                 return domain_context_mapping_one(domain, iommu, table,
2247                                                   bus, devfn);
2248
2249         data.domain = domain;
2250         data.iommu = iommu;
2251         data.table = table;
2252
2253         return pci_for_each_dma_alias(to_pci_dev(dev),
2254                                       &domain_context_mapping_cb, &data);
2255 }
2256
2257 static int domain_context_mapped_cb(struct pci_dev *pdev,
2258                                     u16 alias, void *opaque)
2259 {
2260         struct intel_iommu *iommu = opaque;
2261
2262         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2263 }
2264
2265 static int domain_context_mapped(struct device *dev)
2266 {
2267         struct intel_iommu *iommu;
2268         u8 bus, devfn;
2269
2270         iommu = device_to_iommu(dev, &bus, &devfn);
2271         if (!iommu)
2272                 return -ENODEV;
2273
2274         if (!dev_is_pci(dev))
2275                 return device_context_mapped(iommu, bus, devfn);
2276
2277         return !pci_for_each_dma_alias(to_pci_dev(dev),
2278                                        domain_context_mapped_cb, iommu);
2279 }
2280
2281 /* Returns a number of VTD pages, but aligned to MM page size */
2282 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2283                                             size_t size)
2284 {
2285         host_addr &= ~PAGE_MASK;
2286         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2287 }
2288
2289 /* Return largest possible superpage level for a given mapping */
2290 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2291                                           unsigned long iov_pfn,
2292                                           unsigned long phy_pfn,
2293                                           unsigned long pages)
2294 {
2295         int support, level = 1;
2296         unsigned long pfnmerge;
2297
2298         support = domain->iommu_superpage;
2299
2300         /* To use a large page, the virtual *and* physical addresses
2301            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2302            of them will mean we have to use smaller pages. So just
2303            merge them and check both at once. */
2304         pfnmerge = iov_pfn | phy_pfn;
2305
2306         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2307                 pages >>= VTD_STRIDE_SHIFT;
2308                 if (!pages)
2309                         break;
2310                 pfnmerge >>= VTD_STRIDE_SHIFT;
2311                 level++;
2312                 support--;
2313         }
2314         return level;
2315 }
2316
2317 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2318                             struct scatterlist *sg, unsigned long phys_pfn,
2319                             unsigned long nr_pages, int prot)
2320 {
2321         struct dma_pte *first_pte = NULL, *pte = NULL;
2322         phys_addr_t uninitialized_var(pteval);
2323         unsigned long sg_res = 0;
2324         unsigned int largepage_lvl = 0;
2325         unsigned long lvl_pages = 0;
2326         u64 attr;
2327
2328         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2329
2330         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2331                 return -EINVAL;
2332
2333         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2334         if (domain_use_first_level(domain))
2335                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2336
2337         if (!sg) {
2338                 sg_res = nr_pages;
2339                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2340         }
2341
2342         while (nr_pages > 0) {
2343                 uint64_t tmp;
2344
2345                 if (!sg_res) {
2346                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2347
2348                         sg_res = aligned_nrpages(sg->offset, sg->length);
2349                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2350                         sg->dma_length = sg->length;
2351                         pteval = (sg_phys(sg) - pgoff) | attr;
2352                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2353                 }
2354
2355                 if (!pte) {
2356                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2357
2358                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2359                         if (!pte)
2360                                 return -ENOMEM;
2361                         /* It is large page*/
2362                         if (largepage_lvl > 1) {
2363                                 unsigned long nr_superpages, end_pfn;
2364
2365                                 pteval |= DMA_PTE_LARGE_PAGE;
2366                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2367
2368                                 nr_superpages = sg_res / lvl_pages;
2369                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2370
2371                                 /*
2372                                  * Ensure that old small page tables are
2373                                  * removed to make room for superpage(s).
2374                                  * We're adding new large pages, so make sure
2375                                  * we don't remove their parent tables.
2376                                  */
2377                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2378                                                        largepage_lvl + 1);
2379                         } else {
2380                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2381                         }
2382
2383                 }
2384                 /* We don't need lock here, nobody else
2385                  * touches the iova range
2386                  */
2387                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2388                 if (tmp) {
2389                         static int dumps = 5;
2390                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2391                                 iov_pfn, tmp, (unsigned long long)pteval);
2392                         if (dumps) {
2393                                 dumps--;
2394                                 debug_dma_dump_mappings(NULL);
2395                         }
2396                         WARN_ON(1);
2397                 }
2398
2399                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2400
2401                 BUG_ON(nr_pages < lvl_pages);
2402                 BUG_ON(sg_res < lvl_pages);
2403
2404                 nr_pages -= lvl_pages;
2405                 iov_pfn += lvl_pages;
2406                 phys_pfn += lvl_pages;
2407                 pteval += lvl_pages * VTD_PAGE_SIZE;
2408                 sg_res -= lvl_pages;
2409
2410                 /* If the next PTE would be the first in a new page, then we
2411                    need to flush the cache on the entries we've just written.
2412                    And then we'll need to recalculate 'pte', so clear it and
2413                    let it get set again in the if (!pte) block above.
2414
2415                    If we're done (!nr_pages) we need to flush the cache too.
2416
2417                    Also if we've been setting superpages, we may need to
2418                    recalculate 'pte' and switch back to smaller pages for the
2419                    end of the mapping, if the trailing size is not enough to
2420                    use another superpage (i.e. sg_res < lvl_pages). */
2421                 pte++;
2422                 if (!nr_pages || first_pte_in_page(pte) ||
2423                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2424                         domain_flush_cache(domain, first_pte,
2425                                            (void *)pte - (void *)first_pte);
2426                         pte = NULL;
2427                 }
2428
2429                 if (!sg_res && nr_pages)
2430                         sg = sg_next(sg);
2431         }
2432         return 0;
2433 }
2434
2435 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2436                           struct scatterlist *sg, unsigned long phys_pfn,
2437                           unsigned long nr_pages, int prot)
2438 {
2439         int iommu_id, ret;
2440         struct intel_iommu *iommu;
2441
2442         /* Do the real mapping first */
2443         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2444         if (ret)
2445                 return ret;
2446
2447         for_each_domain_iommu(iommu_id, domain) {
2448                 iommu = g_iommus[iommu_id];
2449                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2450         }
2451
2452         return 0;
2453 }
2454
2455 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2456                                     struct scatterlist *sg, unsigned long nr_pages,
2457                                     int prot)
2458 {
2459         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2460 }
2461
2462 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2463                                      unsigned long phys_pfn, unsigned long nr_pages,
2464                                      int prot)
2465 {
2466         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2467 }
2468
2469 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2470 {
2471         unsigned long flags;
2472         struct context_entry *context;
2473         u16 did_old;
2474
2475         if (!iommu)
2476                 return;
2477
2478         spin_lock_irqsave(&iommu->lock, flags);
2479         context = iommu_context_addr(iommu, bus, devfn, 0);
2480         if (!context) {
2481                 spin_unlock_irqrestore(&iommu->lock, flags);
2482                 return;
2483         }
2484         did_old = context_domain_id(context);
2485         context_clear_entry(context);
2486         __iommu_flush_cache(iommu, context, sizeof(*context));
2487         spin_unlock_irqrestore(&iommu->lock, flags);
2488         iommu->flush.flush_context(iommu,
2489                                    did_old,
2490                                    (((u16)bus) << 8) | devfn,
2491                                    DMA_CCMD_MASK_NOBIT,
2492                                    DMA_CCMD_DEVICE_INVL);
2493         iommu->flush.flush_iotlb(iommu,
2494                                  did_old,
2495                                  0,
2496                                  0,
2497                                  DMA_TLB_DSI_FLUSH);
2498 }
2499
2500 static inline void unlink_domain_info(struct device_domain_info *info)
2501 {
2502         assert_spin_locked(&device_domain_lock);
2503         list_del(&info->link);
2504         list_del(&info->global);
2505         if (info->dev)
2506                 info->dev->archdata.iommu = NULL;
2507 }
2508
2509 static void domain_remove_dev_info(struct dmar_domain *domain)
2510 {
2511         struct device_domain_info *info, *tmp;
2512         unsigned long flags;
2513
2514         spin_lock_irqsave(&device_domain_lock, flags);
2515         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2516                 __dmar_remove_one_dev_info(info);
2517         spin_unlock_irqrestore(&device_domain_lock, flags);
2518 }
2519
2520 struct dmar_domain *find_domain(struct device *dev)
2521 {
2522         struct device_domain_info *info;
2523
2524         if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2525                 return NULL;
2526
2527         /* No lock here, assumes no domain exit in normal case */
2528         info = dev->archdata.iommu;
2529         if (likely(info))
2530                 return info->domain;
2531
2532         return NULL;
2533 }
2534
2535 static void do_deferred_attach(struct device *dev)
2536 {
2537         struct iommu_domain *domain;
2538
2539         dev->archdata.iommu = NULL;
2540         domain = iommu_get_domain_for_dev(dev);
2541         if (domain)
2542                 intel_iommu_attach_device(domain, dev);
2543 }
2544
2545 static inline struct device_domain_info *
2546 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2547 {
2548         struct device_domain_info *info;
2549
2550         list_for_each_entry(info, &device_domain_list, global)
2551                 if (info->segment == segment && info->bus == bus &&
2552                     info->devfn == devfn)
2553                         return info;
2554
2555         return NULL;
2556 }
2557
2558 static int domain_setup_first_level(struct intel_iommu *iommu,
2559                                     struct dmar_domain *domain,
2560                                     struct device *dev,
2561                                     int pasid)
2562 {
2563         int flags = PASID_FLAG_SUPERVISOR_MODE;
2564         struct dma_pte *pgd = domain->pgd;
2565         int agaw, level;
2566
2567         /*
2568          * Skip top levels of page tables for iommu which has
2569          * less agaw than default. Unnecessary for PT mode.
2570          */
2571         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2572                 pgd = phys_to_virt(dma_pte_addr(pgd));
2573                 if (!dma_pte_present(pgd))
2574                         return -ENOMEM;
2575         }
2576
2577         level = agaw_to_level(agaw);
2578         if (level != 4 && level != 5)
2579                 return -EINVAL;
2580
2581         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2582
2583         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2584                                              domain->iommu_did[iommu->seq_id],
2585                                              flags);
2586 }
2587
2588 static bool dev_is_real_dma_subdevice(struct device *dev)
2589 {
2590         return dev && dev_is_pci(dev) &&
2591                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2592 }
2593
2594 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2595                                                     int bus, int devfn,
2596                                                     struct device *dev,
2597                                                     struct dmar_domain *domain)
2598 {
2599         struct dmar_domain *found = NULL;
2600         struct device_domain_info *info;
2601         unsigned long flags;
2602         int ret;
2603
2604         info = alloc_devinfo_mem();
2605         if (!info)
2606                 return NULL;
2607
2608         if (!dev_is_real_dma_subdevice(dev)) {
2609                 info->bus = bus;
2610                 info->devfn = devfn;
2611                 info->segment = iommu->segment;
2612         } else {
2613                 struct pci_dev *pdev = to_pci_dev(dev);
2614
2615                 info->bus = pdev->bus->number;
2616                 info->devfn = pdev->devfn;
2617                 info->segment = pci_domain_nr(pdev->bus);
2618         }
2619
2620         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2621         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2622         info->ats_qdep = 0;
2623         info->dev = dev;
2624         info->domain = domain;
2625         info->iommu = iommu;
2626         info->pasid_table = NULL;
2627         info->auxd_enabled = 0;
2628         INIT_LIST_HEAD(&info->auxiliary_domains);
2629
2630         if (dev && dev_is_pci(dev)) {
2631                 struct pci_dev *pdev = to_pci_dev(info->dev);
2632
2633                 if (!pdev->untrusted &&
2634                     !pci_ats_disabled() &&
2635                     ecap_dev_iotlb_support(iommu->ecap) &&
2636                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2637                     dmar_find_matched_atsr_unit(pdev))
2638                         info->ats_supported = 1;
2639
2640                 if (sm_supported(iommu)) {
2641                         if (pasid_supported(iommu)) {
2642                                 int features = pci_pasid_features(pdev);
2643                                 if (features >= 0)
2644                                         info->pasid_supported = features | 1;
2645                         }
2646
2647                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2648                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2649                                 info->pri_supported = 1;
2650                 }
2651         }
2652
2653         spin_lock_irqsave(&device_domain_lock, flags);
2654         if (dev)
2655                 found = find_domain(dev);
2656
2657         if (!found) {
2658                 struct device_domain_info *info2;
2659                 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2660                                                        info->devfn);
2661                 if (info2) {
2662                         found      = info2->domain;
2663                         info2->dev = dev;
2664                 }
2665         }
2666
2667         if (found) {
2668                 spin_unlock_irqrestore(&device_domain_lock, flags);
2669                 free_devinfo_mem(info);
2670                 /* Caller must free the original domain */
2671                 return found;
2672         }
2673
2674         spin_lock(&iommu->lock);
2675         ret = domain_attach_iommu(domain, iommu);
2676         spin_unlock(&iommu->lock);
2677
2678         if (ret) {
2679                 spin_unlock_irqrestore(&device_domain_lock, flags);
2680                 free_devinfo_mem(info);
2681                 return NULL;
2682         }
2683
2684         list_add(&info->link, &domain->devices);
2685         list_add(&info->global, &device_domain_list);
2686         if (dev)
2687                 dev->archdata.iommu = info;
2688         spin_unlock_irqrestore(&device_domain_lock, flags);
2689
2690         /* PASID table is mandatory for a PCI device in scalable mode. */
2691         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2692                 ret = intel_pasid_alloc_table(dev);
2693                 if (ret) {
2694                         dev_err(dev, "PASID table allocation failed\n");
2695                         dmar_remove_one_dev_info(dev);
2696                         return NULL;
2697                 }
2698
2699                 /* Setup the PASID entry for requests without PASID: */
2700                 spin_lock(&iommu->lock);
2701                 if (hw_pass_through && domain_type_is_si(domain))
2702                         ret = intel_pasid_setup_pass_through(iommu, domain,
2703                                         dev, PASID_RID2PASID);
2704                 else if (domain_use_first_level(domain))
2705                         ret = domain_setup_first_level(iommu, domain, dev,
2706                                         PASID_RID2PASID);
2707                 else
2708                         ret = intel_pasid_setup_second_level(iommu, domain,
2709                                         dev, PASID_RID2PASID);
2710                 spin_unlock(&iommu->lock);
2711                 if (ret) {
2712                         dev_err(dev, "Setup RID2PASID failed\n");
2713                         dmar_remove_one_dev_info(dev);
2714                         return NULL;
2715                 }
2716         }
2717
2718         if (dev && domain_context_mapping(domain, dev)) {
2719                 dev_err(dev, "Domain context map failed\n");
2720                 dmar_remove_one_dev_info(dev);
2721                 return NULL;
2722         }
2723
2724         return domain;
2725 }
2726
2727 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2728 {
2729         *(u16 *)opaque = alias;
2730         return 0;
2731 }
2732
2733 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2734 {
2735         struct device_domain_info *info;
2736         struct dmar_domain *domain = NULL;
2737         struct intel_iommu *iommu;
2738         u16 dma_alias;
2739         unsigned long flags;
2740         u8 bus, devfn;
2741
2742         iommu = device_to_iommu(dev, &bus, &devfn);
2743         if (!iommu)
2744                 return NULL;
2745
2746         if (dev_is_pci(dev)) {
2747                 struct pci_dev *pdev = to_pci_dev(dev);
2748
2749                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2750
2751                 spin_lock_irqsave(&device_domain_lock, flags);
2752                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2753                                                       PCI_BUS_NUM(dma_alias),
2754                                                       dma_alias & 0xff);
2755                 if (info) {
2756                         iommu = info->iommu;
2757                         domain = info->domain;
2758                 }
2759                 spin_unlock_irqrestore(&device_domain_lock, flags);
2760
2761                 /* DMA alias already has a domain, use it */
2762                 if (info)
2763                         goto out;
2764         }
2765
2766         /* Allocate and initialize new domain for the device */
2767         domain = alloc_domain(0);
2768         if (!domain)
2769                 return NULL;
2770         if (domain_init(domain, iommu, gaw)) {
2771                 domain_exit(domain);
2772                 return NULL;
2773         }
2774
2775 out:
2776         return domain;
2777 }
2778
2779 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2780                                               struct dmar_domain *domain)
2781 {
2782         struct intel_iommu *iommu;
2783         struct dmar_domain *tmp;
2784         u16 req_id, dma_alias;
2785         u8 bus, devfn;
2786
2787         iommu = device_to_iommu(dev, &bus, &devfn);
2788         if (!iommu)
2789                 return NULL;
2790
2791         req_id = ((u16)bus << 8) | devfn;
2792
2793         if (dev_is_pci(dev)) {
2794                 struct pci_dev *pdev = to_pci_dev(dev);
2795
2796                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2797
2798                 /* register PCI DMA alias device */
2799                 if (req_id != dma_alias) {
2800                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2801                                         dma_alias & 0xff, NULL, domain);
2802
2803                         if (!tmp || tmp != domain)
2804                                 return tmp;
2805                 }
2806         }
2807
2808         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2809         if (!tmp || tmp != domain)
2810                 return tmp;
2811
2812         return domain;
2813 }
2814
2815 static int iommu_domain_identity_map(struct dmar_domain *domain,
2816                                      unsigned long long start,
2817                                      unsigned long long end)
2818 {
2819         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2820         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2821
2822         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2823                           dma_to_mm_pfn(last_vpfn))) {
2824                 pr_err("Reserving iova failed\n");
2825                 return -ENOMEM;
2826         }
2827
2828         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2829         /*
2830          * RMRR range might have overlap with physical memory range,
2831          * clear it first
2832          */
2833         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2834
2835         return __domain_mapping(domain, first_vpfn, NULL,
2836                                 first_vpfn, last_vpfn - first_vpfn + 1,
2837                                 DMA_PTE_READ|DMA_PTE_WRITE);
2838 }
2839
2840 static int domain_prepare_identity_map(struct device *dev,
2841                                        struct dmar_domain *domain,
2842                                        unsigned long long start,
2843                                        unsigned long long end)
2844 {
2845         /* For _hardware_ passthrough, don't bother. But for software
2846            passthrough, we do it anyway -- it may indicate a memory
2847            range which is reserved in E820, so which didn't get set
2848            up to start with in si_domain */
2849         if (domain == si_domain && hw_pass_through) {
2850                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2851                          start, end);
2852                 return 0;
2853         }
2854
2855         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2856
2857         if (end < start) {
2858                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2859                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2860                         dmi_get_system_info(DMI_BIOS_VENDOR),
2861                         dmi_get_system_info(DMI_BIOS_VERSION),
2862                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2863                 return -EIO;
2864         }
2865
2866         if (end >> agaw_to_width(domain->agaw)) {
2867                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2868                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2869                      agaw_to_width(domain->agaw),
2870                      dmi_get_system_info(DMI_BIOS_VENDOR),
2871                      dmi_get_system_info(DMI_BIOS_VERSION),
2872                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2873                 return -EIO;
2874         }
2875
2876         return iommu_domain_identity_map(domain, start, end);
2877 }
2878
2879 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2880
2881 static int __init si_domain_init(int hw)
2882 {
2883         struct dmar_rmrr_unit *rmrr;
2884         struct device *dev;
2885         int i, nid, ret;
2886
2887         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2888         if (!si_domain)
2889                 return -EFAULT;
2890
2891         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2892                 domain_exit(si_domain);
2893                 return -EFAULT;
2894         }
2895
2896         if (hw)
2897                 return 0;
2898
2899         for_each_online_node(nid) {
2900                 unsigned long start_pfn, end_pfn;
2901                 int i;
2902
2903                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2904                         ret = iommu_domain_identity_map(si_domain,
2905                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2906                         if (ret)
2907                                 return ret;
2908                 }
2909         }
2910
2911         /*
2912          * Identity map the RMRRs so that devices with RMRRs could also use
2913          * the si_domain.
2914          */
2915         for_each_rmrr_units(rmrr) {
2916                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2917                                           i, dev) {
2918                         unsigned long long start = rmrr->base_address;
2919                         unsigned long long end = rmrr->end_address;
2920
2921                         if (WARN_ON(end < start ||
2922                                     end >> agaw_to_width(si_domain->agaw)))
2923                                 continue;
2924
2925                         ret = iommu_domain_identity_map(si_domain, start, end);
2926                         if (ret)
2927                                 return ret;
2928                 }
2929         }
2930
2931         return 0;
2932 }
2933
2934 static int identity_mapping(struct device *dev)
2935 {
2936         struct device_domain_info *info;
2937
2938         info = dev->archdata.iommu;
2939         if (info)
2940                 return (info->domain == si_domain);
2941
2942         return 0;
2943 }
2944
2945 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2946 {
2947         struct dmar_domain *ndomain;
2948         struct intel_iommu *iommu;
2949         u8 bus, devfn;
2950
2951         iommu = device_to_iommu(dev, &bus, &devfn);
2952         if (!iommu)
2953                 return -ENODEV;
2954
2955         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2956         if (ndomain != domain)
2957                 return -EBUSY;
2958
2959         return 0;
2960 }
2961
2962 static bool device_has_rmrr(struct device *dev)
2963 {
2964         struct dmar_rmrr_unit *rmrr;
2965         struct device *tmp;
2966         int i;
2967
2968         rcu_read_lock();
2969         for_each_rmrr_units(rmrr) {
2970                 /*
2971                  * Return TRUE if this RMRR contains the device that
2972                  * is passed in.
2973                  */
2974                 for_each_active_dev_scope(rmrr->devices,
2975                                           rmrr->devices_cnt, i, tmp)
2976                         if (tmp == dev ||
2977                             is_downstream_to_pci_bridge(dev, tmp)) {
2978                                 rcu_read_unlock();
2979                                 return true;
2980                         }
2981         }
2982         rcu_read_unlock();
2983         return false;
2984 }
2985
2986 /**
2987  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2988  * is relaxable (ie. is allowed to be not enforced under some conditions)
2989  * @dev: device handle
2990  *
2991  * We assume that PCI USB devices with RMRRs have them largely
2992  * for historical reasons and that the RMRR space is not actively used post
2993  * boot.  This exclusion may change if vendors begin to abuse it.
2994  *
2995  * The same exception is made for graphics devices, with the requirement that
2996  * any use of the RMRR regions will be torn down before assigning the device
2997  * to a guest.
2998  *
2999  * Return: true if the RMRR is relaxable, false otherwise
3000  */
3001 static bool device_rmrr_is_relaxable(struct device *dev)
3002 {
3003         struct pci_dev *pdev;
3004
3005         if (!dev_is_pci(dev))
3006                 return false;
3007
3008         pdev = to_pci_dev(dev);
3009         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
3010                 return true;
3011         else
3012                 return false;
3013 }
3014
3015 /*
3016  * There are a couple cases where we need to restrict the functionality of
3017  * devices associated with RMRRs.  The first is when evaluating a device for
3018  * identity mapping because problems exist when devices are moved in and out
3019  * of domains and their respective RMRR information is lost.  This means that
3020  * a device with associated RMRRs will never be in a "passthrough" domain.
3021  * The second is use of the device through the IOMMU API.  This interface
3022  * expects to have full control of the IOVA space for the device.  We cannot
3023  * satisfy both the requirement that RMRR access is maintained and have an
3024  * unencumbered IOVA space.  We also have no ability to quiesce the device's
3025  * use of the RMRR space or even inform the IOMMU API user of the restriction.
3026  * We therefore prevent devices associated with an RMRR from participating in
3027  * the IOMMU API, which eliminates them from device assignment.
3028  *
3029  * In both cases, devices which have relaxable RMRRs are not concerned by this
3030  * restriction. See device_rmrr_is_relaxable comment.
3031  */
3032 static bool device_is_rmrr_locked(struct device *dev)
3033 {
3034         if (!device_has_rmrr(dev))
3035                 return false;
3036
3037         if (device_rmrr_is_relaxable(dev))
3038                 return false;
3039
3040         return true;
3041 }
3042
3043 /*
3044  * Return the required default domain type for a specific device.
3045  *
3046  * @dev: the device in query
3047  * @startup: true if this is during early boot
3048  *
3049  * Returns:
3050  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
3051  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
3052  *  - 0: both identity and dynamic domains work for this device
3053  */
3054 static int device_def_domain_type(struct device *dev)
3055 {
3056         if (dev_is_pci(dev)) {
3057                 struct pci_dev *pdev = to_pci_dev(dev);
3058
3059                 /*
3060                  * Prevent any device marked as untrusted from getting
3061                  * placed into the statically identity mapping domain.
3062                  */
3063                 if (pdev->untrusted)
3064                         return IOMMU_DOMAIN_DMA;
3065
3066                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3067                         return IOMMU_DOMAIN_IDENTITY;
3068
3069                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3070                         return IOMMU_DOMAIN_IDENTITY;
3071
3072                 /*
3073                  * We want to start off with all devices in the 1:1 domain, and
3074                  * take them out later if we find they can't access all of memory.
3075                  *
3076                  * However, we can't do this for PCI devices behind bridges,
3077                  * because all PCI devices behind the same bridge will end up
3078                  * with the same source-id on their transactions.
3079                  *
3080                  * Practically speaking, we can't change things around for these
3081                  * devices at run-time, because we can't be sure there'll be no
3082                  * DMA transactions in flight for any of their siblings.
3083                  *
3084                  * So PCI devices (unless they're on the root bus) as well as
3085                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
3086                  * the 1:1 domain, just in _case_ one of their siblings turns out
3087                  * not to be able to map all of memory.
3088                  */
3089                 if (!pci_is_pcie(pdev)) {
3090                         if (!pci_is_root_bus(pdev->bus))
3091                                 return IOMMU_DOMAIN_DMA;
3092                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
3093                                 return IOMMU_DOMAIN_DMA;
3094                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
3095                         return IOMMU_DOMAIN_DMA;
3096         }
3097
3098         return 0;
3099 }
3100
3101 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3102 {
3103         /*
3104          * Start from the sane iommu hardware state.
3105          * If the queued invalidation is already initialized by us
3106          * (for example, while enabling interrupt-remapping) then
3107          * we got the things already rolling from a sane state.
3108          */
3109         if (!iommu->qi) {
3110                 /*
3111                  * Clear any previous faults.
3112                  */
3113                 dmar_fault(-1, iommu);
3114                 /*
3115                  * Disable queued invalidation if supported and already enabled
3116                  * before OS handover.
3117                  */
3118                 dmar_disable_qi(iommu);
3119         }
3120
3121         if (dmar_enable_qi(iommu)) {
3122                 /*
3123                  * Queued Invalidate not enabled, use Register Based Invalidate
3124                  */
3125                 iommu->flush.flush_context = __iommu_flush_context;
3126                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3127                 pr_info("%s: Using Register based invalidation\n",
3128                         iommu->name);
3129         } else {
3130                 iommu->flush.flush_context = qi_flush_context;
3131                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3132                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3133         }
3134 }
3135
3136 static int copy_context_table(struct intel_iommu *iommu,
3137                               struct root_entry *old_re,
3138                               struct context_entry **tbl,
3139                               int bus, bool ext)
3140 {
3141         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3142         struct context_entry *new_ce = NULL, ce;
3143         struct context_entry *old_ce = NULL;
3144         struct root_entry re;
3145         phys_addr_t old_ce_phys;
3146
3147         tbl_idx = ext ? bus * 2 : bus;
3148         memcpy(&re, old_re, sizeof(re));
3149
3150         for (devfn = 0; devfn < 256; devfn++) {
3151                 /* First calculate the correct index */
3152                 idx = (ext ? devfn * 2 : devfn) % 256;
3153
3154                 if (idx == 0) {
3155                         /* First save what we may have and clean up */
3156                         if (new_ce) {
3157                                 tbl[tbl_idx] = new_ce;
3158                                 __iommu_flush_cache(iommu, new_ce,
3159                                                     VTD_PAGE_SIZE);
3160                                 pos = 1;
3161                         }
3162
3163                         if (old_ce)
3164                                 memunmap(old_ce);
3165
3166                         ret = 0;
3167                         if (devfn < 0x80)
3168                                 old_ce_phys = root_entry_lctp(&re);
3169                         else
3170                                 old_ce_phys = root_entry_uctp(&re);
3171
3172                         if (!old_ce_phys) {
3173                                 if (ext && devfn == 0) {
3174                                         /* No LCTP, try UCTP */
3175                                         devfn = 0x7f;
3176                                         continue;
3177                                 } else {
3178                                         goto out;
3179                                 }
3180                         }
3181
3182                         ret = -ENOMEM;
3183                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3184                                         MEMREMAP_WB);
3185                         if (!old_ce)
3186                                 goto out;
3187
3188                         new_ce = alloc_pgtable_page(iommu->node);
3189                         if (!new_ce)
3190                                 goto out_unmap;
3191
3192                         ret = 0;
3193                 }
3194
3195                 /* Now copy the context entry */
3196                 memcpy(&ce, old_ce + idx, sizeof(ce));
3197
3198                 if (!__context_present(&ce))
3199                         continue;
3200
3201                 did = context_domain_id(&ce);
3202                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3203                         set_bit(did, iommu->domain_ids);
3204
3205                 /*
3206                  * We need a marker for copied context entries. This
3207                  * marker needs to work for the old format as well as
3208                  * for extended context entries.
3209                  *
3210                  * Bit 67 of the context entry is used. In the old
3211                  * format this bit is available to software, in the
3212                  * extended format it is the PGE bit, but PGE is ignored
3213                  * by HW if PASIDs are disabled (and thus still
3214                  * available).
3215                  *
3216                  * So disable PASIDs first and then mark the entry
3217                  * copied. This means that we don't copy PASID
3218                  * translations from the old kernel, but this is fine as
3219                  * faults there are not fatal.
3220                  */
3221                 context_clear_pasid_enable(&ce);
3222                 context_set_copied(&ce);
3223
3224                 new_ce[idx] = ce;
3225         }
3226
3227         tbl[tbl_idx + pos] = new_ce;
3228
3229         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3230
3231 out_unmap:
3232         memunmap(old_ce);
3233
3234 out:
3235         return ret;
3236 }
3237
3238 static int copy_translation_tables(struct intel_iommu *iommu)
3239 {
3240         struct context_entry **ctxt_tbls;
3241         struct root_entry *old_rt;
3242         phys_addr_t old_rt_phys;
3243         int ctxt_table_entries;
3244         unsigned long flags;
3245         u64 rtaddr_reg;
3246         int bus, ret;
3247         bool new_ext, ext;
3248
3249         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3250         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3251         new_ext    = !!ecap_ecs(iommu->ecap);
3252
3253         /*
3254          * The RTT bit can only be changed when translation is disabled,
3255          * but disabling translation means to open a window for data
3256          * corruption. So bail out and don't copy anything if we would
3257          * have to change the bit.
3258          */
3259         if (new_ext != ext)
3260                 return -EINVAL;
3261
3262         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3263         if (!old_rt_phys)
3264                 return -EINVAL;
3265
3266         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3267         if (!old_rt)
3268                 return -ENOMEM;
3269
3270         /* This is too big for the stack - allocate it from slab */
3271         ctxt_table_entries = ext ? 512 : 256;
3272         ret = -ENOMEM;
3273         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3274         if (!ctxt_tbls)
3275                 goto out_unmap;
3276
3277         for (bus = 0; bus < 256; bus++) {
3278                 ret = copy_context_table(iommu, &old_rt[bus],
3279                                          ctxt_tbls, bus, ext);
3280                 if (ret) {
3281                         pr_err("%s: Failed to copy context table for bus %d\n",
3282                                 iommu->name, bus);
3283                         continue;
3284                 }
3285         }
3286
3287         spin_lock_irqsave(&iommu->lock, flags);
3288
3289         /* Context tables are copied, now write them to the root_entry table */
3290         for (bus = 0; bus < 256; bus++) {
3291                 int idx = ext ? bus * 2 : bus;
3292                 u64 val;
3293
3294                 if (ctxt_tbls[idx]) {
3295                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3296                         iommu->root_entry[bus].lo = val;
3297                 }
3298
3299                 if (!ext || !ctxt_tbls[idx + 1])
3300                         continue;
3301
3302                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3303                 iommu->root_entry[bus].hi = val;
3304         }
3305
3306         spin_unlock_irqrestore(&iommu->lock, flags);
3307
3308         kfree(ctxt_tbls);
3309
3310         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3311
3312         ret = 0;
3313
3314 out_unmap:
3315         memunmap(old_rt);
3316
3317         return ret;
3318 }
3319
3320 static int __init init_dmars(void)
3321 {
3322         struct dmar_drhd_unit *drhd;
3323         struct intel_iommu *iommu;
3324         int ret;
3325
3326         /*
3327          * for each drhd
3328          *    allocate root
3329          *    initialize and program root entry to not present
3330          * endfor
3331          */
3332         for_each_drhd_unit(drhd) {
3333                 /*
3334                  * lock not needed as this is only incremented in the single
3335                  * threaded kernel __init code path all other access are read
3336                  * only
3337                  */
3338                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3339                         g_num_of_iommus++;
3340                         continue;
3341                 }
3342                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3343         }
3344
3345         /* Preallocate enough resources for IOMMU hot-addition */
3346         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3347                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3348
3349         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3350                         GFP_KERNEL);
3351         if (!g_iommus) {
3352                 pr_err("Allocating global iommu array failed\n");
3353                 ret = -ENOMEM;
3354                 goto error;
3355         }
3356
3357         for_each_iommu(iommu, drhd) {
3358                 if (drhd->ignored) {
3359                         iommu_disable_translation(iommu);
3360                         continue;
3361                 }
3362
3363                 /*
3364                  * Find the max pasid size of all IOMMU's in the system.
3365                  * We need to ensure the system pasid table is no bigger
3366                  * than the smallest supported.
3367                  */
3368                 if (pasid_supported(iommu)) {
3369                         u32 temp = 2 << ecap_pss(iommu->ecap);
3370
3371                         intel_pasid_max_id = min_t(u32, temp,
3372                                                    intel_pasid_max_id);
3373                 }
3374
3375                 g_iommus[iommu->seq_id] = iommu;
3376
3377                 intel_iommu_init_qi(iommu);
3378
3379                 ret = iommu_init_domains(iommu);
3380                 if (ret)
3381                         goto free_iommu;
3382
3383                 init_translation_status(iommu);
3384
3385                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3386                         iommu_disable_translation(iommu);
3387                         clear_translation_pre_enabled(iommu);
3388                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3389                                 iommu->name);
3390                 }
3391
3392                 /*
3393                  * TBD:
3394                  * we could share the same root & context tables
3395                  * among all IOMMU's. Need to Split it later.
3396                  */
3397                 ret = iommu_alloc_root_entry(iommu);
3398                 if (ret)
3399                         goto free_iommu;
3400
3401                 if (translation_pre_enabled(iommu)) {
3402                         pr_info("Translation already enabled - trying to copy translation structures\n");
3403
3404                         ret = copy_translation_tables(iommu);
3405                         if (ret) {
3406                                 /*
3407                                  * We found the IOMMU with translation
3408                                  * enabled - but failed to copy over the
3409                                  * old root-entry table. Try to proceed
3410                                  * by disabling translation now and
3411                                  * allocating a clean root-entry table.
3412                                  * This might cause DMAR faults, but
3413                                  * probably the dump will still succeed.
3414                                  */
3415                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3416                                        iommu->name);
3417                                 iommu_disable_translation(iommu);
3418                                 clear_translation_pre_enabled(iommu);
3419                         } else {
3420                                 pr_info("Copied translation tables from previous kernel for %s\n",
3421                                         iommu->name);
3422                         }
3423                 }
3424
3425                 if (!ecap_pass_through(iommu->ecap))
3426                         hw_pass_through = 0;
3427                 intel_svm_check(iommu);
3428         }
3429
3430         /*
3431          * Now that qi is enabled on all iommus, set the root entry and flush
3432          * caches. This is required on some Intel X58 chipsets, otherwise the
3433          * flush_context function will loop forever and the boot hangs.
3434          */
3435         for_each_active_iommu(iommu, drhd) {
3436                 iommu_flush_write_buffer(iommu);
3437                 iommu_set_root_entry(iommu);
3438                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3439                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3440         }
3441
3442 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3443         dmar_map_gfx = 0;
3444 #endif
3445
3446         if (!dmar_map_gfx)
3447                 iommu_identity_mapping |= IDENTMAP_GFX;
3448
3449         check_tylersburg_isoch();
3450
3451         ret = si_domain_init(hw_pass_through);
3452         if (ret)
3453                 goto free_iommu;
3454
3455         /*
3456          * for each drhd
3457          *   enable fault log
3458          *   global invalidate context cache
3459          *   global invalidate iotlb
3460          *   enable translation
3461          */
3462         for_each_iommu(iommu, drhd) {
3463                 if (drhd->ignored) {
3464                         /*
3465                          * we always have to disable PMRs or DMA may fail on
3466                          * this device
3467                          */
3468                         if (force_on)
3469                                 iommu_disable_protect_mem_regions(iommu);
3470                         continue;
3471                 }
3472
3473                 iommu_flush_write_buffer(iommu);
3474
3475 #ifdef CONFIG_INTEL_IOMMU_SVM
3476                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3477                         /*
3478                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3479                          * could cause possible lock race condition.
3480                          */
3481                         up_write(&dmar_global_lock);
3482                         ret = intel_svm_enable_prq(iommu);
3483                         down_write(&dmar_global_lock);
3484                         if (ret)
3485                                 goto free_iommu;
3486                 }
3487 #endif
3488                 ret = dmar_set_interrupt(iommu);
3489                 if (ret)
3490                         goto free_iommu;
3491         }
3492
3493         return 0;
3494
3495 free_iommu:
3496         for_each_active_iommu(iommu, drhd) {
3497                 disable_dmar_iommu(iommu);
3498                 free_dmar_iommu(iommu);
3499         }
3500
3501         kfree(g_iommus);
3502
3503 error:
3504         return ret;
3505 }
3506
3507 /* This takes a number of _MM_ pages, not VTD pages */
3508 static unsigned long intel_alloc_iova(struct device *dev,
3509                                      struct dmar_domain *domain,
3510                                      unsigned long nrpages, uint64_t dma_mask)
3511 {
3512         unsigned long iova_pfn;
3513
3514         /*
3515          * Restrict dma_mask to the width that the iommu can handle.
3516          * First-level translation restricts the input-address to a
3517          * canonical address (i.e., address bits 63:N have the same
3518          * value as address bit [N-1], where N is 48-bits with 4-level
3519          * paging and 57-bits with 5-level paging). Hence, skip bit
3520          * [N-1].
3521          */
3522         if (domain_use_first_level(domain))
3523                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3524                                  dma_mask);
3525         else
3526                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3527                                  dma_mask);
3528
3529         /* Ensure we reserve the whole size-aligned region */
3530         nrpages = __roundup_pow_of_two(nrpages);
3531
3532         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3533                 /*
3534                  * First try to allocate an io virtual address in
3535                  * DMA_BIT_MASK(32) and if that fails then try allocating
3536                  * from higher range
3537                  */
3538                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3539                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3540                 if (iova_pfn)
3541                         return iova_pfn;
3542         }
3543         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3544                                    IOVA_PFN(dma_mask), true);
3545         if (unlikely(!iova_pfn)) {
3546                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3547                              nrpages);
3548                 return 0;
3549         }
3550
3551         return iova_pfn;
3552 }
3553
3554 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3555 {
3556         struct dmar_domain *domain, *tmp;
3557         struct dmar_rmrr_unit *rmrr;
3558         struct device *i_dev;
3559         int i, ret;
3560
3561         /* Device shouldn't be attached by any domains. */
3562         domain = find_domain(dev);
3563         if (domain)
3564                 return NULL;
3565
3566         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3567         if (!domain)
3568                 goto out;
3569
3570         /* We have a new domain - setup possible RMRRs for the device */
3571         rcu_read_lock();
3572         for_each_rmrr_units(rmrr) {
3573                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3574                                           i, i_dev) {
3575                         if (i_dev != dev)
3576                                 continue;
3577
3578                         ret = domain_prepare_identity_map(dev, domain,
3579                                                           rmrr->base_address,
3580                                                           rmrr->end_address);
3581                         if (ret)
3582                                 dev_err(dev, "Mapping reserved region failed\n");
3583                 }
3584         }
3585         rcu_read_unlock();
3586
3587         tmp = set_domain_for_dev(dev, domain);
3588         if (!tmp || domain != tmp) {
3589                 domain_exit(domain);
3590                 domain = tmp;
3591         }
3592
3593 out:
3594         if (!domain)
3595                 dev_err(dev, "Allocating domain failed\n");
3596         else
3597                 domain->domain.type = IOMMU_DOMAIN_DMA;
3598
3599         return domain;
3600 }
3601
3602 /* Check if the dev needs to go through non-identity map and unmap process.*/
3603 static bool iommu_need_mapping(struct device *dev)
3604 {
3605         int ret;
3606
3607         if (iommu_dummy(dev))
3608                 return false;
3609
3610         if (unlikely(attach_deferred(dev)))
3611                 do_deferred_attach(dev);
3612
3613         ret = identity_mapping(dev);
3614         if (ret) {
3615                 u64 dma_mask = *dev->dma_mask;
3616
3617                 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3618                         dma_mask = dev->coherent_dma_mask;
3619
3620                 if (dma_mask >= dma_direct_get_required_mask(dev))
3621                         return false;
3622
3623                 /*
3624                  * 32 bit DMA is removed from si_domain and fall back to
3625                  * non-identity mapping.
3626                  */
3627                 dmar_remove_one_dev_info(dev);
3628                 ret = iommu_request_dma_domain_for_dev(dev);
3629                 if (ret) {
3630                         struct iommu_domain *domain;
3631                         struct dmar_domain *dmar_domain;
3632
3633                         domain = iommu_get_domain_for_dev(dev);
3634                         if (domain) {
3635                                 dmar_domain = to_dmar_domain(domain);
3636                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3637                         }
3638                         dmar_remove_one_dev_info(dev);
3639                         get_private_domain_for_dev(dev);
3640                 }
3641
3642                 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3643         }
3644
3645         return true;
3646 }
3647
3648 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3649                                      size_t size, int dir, u64 dma_mask)
3650 {
3651         struct dmar_domain *domain;
3652         phys_addr_t start_paddr;
3653         unsigned long iova_pfn;
3654         int prot = 0;
3655         int ret;
3656         struct intel_iommu *iommu;
3657         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3658
3659         BUG_ON(dir == DMA_NONE);
3660
3661         domain = find_domain(dev);
3662         if (!domain)
3663                 return DMA_MAPPING_ERROR;
3664
3665         iommu = domain_get_iommu(domain);
3666         size = aligned_nrpages(paddr, size);
3667
3668         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3669         if (!iova_pfn)
3670                 goto error;
3671
3672         /*
3673          * Check if DMAR supports zero-length reads on write only
3674          * mappings..
3675          */
3676         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3677                         !cap_zlr(iommu->cap))
3678                 prot |= DMA_PTE_READ;
3679         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3680                 prot |= DMA_PTE_WRITE;
3681         /*
3682          * paddr - (paddr + size) might be partial page, we should map the whole
3683          * page.  Note: if two part of one page are separately mapped, we
3684          * might have two guest_addr mapping to the same host paddr, but this
3685          * is not a big problem
3686          */
3687         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3688                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3689         if (ret)
3690                 goto error;
3691
3692         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3693         start_paddr += paddr & ~PAGE_MASK;
3694
3695         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3696
3697         return start_paddr;
3698
3699 error:
3700         if (iova_pfn)
3701                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3702         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3703                 size, (unsigned long long)paddr, dir);
3704         return DMA_MAPPING_ERROR;
3705 }
3706
3707 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3708                                  unsigned long offset, size_t size,
3709                                  enum dma_data_direction dir,
3710                                  unsigned long attrs)
3711 {
3712         if (iommu_need_mapping(dev))
3713                 return __intel_map_single(dev, page_to_phys(page) + offset,
3714                                 size, dir, *dev->dma_mask);
3715         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3716 }
3717
3718 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3719                                      size_t size, enum dma_data_direction dir,
3720                                      unsigned long attrs)
3721 {
3722         if (iommu_need_mapping(dev))
3723                 return __intel_map_single(dev, phys_addr, size, dir,
3724                                 *dev->dma_mask);
3725         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3726 }
3727
3728 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3729 {
3730         struct dmar_domain *domain;
3731         unsigned long start_pfn, last_pfn;
3732         unsigned long nrpages;
3733         unsigned long iova_pfn;
3734         struct intel_iommu *iommu;
3735         struct page *freelist;
3736         struct pci_dev *pdev = NULL;
3737
3738         domain = find_domain(dev);
3739         BUG_ON(!domain);
3740
3741         iommu = domain_get_iommu(domain);
3742
3743         iova_pfn = IOVA_PFN(dev_addr);
3744
3745         nrpages = aligned_nrpages(dev_addr, size);
3746         start_pfn = mm_to_dma_pfn(iova_pfn);
3747         last_pfn = start_pfn + nrpages - 1;
3748
3749         if (dev_is_pci(dev))
3750                 pdev = to_pci_dev(dev);
3751
3752         freelist = domain_unmap(domain, start_pfn, last_pfn);
3753         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3754                         !has_iova_flush_queue(&domain->iovad)) {
3755                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3756                                       nrpages, !freelist, 0);
3757                 /* free iova */
3758                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3759                 dma_free_pagelist(freelist);
3760         } else {
3761                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3762                            (unsigned long)freelist);
3763                 /*
3764                  * queue up the release of the unmap to save the 1/6th of the
3765                  * cpu used up by the iotlb flush operation...
3766                  */
3767         }
3768
3769         trace_unmap_single(dev, dev_addr, size);
3770 }
3771
3772 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3773                              size_t size, enum dma_data_direction dir,
3774                              unsigned long attrs)
3775 {
3776         if (iommu_need_mapping(dev))
3777                 intel_unmap(dev, dev_addr, size);
3778         else
3779                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3780 }
3781
3782 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3783                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3784 {
3785         if (iommu_need_mapping(dev))
3786                 intel_unmap(dev, dev_addr, size);
3787 }
3788
3789 static void *intel_alloc_coherent(struct device *dev, size_t size,
3790                                   dma_addr_t *dma_handle, gfp_t flags,
3791                                   unsigned long attrs)
3792 {
3793         struct page *page = NULL;
3794         int order;
3795
3796         if (!iommu_need_mapping(dev))
3797                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3798
3799         size = PAGE_ALIGN(size);
3800         order = get_order(size);
3801
3802         if (gfpflags_allow_blocking(flags)) {
3803                 unsigned int count = size >> PAGE_SHIFT;
3804
3805                 page = dma_alloc_from_contiguous(dev, count, order,
3806                                                  flags & __GFP_NOWARN);
3807         }
3808
3809         if (!page)
3810                 page = alloc_pages(flags, order);
3811         if (!page)
3812                 return NULL;
3813         memset(page_address(page), 0, size);
3814
3815         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3816                                          DMA_BIDIRECTIONAL,
3817                                          dev->coherent_dma_mask);
3818         if (*dma_handle != DMA_MAPPING_ERROR)
3819                 return page_address(page);
3820         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3821                 __free_pages(page, order);
3822
3823         return NULL;
3824 }
3825
3826 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3827                                 dma_addr_t dma_handle, unsigned long attrs)
3828 {
3829         int order;
3830         struct page *page = virt_to_page(vaddr);
3831
3832         if (!iommu_need_mapping(dev))
3833                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3834
3835         size = PAGE_ALIGN(size);
3836         order = get_order(size);
3837
3838         intel_unmap(dev, dma_handle, size);
3839         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3840                 __free_pages(page, order);
3841 }
3842
3843 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3844                            int nelems, enum dma_data_direction dir,
3845                            unsigned long attrs)
3846 {
3847         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3848         unsigned long nrpages = 0;
3849         struct scatterlist *sg;
3850         int i;
3851
3852         if (!iommu_need_mapping(dev))
3853                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3854
3855         for_each_sg(sglist, sg, nelems, i) {
3856                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3857         }
3858
3859         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3860
3861         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3862 }
3863
3864 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3865                         enum dma_data_direction dir, unsigned long attrs)
3866 {
3867         int i;
3868         struct dmar_domain *domain;
3869         size_t size = 0;
3870         int prot = 0;
3871         unsigned long iova_pfn;
3872         int ret;
3873         struct scatterlist *sg;
3874         unsigned long start_vpfn;
3875         struct intel_iommu *iommu;
3876
3877         BUG_ON(dir == DMA_NONE);
3878         if (!iommu_need_mapping(dev))
3879                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3880
3881         domain = find_domain(dev);
3882         if (!domain)
3883                 return 0;
3884
3885         iommu = domain_get_iommu(domain);
3886
3887         for_each_sg(sglist, sg, nelems, i)
3888                 size += aligned_nrpages(sg->offset, sg->length);
3889
3890         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3891                                 *dev->dma_mask);
3892         if (!iova_pfn) {
3893                 sglist->dma_length = 0;
3894                 return 0;
3895         }
3896
3897         /*
3898          * Check if DMAR supports zero-length reads on write only
3899          * mappings..
3900          */
3901         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3902                         !cap_zlr(iommu->cap))
3903                 prot |= DMA_PTE_READ;
3904         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3905                 prot |= DMA_PTE_WRITE;
3906
3907         start_vpfn = mm_to_dma_pfn(iova_pfn);
3908
3909         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3910         if (unlikely(ret)) {
3911                 dma_pte_free_pagetable(domain, start_vpfn,
3912                                        start_vpfn + size - 1,
3913                                        agaw_to_level(domain->agaw) + 1);
3914                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3915                 return 0;
3916         }
3917
3918         for_each_sg(sglist, sg, nelems, i)
3919                 trace_map_sg(dev, i + 1, nelems, sg);
3920
3921         return nelems;
3922 }
3923
3924 static u64 intel_get_required_mask(struct device *dev)
3925 {
3926         if (!iommu_need_mapping(dev))
3927                 return dma_direct_get_required_mask(dev);
3928         return DMA_BIT_MASK(32);
3929 }
3930
3931 static const struct dma_map_ops intel_dma_ops = {
3932         .alloc = intel_alloc_coherent,
3933         .free = intel_free_coherent,
3934         .map_sg = intel_map_sg,
3935         .unmap_sg = intel_unmap_sg,
3936         .map_page = intel_map_page,
3937         .unmap_page = intel_unmap_page,
3938         .map_resource = intel_map_resource,
3939         .unmap_resource = intel_unmap_resource,
3940         .dma_supported = dma_direct_supported,
3941         .mmap = dma_common_mmap,
3942         .get_sgtable = dma_common_get_sgtable,
3943         .get_required_mask = intel_get_required_mask,
3944 };
3945
3946 static void
3947 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3948                    enum dma_data_direction dir, enum dma_sync_target target)
3949 {
3950         struct dmar_domain *domain;
3951         phys_addr_t tlb_addr;
3952
3953         domain = find_domain(dev);
3954         if (WARN_ON(!domain))
3955                 return;
3956
3957         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3958         if (is_swiotlb_buffer(tlb_addr))
3959                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3960 }
3961
3962 static dma_addr_t
3963 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3964                   enum dma_data_direction dir, unsigned long attrs,
3965                   u64 dma_mask)
3966 {
3967         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3968         struct dmar_domain *domain;
3969         struct intel_iommu *iommu;
3970         unsigned long iova_pfn;
3971         unsigned long nrpages;
3972         phys_addr_t tlb_addr;
3973         int prot = 0;
3974         int ret;
3975
3976         if (unlikely(attach_deferred(dev)))
3977                 do_deferred_attach(dev);
3978
3979         domain = find_domain(dev);
3980
3981         if (WARN_ON(dir == DMA_NONE || !domain))
3982                 return DMA_MAPPING_ERROR;
3983
3984         iommu = domain_get_iommu(domain);
3985         if (WARN_ON(!iommu))
3986                 return DMA_MAPPING_ERROR;
3987
3988         nrpages = aligned_nrpages(0, size);
3989         iova_pfn = intel_alloc_iova(dev, domain,
3990                                     dma_to_mm_pfn(nrpages), dma_mask);
3991         if (!iova_pfn)
3992                 return DMA_MAPPING_ERROR;
3993
3994         /*
3995          * Check if DMAR supports zero-length reads on write only
3996          * mappings..
3997          */
3998         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3999                         !cap_zlr(iommu->cap))
4000                 prot |= DMA_PTE_READ;
4001         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
4002                 prot |= DMA_PTE_WRITE;
4003
4004         /*
4005          * If both the physical buffer start address and size are
4006          * page aligned, we don't need to use a bounce page.
4007          */
4008         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
4009                 tlb_addr = swiotlb_tbl_map_single(dev,
4010                                 __phys_to_dma(dev, io_tlb_start),
4011                                 paddr, size, aligned_size, dir, attrs);
4012                 if (tlb_addr == DMA_MAPPING_ERROR) {
4013                         goto swiotlb_error;
4014                 } else {
4015                         /* Cleanup the padding area. */
4016                         void *padding_start = phys_to_virt(tlb_addr);
4017                         size_t padding_size = aligned_size;
4018
4019                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
4020                             (dir == DMA_TO_DEVICE ||
4021                              dir == DMA_BIDIRECTIONAL)) {
4022                                 padding_start += size;
4023                                 padding_size -= size;
4024                         }
4025
4026                         memset(padding_start, 0, padding_size);
4027                 }
4028         } else {
4029                 tlb_addr = paddr;
4030         }
4031
4032         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
4033                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
4034         if (ret)
4035                 goto mapping_error;
4036
4037         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
4038
4039         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
4040
4041 mapping_error:
4042         if (is_swiotlb_buffer(tlb_addr))
4043                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4044                                          aligned_size, dir, attrs);
4045 swiotlb_error:
4046         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
4047         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
4048                 size, (unsigned long long)paddr, dir);
4049
4050         return DMA_MAPPING_ERROR;
4051 }
4052
4053 static void
4054 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
4055                     enum dma_data_direction dir, unsigned long attrs)
4056 {
4057         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
4058         struct dmar_domain *domain;
4059         phys_addr_t tlb_addr;
4060
4061         domain = find_domain(dev);
4062         if (WARN_ON(!domain))
4063                 return;
4064
4065         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
4066         if (WARN_ON(!tlb_addr))
4067                 return;
4068
4069         intel_unmap(dev, dev_addr, size);
4070         if (is_swiotlb_buffer(tlb_addr))
4071                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4072                                          aligned_size, dir, attrs);
4073
4074         trace_bounce_unmap_single(dev, dev_addr, size);
4075 }
4076
4077 static dma_addr_t
4078 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
4079                 size_t size, enum dma_data_direction dir, unsigned long attrs)
4080 {
4081         return bounce_map_single(dev, page_to_phys(page) + offset,
4082                                  size, dir, attrs, *dev->dma_mask);
4083 }
4084
4085 static dma_addr_t
4086 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
4087                     enum dma_data_direction dir, unsigned long attrs)
4088 {
4089         return bounce_map_single(dev, phys_addr, size,
4090                                  dir, attrs, *dev->dma_mask);
4091 }
4092
4093 static void
4094 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
4095                   enum dma_data_direction dir, unsigned long attrs)
4096 {
4097         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4098 }
4099
4100 static void
4101 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
4102                       enum dma_data_direction dir, unsigned long attrs)
4103 {
4104         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4105 }
4106
4107 static void
4108 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4109                 enum dma_data_direction dir, unsigned long attrs)
4110 {
4111         struct scatterlist *sg;
4112         int i;
4113
4114         for_each_sg(sglist, sg, nelems, i)
4115                 bounce_unmap_page(dev, sg->dma_address,
4116                                   sg_dma_len(sg), dir, attrs);
4117 }
4118
4119 static int
4120 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4121               enum dma_data_direction dir, unsigned long attrs)
4122 {
4123         int i;
4124         struct scatterlist *sg;
4125
4126         for_each_sg(sglist, sg, nelems, i) {
4127                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
4128                                                   sg->offset, sg->length,
4129                                                   dir, attrs);
4130                 if (sg->dma_address == DMA_MAPPING_ERROR)
4131                         goto out_unmap;
4132                 sg_dma_len(sg) = sg->length;
4133         }
4134
4135         for_each_sg(sglist, sg, nelems, i)
4136                 trace_bounce_map_sg(dev, i + 1, nelems, sg);
4137
4138         return nelems;
4139
4140 out_unmap:
4141         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4142         return 0;
4143 }
4144
4145 static void
4146 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4147                            size_t size, enum dma_data_direction dir)
4148 {
4149         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4150 }
4151
4152 static void
4153 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4154                               size_t size, enum dma_data_direction dir)
4155 {
4156         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4157 }
4158
4159 static void
4160 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4161                        int nelems, enum dma_data_direction dir)
4162 {
4163         struct scatterlist *sg;
4164         int i;
4165
4166         for_each_sg(sglist, sg, nelems, i)
4167                 bounce_sync_single(dev, sg_dma_address(sg),
4168                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
4169 }
4170
4171 static void
4172 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4173                           int nelems, enum dma_data_direction dir)
4174 {
4175         struct scatterlist *sg;
4176         int i;
4177
4178         for_each_sg(sglist, sg, nelems, i)
4179                 bounce_sync_single(dev, sg_dma_address(sg),
4180                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4181 }
4182
4183 static const struct dma_map_ops bounce_dma_ops = {
4184         .alloc                  = intel_alloc_coherent,
4185         .free                   = intel_free_coherent,
4186         .map_sg                 = bounce_map_sg,
4187         .unmap_sg               = bounce_unmap_sg,
4188         .map_page               = bounce_map_page,
4189         .unmap_page             = bounce_unmap_page,
4190         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
4191         .sync_single_for_device = bounce_sync_single_for_device,
4192         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
4193         .sync_sg_for_device     = bounce_sync_sg_for_device,
4194         .map_resource           = bounce_map_resource,
4195         .unmap_resource         = bounce_unmap_resource,
4196         .dma_supported          = dma_direct_supported,
4197 };
4198
4199 static inline int iommu_domain_cache_init(void)
4200 {
4201         int ret = 0;
4202
4203         iommu_domain_cache = kmem_cache_create("iommu_domain",
4204                                          sizeof(struct dmar_domain),
4205                                          0,
4206                                          SLAB_HWCACHE_ALIGN,
4207
4208                                          NULL);
4209         if (!iommu_domain_cache) {
4210                 pr_err("Couldn't create iommu_domain cache\n");
4211                 ret = -ENOMEM;
4212         }
4213
4214         return ret;
4215 }
4216
4217 static inline int iommu_devinfo_cache_init(void)
4218 {
4219         int ret = 0;
4220
4221         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4222                                          sizeof(struct device_domain_info),
4223                                          0,
4224                                          SLAB_HWCACHE_ALIGN,
4225                                          NULL);
4226         if (!iommu_devinfo_cache) {
4227                 pr_err("Couldn't create devinfo cache\n");
4228                 ret = -ENOMEM;
4229         }
4230
4231         return ret;
4232 }
4233
4234 static int __init iommu_init_mempool(void)
4235 {
4236         int ret;
4237         ret = iova_cache_get();
4238         if (ret)
4239                 return ret;
4240
4241         ret = iommu_domain_cache_init();
4242         if (ret)
4243                 goto domain_error;
4244
4245         ret = iommu_devinfo_cache_init();
4246         if (!ret)
4247                 return ret;
4248
4249         kmem_cache_destroy(iommu_domain_cache);
4250 domain_error:
4251         iova_cache_put();
4252
4253         return -ENOMEM;
4254 }
4255
4256 static void __init iommu_exit_mempool(void)
4257 {
4258         kmem_cache_destroy(iommu_devinfo_cache);
4259         kmem_cache_destroy(iommu_domain_cache);
4260         iova_cache_put();
4261 }
4262
4263 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4264 {
4265         struct dmar_drhd_unit *drhd;
4266         u32 vtbar;
4267         int rc;
4268
4269         /* We know that this device on this chipset has its own IOMMU.
4270          * If we find it under a different IOMMU, then the BIOS is lying
4271          * to us. Hope that the IOMMU for this device is actually
4272          * disabled, and it needs no translation...
4273          */
4274         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4275         if (rc) {
4276                 /* "can't" happen */
4277                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4278                 return;
4279         }
4280         vtbar &= 0xffff0000;
4281
4282         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4283         drhd = dmar_find_matched_drhd_unit(pdev);
4284         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
4285                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4286                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4287                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4288         }
4289 }
4290 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4291
4292 static void __init init_no_remapping_devices(void)
4293 {
4294         struct dmar_drhd_unit *drhd;
4295         struct device *dev;
4296         int i;
4297
4298         for_each_drhd_unit(drhd) {
4299                 if (!drhd->include_all) {
4300                         for_each_active_dev_scope(drhd->devices,
4301                                                   drhd->devices_cnt, i, dev)
4302                                 break;
4303                         /* ignore DMAR unit if no devices exist */
4304                         if (i == drhd->devices_cnt)
4305                                 drhd->ignored = 1;
4306                 }
4307         }
4308
4309         for_each_active_drhd_unit(drhd) {
4310                 if (drhd->include_all)
4311                         continue;
4312
4313                 for_each_active_dev_scope(drhd->devices,
4314                                           drhd->devices_cnt, i, dev)
4315                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4316                                 break;
4317                 if (i < drhd->devices_cnt)
4318                         continue;
4319
4320                 /* This IOMMU has *only* gfx devices. Either bypass it or
4321                    set the gfx_mapped flag, as appropriate */
4322                 if (!dmar_map_gfx) {
4323                         drhd->ignored = 1;
4324                         for_each_active_dev_scope(drhd->devices,
4325                                                   drhd->devices_cnt, i, dev)
4326                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4327                 }
4328         }
4329 }
4330
4331 #ifdef CONFIG_SUSPEND
4332 static int init_iommu_hw(void)
4333 {
4334         struct dmar_drhd_unit *drhd;
4335         struct intel_iommu *iommu = NULL;
4336
4337         for_each_active_iommu(iommu, drhd)
4338                 if (iommu->qi)
4339                         dmar_reenable_qi(iommu);
4340
4341         for_each_iommu(iommu, drhd) {
4342                 if (drhd->ignored) {
4343                         /*
4344                          * we always have to disable PMRs or DMA may fail on
4345                          * this device
4346                          */
4347                         if (force_on)
4348                                 iommu_disable_protect_mem_regions(iommu);
4349                         continue;
4350                 }
4351
4352                 iommu_flush_write_buffer(iommu);
4353
4354                 iommu_set_root_entry(iommu);
4355
4356                 iommu->flush.flush_context(iommu, 0, 0, 0,
4357                                            DMA_CCMD_GLOBAL_INVL);
4358                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4359                 iommu_enable_translation(iommu);
4360                 iommu_disable_protect_mem_regions(iommu);
4361         }
4362
4363         return 0;
4364 }
4365
4366 static void iommu_flush_all(void)
4367 {
4368         struct dmar_drhd_unit *drhd;
4369         struct intel_iommu *iommu;
4370
4371         for_each_active_iommu(iommu, drhd) {
4372                 iommu->flush.flush_context(iommu, 0, 0, 0,
4373                                            DMA_CCMD_GLOBAL_INVL);
4374                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4375                                          DMA_TLB_GLOBAL_FLUSH);
4376         }
4377 }
4378
4379 static int iommu_suspend(void)
4380 {
4381         struct dmar_drhd_unit *drhd;
4382         struct intel_iommu *iommu = NULL;
4383         unsigned long flag;
4384
4385         for_each_active_iommu(iommu, drhd) {
4386                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4387                                                  GFP_ATOMIC);
4388                 if (!iommu->iommu_state)
4389                         goto nomem;
4390         }
4391
4392         iommu_flush_all();
4393
4394         for_each_active_iommu(iommu, drhd) {
4395                 iommu_disable_translation(iommu);
4396
4397                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4398
4399                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4400                         readl(iommu->reg + DMAR_FECTL_REG);
4401                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4402                         readl(iommu->reg + DMAR_FEDATA_REG);
4403                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4404                         readl(iommu->reg + DMAR_FEADDR_REG);
4405                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4406                         readl(iommu->reg + DMAR_FEUADDR_REG);
4407
4408                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4409         }
4410         return 0;
4411
4412 nomem:
4413         for_each_active_iommu(iommu, drhd)
4414                 kfree(iommu->iommu_state);
4415
4416         return -ENOMEM;
4417 }
4418
4419 static void iommu_resume(void)
4420 {
4421         struct dmar_drhd_unit *drhd;
4422         struct intel_iommu *iommu = NULL;
4423         unsigned long flag;
4424
4425         if (init_iommu_hw()) {
4426                 if (force_on)
4427                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4428                 else
4429                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4430                 return;
4431         }
4432
4433         for_each_active_iommu(iommu, drhd) {
4434
4435                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4436
4437                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4438                         iommu->reg + DMAR_FECTL_REG);
4439                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4440                         iommu->reg + DMAR_FEDATA_REG);
4441                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4442                         iommu->reg + DMAR_FEADDR_REG);
4443                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4444                         iommu->reg + DMAR_FEUADDR_REG);
4445
4446                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4447         }
4448
4449         for_each_active_iommu(iommu, drhd)
4450                 kfree(iommu->iommu_state);
4451 }
4452
4453 static struct syscore_ops iommu_syscore_ops = {
4454         .resume         = iommu_resume,
4455         .suspend        = iommu_suspend,
4456 };
4457
4458 static void __init init_iommu_pm_ops(void)
4459 {
4460         register_syscore_ops(&iommu_syscore_ops);
4461 }
4462
4463 #else
4464 static inline void init_iommu_pm_ops(void) {}
4465 #endif  /* CONFIG_PM */
4466
4467 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4468 {
4469         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4470             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4471             rmrr->end_address <= rmrr->base_address ||
4472             arch_rmrr_sanity_check(rmrr))
4473                 return -EINVAL;
4474
4475         return 0;
4476 }
4477
4478 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4479 {
4480         struct acpi_dmar_reserved_memory *rmrr;
4481         struct dmar_rmrr_unit *rmrru;
4482
4483         rmrr = (struct acpi_dmar_reserved_memory *)header;
4484         if (rmrr_sanity_check(rmrr)) {
4485                 pr_warn(FW_BUG
4486                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4487                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4488                            rmrr->base_address, rmrr->end_address,
4489                            dmi_get_system_info(DMI_BIOS_VENDOR),
4490                            dmi_get_system_info(DMI_BIOS_VERSION),
4491                            dmi_get_system_info(DMI_PRODUCT_VERSION));
4492                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4493         }
4494
4495         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4496         if (!rmrru)
4497                 goto out;
4498
4499         rmrru->hdr = header;
4500
4501         rmrru->base_address = rmrr->base_address;
4502         rmrru->end_address = rmrr->end_address;
4503
4504         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4505                                 ((void *)rmrr) + rmrr->header.length,
4506                                 &rmrru->devices_cnt);
4507         if (rmrru->devices_cnt && rmrru->devices == NULL)
4508                 goto free_rmrru;
4509
4510         list_add(&rmrru->list, &dmar_rmrr_units);
4511
4512         return 0;
4513 free_rmrru:
4514         kfree(rmrru);
4515 out:
4516         return -ENOMEM;
4517 }
4518
4519 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4520 {
4521         struct dmar_atsr_unit *atsru;
4522         struct acpi_dmar_atsr *tmp;
4523
4524         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4525                                 dmar_rcu_check()) {
4526                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4527                 if (atsr->segment != tmp->segment)
4528                         continue;
4529                 if (atsr->header.length != tmp->header.length)
4530                         continue;
4531                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4532                         return atsru;
4533         }
4534
4535         return NULL;
4536 }
4537
4538 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4539 {
4540         struct acpi_dmar_atsr *atsr;
4541         struct dmar_atsr_unit *atsru;
4542
4543         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4544                 return 0;
4545
4546         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4547         atsru = dmar_find_atsr(atsr);
4548         if (atsru)
4549                 return 0;
4550
4551         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4552         if (!atsru)
4553                 return -ENOMEM;
4554
4555         /*
4556          * If memory is allocated from slab by ACPI _DSM method, we need to
4557          * copy the memory content because the memory buffer will be freed
4558          * on return.
4559          */
4560         atsru->hdr = (void *)(atsru + 1);
4561         memcpy(atsru->hdr, hdr, hdr->length);
4562         atsru->include_all = atsr->flags & 0x1;
4563         if (!atsru->include_all) {
4564                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4565                                 (void *)atsr + atsr->header.length,
4566                                 &atsru->devices_cnt);
4567                 if (atsru->devices_cnt && atsru->devices == NULL) {
4568                         kfree(atsru);
4569                         return -ENOMEM;
4570                 }
4571         }
4572
4573         list_add_rcu(&atsru->list, &dmar_atsr_units);
4574
4575         return 0;
4576 }
4577
4578 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4579 {
4580         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4581         kfree(atsru);
4582 }
4583
4584 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4585 {
4586         struct acpi_dmar_atsr *atsr;
4587         struct dmar_atsr_unit *atsru;
4588
4589         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4590         atsru = dmar_find_atsr(atsr);
4591         if (atsru) {
4592                 list_del_rcu(&atsru->list);
4593                 synchronize_rcu();
4594                 intel_iommu_free_atsr(atsru);
4595         }
4596
4597         return 0;
4598 }
4599
4600 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4601 {
4602         int i;
4603         struct device *dev;
4604         struct acpi_dmar_atsr *atsr;
4605         struct dmar_atsr_unit *atsru;
4606
4607         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4608         atsru = dmar_find_atsr(atsr);
4609         if (!atsru)
4610                 return 0;
4611
4612         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4613                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4614                                           i, dev)
4615                         return -EBUSY;
4616         }
4617
4618         return 0;
4619 }
4620
4621 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4622 {
4623         int sp, ret;
4624         struct intel_iommu *iommu = dmaru->iommu;
4625
4626         if (g_iommus[iommu->seq_id])
4627                 return 0;
4628
4629         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4630                 pr_warn("%s: Doesn't support hardware pass through.\n",
4631                         iommu->name);
4632                 return -ENXIO;
4633         }
4634         if (!ecap_sc_support(iommu->ecap) &&
4635             domain_update_iommu_snooping(iommu)) {
4636                 pr_warn("%s: Doesn't support snooping.\n",
4637                         iommu->name);
4638                 return -ENXIO;
4639         }
4640         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4641         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4642                 pr_warn("%s: Doesn't support large page.\n",
4643                         iommu->name);
4644                 return -ENXIO;
4645         }
4646
4647         /*
4648          * Disable translation if already enabled prior to OS handover.
4649          */
4650         if (iommu->gcmd & DMA_GCMD_TE)
4651                 iommu_disable_translation(iommu);
4652
4653         g_iommus[iommu->seq_id] = iommu;
4654         ret = iommu_init_domains(iommu);
4655         if (ret == 0)
4656                 ret = iommu_alloc_root_entry(iommu);
4657         if (ret)
4658                 goto out;
4659
4660         intel_svm_check(iommu);
4661
4662         if (dmaru->ignored) {
4663                 /*
4664                  * we always have to disable PMRs or DMA may fail on this device
4665                  */
4666                 if (force_on)
4667                         iommu_disable_protect_mem_regions(iommu);
4668                 return 0;
4669         }
4670
4671         intel_iommu_init_qi(iommu);
4672         iommu_flush_write_buffer(iommu);
4673
4674 #ifdef CONFIG_INTEL_IOMMU_SVM
4675         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4676                 ret = intel_svm_enable_prq(iommu);
4677                 if (ret)
4678                         goto disable_iommu;
4679         }
4680 #endif
4681         ret = dmar_set_interrupt(iommu);
4682         if (ret)
4683                 goto disable_iommu;
4684
4685         iommu_set_root_entry(iommu);
4686         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4687         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4688         iommu_enable_translation(iommu);
4689
4690         iommu_disable_protect_mem_regions(iommu);
4691         return 0;
4692
4693 disable_iommu:
4694         disable_dmar_iommu(iommu);
4695 out:
4696         free_dmar_iommu(iommu);
4697         return ret;
4698 }
4699
4700 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4701 {
4702         int ret = 0;
4703         struct intel_iommu *iommu = dmaru->iommu;
4704
4705         if (!intel_iommu_enabled)
4706                 return 0;
4707         if (iommu == NULL)
4708                 return -EINVAL;
4709
4710         if (insert) {
4711                 ret = intel_iommu_add(dmaru);
4712         } else {
4713                 disable_dmar_iommu(iommu);
4714                 free_dmar_iommu(iommu);
4715         }
4716
4717         return ret;
4718 }
4719
4720 static void intel_iommu_free_dmars(void)
4721 {
4722         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4723         struct dmar_atsr_unit *atsru, *atsr_n;
4724
4725         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4726                 list_del(&rmrru->list);
4727                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4728                 kfree(rmrru);
4729         }
4730
4731         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4732                 list_del(&atsru->list);
4733                 intel_iommu_free_atsr(atsru);
4734         }
4735 }
4736
4737 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4738 {
4739         int i, ret = 1;
4740         struct pci_bus *bus;
4741         struct pci_dev *bridge = NULL;
4742         struct device *tmp;
4743         struct acpi_dmar_atsr *atsr;
4744         struct dmar_atsr_unit *atsru;
4745
4746         dev = pci_physfn(dev);
4747         for (bus = dev->bus; bus; bus = bus->parent) {
4748                 bridge = bus->self;
4749                 /* If it's an integrated device, allow ATS */
4750                 if (!bridge)
4751                         return 1;
4752                 /* Connected via non-PCIe: no ATS */
4753                 if (!pci_is_pcie(bridge) ||
4754                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4755                         return 0;
4756                 /* If we found the root port, look it up in the ATSR */
4757                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4758                         break;
4759         }
4760
4761         rcu_read_lock();
4762         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4763                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4764                 if (atsr->segment != pci_domain_nr(dev->bus))
4765                         continue;
4766
4767                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4768                         if (tmp == &bridge->dev)
4769                                 goto out;
4770
4771                 if (atsru->include_all)
4772                         goto out;
4773         }
4774         ret = 0;
4775 out:
4776         rcu_read_unlock();
4777
4778         return ret;
4779 }
4780
4781 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4782 {
4783         int ret;
4784         struct dmar_rmrr_unit *rmrru;
4785         struct dmar_atsr_unit *atsru;
4786         struct acpi_dmar_atsr *atsr;
4787         struct acpi_dmar_reserved_memory *rmrr;
4788
4789         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4790                 return 0;
4791
4792         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4793                 rmrr = container_of(rmrru->hdr,
4794                                     struct acpi_dmar_reserved_memory, header);
4795                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4796                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4797                                 ((void *)rmrr) + rmrr->header.length,
4798                                 rmrr->segment, rmrru->devices,
4799                                 rmrru->devices_cnt);
4800                         if (ret < 0)
4801                                 return ret;
4802                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4803                         dmar_remove_dev_scope(info, rmrr->segment,
4804                                 rmrru->devices, rmrru->devices_cnt);
4805                 }
4806         }
4807
4808         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4809                 if (atsru->include_all)
4810                         continue;
4811
4812                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4813                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4814                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4815                                         (void *)atsr + atsr->header.length,
4816                                         atsr->segment, atsru->devices,
4817                                         atsru->devices_cnt);
4818                         if (ret > 0)
4819                                 break;
4820                         else if (ret < 0)
4821                                 return ret;
4822                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4823                         if (dmar_remove_dev_scope(info, atsr->segment,
4824                                         atsru->devices, atsru->devices_cnt))
4825                                 break;
4826                 }
4827         }
4828
4829         return 0;
4830 }
4831
4832 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4833                                        unsigned long val, void *v)
4834 {
4835         struct memory_notify *mhp = v;
4836         unsigned long long start, end;
4837         unsigned long start_vpfn, last_vpfn;
4838
4839         switch (val) {
4840         case MEM_GOING_ONLINE:
4841                 start = mhp->start_pfn << PAGE_SHIFT;
4842                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4843                 if (iommu_domain_identity_map(si_domain, start, end)) {
4844                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4845                                 start, end);
4846                         return NOTIFY_BAD;
4847                 }
4848                 break;
4849
4850         case MEM_OFFLINE:
4851         case MEM_CANCEL_ONLINE:
4852                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4853                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4854                 while (start_vpfn <= last_vpfn) {
4855                         struct iova *iova;
4856                         struct dmar_drhd_unit *drhd;
4857                         struct intel_iommu *iommu;
4858                         struct page *freelist;
4859
4860                         iova = find_iova(&si_domain->iovad, start_vpfn);
4861                         if (iova == NULL) {
4862                                 pr_debug("Failed get IOVA for PFN %lx\n",
4863                                          start_vpfn);
4864                                 break;
4865                         }
4866
4867                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4868                                                      start_vpfn, last_vpfn);
4869                         if (iova == NULL) {
4870                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4871                                         start_vpfn, last_vpfn);
4872                                 return NOTIFY_BAD;
4873                         }
4874
4875                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4876                                                iova->pfn_hi);
4877
4878                         rcu_read_lock();
4879                         for_each_active_iommu(iommu, drhd)
4880                                 iommu_flush_iotlb_psi(iommu, si_domain,
4881                                         iova->pfn_lo, iova_size(iova),
4882                                         !freelist, 0);
4883                         rcu_read_unlock();
4884                         dma_free_pagelist(freelist);
4885
4886                         start_vpfn = iova->pfn_hi + 1;
4887                         free_iova_mem(iova);
4888                 }
4889                 break;
4890         }
4891
4892         return NOTIFY_OK;
4893 }
4894
4895 static struct notifier_block intel_iommu_memory_nb = {
4896         .notifier_call = intel_iommu_memory_notifier,
4897         .priority = 0
4898 };
4899
4900 static void free_all_cpu_cached_iovas(unsigned int cpu)
4901 {
4902         int i;
4903
4904         for (i = 0; i < g_num_of_iommus; i++) {
4905                 struct intel_iommu *iommu = g_iommus[i];
4906                 struct dmar_domain *domain;
4907                 int did;
4908
4909                 if (!iommu)
4910                         continue;
4911
4912                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4913                         domain = get_iommu_domain(iommu, (u16)did);
4914
4915                         if (!domain)
4916                                 continue;
4917                         free_cpu_cached_iovas(cpu, &domain->iovad);
4918                 }
4919         }
4920 }
4921
4922 static int intel_iommu_cpu_dead(unsigned int cpu)
4923 {
4924         free_all_cpu_cached_iovas(cpu);
4925         return 0;
4926 }
4927
4928 static void intel_disable_iommus(void)
4929 {
4930         struct intel_iommu *iommu = NULL;
4931         struct dmar_drhd_unit *drhd;
4932
4933         for_each_iommu(iommu, drhd)
4934                 iommu_disable_translation(iommu);
4935 }
4936
4937 void intel_iommu_shutdown(void)
4938 {
4939         struct dmar_drhd_unit *drhd;
4940         struct intel_iommu *iommu = NULL;
4941
4942         if (no_iommu || dmar_disabled)
4943                 return;
4944
4945         down_write(&dmar_global_lock);
4946
4947         /* Disable PMRs explicitly here. */
4948         for_each_iommu(iommu, drhd)
4949                 iommu_disable_protect_mem_regions(iommu);
4950
4951         /* Make sure the IOMMUs are switched off */
4952         intel_disable_iommus();
4953
4954         up_write(&dmar_global_lock);
4955 }
4956
4957 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4958 {
4959         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4960
4961         return container_of(iommu_dev, struct intel_iommu, iommu);
4962 }
4963
4964 static ssize_t intel_iommu_show_version(struct device *dev,
4965                                         struct device_attribute *attr,
4966                                         char *buf)
4967 {
4968         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4969         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4970         return sprintf(buf, "%d:%d\n",
4971                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4972 }
4973 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4974
4975 static ssize_t intel_iommu_show_address(struct device *dev,
4976                                         struct device_attribute *attr,
4977                                         char *buf)
4978 {
4979         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4980         return sprintf(buf, "%llx\n", iommu->reg_phys);
4981 }
4982 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4983
4984 static ssize_t intel_iommu_show_cap(struct device *dev,
4985                                     struct device_attribute *attr,
4986                                     char *buf)
4987 {
4988         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4989         return sprintf(buf, "%llx\n", iommu->cap);
4990 }
4991 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4992
4993 static ssize_t intel_iommu_show_ecap(struct device *dev,
4994                                     struct device_attribute *attr,
4995                                     char *buf)
4996 {
4997         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4998         return sprintf(buf, "%llx\n", iommu->ecap);
4999 }
5000 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
5001
5002 static ssize_t intel_iommu_show_ndoms(struct device *dev,
5003                                       struct device_attribute *attr,
5004                                       char *buf)
5005 {
5006         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
5007         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
5008 }
5009 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
5010
5011 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
5012                                            struct device_attribute *attr,
5013                                            char *buf)
5014 {
5015         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
5016         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
5017                                                   cap_ndoms(iommu->cap)));
5018 }
5019 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
5020
5021 static struct attribute *intel_iommu_attrs[] = {
5022         &dev_attr_version.attr,
5023         &dev_attr_address.attr,
5024         &dev_attr_cap.attr,
5025         &dev_attr_ecap.attr,
5026         &dev_attr_domains_supported.attr,
5027         &dev_attr_domains_used.attr,
5028         NULL,
5029 };
5030
5031 static struct attribute_group intel_iommu_group = {
5032         .name = "intel-iommu",
5033         .attrs = intel_iommu_attrs,
5034 };
5035
5036 const struct attribute_group *intel_iommu_groups[] = {
5037         &intel_iommu_group,
5038         NULL,
5039 };
5040
5041 static inline bool has_untrusted_dev(void)
5042 {
5043         struct pci_dev *pdev = NULL;
5044
5045         for_each_pci_dev(pdev)
5046                 if (pdev->untrusted)
5047                         return true;
5048
5049         return false;
5050 }
5051
5052 static int __init platform_optin_force_iommu(void)
5053 {
5054         if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
5055                 return 0;
5056
5057         if (no_iommu || dmar_disabled)
5058                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
5059
5060         /*
5061          * If Intel-IOMMU is disabled by default, we will apply identity
5062          * map for all devices except those marked as being untrusted.
5063          */
5064         if (dmar_disabled)
5065                 iommu_set_default_passthrough(false);
5066
5067         dmar_disabled = 0;
5068         no_iommu = 0;
5069
5070         return 1;
5071 }
5072
5073 static int __init probe_acpi_namespace_devices(void)
5074 {
5075         struct dmar_drhd_unit *drhd;
5076         /* To avoid a -Wunused-but-set-variable warning. */
5077         struct intel_iommu *iommu __maybe_unused;
5078         struct device *dev;
5079         int i, ret = 0;
5080
5081         for_each_active_iommu(iommu, drhd) {
5082                 for_each_active_dev_scope(drhd->devices,
5083                                           drhd->devices_cnt, i, dev) {
5084                         struct acpi_device_physical_node *pn;
5085                         struct iommu_group *group;
5086                         struct acpi_device *adev;
5087
5088                         if (dev->bus != &acpi_bus_type)
5089                                 continue;
5090
5091                         adev = to_acpi_device(dev);
5092                         mutex_lock(&adev->physical_node_lock);
5093                         list_for_each_entry(pn,
5094                                             &adev->physical_node_list, node) {
5095                                 group = iommu_group_get(pn->dev);
5096                                 if (group) {
5097                                         iommu_group_put(group);
5098                                         continue;
5099                                 }
5100
5101                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
5102                                 ret = iommu_probe_device(pn->dev);
5103                                 if (ret)
5104                                         break;
5105                         }
5106                         mutex_unlock(&adev->physical_node_lock);
5107
5108                         if (ret)
5109                                 return ret;
5110                 }
5111         }
5112
5113         return 0;
5114 }
5115
5116 int __init intel_iommu_init(void)
5117 {
5118         int ret = -ENODEV;
5119         struct dmar_drhd_unit *drhd;
5120         struct intel_iommu *iommu;
5121
5122         /*
5123          * Intel IOMMU is required for a TXT/tboot launch or platform
5124          * opt in, so enforce that.
5125          */
5126         force_on = tboot_force_iommu() || platform_optin_force_iommu();
5127
5128         if (iommu_init_mempool()) {
5129                 if (force_on)
5130                         panic("tboot: Failed to initialize iommu memory\n");
5131                 return -ENOMEM;
5132         }
5133
5134         down_write(&dmar_global_lock);
5135         if (dmar_table_init()) {
5136                 if (force_on)
5137                         panic("tboot: Failed to initialize DMAR table\n");
5138                 goto out_free_dmar;
5139         }
5140
5141         if (dmar_dev_scope_init() < 0) {
5142                 if (force_on)
5143                         panic("tboot: Failed to initialize DMAR device scope\n");
5144                 goto out_free_dmar;
5145         }
5146
5147         up_write(&dmar_global_lock);
5148
5149         /*
5150          * The bus notifier takes the dmar_global_lock, so lockdep will
5151          * complain later when we register it under the lock.
5152          */
5153         dmar_register_bus_notifier();
5154
5155         down_write(&dmar_global_lock);
5156
5157         if (!no_iommu)
5158                 intel_iommu_debugfs_init();
5159
5160         if (no_iommu || dmar_disabled) {
5161                 /*
5162                  * We exit the function here to ensure IOMMU's remapping and
5163                  * mempool aren't setup, which means that the IOMMU's PMRs
5164                  * won't be disabled via the call to init_dmars(). So disable
5165                  * it explicitly here. The PMRs were setup by tboot prior to
5166                  * calling SENTER, but the kernel is expected to reset/tear
5167                  * down the PMRs.
5168                  */
5169                 if (intel_iommu_tboot_noforce) {
5170                         for_each_iommu(iommu, drhd)
5171                                 iommu_disable_protect_mem_regions(iommu);
5172                 }
5173
5174                 /*
5175                  * Make sure the IOMMUs are switched off, even when we
5176                  * boot into a kexec kernel and the previous kernel left
5177                  * them enabled
5178                  */
5179                 intel_disable_iommus();
5180                 goto out_free_dmar;
5181         }
5182
5183         if (list_empty(&dmar_rmrr_units))
5184                 pr_info("No RMRR found\n");
5185
5186         if (list_empty(&dmar_atsr_units))
5187                 pr_info("No ATSR found\n");
5188
5189         if (dmar_init_reserved_ranges()) {
5190                 if (force_on)
5191                         panic("tboot: Failed to reserve iommu ranges\n");
5192                 goto out_free_reserved_range;
5193         }
5194
5195         if (dmar_map_gfx)
5196                 intel_iommu_gfx_mapped = 1;
5197
5198         init_no_remapping_devices();
5199
5200         ret = init_dmars();
5201         if (ret) {
5202                 if (force_on)
5203                         panic("tboot: Failed to initialize DMARs\n");
5204                 pr_err("Initialization failed\n");
5205                 goto out_free_reserved_range;
5206         }
5207         up_write(&dmar_global_lock);
5208
5209 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5210         /*
5211          * If the system has no untrusted device or the user has decided
5212          * to disable the bounce page mechanisms, we don't need swiotlb.
5213          * Mark this and the pre-allocated bounce pages will be released
5214          * later.
5215          */
5216         if (!has_untrusted_dev() || intel_no_bounce)
5217                 swiotlb = 0;
5218 #endif
5219         dma_ops = &intel_dma_ops;
5220
5221         init_iommu_pm_ops();
5222
5223         down_read(&dmar_global_lock);
5224         for_each_active_iommu(iommu, drhd) {
5225                 iommu_device_sysfs_add(&iommu->iommu, NULL,
5226                                        intel_iommu_groups,
5227                                        "%s", iommu->name);
5228                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5229                 iommu_device_register(&iommu->iommu);
5230         }
5231         up_read(&dmar_global_lock);
5232
5233         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5234         if (si_domain && !hw_pass_through)
5235                 register_memory_notifier(&intel_iommu_memory_nb);
5236         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5237                           intel_iommu_cpu_dead);
5238
5239         down_read(&dmar_global_lock);
5240         if (probe_acpi_namespace_devices())
5241                 pr_warn("ACPI name space devices didn't probe correctly\n");
5242
5243         /* Finally, we enable the DMA remapping hardware. */
5244         for_each_iommu(iommu, drhd) {
5245                 if (!drhd->ignored && !translation_pre_enabled(iommu))
5246                         iommu_enable_translation(iommu);
5247
5248                 iommu_disable_protect_mem_regions(iommu);
5249         }
5250         up_read(&dmar_global_lock);
5251
5252         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5253
5254         intel_iommu_enabled = 1;
5255
5256         return 0;
5257
5258 out_free_reserved_range:
5259         put_iova_domain(&reserved_iova_list);
5260 out_free_dmar:
5261         intel_iommu_free_dmars();
5262         up_write(&dmar_global_lock);
5263         iommu_exit_mempool();
5264         return ret;
5265 }
5266
5267 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5268 {
5269         struct intel_iommu *iommu = opaque;
5270
5271         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5272         return 0;
5273 }
5274
5275 /*
5276  * NB - intel-iommu lacks any sort of reference counting for the users of
5277  * dependent devices.  If multiple endpoints have intersecting dependent
5278  * devices, unbinding the driver from any one of them will possibly leave
5279  * the others unable to operate.
5280  */
5281 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5282 {
5283         if (!iommu || !dev || !dev_is_pci(dev))
5284                 return;
5285
5286         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5287 }
5288
5289 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5290 {
5291         struct dmar_domain *domain;
5292         struct intel_iommu *iommu;
5293         unsigned long flags;
5294
5295         assert_spin_locked(&device_domain_lock);
5296
5297         if (WARN_ON(!info))
5298                 return;
5299
5300         iommu = info->iommu;
5301         domain = info->domain;
5302
5303         if (info->dev) {
5304                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5305                         intel_pasid_tear_down_entry(iommu, info->dev,
5306                                         PASID_RID2PASID);
5307
5308                 iommu_disable_dev_iotlb(info);
5309                 if (!dev_is_real_dma_subdevice(info->dev))
5310                         domain_context_clear(iommu, info->dev);
5311                 intel_pasid_free_table(info->dev);
5312         }
5313
5314         unlink_domain_info(info);
5315
5316         spin_lock_irqsave(&iommu->lock, flags);
5317         domain_detach_iommu(domain, iommu);
5318         spin_unlock_irqrestore(&iommu->lock, flags);
5319
5320         /* free the private domain */
5321         if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5322             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5323             list_empty(&domain->devices))
5324                 domain_exit(info->domain);
5325
5326         free_devinfo_mem(info);
5327 }
5328
5329 static void dmar_remove_one_dev_info(struct device *dev)
5330 {
5331         struct device_domain_info *info;
5332         unsigned long flags;
5333
5334         spin_lock_irqsave(&device_domain_lock, flags);
5335         info = dev->archdata.iommu;
5336         if (info && info != DEFER_DEVICE_DOMAIN_INFO
5337             && info != DUMMY_DEVICE_DOMAIN_INFO)
5338                 __dmar_remove_one_dev_info(info);
5339         spin_unlock_irqrestore(&device_domain_lock, flags);
5340 }
5341
5342 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5343 {
5344         int adjust_width;
5345
5346         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5347         domain_reserve_special_ranges(domain);
5348
5349         /* calculate AGAW */
5350         domain->gaw = guest_width;
5351         adjust_width = guestwidth_to_adjustwidth(guest_width);
5352         domain->agaw = width_to_agaw(adjust_width);
5353
5354         domain->iommu_coherency = 0;
5355         domain->iommu_snooping = 0;
5356         domain->iommu_superpage = 0;
5357         domain->max_addr = 0;
5358
5359         /* always allocate the top pgd */
5360         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5361         if (!domain->pgd)
5362                 return -ENOMEM;
5363         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5364         return 0;
5365 }
5366
5367 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5368 {
5369         struct dmar_domain *dmar_domain;
5370         struct iommu_domain *domain;
5371         int ret;
5372
5373         switch (type) {
5374         case IOMMU_DOMAIN_DMA:
5375         /* fallthrough */
5376         case IOMMU_DOMAIN_UNMANAGED:
5377                 dmar_domain = alloc_domain(0);
5378                 if (!dmar_domain) {
5379                         pr_err("Can't allocate dmar_domain\n");
5380                         return NULL;
5381                 }
5382                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5383                         pr_err("Domain initialization failed\n");
5384                         domain_exit(dmar_domain);
5385                         return NULL;
5386                 }
5387
5388                 if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) {
5389                         ret = init_iova_flush_queue(&dmar_domain->iovad,
5390                                                     iommu_flush_iova,
5391                                                     iova_entry_free);
5392                         if (ret)
5393                                 pr_info("iova flush queue initialization failed\n");
5394                 }
5395
5396                 domain_update_iommu_cap(dmar_domain);
5397
5398                 domain = &dmar_domain->domain;
5399                 domain->geometry.aperture_start = 0;
5400                 domain->geometry.aperture_end   =
5401                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5402                 domain->geometry.force_aperture = true;
5403
5404                 return domain;
5405         case IOMMU_DOMAIN_IDENTITY:
5406                 return &si_domain->domain;
5407         default:
5408                 return NULL;
5409         }
5410
5411         return NULL;
5412 }
5413
5414 static void intel_iommu_domain_free(struct iommu_domain *domain)
5415 {
5416         if (domain != &si_domain->domain)
5417                 domain_exit(to_dmar_domain(domain));
5418 }
5419
5420 /*
5421  * Check whether a @domain could be attached to the @dev through the
5422  * aux-domain attach/detach APIs.
5423  */
5424 static inline bool
5425 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5426 {
5427         struct device_domain_info *info = dev->archdata.iommu;
5428
5429         return info && info->auxd_enabled &&
5430                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5431 }
5432
5433 static void auxiliary_link_device(struct dmar_domain *domain,
5434                                   struct device *dev)
5435 {
5436         struct device_domain_info *info = dev->archdata.iommu;
5437
5438         assert_spin_locked(&device_domain_lock);
5439         if (WARN_ON(!info))
5440                 return;
5441
5442         domain->auxd_refcnt++;
5443         list_add(&domain->auxd, &info->auxiliary_domains);
5444 }
5445
5446 static void auxiliary_unlink_device(struct dmar_domain *domain,
5447                                     struct device *dev)
5448 {
5449         struct device_domain_info *info = dev->archdata.iommu;
5450
5451         assert_spin_locked(&device_domain_lock);
5452         if (WARN_ON(!info))
5453                 return;
5454
5455         list_del(&domain->auxd);
5456         domain->auxd_refcnt--;
5457
5458         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5459                 ioasid_free(domain->default_pasid);
5460 }
5461
5462 static int aux_domain_add_dev(struct dmar_domain *domain,
5463                               struct device *dev)
5464 {
5465         int ret;
5466         u8 bus, devfn;
5467         unsigned long flags;
5468         struct intel_iommu *iommu;
5469
5470         iommu = device_to_iommu(dev, &bus, &devfn);
5471         if (!iommu)
5472                 return -ENODEV;
5473
5474         if (domain->default_pasid <= 0) {
5475                 int pasid;
5476
5477                 /* No private data needed for the default pasid */
5478                 pasid = ioasid_alloc(NULL, PASID_MIN,
5479                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5480                                      NULL);
5481                 if (pasid == INVALID_IOASID) {
5482                         pr_err("Can't allocate default pasid\n");
5483                         return -ENODEV;
5484                 }
5485                 domain->default_pasid = pasid;
5486         }
5487
5488         spin_lock_irqsave(&device_domain_lock, flags);
5489         /*
5490          * iommu->lock must be held to attach domain to iommu and setup the
5491          * pasid entry for second level translation.
5492          */
5493         spin_lock(&iommu->lock);
5494         ret = domain_attach_iommu(domain, iommu);
5495         if (ret)
5496                 goto attach_failed;
5497
5498         /* Setup the PASID entry for mediated devices: */
5499         if (domain_use_first_level(domain))
5500                 ret = domain_setup_first_level(iommu, domain, dev,
5501                                                domain->default_pasid);
5502         else
5503                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5504                                                      domain->default_pasid);
5505         if (ret)
5506                 goto table_failed;
5507         spin_unlock(&iommu->lock);
5508
5509         auxiliary_link_device(domain, dev);
5510
5511         spin_unlock_irqrestore(&device_domain_lock, flags);
5512
5513         return 0;
5514
5515 table_failed:
5516         domain_detach_iommu(domain, iommu);
5517 attach_failed:
5518         spin_unlock(&iommu->lock);
5519         spin_unlock_irqrestore(&device_domain_lock, flags);
5520         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5521                 ioasid_free(domain->default_pasid);
5522
5523         return ret;
5524 }
5525
5526 static void aux_domain_remove_dev(struct dmar_domain *domain,
5527                                   struct device *dev)
5528 {
5529         struct device_domain_info *info;
5530         struct intel_iommu *iommu;
5531         unsigned long flags;
5532
5533         if (!is_aux_domain(dev, &domain->domain))
5534                 return;
5535
5536         spin_lock_irqsave(&device_domain_lock, flags);
5537         info = dev->archdata.iommu;
5538         iommu = info->iommu;
5539
5540         auxiliary_unlink_device(domain, dev);
5541
5542         spin_lock(&iommu->lock);
5543         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5544         domain_detach_iommu(domain, iommu);
5545         spin_unlock(&iommu->lock);
5546
5547         spin_unlock_irqrestore(&device_domain_lock, flags);
5548 }
5549
5550 static int prepare_domain_attach_device(struct iommu_domain *domain,
5551                                         struct device *dev)
5552 {
5553         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5554         struct intel_iommu *iommu;
5555         int addr_width;
5556         u8 bus, devfn;
5557
5558         iommu = device_to_iommu(dev, &bus, &devfn);
5559         if (!iommu)
5560                 return -ENODEV;
5561
5562         /* check if this iommu agaw is sufficient for max mapped address */
5563         addr_width = agaw_to_width(iommu->agaw);
5564         if (addr_width > cap_mgaw(iommu->cap))
5565                 addr_width = cap_mgaw(iommu->cap);
5566
5567         if (dmar_domain->max_addr > (1LL << addr_width)) {
5568                 dev_err(dev, "%s: iommu width (%d) is not "
5569                         "sufficient for the mapped address (%llx)\n",
5570                         __func__, addr_width, dmar_domain->max_addr);
5571                 return -EFAULT;
5572         }
5573         dmar_domain->gaw = addr_width;
5574
5575         /*
5576          * Knock out extra levels of page tables if necessary
5577          */
5578         while (iommu->agaw < dmar_domain->agaw) {
5579                 struct dma_pte *pte;
5580
5581                 pte = dmar_domain->pgd;
5582                 if (dma_pte_present(pte)) {
5583                         dmar_domain->pgd = (struct dma_pte *)
5584                                 phys_to_virt(dma_pte_addr(pte));
5585                         free_pgtable_page(pte);
5586                 }
5587                 dmar_domain->agaw--;
5588         }
5589
5590         return 0;
5591 }
5592
5593 static int intel_iommu_attach_device(struct iommu_domain *domain,
5594                                      struct device *dev)
5595 {
5596         int ret;
5597
5598         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5599             device_is_rmrr_locked(dev)) {
5600                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5601                 return -EPERM;
5602         }
5603
5604         if (is_aux_domain(dev, domain))
5605                 return -EPERM;
5606
5607         /* normally dev is not mapped */
5608         if (unlikely(domain_context_mapped(dev))) {
5609                 struct dmar_domain *old_domain;
5610
5611                 old_domain = find_domain(dev);
5612                 if (old_domain)
5613                         dmar_remove_one_dev_info(dev);
5614         }
5615
5616         ret = prepare_domain_attach_device(domain, dev);
5617         if (ret)
5618                 return ret;
5619
5620         return domain_add_dev_info(to_dmar_domain(domain), dev);
5621 }
5622
5623 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5624                                          struct device *dev)
5625 {
5626         int ret;
5627
5628         if (!is_aux_domain(dev, domain))
5629                 return -EPERM;
5630
5631         ret = prepare_domain_attach_device(domain, dev);
5632         if (ret)
5633                 return ret;
5634
5635         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5636 }
5637
5638 static void intel_iommu_detach_device(struct iommu_domain *domain,
5639                                       struct device *dev)
5640 {
5641         dmar_remove_one_dev_info(dev);
5642 }
5643
5644 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5645                                           struct device *dev)
5646 {
5647         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5648 }
5649
5650 static int intel_iommu_map(struct iommu_domain *domain,
5651                            unsigned long iova, phys_addr_t hpa,
5652                            size_t size, int iommu_prot, gfp_t gfp)
5653 {
5654         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5655         u64 max_addr;
5656         int prot = 0;
5657         int ret;
5658
5659         if (iommu_prot & IOMMU_READ)
5660                 prot |= DMA_PTE_READ;
5661         if (iommu_prot & IOMMU_WRITE)
5662                 prot |= DMA_PTE_WRITE;
5663         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5664                 prot |= DMA_PTE_SNP;
5665
5666         max_addr = iova + size;
5667         if (dmar_domain->max_addr < max_addr) {
5668                 u64 end;
5669
5670                 /* check if minimum agaw is sufficient for mapped address */
5671                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5672                 if (end < max_addr) {
5673                         pr_err("%s: iommu width (%d) is not "
5674                                "sufficient for the mapped address (%llx)\n",
5675                                __func__, dmar_domain->gaw, max_addr);
5676                         return -EFAULT;
5677                 }
5678                 dmar_domain->max_addr = max_addr;
5679         }
5680         /* Round up size to next multiple of PAGE_SIZE, if it and
5681            the low bits of hpa would take us onto the next page */
5682         size = aligned_nrpages(hpa, size);
5683         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5684                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5685         return ret;
5686 }
5687
5688 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5689                                 unsigned long iova, size_t size,
5690                                 struct iommu_iotlb_gather *gather)
5691 {
5692         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5693         struct page *freelist = NULL;
5694         unsigned long start_pfn, last_pfn;
5695         unsigned int npages;
5696         int iommu_id, level = 0;
5697
5698         /* Cope with horrid API which requires us to unmap more than the
5699            size argument if it happens to be a large-page mapping. */
5700         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5701
5702         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5703                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5704
5705         start_pfn = iova >> VTD_PAGE_SHIFT;
5706         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5707
5708         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5709
5710         npages = last_pfn - start_pfn + 1;
5711
5712         for_each_domain_iommu(iommu_id, dmar_domain)
5713                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5714                                       start_pfn, npages, !freelist, 0);
5715
5716         dma_free_pagelist(freelist);
5717
5718         if (dmar_domain->max_addr == iova + size)
5719                 dmar_domain->max_addr = iova;
5720
5721         return size;
5722 }
5723
5724 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5725                                             dma_addr_t iova)
5726 {
5727         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5728         struct dma_pte *pte;
5729         int level = 0;
5730         u64 phys = 0;
5731
5732         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5733         if (pte && dma_pte_present(pte))
5734                 phys = dma_pte_addr(pte) +
5735                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5736                                                 VTD_PAGE_SHIFT) - 1));
5737
5738         return phys;
5739 }
5740
5741 static inline bool scalable_mode_support(void)
5742 {
5743         struct dmar_drhd_unit *drhd;
5744         struct intel_iommu *iommu;
5745         bool ret = true;
5746
5747         rcu_read_lock();
5748         for_each_active_iommu(iommu, drhd) {
5749                 if (!sm_supported(iommu)) {
5750                         ret = false;
5751                         break;
5752                 }
5753         }
5754         rcu_read_unlock();
5755
5756         return ret;
5757 }
5758
5759 static inline bool iommu_pasid_support(void)
5760 {
5761         struct dmar_drhd_unit *drhd;
5762         struct intel_iommu *iommu;
5763         bool ret = true;
5764
5765         rcu_read_lock();
5766         for_each_active_iommu(iommu, drhd) {
5767                 if (!pasid_supported(iommu)) {
5768                         ret = false;
5769                         break;
5770                 }
5771         }
5772         rcu_read_unlock();
5773
5774         return ret;
5775 }
5776
5777 static inline bool nested_mode_support(void)
5778 {
5779         struct dmar_drhd_unit *drhd;
5780         struct intel_iommu *iommu;
5781         bool ret = true;
5782
5783         rcu_read_lock();
5784         for_each_active_iommu(iommu, drhd) {
5785                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5786                         ret = false;
5787                         break;
5788                 }
5789         }
5790         rcu_read_unlock();
5791
5792         return ret;
5793 }
5794
5795 static bool intel_iommu_capable(enum iommu_cap cap)
5796 {
5797         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5798                 return domain_update_iommu_snooping(NULL) == 1;
5799         if (cap == IOMMU_CAP_INTR_REMAP)
5800                 return irq_remapping_enabled == 1;
5801
5802         return false;
5803 }
5804
5805 static int intel_iommu_add_device(struct device *dev)
5806 {
5807         struct dmar_domain *dmar_domain;
5808         struct iommu_domain *domain;
5809         struct intel_iommu *iommu;
5810         struct iommu_group *group;
5811         u8 bus, devfn;
5812         int ret;
5813
5814         iommu = device_to_iommu(dev, &bus, &devfn);
5815         if (!iommu)
5816                 return -ENODEV;
5817
5818         iommu_device_link(&iommu->iommu, dev);
5819
5820         if (translation_pre_enabled(iommu))
5821                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5822
5823         group = iommu_group_get_for_dev(dev);
5824
5825         if (IS_ERR(group)) {
5826                 ret = PTR_ERR(group);
5827                 goto unlink;
5828         }
5829
5830         iommu_group_put(group);
5831
5832         domain = iommu_get_domain_for_dev(dev);
5833         dmar_domain = to_dmar_domain(domain);
5834         if (domain->type == IOMMU_DOMAIN_DMA) {
5835                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5836                         ret = iommu_request_dm_for_dev(dev);
5837                         if (ret) {
5838                                 dmar_remove_one_dev_info(dev);
5839                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5840                                 domain_add_dev_info(si_domain, dev);
5841                                 dev_info(dev,
5842                                          "Device uses a private identity domain.\n");
5843                         }
5844                 }
5845         } else {
5846                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5847                         ret = iommu_request_dma_domain_for_dev(dev);
5848                         if (ret) {
5849                                 dmar_remove_one_dev_info(dev);
5850                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5851                                 if (!get_private_domain_for_dev(dev)) {
5852                                         dev_warn(dev,
5853                                                  "Failed to get a private domain.\n");
5854                                         ret = -ENOMEM;
5855                                         goto unlink;
5856                                 }
5857
5858                                 dev_info(dev,
5859                                          "Device uses a private dma domain.\n");
5860                         }
5861                 }
5862         }
5863
5864         if (device_needs_bounce(dev)) {
5865                 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5866                 set_dma_ops(dev, &bounce_dma_ops);
5867         }
5868
5869         return 0;
5870
5871 unlink:
5872         iommu_device_unlink(&iommu->iommu, dev);
5873         return ret;
5874 }
5875
5876 static void intel_iommu_remove_device(struct device *dev)
5877 {
5878         struct intel_iommu *iommu;
5879         u8 bus, devfn;
5880
5881         iommu = device_to_iommu(dev, &bus, &devfn);
5882         if (!iommu)
5883                 return;
5884
5885         dmar_remove_one_dev_info(dev);
5886
5887         iommu_group_remove_device(dev);
5888
5889         iommu_device_unlink(&iommu->iommu, dev);
5890
5891         if (device_needs_bounce(dev))
5892                 set_dma_ops(dev, NULL);
5893 }
5894
5895 static void intel_iommu_get_resv_regions(struct device *device,
5896                                          struct list_head *head)
5897 {
5898         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5899         struct iommu_resv_region *reg;
5900         struct dmar_rmrr_unit *rmrr;
5901         struct device *i_dev;
5902         int i;
5903
5904         down_read(&dmar_global_lock);
5905         for_each_rmrr_units(rmrr) {
5906                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5907                                           i, i_dev) {
5908                         struct iommu_resv_region *resv;
5909                         enum iommu_resv_type type;
5910                         size_t length;
5911
5912                         if (i_dev != device &&
5913                             !is_downstream_to_pci_bridge(device, i_dev))
5914                                 continue;
5915
5916                         length = rmrr->end_address - rmrr->base_address + 1;
5917
5918                         type = device_rmrr_is_relaxable(device) ?
5919                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5920
5921                         resv = iommu_alloc_resv_region(rmrr->base_address,
5922                                                        length, prot, type);
5923                         if (!resv)
5924                                 break;
5925
5926                         list_add_tail(&resv->list, head);
5927                 }
5928         }
5929         up_read(&dmar_global_lock);
5930
5931 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5932         if (dev_is_pci(device)) {
5933                 struct pci_dev *pdev = to_pci_dev(device);
5934
5935                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5936                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5937                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5938                         if (reg)
5939                                 list_add_tail(&reg->list, head);
5940                 }
5941         }
5942 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5943
5944         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5945                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5946                                       0, IOMMU_RESV_MSI);
5947         if (!reg)
5948                 return;
5949         list_add_tail(&reg->list, head);
5950 }
5951
5952 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5953 {
5954         struct device_domain_info *info;
5955         struct context_entry *context;
5956         struct dmar_domain *domain;
5957         unsigned long flags;
5958         u64 ctx_lo;
5959         int ret;
5960
5961         domain = find_domain(dev);
5962         if (!domain)
5963                 return -EINVAL;
5964
5965         spin_lock_irqsave(&device_domain_lock, flags);
5966         spin_lock(&iommu->lock);
5967
5968         ret = -EINVAL;
5969         info = dev->archdata.iommu;
5970         if (!info || !info->pasid_supported)
5971                 goto out;
5972
5973         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5974         if (WARN_ON(!context))
5975                 goto out;
5976
5977         ctx_lo = context[0].lo;
5978
5979         if (!(ctx_lo & CONTEXT_PASIDE)) {
5980                 ctx_lo |= CONTEXT_PASIDE;
5981                 context[0].lo = ctx_lo;
5982                 wmb();
5983                 iommu->flush.flush_context(iommu,
5984                                            domain->iommu_did[iommu->seq_id],
5985                                            PCI_DEVID(info->bus, info->devfn),
5986                                            DMA_CCMD_MASK_NOBIT,
5987                                            DMA_CCMD_DEVICE_INVL);
5988         }
5989
5990         /* Enable PASID support in the device, if it wasn't already */
5991         if (!info->pasid_enabled)
5992                 iommu_enable_dev_iotlb(info);
5993
5994         ret = 0;
5995
5996  out:
5997         spin_unlock(&iommu->lock);
5998         spin_unlock_irqrestore(&device_domain_lock, flags);
5999
6000         return ret;
6001 }
6002
6003 static void intel_iommu_apply_resv_region(struct device *dev,
6004                                           struct iommu_domain *domain,
6005                                           struct iommu_resv_region *region)
6006 {
6007         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6008         unsigned long start, end;
6009
6010         start = IOVA_PFN(region->start);
6011         end   = IOVA_PFN(region->start + region->length - 1);
6012
6013         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
6014 }
6015
6016 static struct iommu_group *intel_iommu_device_group(struct device *dev)
6017 {
6018         if (dev_is_pci(dev))
6019                 return pci_device_group(dev);
6020         return generic_device_group(dev);
6021 }
6022
6023 #ifdef CONFIG_INTEL_IOMMU_SVM
6024 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
6025 {
6026         struct intel_iommu *iommu;
6027         u8 bus, devfn;
6028
6029         if (iommu_dummy(dev)) {
6030                 dev_warn(dev,
6031                          "No IOMMU translation for device; cannot enable SVM\n");
6032                 return NULL;
6033         }
6034
6035         iommu = device_to_iommu(dev, &bus, &devfn);
6036         if ((!iommu)) {
6037                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
6038                 return NULL;
6039         }
6040
6041         return iommu;
6042 }
6043 #endif /* CONFIG_INTEL_IOMMU_SVM */
6044
6045 static int intel_iommu_enable_auxd(struct device *dev)
6046 {
6047         struct device_domain_info *info;
6048         struct intel_iommu *iommu;
6049         unsigned long flags;
6050         u8 bus, devfn;
6051         int ret;
6052
6053         iommu = device_to_iommu(dev, &bus, &devfn);
6054         if (!iommu || dmar_disabled)
6055                 return -EINVAL;
6056
6057         if (!sm_supported(iommu) || !pasid_supported(iommu))
6058                 return -EINVAL;
6059
6060         ret = intel_iommu_enable_pasid(iommu, dev);
6061         if (ret)
6062                 return -ENODEV;
6063
6064         spin_lock_irqsave(&device_domain_lock, flags);
6065         info = dev->archdata.iommu;
6066         info->auxd_enabled = 1;
6067         spin_unlock_irqrestore(&device_domain_lock, flags);
6068
6069         return 0;
6070 }
6071
6072 static int intel_iommu_disable_auxd(struct device *dev)
6073 {
6074         struct device_domain_info *info;
6075         unsigned long flags;
6076
6077         spin_lock_irqsave(&device_domain_lock, flags);
6078         info = dev->archdata.iommu;
6079         if (!WARN_ON(!info))
6080                 info->auxd_enabled = 0;
6081         spin_unlock_irqrestore(&device_domain_lock, flags);
6082
6083         return 0;
6084 }
6085
6086 /*
6087  * A PCI express designated vendor specific extended capability is defined
6088  * in the section 3.7 of Intel scalable I/O virtualization technical spec
6089  * for system software and tools to detect endpoint devices supporting the
6090  * Intel scalable IO virtualization without host driver dependency.
6091  *
6092  * Returns the address of the matching extended capability structure within
6093  * the device's PCI configuration space or 0 if the device does not support
6094  * it.
6095  */
6096 static int siov_find_pci_dvsec(struct pci_dev *pdev)
6097 {
6098         int pos;
6099         u16 vendor, id;
6100
6101         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
6102         while (pos) {
6103                 pci_read_config_word(pdev, pos + 4, &vendor);
6104                 pci_read_config_word(pdev, pos + 8, &id);
6105                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
6106                         return pos;
6107
6108                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
6109         }
6110
6111         return 0;
6112 }
6113
6114 static bool
6115 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
6116 {
6117         if (feat == IOMMU_DEV_FEAT_AUX) {
6118                 int ret;
6119
6120                 if (!dev_is_pci(dev) || dmar_disabled ||
6121                     !scalable_mode_support() || !iommu_pasid_support())
6122                         return false;
6123
6124                 ret = pci_pasid_features(to_pci_dev(dev));
6125                 if (ret < 0)
6126                         return false;
6127
6128                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
6129         }
6130
6131         return false;
6132 }
6133
6134 static int
6135 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6136 {
6137         if (feat == IOMMU_DEV_FEAT_AUX)
6138                 return intel_iommu_enable_auxd(dev);
6139
6140         return -ENODEV;
6141 }
6142
6143 static int
6144 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6145 {
6146         if (feat == IOMMU_DEV_FEAT_AUX)
6147                 return intel_iommu_disable_auxd(dev);
6148
6149         return -ENODEV;
6150 }
6151
6152 static bool
6153 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6154 {
6155         struct device_domain_info *info = dev->archdata.iommu;
6156
6157         if (feat == IOMMU_DEV_FEAT_AUX)
6158                 return scalable_mode_support() && info && info->auxd_enabled;
6159
6160         return false;
6161 }
6162
6163 static int
6164 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6165 {
6166         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6167
6168         return dmar_domain->default_pasid > 0 ?
6169                         dmar_domain->default_pasid : -EINVAL;
6170 }
6171
6172 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6173                                            struct device *dev)
6174 {
6175         return attach_deferred(dev);
6176 }
6177
6178 static int
6179 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6180                             enum iommu_attr attr, void *data)
6181 {
6182         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6183         unsigned long flags;
6184         int ret = 0;
6185
6186         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6187                 return -EINVAL;
6188
6189         switch (attr) {
6190         case DOMAIN_ATTR_NESTING:
6191                 spin_lock_irqsave(&device_domain_lock, flags);
6192                 if (nested_mode_support() &&
6193                     list_empty(&dmar_domain->devices)) {
6194                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6195                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6196                 } else {
6197                         ret = -ENODEV;
6198                 }
6199                 spin_unlock_irqrestore(&device_domain_lock, flags);
6200                 break;
6201         default:
6202                 ret = -EINVAL;
6203                 break;
6204         }
6205
6206         return ret;
6207 }
6208
6209 const struct iommu_ops intel_iommu_ops = {
6210         .capable                = intel_iommu_capable,
6211         .domain_alloc           = intel_iommu_domain_alloc,
6212         .domain_free            = intel_iommu_domain_free,
6213         .domain_set_attr        = intel_iommu_domain_set_attr,
6214         .attach_dev             = intel_iommu_attach_device,
6215         .detach_dev             = intel_iommu_detach_device,
6216         .aux_attach_dev         = intel_iommu_aux_attach_device,
6217         .aux_detach_dev         = intel_iommu_aux_detach_device,
6218         .aux_get_pasid          = intel_iommu_aux_get_pasid,
6219         .map                    = intel_iommu_map,
6220         .unmap                  = intel_iommu_unmap,
6221         .iova_to_phys           = intel_iommu_iova_to_phys,
6222         .add_device             = intel_iommu_add_device,
6223         .remove_device          = intel_iommu_remove_device,
6224         .get_resv_regions       = intel_iommu_get_resv_regions,
6225         .put_resv_regions       = generic_iommu_put_resv_regions,
6226         .apply_resv_region      = intel_iommu_apply_resv_region,
6227         .device_group           = intel_iommu_device_group,
6228         .dev_has_feat           = intel_iommu_dev_has_feat,
6229         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6230         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6231         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6232         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6233         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6234 };
6235
6236 static void quirk_iommu_igfx(struct pci_dev *dev)
6237 {
6238         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6239         dmar_map_gfx = 0;
6240 }
6241
6242 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6243 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6244 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6245 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6247 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6248 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6249 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6250
6251 /* Broadwell igfx malfunctions with dmar */
6252 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6253 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6254 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6255 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6256 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6257 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6258 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6259 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6260 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6261 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6262 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6263 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6264 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6265 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6266 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6267 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6268 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6269 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6270 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6271 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6272 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6273 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6274 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6275 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6276
6277 static void quirk_iommu_rwbf(struct pci_dev *dev)
6278 {
6279         /*
6280          * Mobile 4 Series Chipset neglects to set RWBF capability,
6281          * but needs it. Same seems to hold for the desktop versions.
6282          */
6283         pci_info(dev, "Forcing write-buffer flush capability\n");
6284         rwbf_quirk = 1;
6285 }
6286
6287 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6288 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6289 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6290 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6291 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6292 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6293 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6294
6295 #define GGC 0x52
6296 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6297 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6298 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6299 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6300 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6301 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6302 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6303 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6304
6305 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6306 {
6307         unsigned short ggc;
6308
6309         if (pci_read_config_word(dev, GGC, &ggc))
6310                 return;
6311
6312         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6313                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6314                 dmar_map_gfx = 0;
6315         } else if (dmar_map_gfx) {
6316                 /* we have to ensure the gfx device is idle before we flush */
6317                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6318                 intel_iommu_strict = 1;
6319        }
6320 }
6321 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6322 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6323 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6324 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6325
6326 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6327    ISOCH DMAR unit for the Azalia sound device, but not give it any
6328    TLB entries, which causes it to deadlock. Check for that.  We do
6329    this in a function called from init_dmars(), instead of in a PCI
6330    quirk, because we don't want to print the obnoxious "BIOS broken"
6331    message if VT-d is actually disabled.
6332 */
6333 static void __init check_tylersburg_isoch(void)
6334 {
6335         struct pci_dev *pdev;
6336         uint32_t vtisochctrl;
6337
6338         /* If there's no Azalia in the system anyway, forget it. */
6339         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6340         if (!pdev)
6341                 return;
6342         pci_dev_put(pdev);
6343
6344         /* System Management Registers. Might be hidden, in which case
6345            we can't do the sanity check. But that's OK, because the
6346            known-broken BIOSes _don't_ actually hide it, so far. */
6347         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6348         if (!pdev)
6349                 return;
6350
6351         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6352                 pci_dev_put(pdev);
6353                 return;
6354         }
6355
6356         pci_dev_put(pdev);
6357
6358         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6359         if (vtisochctrl & 1)
6360                 return;
6361
6362         /* Drop all bits other than the number of TLB entries */
6363         vtisochctrl &= 0x1c;
6364
6365         /* If we have the recommended number of TLB entries (16), fine. */
6366         if (vtisochctrl == 0x10)
6367                 return;
6368
6369         /* Zero TLB entries? You get to ride the short bus to school. */
6370         if (!vtisochctrl) {
6371                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6372                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6373                      dmi_get_system_info(DMI_BIOS_VENDOR),
6374                      dmi_get_system_info(DMI_BIOS_VERSION),
6375                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6376                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6377                 return;
6378         }
6379
6380         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6381                vtisochctrl);
6382 }