drivers/iommu/intel-iommu.c

   1 /*
   2  * Copyright © 2006-2014 Intel Corporation.
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms and conditions of the GNU General Public License,
   6  * version 2, as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope it will be useful, but WITHOUT
   9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11  * more details.
  12  *
  13  * Authors: David Woodhouse <dwmw2@infradead.org>,
  14  *          Ashok Raj <ashok.raj@intel.com>,
  15  *          Shaohua Li <shaohua.li@intel.com>,
  16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
  17  *          Fenghua Yu <fenghua.yu@intel.com>
  18  *          Joerg Roedel <jroedel@suse.de>
  19  */
  20
  21 #define pr_fmt(fmt)     "DMAR: " fmt
  22
  23 #include <linux/init.h>
  24 #include <linux/bitmap.h>
  25 #include <linux/debugfs.h>
  26 #include <linux/export.h>
  27 #include <linux/slab.h>
  28 #include <linux/irq.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/spinlock.h>
  31 #include <linux/pci.h>
  32 #include <linux/dmar.h>
  33 #include <linux/dma-mapping.h>
  34 #include <linux/mempool.h>
  35 #include <linux/memory.h>
  36 #include <linux/cpu.h>
  37 #include <linux/timer.h>
  38 #include <linux/io.h>
  39 #include <linux/iova.h>
  40 #include <linux/iommu.h>
  41 #include <linux/intel-iommu.h>
  42 #include <linux/syscore_ops.h>
  43 #include <linux/tboot.h>
  44 #include <linux/dmi.h>
  45 #include <linux/pci-ats.h>
  46 #include <linux/memblock.h>
  47 #include <linux/dma-contiguous.h>
  48 #include <linux/dma-direct.h>
  49 #include <linux/crash_dump.h>
  50 #include <asm/irq_remapping.h>
  51 #include <asm/cacheflush.h>
  52 #include <asm/iommu.h>
  53
  54 #include "irq_remapping.h"
  55 #include "intel-pasid.h"
  56
  57 #define ROOT_SIZE               VTD_PAGE_SIZE
  58 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  59
  60 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  61 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  62 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  63 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  64
  65 #define IOAPIC_RANGE_START      (0xfee00000)
  66 #define IOAPIC_RANGE_END        (0xfeefffff)
  67 #define IOVA_START_ADDR         (0x1000)
  68
  69 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  70
  71 #define MAX_AGAW_WIDTH 64
  72 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  73
  74 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  75 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  76
  77 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  78    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  79 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  80                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  81 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  82
  83 /* IO virtual address start page frame number */
  84 #define IOVA_START_PFN          (1)
  85
  86 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  87
  88 /* page table handling */
  89 #define LEVEL_STRIDE            (9)
  90 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  91
  92 /*
  93  * This bitmap is used to advertise the page sizes our hardware support
  94  * to the IOMMU core, which will then use this information to split
  95  * physically contiguous memory regions it is mapping into page sizes
  96  * that we support.
  97  *
  98  * Traditionally the IOMMU core just handed us the mappings directly,
  99  * after making sure the size is an order of a 4KiB page and that the
 100  * mapping has natural alignment.
 101  *
 102  * To retain this behavior, we currently advertise that we support
 103  * all page sizes that are an order of 4KiB.
 104  *
 105  * If at some point we'd like to utilize the IOMMU core's new behavior,
 106  * we could change this to advertise the real page sizes we support.
 107  */
 108 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 109
 110 static inline int agaw_to_level(int agaw)
 111 {
 112         return agaw + 2;
 113 }
 114
 115 static inline int agaw_to_width(int agaw)
 116 {
 117         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 118 }
 119
 120 static inline int width_to_agaw(int width)
 121 {
 122         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 123 }
 124
 125 static inline unsigned int level_to_offset_bits(int level)
 126 {
 127         return (level - 1) * LEVEL_STRIDE;
 128 }
 129
 130 static inline int pfn_level_offset(unsigned long pfn, int level)
 131 {
 132         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 133 }
 134
 135 static inline unsigned long level_mask(int level)
 136 {
 137         return -1UL << level_to_offset_bits(level);
 138 }
 139
 140 static inline unsigned long level_size(int level)
 141 {
 142         return 1UL << level_to_offset_bits(level);
 143 }
 144
 145 static inline unsigned long align_to_level(unsigned long pfn, int level)
 146 {
 147         return (pfn + level_size(level) - 1) & level_mask(level);
 148 }
 149
 150 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 151 {
 152         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 153 }
 154
 155 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 156    are never going to work. */
 157 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 158 {
 159         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 160 }
 161
 162 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 163 {
 164         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 165 }
 166 static inline unsigned long page_to_dma_pfn(struct page *pg)
 167 {
 168         return mm_to_dma_pfn(page_to_pfn(pg));
 169 }
 170 static inline unsigned long virt_to_dma_pfn(void *p)
 171 {
 172         return page_to_dma_pfn(virt_to_page(p));
 173 }
 174
 175 /* global iommu list, set NULL for ignored DMAR units */
 176 static struct intel_iommu **g_iommus;
 177
 178 static void __init check_tylersburg_isoch(void);
 179 static int rwbf_quirk;
 180
 181 /*
 182  * set to 1 to panic kernel if can't successfully enable VT-d
 183  * (used when kernel is launched w/ TXT)
 184  */
 185 static int force_on = 0;
 186 int intel_iommu_tboot_noforce;
 187
 188 /*
 189  * 0: Present
 190  * 1-11: Reserved
 191  * 12-63: Context Ptr (12 - (haw-1))
 192  * 64-127: Reserved
 193  */
 194 struct root_entry {
 195         u64     lo;
 196         u64     hi;
 197 };
 198 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 199
 200 /*
 201  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 202  * if marked present.
 203  */
 204 static phys_addr_t root_entry_lctp(struct root_entry *re)
 205 {
 206         if (!(re->lo & 1))
 207                 return 0;
 208
 209         return re->lo & VTD_PAGE_MASK;
 210 }
 211
 212 /*
 213  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 214  * if marked present.
 215  */
 216 static phys_addr_t root_entry_uctp(struct root_entry *re)
 217 {
 218         if (!(re->hi & 1))
 219                 return 0;
 220
 221         return re->hi & VTD_PAGE_MASK;
 222 }
 223 /*
 224  * low 64 bits:
 225  * 0: present
 226  * 1: fault processing disable
 227  * 2-3: translation type
 228  * 12-63: address space root
 229  * high 64 bits:
 230  * 0-2: address width
 231  * 3-6: aval
 232  * 8-23: domain id
 233  */
 234 struct context_entry {
 235         u64 lo;
 236         u64 hi;
 237 };
 238
 239 static inline void context_clear_pasid_enable(struct context_entry *context)
 240 {
 241         context->lo &= ~(1ULL << 11);
 242 }
 243
 244 static inline bool context_pasid_enabled(struct context_entry *context)
 245 {
 246         return !!(context->lo & (1ULL << 11));
 247 }
 248
 249 static inline void context_set_copied(struct context_entry *context)
 250 {
 251         context->hi |= (1ull << 3);
 252 }
 253
 254 static inline bool context_copied(struct context_entry *context)
 255 {
 256         return !!(context->hi & (1ULL << 3));
 257 }
 258
 259 static inline bool __context_present(struct context_entry *context)
 260 {
 261         return (context->lo & 1);
 262 }
 263
 264 static inline bool context_present(struct context_entry *context)
 265 {
 266         return context_pasid_enabled(context) ?
 267              __context_present(context) :
 268              __context_present(context) && !context_copied(context);
 269 }
 270
 271 static inline void context_set_present(struct context_entry *context)
 272 {
 273         context->lo |= 1;
 274 }
 275
 276 static inline void context_set_fault_enable(struct context_entry *context)
 277 {
 278         context->lo &= (((u64)-1) << 2) | 1;
 279 }
 280
 281 static inline void context_set_translation_type(struct context_entry *context,
 282                                                 unsigned long value)
 283 {
 284         context->lo &= (((u64)-1) << 4) | 3;
 285         context->lo |= (value & 3) << 2;
 286 }
 287
 288 static inline void context_set_address_root(struct context_entry *context,
 289                                             unsigned long value)
 290 {
 291         context->lo &= ~VTD_PAGE_MASK;
 292         context->lo |= value & VTD_PAGE_MASK;
 293 }
 294
 295 static inline void context_set_address_width(struct context_entry *context,
 296                                              unsigned long value)
 297 {
 298         context->hi |= value & 7;
 299 }
 300
 301 static inline void context_set_domain_id(struct context_entry *context,
 302                                          unsigned long value)
 303 {
 304         context->hi |= (value & ((1 << 16) - 1)) << 8;
 305 }
 306
 307 static inline int context_domain_id(struct context_entry *c)
 308 {
 309         return((c->hi >> 8) & 0xffff);
 310 }
 311
 312 static inline void context_clear_entry(struct context_entry *context)
 313 {
 314         context->lo = 0;
 315         context->hi = 0;
 316 }
 317
 318 /*
 319  * 0: readable
 320  * 1: writable
 321  * 2-6: reserved
 322  * 7: super page
 323  * 8-10: available
 324  * 11: snoop behavior
 325  * 12-63: Host physcial address
 326  */
 327 struct dma_pte {
 328         u64 val;
 329 };
 330
 331 static inline void dma_clear_pte(struct dma_pte *pte)
 332 {
 333         pte->val = 0;
 334 }
 335
 336 static inline u64 dma_pte_addr(struct dma_pte *pte)
 337 {
 338 #ifdef CONFIG_64BIT
 339         return pte->val & VTD_PAGE_MASK;
 340 #else
 341         /* Must have a full atomic 64-bit read */
 342         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 343 #endif
 344 }
 345
 346 static inline bool dma_pte_present(struct dma_pte *pte)
 347 {
 348         return (pte->val & 3) != 0;
 349 }
 350
 351 static inline bool dma_pte_superpage(struct dma_pte *pte)
 352 {
 353         return (pte->val & DMA_PTE_LARGE_PAGE);
 354 }
 355
 356 static inline int first_pte_in_page(struct dma_pte *pte)
 357 {
 358         return !((unsigned long)pte & ~VTD_PAGE_MASK);
 359 }
 360
 361 /*
 362  * This domain is a statically identity mapping domain.
 363  *      1. This domain creats a static 1:1 mapping to all usable memory.
 364  *      2. It maps to each iommu if successful.
 365  *      3. Each iommu mapps to this domain if successful.
 366  */
 367 static struct dmar_domain *si_domain;
 368 static int hw_pass_through = 1;
 369
 370 /*
 371  * Domain represents a virtual machine, more than one devices
 372  * across iommus may be owned in one domain, e.g. kvm guest.
 373  */
 374 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
 375
 376 /* si_domain contains mulitple devices */
 377 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
 378
 379 #define for_each_domain_iommu(idx, domain)                      \
 380         for (idx = 0; idx < g_num_of_iommus; idx++)             \
 381                 if (domain->iommu_refcnt[idx])
 382
 383 struct dmar_rmrr_unit {
 384         struct list_head list;          /* list of rmrr units   */
 385         struct acpi_dmar_header *hdr;   /* ACPI header          */
 386         u64     base_address;           /* reserved base address*/
 387         u64     end_address;            /* reserved end address */
 388         struct dmar_dev_scope *devices; /* target devices */
 389         int     devices_cnt;            /* target device count */
 390 };
 391
 392 struct dmar_atsr_unit {
 393         struct list_head list;          /* list of ATSR units */
 394         struct acpi_dmar_header *hdr;   /* ACPI header */
 395         struct dmar_dev_scope *devices; /* target devices */
 396         int devices_cnt;                /* target device count */
 397         u8 include_all:1;               /* include all ports */
 398 };
 399
 400 static LIST_HEAD(dmar_atsr_units);
 401 static LIST_HEAD(dmar_rmrr_units);
 402
 403 #define for_each_rmrr_units(rmrr) \
 404         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 405
 406 /* bitmap for indexing intel_iommus */
 407 static int g_num_of_iommus;
 408
 409 static void domain_exit(struct dmar_domain *domain);
 410 static void domain_remove_dev_info(struct dmar_domain *domain);
 411 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
 412                                      struct device *dev);
 413 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 414 static void domain_context_clear(struct intel_iommu *iommu,
 415                                  struct device *dev);
 416 static int domain_detach_iommu(struct dmar_domain *domain,
 417                                struct intel_iommu *iommu);
 418
 419 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 420 int dmar_disabled = 0;
 421 #else
 422 int dmar_disabled = 1;
 423 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 424
 425 int intel_iommu_enabled = 0;
 426 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 427
 428 static int dmar_map_gfx = 1;
 429 static int dmar_forcedac;
 430 static int intel_iommu_strict;
 431 static int intel_iommu_superpage = 1;
 432 static int intel_iommu_ecs = 1;
 433 static int intel_iommu_pasid28;
 434 static int iommu_identity_mapping;
 435
 436 #define IDENTMAP_ALL            1
 437 #define IDENTMAP_GFX            2
 438 #define IDENTMAP_AZALIA         4
 439
 440 /* Broadwell and Skylake have broken ECS support — normal so-called "second
 441  * level" translation of DMA requests-without-PASID doesn't actually happen
 442  * unless you also set the NESTE bit in an extended context-entry. Which of
 443  * course means that SVM doesn't work because it's trying to do nested
 444  * translation of the physical addresses it finds in the process page tables,
 445  * through the IOVA->phys mapping found in the "second level" page tables.
 446  *
 447  * The VT-d specification was retroactively changed to change the definition
 448  * of the capability bits and pretend that Broadwell/Skylake never happened...
 449  * but unfortunately the wrong bit was changed. It's ECS which is broken, but
 450  * for some reason it was the PASID capability bit which was redefined (from
 451  * bit 28 on BDW/SKL to bit 40 in future).
 452  *
 453  * So our test for ECS needs to eschew those implementations which set the old
 454  * PASID capabiity bit 28, since those are the ones on which ECS is broken.
 455  * Unless we are working around the 'pasid28' limitations, that is, by putting
 456  * the device into passthrough mode for normal DMA and thus masking the bug.
 457  */
 458 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
 459                             (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
 460 /* PASID support is thus enabled if ECS is enabled and *either* of the old
 461  * or new capability bits are set. */
 462 #define pasid_enabled(iommu) (ecs_enabled(iommu) &&                     \
 463                               (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
 464
 465 int intel_iommu_gfx_mapped;
 466 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 467
 468 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 469 static DEFINE_SPINLOCK(device_domain_lock);
 470 static LIST_HEAD(device_domain_list);
 471
 472 /*
 473  * Iterate over elements in device_domain_list and call the specified
 474  * callback @fn against each element. This helper should only be used
 475  * in the context where the device_domain_lock has already been holden.
 476  */
 477 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 478                                      void *data), void *data)
 479 {
 480         int ret = 0;
 481         struct device_domain_info *info;
 482
 483         assert_spin_locked(&device_domain_lock);
 484         list_for_each_entry(info, &device_domain_list, global) {
 485                 ret = fn(info, data);
 486                 if (ret)
 487                         return ret;
 488         }
 489
 490         return 0;
 491 }
 492
 493 const struct iommu_ops intel_iommu_ops;
 494
 495 static bool translation_pre_enabled(struct intel_iommu *iommu)
 496 {
 497         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 498 }
 499
 500 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 501 {
 502         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 503 }
 504
 505 static void init_translation_status(struct intel_iommu *iommu)
 506 {
 507         u32 gsts;
 508
 509         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 510         if (gsts & DMA_GSTS_TES)
 511                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 512 }
 513
 514 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
 515 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 516 {
 517         return container_of(dom, struct dmar_domain, domain);
 518 }
 519
 520 static int __init intel_iommu_setup(char *str)
 521 {
 522         if (!str)
 523                 return -EINVAL;
 524         while (*str) {
 525                 if (!strncmp(str, "on", 2)) {
 526                         dmar_disabled = 0;
 527                         pr_info("IOMMU enabled\n");
 528                 } else if (!strncmp(str, "off", 3)) {
 529                         dmar_disabled = 1;
 530                         pr_info("IOMMU disabled\n");
 531                 } else if (!strncmp(str, "igfx_off", 8)) {
 532                         dmar_map_gfx = 0;
 533                         pr_info("Disable GFX device mapping\n");
 534                 } else if (!strncmp(str, "forcedac", 8)) {
 535                         pr_info("Forcing DAC for PCI devices\n");
 536                         dmar_forcedac = 1;
 537                 } else if (!strncmp(str, "strict", 6)) {
 538                         pr_info("Disable batched IOTLB flush\n");
 539                         intel_iommu_strict = 1;
 540                 } else if (!strncmp(str, "sp_off", 6)) {
 541                         pr_info("Disable supported super page\n");
 542                         intel_iommu_superpage = 0;
 543                 } else if (!strncmp(str, "ecs_off", 7)) {
 544                         printk(KERN_INFO
 545                                 "Intel-IOMMU: disable extended context table support\n");
 546                         intel_iommu_ecs = 0;
 547                 } else if (!strncmp(str, "pasid28", 7)) {
 548                         printk(KERN_INFO
 549                                 "Intel-IOMMU: enable pre-production PASID support\n");
 550                         intel_iommu_pasid28 = 1;
 551                         iommu_identity_mapping |= IDENTMAP_GFX;
 552                 } else if (!strncmp(str, "tboot_noforce", 13)) {
 553                         printk(KERN_INFO
 554                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 555                         intel_iommu_tboot_noforce = 1;
 556                 }
 557
 558                 str += strcspn(str, ",");
 559                 while (*str == ',')
 560                         str++;
 561         }
 562         return 0;
 563 }
 564 __setup("intel_iommu=", intel_iommu_setup);
 565
 566 static struct kmem_cache *iommu_domain_cache;
 567 static struct kmem_cache *iommu_devinfo_cache;
 568
 569 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 570 {
 571         struct dmar_domain **domains;
 572         int idx = did >> 8;
 573
 574         domains = iommu->domains[idx];
 575         if (!domains)
 576                 return NULL;
 577
 578         return domains[did & 0xff];
 579 }
 580
 581 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 582                              struct dmar_domain *domain)
 583 {
 584         struct dmar_domain **domains;
 585         int idx = did >> 8;
 586
 587         if (!iommu->domains[idx]) {
 588                 size_t size = 256 * sizeof(struct dmar_domain *);
 589                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 590         }
 591
 592         domains = iommu->domains[idx];
 593         if (WARN_ON(!domains))
 594                 return;
 595         else
 596                 domains[did & 0xff] = domain;
 597 }
 598
 599 void *alloc_pgtable_page(int node)
 600 {
 601         struct page *page;
 602         void *vaddr = NULL;
 603
 604         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 605         if (page)
 606                 vaddr = page_address(page);
 607         return vaddr;
 608 }
 609
 610 void free_pgtable_page(void *vaddr)
 611 {
 612         free_page((unsigned long)vaddr);
 613 }
 614
 615 static inline void *alloc_domain_mem(void)
 616 {
 617         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 618 }
 619
 620 static void free_domain_mem(void *vaddr)
 621 {
 622         kmem_cache_free(iommu_domain_cache, vaddr);
 623 }
 624
 625 static inline void * alloc_devinfo_mem(void)
 626 {
 627         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 628 }
 629
 630 static inline void free_devinfo_mem(void *vaddr)
 631 {
 632         kmem_cache_free(iommu_devinfo_cache, vaddr);
 633 }
 634
 635 static inline int domain_type_is_vm(struct dmar_domain *domain)
 636 {
 637         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
 638 }
 639
 640 static inline int domain_type_is_si(struct dmar_domain *domain)
 641 {
 642         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 643 }
 644
 645 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
 646 {
 647         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
 648                                 DOMAIN_FLAG_STATIC_IDENTITY);
 649 }
 650
 651 static inline int domain_pfn_supported(struct dmar_domain *domain,
 652                                        unsigned long pfn)
 653 {
 654         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 655
 656         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 657 }
 658
 659 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 660 {
 661         unsigned long sagaw;
 662         int agaw = -1;
 663
 664         sagaw = cap_sagaw(iommu->cap);
 665         for (agaw = width_to_agaw(max_gaw);
 666              agaw >= 0; agaw--) {
 667                 if (test_bit(agaw, &sagaw))
 668                         break;
 669         }
 670
 671         return agaw;
 672 }
 673
 674 /*
 675  * Calculate max SAGAW for each iommu.
 676  */
 677 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 678 {
 679         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 680 }
 681
 682 /*
 683  * calculate agaw for each iommu.
 684  * "SAGAW" may be different across iommus, use a default agaw, and
 685  * get a supported less agaw for iommus that don't support the default agaw.
 686  */
 687 int iommu_calculate_agaw(struct intel_iommu *iommu)
 688 {
 689         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 690 }
 691
 692 /* This functionin only returns single iommu in a domain */
 693 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 694 {
 695         int iommu_id;
 696
 697         /* si_domain and vm domain should not get here. */
 698         BUG_ON(domain_type_is_vm_or_si(domain));
 699         for_each_domain_iommu(iommu_id, domain)
 700                 break;
 701
 702         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 703                 return NULL;
 704
 705         return g_iommus[iommu_id];
 706 }
 707
 708 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 709 {
 710         struct dmar_drhd_unit *drhd;
 711         struct intel_iommu *iommu;
 712         bool found = false;
 713         int i;
 714
 715         domain->iommu_coherency = 1;
 716
 717         for_each_domain_iommu(i, domain) {
 718                 found = true;
 719                 if (!ecap_coherent(g_iommus[i]->ecap)) {
 720                         domain->iommu_coherency = 0;
 721                         break;
 722                 }
 723         }
 724         if (found)
 725                 return;
 726
 727         /* No hardware attached; use lowest common denominator */
 728         rcu_read_lock();
 729         for_each_active_iommu(iommu, drhd) {
 730                 if (!ecap_coherent(iommu->ecap)) {
 731                         domain->iommu_coherency = 0;
 732                         break;
 733                 }
 734         }
 735         rcu_read_unlock();
 736 }
 737
 738 static int domain_update_iommu_snooping(struct intel_iommu *skip)
 739 {
 740         struct dmar_drhd_unit *drhd;
 741         struct intel_iommu *iommu;
 742         int ret = 1;
 743
 744         rcu_read_lock();
 745         for_each_active_iommu(iommu, drhd) {
 746                 if (iommu != skip) {
 747                         if (!ecap_sc_support(iommu->ecap)) {
 748                                 ret = 0;
 749                                 break;
 750                         }
 751                 }
 752         }
 753         rcu_read_unlock();
 754
 755         return ret;
 756 }
 757
 758 static int domain_update_iommu_superpage(struct intel_iommu *skip)
 759 {
 760         struct dmar_drhd_unit *drhd;
 761         struct intel_iommu *iommu;
 762         int mask = 0xf;
 763
 764         if (!intel_iommu_superpage) {
 765                 return 0;
 766         }
 767
 768         /* set iommu_superpage to the smallest common denominator */
 769         rcu_read_lock();
 770         for_each_active_iommu(iommu, drhd) {
 771                 if (iommu != skip) {
 772                         mask &= cap_super_page_val(iommu->cap);
 773                         if (!mask)
 774                                 break;
 775                 }
 776         }
 777         rcu_read_unlock();
 778
 779         return fls(mask);
 780 }
 781
 782 /* Some capabilities may be different across iommus */
 783 static void domain_update_iommu_cap(struct dmar_domain *domain)
 784 {
 785         domain_update_iommu_coherency(domain);
 786         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 787         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
 788 }
 789
 790 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
 791                                                        u8 bus, u8 devfn, int alloc)
 792 {
 793         struct root_entry *root = &iommu->root_entry[bus];
 794         struct context_entry *context;
 795         u64 *entry;
 796
 797         entry = &root->lo;
 798         if (ecs_enabled(iommu)) {
 799                 if (devfn >= 0x80) {
 800                         devfn -= 0x80;
 801                         entry = &root->hi;
 802                 }
 803                 devfn *= 2;
 804         }
 805         if (*entry & 1)
 806                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 807         else {
 808                 unsigned long phy_addr;
 809                 if (!alloc)
 810                         return NULL;
 811
 812                 context = alloc_pgtable_page(iommu->node);
 813                 if (!context)
 814                         return NULL;
 815
 816                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 817                 phy_addr = virt_to_phys((void *)context);
 818                 *entry = phy_addr | 1;
 819                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 820         }
 821         return &context[devfn];
 822 }
 823
 824 static int iommu_dummy(struct device *dev)
 825 {
 826         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 827 }
 828
 829 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 830 {
 831         struct dmar_drhd_unit *drhd = NULL;
 832         struct intel_iommu *iommu;
 833         struct device *tmp;
 834         struct pci_dev *ptmp, *pdev = NULL;
 835         u16 segment = 0;
 836         int i;
 837
 838         if (iommu_dummy(dev))
 839                 return NULL;
 840
 841         if (dev_is_pci(dev)) {
 842                 struct pci_dev *pf_pdev;
 843
 844                 pdev = to_pci_dev(dev);
 845
 846 #ifdef CONFIG_X86
 847                 /* VMD child devices currently cannot be handled individually */
 848                 if (is_vmd(pdev->bus))
 849                         return NULL;
 850 #endif
 851
 852                 /* VFs aren't listed in scope tables; we need to look up
 853                  * the PF instead to find the IOMMU. */
 854                 pf_pdev = pci_physfn(pdev);
 855                 dev = &pf_pdev->dev;
 856                 segment = pci_domain_nr(pdev->bus);
 857         } else if (has_acpi_companion(dev))
 858                 dev = &ACPI_COMPANION(dev)->dev;
 859
 860         rcu_read_lock();
 861         for_each_active_iommu(iommu, drhd) {
 862                 if (pdev && segment != drhd->segment)
 863                         continue;
 864
 865                 for_each_active_dev_scope(drhd->devices,
 866                                           drhd->devices_cnt, i, tmp) {
 867                         if (tmp == dev) {
 868                                 /* For a VF use its original BDF# not that of the PF
 869                                  * which we used for the IOMMU lookup. Strictly speaking
 870                                  * we could do this for all PCI devices; we only need to
 871                                  * get the BDF# from the scope table for ACPI matches. */
 872                                 if (pdev && pdev->is_virtfn)
 873                                         goto got_pdev;
 874
 875                                 *bus = drhd->devices[i].bus;
 876                                 *devfn = drhd->devices[i].devfn;
 877                                 goto out;
 878                         }
 879
 880                         if (!pdev || !dev_is_pci(tmp))
 881                                 continue;
 882
 883                         ptmp = to_pci_dev(tmp);
 884                         if (ptmp->subordinate &&
 885                             ptmp->subordinate->number <= pdev->bus->number &&
 886                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
 887                                 goto got_pdev;
 888                 }
 889
 890                 if (pdev && drhd->include_all) {
 891                 got_pdev:
 892                         *bus = pdev->bus->number;
 893                         *devfn = pdev->devfn;
 894                         goto out;
 895                 }
 896         }
 897         iommu = NULL;
 898  out:
 899         rcu_read_unlock();
 900
 901         return iommu;
 902 }
 903
 904 static void domain_flush_cache(struct dmar_domain *domain,
 905                                void *addr, int size)
 906 {
 907         if (!domain->iommu_coherency)
 908                 clflush_cache_range(addr, size);
 909 }
 910
 911 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 912 {
 913         struct context_entry *context;
 914         int ret = 0;
 915         unsigned long flags;
 916
 917         spin_lock_irqsave(&iommu->lock, flags);
 918         context = iommu_context_addr(iommu, bus, devfn, 0);
 919         if (context)
 920                 ret = context_present(context);
 921         spin_unlock_irqrestore(&iommu->lock, flags);
 922         return ret;
 923 }
 924
 925 static void free_context_table(struct intel_iommu *iommu)
 926 {
 927         int i;
 928         unsigned long flags;
 929         struct context_entry *context;
 930
 931         spin_lock_irqsave(&iommu->lock, flags);
 932         if (!iommu->root_entry) {
 933                 goto out;
 934         }
 935         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 936                 context = iommu_context_addr(iommu, i, 0, 0);
 937                 if (context)
 938                         free_pgtable_page(context);
 939
 940                 if (!ecs_enabled(iommu))
 941                         continue;
 942
 943                 context = iommu_context_addr(iommu, i, 0x80, 0);
 944                 if (context)
 945                         free_pgtable_page(context);
 946
 947         }
 948         free_pgtable_page(iommu->root_entry);
 949         iommu->root_entry = NULL;
 950 out:
 951         spin_unlock_irqrestore(&iommu->lock, flags);
 952 }
 953
 954 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 955                                       unsigned long pfn, int *target_level)
 956 {
 957         struct dma_pte *parent, *pte = NULL;
 958         int level = agaw_to_level(domain->agaw);
 959         int offset;
 960
 961         BUG_ON(!domain->pgd);
 962
 963         if (!domain_pfn_supported(domain, pfn))
 964                 /* Address beyond IOMMU's addressing capabilities. */
 965                 return NULL;
 966
 967         parent = domain->pgd;
 968
 969         while (1) {
 970                 void *tmp_page;
 971
 972                 offset = pfn_level_offset(pfn, level);
 973                 pte = &parent[offset];
 974                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 975                         break;
 976                 if (level == *target_level)
 977                         break;
 978
 979                 if (!dma_pte_present(pte)) {
 980                         uint64_t pteval;
 981
 982                         tmp_page = alloc_pgtable_page(domain->nid);
 983
 984                         if (!tmp_page)
 985                                 return NULL;
 986
 987                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 988                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 989                         if (cmpxchg64(&pte->val, 0ULL, pteval))
 990                                 /* Someone else set it while we were thinking; use theirs. */
 991                                 free_pgtable_page(tmp_page);
 992                         else
 993                                 domain_flush_cache(domain, pte, sizeof(*pte));
 994                 }
 995                 if (level == 1)
 996                         break;
 997
 998                 parent = phys_to_virt(dma_pte_addr(pte));
 999                 level--;
1000         }
1001
1002         if (!*target_level)
1003                 *target_level = level;
1004
1005         return pte;
1006 }
1007
1008
1009 /* return address's pte at specific level */
1010 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1011                                          unsigned long pfn,
1012                                          int level, int *large_page)
1013 {
1014         struct dma_pte *parent, *pte = NULL;
1015         int total = agaw_to_level(domain->agaw);
1016         int offset;
1017
1018         parent = domain->pgd;
1019         while (level <= total) {
1020                 offset = pfn_level_offset(pfn, total);
1021                 pte = &parent[offset];
1022                 if (level == total)
1023                         return pte;
1024
1025                 if (!dma_pte_present(pte)) {
1026                         *large_page = total;
1027                         break;
1028                 }
1029
1030                 if (dma_pte_superpage(pte)) {
1031                         *large_page = total;
1032                         return pte;
1033                 }
1034
1035                 parent = phys_to_virt(dma_pte_addr(pte));
1036                 total--;
1037         }
1038         return NULL;
1039 }
1040
1041 /* clear last level pte, a tlb flush should be followed */
1042 static void dma_pte_clear_range(struct dmar_domain *domain,
1043                                 unsigned long start_pfn,
1044                                 unsigned long last_pfn)
1045 {
1046         unsigned int large_page = 1;
1047         struct dma_pte *first_pte, *pte;
1048
1049         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1050         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1051         BUG_ON(start_pfn > last_pfn);
1052
1053         /* we don't need lock here; nobody else touches the iova range */
1054         do {
1055                 large_page = 1;
1056                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1057                 if (!pte) {
1058                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1059                         continue;
1060                 }
1061                 do {
1062                         dma_clear_pte(pte);
1063                         start_pfn += lvl_to_nr_pages(large_page);
1064                         pte++;
1065                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1066
1067                 domain_flush_cache(domain, first_pte,
1068                                    (void *)pte - (void *)first_pte);
1069
1070         } while (start_pfn && start_pfn <= last_pfn);
1071 }
1072
1073 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1074                                int retain_level, struct dma_pte *pte,
1075                                unsigned long pfn, unsigned long start_pfn,
1076                                unsigned long last_pfn)
1077 {
1078         pfn = max(start_pfn, pfn);
1079         pte = &pte[pfn_level_offset(pfn, level)];
1080
1081         do {
1082                 unsigned long level_pfn;
1083                 struct dma_pte *level_pte;
1084
1085                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1086                         goto next;
1087
1088                 level_pfn = pfn & level_mask(level);
1089                 level_pte = phys_to_virt(dma_pte_addr(pte));
1090
1091                 if (level > 2) {
1092                         dma_pte_free_level(domain, level - 1, retain_level,
1093                                            level_pte, level_pfn, start_pfn,
1094                                            last_pfn);
1095                 }
1096
1097                 /*
1098                  * Free the page table if we're below the level we want to
1099                  * retain and the range covers the entire table.
1100                  */
1101                 if (level < retain_level && !(start_pfn > level_pfn ||
1102                       last_pfn < level_pfn + level_size(level) - 1)) {
1103                         dma_clear_pte(pte);
1104                         domain_flush_cache(domain, pte, sizeof(*pte));
1105                         free_pgtable_page(level_pte);
1106                 }
1107 next:
1108                 pfn += level_size(level);
1109         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1110 }
1111
1112 /*
1113  * clear last level (leaf) ptes and free page table pages below the
1114  * level we wish to keep intact.
1115  */
1116 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1117                                    unsigned long start_pfn,
1118                                    unsigned long last_pfn,
1119                                    int retain_level)
1120 {
1121         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1122         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1123         BUG_ON(start_pfn > last_pfn);
1124
1125         dma_pte_clear_range(domain, start_pfn, last_pfn);
1126
1127         /* We don't need lock here; nobody else touches the iova range */
1128         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1129                            domain->pgd, 0, start_pfn, last_pfn);
1130
1131         /* free pgd */
1132         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1133                 free_pgtable_page(domain->pgd);
1134                 domain->pgd = NULL;
1135         }
1136 }
1137
1138 /* When a page at a given level is being unlinked from its parent, we don't
1139    need to *modify* it at all. All we need to do is make a list of all the
1140    pages which can be freed just as soon as we've flushed the IOTLB and we
1141    know the hardware page-walk will no longer touch them.
1142    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1143    be freed. */
1144 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1145                                             int level, struct dma_pte *pte,
1146                                             struct page *freelist)
1147 {
1148         struct page *pg;
1149
1150         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1151         pg->freelist = freelist;
1152         freelist = pg;
1153
1154         if (level == 1)
1155                 return freelist;
1156
1157         pte = page_address(pg);
1158         do {
1159                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1160                         freelist = dma_pte_list_pagetables(domain, level - 1,
1161                                                            pte, freelist);
1162                 pte++;
1163         } while (!first_pte_in_page(pte));
1164
1165         return freelist;
1166 }
1167
1168 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1169                                         struct dma_pte *pte, unsigned long pfn,
1170                                         unsigned long start_pfn,
1171                                         unsigned long last_pfn,
1172                                         struct page *freelist)
1173 {
1174         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1175
1176         pfn = max(start_pfn, pfn);
1177         pte = &pte[pfn_level_offset(pfn, level)];
1178
1179         do {
1180                 unsigned long level_pfn;
1181
1182                 if (!dma_pte_present(pte))
1183                         goto next;
1184
1185                 level_pfn = pfn & level_mask(level);
1186
1187                 /* If range covers entire pagetable, free it */
1188                 if (start_pfn <= level_pfn &&
1189                     last_pfn >= level_pfn + level_size(level) - 1) {
1190                         /* These suborbinate page tables are going away entirely. Don't
1191                            bother to clear them; we're just going to *free* them. */
1192                         if (level > 1 && !dma_pte_superpage(pte))
1193                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1194
1195                         dma_clear_pte(pte);
1196                         if (!first_pte)
1197                                 first_pte = pte;
1198                         last_pte = pte;
1199                 } else if (level > 1) {
1200                         /* Recurse down into a level that isn't *entirely* obsolete */
1201                         freelist = dma_pte_clear_level(domain, level - 1,
1202                                                        phys_to_virt(dma_pte_addr(pte)),
1203                                                        level_pfn, start_pfn, last_pfn,
1204                                                        freelist);
1205                 }
1206 next:
1207                 pfn += level_size(level);
1208         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1209
1210         if (first_pte)
1211                 domain_flush_cache(domain, first_pte,
1212                                    (void *)++last_pte - (void *)first_pte);
1213
1214         return freelist;
1215 }
1216
1217 /* We can't just free the pages because the IOMMU may still be walking
1218    the page tables, and may have cached the intermediate levels. The
1219    pages can only be freed after the IOTLB flush has been done. */
1220 static struct page *domain_unmap(struct dmar_domain *domain,
1221                                  unsigned long start_pfn,
1222                                  unsigned long last_pfn)
1223 {
1224         struct page *freelist = NULL;
1225
1226         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1227         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1228         BUG_ON(start_pfn > last_pfn);
1229
1230         /* we don't need lock here; nobody else touches the iova range */
1231         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1232                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1233
1234         /* free pgd */
1235         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1236                 struct page *pgd_page = virt_to_page(domain->pgd);
1237                 pgd_page->freelist = freelist;
1238                 freelist = pgd_page;
1239
1240                 domain->pgd = NULL;
1241         }
1242
1243         return freelist;
1244 }
1245
1246 static void dma_free_pagelist(struct page *freelist)
1247 {
1248         struct page *pg;
1249
1250         while ((pg = freelist)) {
1251                 freelist = pg->freelist;
1252                 free_pgtable_page(page_address(pg));
1253         }
1254 }
1255
1256 static void iova_entry_free(unsigned long data)
1257 {
1258         struct page *freelist = (struct page *)data;
1259
1260         dma_free_pagelist(freelist);
1261 }
1262
1263 /* iommu handling */
1264 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1265 {
1266         struct root_entry *root;
1267         unsigned long flags;
1268
1269         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1270         if (!root) {
1271                 pr_err("Allocating root entry for %s failed\n",
1272                         iommu->name);
1273                 return -ENOMEM;
1274         }
1275
1276         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1277
1278         spin_lock_irqsave(&iommu->lock, flags);
1279         iommu->root_entry = root;
1280         spin_unlock_irqrestore(&iommu->lock, flags);
1281
1282         return 0;
1283 }
1284
1285 static void iommu_set_root_entry(struct intel_iommu *iommu)
1286 {
1287         u64 addr;
1288         u32 sts;
1289         unsigned long flag;
1290
1291         addr = virt_to_phys(iommu->root_entry);
1292         if (ecs_enabled(iommu))
1293                 addr |= DMA_RTADDR_RTT;
1294
1295         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1296         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1297
1298         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1299
1300         /* Make sure hardware complete it */
1301         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1302                       readl, (sts & DMA_GSTS_RTPS), sts);
1303
1304         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1305 }
1306
1307 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1308 {
1309         u32 val;
1310         unsigned long flag;
1311
1312         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1313                 return;
1314
1315         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1316         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1317
1318         /* Make sure hardware complete it */
1319         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1320                       readl, (!(val & DMA_GSTS_WBFS)), val);
1321
1322         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1323 }
1324
1325 /* return value determine if we need a write buffer flush */
1326 static void __iommu_flush_context(struct intel_iommu *iommu,
1327                                   u16 did, u16 source_id, u8 function_mask,
1328                                   u64 type)
1329 {
1330         u64 val = 0;
1331         unsigned long flag;
1332
1333         switch (type) {
1334         case DMA_CCMD_GLOBAL_INVL:
1335                 val = DMA_CCMD_GLOBAL_INVL;
1336                 break;
1337         case DMA_CCMD_DOMAIN_INVL:
1338                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1339                 break;
1340         case DMA_CCMD_DEVICE_INVL:
1341                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1342                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1343                 break;
1344         default:
1345                 BUG();
1346         }
1347         val |= DMA_CCMD_ICC;
1348
1349         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1350         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1351
1352         /* Make sure hardware complete it */
1353         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1354                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1355
1356         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1357 }
1358
1359 /* return value determine if we need a write buffer flush */
1360 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1361                                 u64 addr, unsigned int size_order, u64 type)
1362 {
1363         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1364         u64 val = 0, val_iva = 0;
1365         unsigned long flag;
1366
1367         switch (type) {
1368         case DMA_TLB_GLOBAL_FLUSH:
1369                 /* global flush doesn't need set IVA_REG */
1370                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1371                 break;
1372         case DMA_TLB_DSI_FLUSH:
1373                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1374                 break;
1375         case DMA_TLB_PSI_FLUSH:
1376                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1377                 /* IH bit is passed in as part of address */
1378                 val_iva = size_order | addr;
1379                 break;
1380         default:
1381                 BUG();
1382         }
1383         /* Note: set drain read/write */
1384 #if 0
1385         /*
1386          * This is probably to be super secure.. Looks like we can
1387          * ignore it without any impact.
1388          */
1389         if (cap_read_drain(iommu->cap))
1390                 val |= DMA_TLB_READ_DRAIN;
1391 #endif
1392         if (cap_write_drain(iommu->cap))
1393                 val |= DMA_TLB_WRITE_DRAIN;
1394
1395         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1396         /* Note: Only uses first TLB reg currently */
1397         if (val_iva)
1398                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1399         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1400
1401         /* Make sure hardware complete it */
1402         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1403                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1404
1405         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1406
1407         /* check IOTLB invalidation granularity */
1408         if (DMA_TLB_IAIG(val) == 0)
1409                 pr_err("Flush IOTLB failed\n");
1410         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1411                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1412                         (unsigned long long)DMA_TLB_IIRG(type),
1413                         (unsigned long long)DMA_TLB_IAIG(val));
1414 }
1415
1416 static struct device_domain_info *
1417 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1418                          u8 bus, u8 devfn)
1419 {
1420         struct device_domain_info *info;
1421
1422         assert_spin_locked(&device_domain_lock);
1423
1424         if (!iommu->qi)
1425                 return NULL;
1426
1427         list_for_each_entry(info, &domain->devices, link)
1428                 if (info->iommu == iommu && info->bus == bus &&
1429                     info->devfn == devfn) {
1430                         if (info->ats_supported && info->dev)
1431                                 return info;
1432                         break;
1433                 }
1434
1435         return NULL;
1436 }
1437
1438 static void domain_update_iotlb(struct dmar_domain *domain)
1439 {
1440         struct device_domain_info *info;
1441         bool has_iotlb_device = false;
1442
1443         assert_spin_locked(&device_domain_lock);
1444
1445         list_for_each_entry(info, &domain->devices, link) {
1446                 struct pci_dev *pdev;
1447
1448                 if (!info->dev || !dev_is_pci(info->dev))
1449                         continue;
1450
1451                 pdev = to_pci_dev(info->dev);
1452                 if (pdev->ats_enabled) {
1453                         has_iotlb_device = true;
1454                         break;
1455                 }
1456         }
1457
1458         domain->has_iotlb_device = has_iotlb_device;
1459 }
1460
1461 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1462 {
1463         struct pci_dev *pdev;
1464
1465         assert_spin_locked(&device_domain_lock);
1466
1467         if (!info || !dev_is_pci(info->dev))
1468                 return;
1469
1470         pdev = to_pci_dev(info->dev);
1471         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1472          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1473          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1474          * reserved, which should be set to 0.
1475          */
1476         if (!ecap_dit(info->iommu->ecap))
1477                 info->pfsid = 0;
1478         else {
1479                 struct pci_dev *pf_pdev;
1480
1481                 /* pdev will be returned if device is not a vf */
1482                 pf_pdev = pci_physfn(pdev);
1483                 info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
1484         }
1485
1486 #ifdef CONFIG_INTEL_IOMMU_SVM
1487         /* The PCIe spec, in its wisdom, declares that the behaviour of
1488            the device if you enable PASID support after ATS support is
1489            undefined. So always enable PASID support on devices which
1490            have it, even if we can't yet know if we're ever going to
1491            use it. */
1492         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1493                 info->pasid_enabled = 1;
1494
1495         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1496                 info->pri_enabled = 1;
1497 #endif
1498         if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1499                 info->ats_enabled = 1;
1500                 domain_update_iotlb(info->domain);
1501                 info->ats_qdep = pci_ats_queue_depth(pdev);
1502         }
1503 }
1504
1505 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1506 {
1507         struct pci_dev *pdev;
1508
1509         assert_spin_locked(&device_domain_lock);
1510
1511         if (!dev_is_pci(info->dev))
1512                 return;
1513
1514         pdev = to_pci_dev(info->dev);
1515
1516         if (info->ats_enabled) {
1517                 pci_disable_ats(pdev);
1518                 info->ats_enabled = 0;
1519                 domain_update_iotlb(info->domain);
1520         }
1521 #ifdef CONFIG_INTEL_IOMMU_SVM
1522         if (info->pri_enabled) {
1523                 pci_disable_pri(pdev);
1524                 info->pri_enabled = 0;
1525         }
1526         if (info->pasid_enabled) {
1527                 pci_disable_pasid(pdev);
1528                 info->pasid_enabled = 0;
1529         }
1530 #endif
1531 }
1532
1533 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1534                                   u64 addr, unsigned mask)
1535 {
1536         u16 sid, qdep;
1537         unsigned long flags;
1538         struct device_domain_info *info;
1539
1540         if (!domain->has_iotlb_device)
1541                 return;
1542
1543         spin_lock_irqsave(&device_domain_lock, flags);
1544         list_for_each_entry(info, &domain->devices, link) {
1545                 if (!info->ats_enabled)
1546                         continue;
1547
1548                 sid = info->bus << 8 | info->devfn;
1549                 qdep = info->ats_qdep;
1550                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1551                                 qdep, addr, mask);
1552         }
1553         spin_unlock_irqrestore(&device_domain_lock, flags);
1554 }
1555
1556 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1557                                   struct dmar_domain *domain,
1558                                   unsigned long pfn, unsigned int pages,
1559                                   int ih, int map)
1560 {
1561         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1562         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1563         u16 did = domain->iommu_did[iommu->seq_id];
1564
1565         BUG_ON(pages == 0);
1566
1567         if (ih)
1568                 ih = 1 << 6;
1569         /*
1570          * Fallback to domain selective flush if no PSI support or the size is
1571          * too big.
1572          * PSI requires page size to be 2 ^ x, and the base address is naturally
1573          * aligned to the size
1574          */
1575         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1576                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1577                                                 DMA_TLB_DSI_FLUSH);
1578         else
1579                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1580                                                 DMA_TLB_PSI_FLUSH);
1581
1582         /*
1583          * In caching mode, changes of pages from non-present to present require
1584          * flush. However, device IOTLB doesn't need to be flushed in this case.
1585          */
1586         if (!cap_caching_mode(iommu->cap) || !map)
1587                 iommu_flush_dev_iotlb(domain, addr, mask);
1588 }
1589
1590 /* Notification for newly created mappings */
1591 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1592                                         struct dmar_domain *domain,
1593                                         unsigned long pfn, unsigned int pages)
1594 {
1595         /* It's a non-present to present mapping. Only flush if caching mode */
1596         if (cap_caching_mode(iommu->cap))
1597                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1598         else
1599                 iommu_flush_write_buffer(iommu);
1600 }
1601
1602 static void iommu_flush_iova(struct iova_domain *iovad)
1603 {
1604         struct dmar_domain *domain;
1605         int idx;
1606
1607         domain = container_of(iovad, struct dmar_domain, iovad);
1608
1609         for_each_domain_iommu(idx, domain) {
1610                 struct intel_iommu *iommu = g_iommus[idx];
1611                 u16 did = domain->iommu_did[iommu->seq_id];
1612
1613                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1614
1615                 if (!cap_caching_mode(iommu->cap))
1616                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1617                                               0, MAX_AGAW_PFN_WIDTH);
1618         }
1619 }
1620
1621 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1622 {
1623         u32 pmen;
1624         unsigned long flags;
1625
1626         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1627                 return;
1628
1629         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1630         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1631         pmen &= ~DMA_PMEN_EPM;
1632         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1633
1634         /* wait for the protected region status bit to clear */
1635         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1636                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1637
1638         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1639 }
1640
1641 static void iommu_enable_translation(struct intel_iommu *iommu)
1642 {
1643         u32 sts;
1644         unsigned long flags;
1645
1646         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1647         iommu->gcmd |= DMA_GCMD_TE;
1648         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1649
1650         /* Make sure hardware complete it */
1651         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1652                       readl, (sts & DMA_GSTS_TES), sts);
1653
1654         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1655 }
1656
1657 static void iommu_disable_translation(struct intel_iommu *iommu)
1658 {
1659         u32 sts;
1660         unsigned long flag;
1661
1662         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1663         iommu->gcmd &= ~DMA_GCMD_TE;
1664         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1665
1666         /* Make sure hardware complete it */
1667         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1668                       readl, (!(sts & DMA_GSTS_TES)), sts);
1669
1670         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1671 }
1672
1673
1674 static int iommu_init_domains(struct intel_iommu *iommu)
1675 {
1676         u32 ndomains, nlongs;
1677         size_t size;
1678
1679         ndomains = cap_ndoms(iommu->cap);
1680         pr_debug("%s: Number of Domains supported <%d>\n",
1681                  iommu->name, ndomains);
1682         nlongs = BITS_TO_LONGS(ndomains);
1683
1684         spin_lock_init(&iommu->lock);
1685
1686         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1687         if (!iommu->domain_ids) {
1688                 pr_err("%s: Allocating domain id array failed\n",
1689                        iommu->name);
1690                 return -ENOMEM;
1691         }
1692
1693         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1694         iommu->domains = kzalloc(size, GFP_KERNEL);
1695
1696         if (iommu->domains) {
1697                 size = 256 * sizeof(struct dmar_domain *);
1698                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1699         }
1700
1701         if (!iommu->domains || !iommu->domains[0]) {
1702                 pr_err("%s: Allocating domain array failed\n",
1703                        iommu->name);
1704                 kfree(iommu->domain_ids);
1705                 kfree(iommu->domains);
1706                 iommu->domain_ids = NULL;
1707                 iommu->domains    = NULL;
1708                 return -ENOMEM;
1709         }
1710
1711
1712
1713         /*
1714          * If Caching mode is set, then invalid translations are tagged
1715          * with domain-id 0, hence we need to pre-allocate it. We also
1716          * use domain-id 0 as a marker for non-allocated domain-id, so
1717          * make sure it is not used for a real domain.
1718          */
1719         set_bit(0, iommu->domain_ids);
1720
1721         return 0;
1722 }
1723
1724 static void disable_dmar_iommu(struct intel_iommu *iommu)
1725 {
1726         struct device_domain_info *info, *tmp;
1727         unsigned long flags;
1728
1729         if (!iommu->domains || !iommu->domain_ids)
1730                 return;
1731
1732 again:
1733         spin_lock_irqsave(&device_domain_lock, flags);
1734         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1735                 struct dmar_domain *domain;
1736
1737                 if (info->iommu != iommu)
1738                         continue;
1739
1740                 if (!info->dev || !info->domain)
1741                         continue;
1742
1743                 domain = info->domain;
1744
1745                 __dmar_remove_one_dev_info(info);
1746
1747                 if (!domain_type_is_vm_or_si(domain)) {
1748                         /*
1749                          * The domain_exit() function  can't be called under
1750                          * device_domain_lock, as it takes this lock itself.
1751                          * So release the lock here and re-run the loop
1752                          * afterwards.
1753                          */
1754                         spin_unlock_irqrestore(&device_domain_lock, flags);
1755                         domain_exit(domain);
1756                         goto again;
1757                 }
1758         }
1759         spin_unlock_irqrestore(&device_domain_lock, flags);
1760
1761         if (iommu->gcmd & DMA_GCMD_TE)
1762                 iommu_disable_translation(iommu);
1763 }
1764
1765 static void free_dmar_iommu(struct intel_iommu *iommu)
1766 {
1767         if ((iommu->domains) && (iommu->domain_ids)) {
1768                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1769                 int i;
1770
1771                 for (i = 0; i < elems; i++)
1772                         kfree(iommu->domains[i]);
1773                 kfree(iommu->domains);
1774                 kfree(iommu->domain_ids);
1775                 iommu->domains = NULL;
1776                 iommu->domain_ids = NULL;
1777         }
1778
1779         g_iommus[iommu->seq_id] = NULL;
1780
1781         /* free context mapping */
1782         free_context_table(iommu);
1783
1784 #ifdef CONFIG_INTEL_IOMMU_SVM
1785         if (pasid_enabled(iommu)) {
1786                 if (ecap_prs(iommu->ecap))
1787                         intel_svm_finish_prq(iommu);
1788                 intel_svm_exit(iommu);
1789         }
1790 #endif
1791 }
1792
1793 static struct dmar_domain *alloc_domain(int flags)
1794 {
1795         struct dmar_domain *domain;
1796
1797         domain = alloc_domain_mem();
1798         if (!domain)
1799                 return NULL;
1800
1801         memset(domain, 0, sizeof(*domain));
1802         domain->nid = -1;
1803         domain->flags = flags;
1804         domain->has_iotlb_device = false;
1805         INIT_LIST_HEAD(&domain->devices);
1806
1807         return domain;
1808 }
1809
1810 /* Must be called with iommu->lock */
1811 static int domain_attach_iommu(struct dmar_domain *domain,
1812                                struct intel_iommu *iommu)
1813 {
1814         unsigned long ndomains;
1815         int num;
1816
1817         assert_spin_locked(&device_domain_lock);
1818         assert_spin_locked(&iommu->lock);
1819
1820         domain->iommu_refcnt[iommu->seq_id] += 1;
1821         domain->iommu_count += 1;
1822         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1823                 ndomains = cap_ndoms(iommu->cap);
1824                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1825
1826                 if (num >= ndomains) {
1827                         pr_err("%s: No free domain ids\n", iommu->name);
1828                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1829                         domain->iommu_count -= 1;
1830                         return -ENOSPC;
1831                 }
1832
1833                 set_bit(num, iommu->domain_ids);
1834                 set_iommu_domain(iommu, num, domain);
1835
1836                 domain->iommu_did[iommu->seq_id] = num;
1837                 domain->nid                      = iommu->node;
1838
1839                 domain_update_iommu_cap(domain);
1840         }
1841
1842         return 0;
1843 }
1844
1845 static int domain_detach_iommu(struct dmar_domain *domain,
1846                                struct intel_iommu *iommu)
1847 {
1848         int num, count = INT_MAX;
1849
1850         assert_spin_locked(&device_domain_lock);
1851         assert_spin_locked(&iommu->lock);
1852
1853         domain->iommu_refcnt[iommu->seq_id] -= 1;
1854         count = --domain->iommu_count;
1855         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1856                 num = domain->iommu_did[iommu->seq_id];
1857                 clear_bit(num, iommu->domain_ids);
1858                 set_iommu_domain(iommu, num, NULL);
1859
1860                 domain_update_iommu_cap(domain);
1861                 domain->iommu_did[iommu->seq_id] = 0;
1862         }
1863
1864         return count;
1865 }
1866
1867 static struct iova_domain reserved_iova_list;
1868 static struct lock_class_key reserved_rbtree_key;
1869
1870 static int dmar_init_reserved_ranges(void)
1871 {
1872         struct pci_dev *pdev = NULL;
1873         struct iova *iova;
1874         int i;
1875
1876         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1877
1878         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1879                 &reserved_rbtree_key);
1880
1881         /* IOAPIC ranges shouldn't be accessed by DMA */
1882         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1883                 IOVA_PFN(IOAPIC_RANGE_END));
1884         if (!iova) {
1885                 pr_err("Reserve IOAPIC range failed\n");
1886                 return -ENODEV;
1887         }
1888
1889         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1890         for_each_pci_dev(pdev) {
1891                 struct resource *r;
1892
1893                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1894                         r = &pdev->resource[i];
1895                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1896                                 continue;
1897                         iova = reserve_iova(&reserved_iova_list,
1898                                             IOVA_PFN(r->start),
1899                                             IOVA_PFN(r->end));
1900                         if (!iova) {
1901                                 pr_err("Reserve iova failed\n");
1902                                 return -ENODEV;
1903                         }
1904                 }
1905         }
1906         return 0;
1907 }
1908
1909 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1910 {
1911         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1912 }
1913
1914 static inline int guestwidth_to_adjustwidth(int gaw)
1915 {
1916         int agaw;
1917         int r = (gaw - 12) % 9;
1918
1919         if (r == 0)
1920                 agaw = gaw;
1921         else
1922                 agaw = gaw + 9 - r;
1923         if (agaw > 64)
1924                 agaw = 64;
1925         return agaw;
1926 }
1927
1928 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1929                        int guest_width)
1930 {
1931         int adjust_width, agaw;
1932         unsigned long sagaw;
1933         int err;
1934
1935         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1936
1937         err = init_iova_flush_queue(&domain->iovad,
1938                                     iommu_flush_iova, iova_entry_free);
1939         if (err)
1940                 return err;
1941
1942         domain_reserve_special_ranges(domain);
1943
1944         /* calculate AGAW */
1945         if (guest_width > cap_mgaw(iommu->cap))
1946                 guest_width = cap_mgaw(iommu->cap);
1947         domain->gaw = guest_width;
1948         adjust_width = guestwidth_to_adjustwidth(guest_width);
1949         agaw = width_to_agaw(adjust_width);
1950         sagaw = cap_sagaw(iommu->cap);
1951         if (!test_bit(agaw, &sagaw)) {
1952                 /* hardware doesn't support it, choose a bigger one */
1953                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1954                 agaw = find_next_bit(&sagaw, 5, agaw);
1955                 if (agaw >= 5)
1956                         return -ENODEV;
1957         }
1958         domain->agaw = agaw;
1959
1960         if (ecap_coherent(iommu->ecap))
1961                 domain->iommu_coherency = 1;
1962         else
1963                 domain->iommu_coherency = 0;
1964
1965         if (ecap_sc_support(iommu->ecap))
1966                 domain->iommu_snooping = 1;
1967         else
1968                 domain->iommu_snooping = 0;
1969
1970         if (intel_iommu_superpage)
1971                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1972         else
1973                 domain->iommu_superpage = 0;
1974
1975         domain->nid = iommu->node;
1976
1977         /* always allocate the top pgd */
1978         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1979         if (!domain->pgd)
1980                 return -ENOMEM;
1981         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1982         return 0;
1983 }
1984
1985 static void domain_exit(struct dmar_domain *domain)
1986 {
1987         struct page *freelist = NULL;
1988
1989         /* Domain 0 is reserved, so dont process it */
1990         if (!domain)
1991                 return;
1992
1993         /* Remove associated devices and clear attached or cached domains */
1994         rcu_read_lock();
1995         domain_remove_dev_info(domain);
1996         rcu_read_unlock();
1997
1998         /* destroy iovas */
1999         put_iova_domain(&domain->iovad);
2000
2001         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2002
2003         dma_free_pagelist(freelist);
2004
2005         free_domain_mem(domain);
2006 }
2007
2008 static int domain_context_mapping_one(struct dmar_domain *domain,
2009                                       struct intel_iommu *iommu,
2010                                       u8 bus, u8 devfn)
2011 {
2012         u16 did = domain->iommu_did[iommu->seq_id];
2013         int translation = CONTEXT_TT_MULTI_LEVEL;
2014         struct device_domain_info *info = NULL;
2015         struct context_entry *context;
2016         unsigned long flags;
2017         struct dma_pte *pgd;
2018         int ret, agaw;
2019
2020         WARN_ON(did == 0);
2021
2022         if (hw_pass_through && domain_type_is_si(domain))
2023                 translation = CONTEXT_TT_PASS_THROUGH;
2024
2025         pr_debug("Set context mapping for %02x:%02x.%d\n",
2026                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2027
2028         BUG_ON(!domain->pgd);
2029
2030         spin_lock_irqsave(&device_domain_lock, flags);
2031         spin_lock(&iommu->lock);
2032
2033         ret = -ENOMEM;
2034         context = iommu_context_addr(iommu, bus, devfn, 1);
2035         if (!context)
2036                 goto out_unlock;
2037
2038         ret = 0;
2039         if (context_present(context))
2040                 goto out_unlock;
2041
2042         /*
2043          * For kdump cases, old valid entries may be cached due to the
2044          * in-flight DMA and copied pgtable, but there is no unmapping
2045          * behaviour for them, thus we need an explicit cache flush for
2046          * the newly-mapped device. For kdump, at this point, the device
2047          * is supposed to finish reset at its driver probe stage, so no
2048          * in-flight DMA will exist, and we don't need to worry anymore
2049          * hereafter.
2050          */
2051         if (context_copied(context)) {
2052                 u16 did_old = context_domain_id(context);
2053
2054                 if (did_old < cap_ndoms(iommu->cap)) {
2055                         iommu->flush.flush_context(iommu, did_old,
2056                                                    (((u16)bus) << 8) | devfn,
2057                                                    DMA_CCMD_MASK_NOBIT,
2058                                                    DMA_CCMD_DEVICE_INVL);
2059                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2060                                                  DMA_TLB_DSI_FLUSH);
2061                 }
2062         }
2063
2064         pgd = domain->pgd;
2065
2066         context_clear_entry(context);
2067         context_set_domain_id(context, did);
2068
2069         /*
2070          * Skip top levels of page tables for iommu which has less agaw
2071          * than default.  Unnecessary for PT mode.
2072          */
2073         if (translation != CONTEXT_TT_PASS_THROUGH) {
2074                 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2075                         ret = -ENOMEM;
2076                         pgd = phys_to_virt(dma_pte_addr(pgd));
2077                         if (!dma_pte_present(pgd))
2078                                 goto out_unlock;
2079                 }
2080
2081                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2082                 if (info && info->ats_supported)
2083                         translation = CONTEXT_TT_DEV_IOTLB;
2084                 else
2085                         translation = CONTEXT_TT_MULTI_LEVEL;
2086
2087                 context_set_address_root(context, virt_to_phys(pgd));
2088                 context_set_address_width(context, agaw);
2089         } else {
2090                 /*
2091                  * In pass through mode, AW must be programmed to
2092                  * indicate the largest AGAW value supported by
2093                  * hardware. And ASR is ignored by hardware.
2094                  */
2095                 context_set_address_width(context, iommu->msagaw);
2096         }
2097
2098         context_set_translation_type(context, translation);
2099         context_set_fault_enable(context);
2100         context_set_present(context);
2101         domain_flush_cache(domain, context, sizeof(*context));
2102
2103         /*
2104          * It's a non-present to present mapping. If hardware doesn't cache
2105          * non-present entry we only need to flush the write-buffer. If the
2106          * _does_ cache non-present entries, then it does so in the special
2107          * domain #0, which we have to flush:
2108          */
2109         if (cap_caching_mode(iommu->cap)) {
2110                 iommu->flush.flush_context(iommu, 0,
2111                                            (((u16)bus) << 8) | devfn,
2112                                            DMA_CCMD_MASK_NOBIT,
2113                                            DMA_CCMD_DEVICE_INVL);
2114                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2115         } else {
2116                 iommu_flush_write_buffer(iommu);
2117         }
2118         iommu_enable_dev_iotlb(info);
2119
2120         ret = 0;
2121
2122 out_unlock:
2123         spin_unlock(&iommu->lock);
2124         spin_unlock_irqrestore(&device_domain_lock, flags);
2125
2126         return ret;
2127 }
2128
2129 struct domain_context_mapping_data {
2130         struct dmar_domain *domain;
2131         struct intel_iommu *iommu;
2132 };
2133
2134 static int domain_context_mapping_cb(struct pci_dev *pdev,
2135                                      u16 alias, void *opaque)
2136 {
2137         struct domain_context_mapping_data *data = opaque;
2138
2139         return domain_context_mapping_one(data->domain, data->iommu,
2140                                           PCI_BUS_NUM(alias), alias & 0xff);
2141 }
2142
2143 static int
2144 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2145 {
2146         struct intel_iommu *iommu;
2147         u8 bus, devfn;
2148         struct domain_context_mapping_data data;
2149
2150         iommu = device_to_iommu(dev, &bus, &devfn);
2151         if (!iommu)
2152                 return -ENODEV;
2153
2154         if (!dev_is_pci(dev))
2155                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2156
2157         data.domain = domain;
2158         data.iommu = iommu;
2159
2160         return pci_for_each_dma_alias(to_pci_dev(dev),
2161                                       &domain_context_mapping_cb, &data);
2162 }
2163
2164 static int domain_context_mapped_cb(struct pci_dev *pdev,
2165                                     u16 alias, void *opaque)
2166 {
2167         struct intel_iommu *iommu = opaque;
2168
2169         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2170 }
2171
2172 static int domain_context_mapped(struct device *dev)
2173 {
2174         struct intel_iommu *iommu;
2175         u8 bus, devfn;
2176
2177         iommu = device_to_iommu(dev, &bus, &devfn);
2178         if (!iommu)
2179                 return -ENODEV;
2180
2181         if (!dev_is_pci(dev))
2182                 return device_context_mapped(iommu, bus, devfn);
2183
2184         return !pci_for_each_dma_alias(to_pci_dev(dev),
2185                                        domain_context_mapped_cb, iommu);
2186 }
2187
2188 /* Returns a number of VTD pages, but aligned to MM page size */
2189 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2190                                             size_t size)
2191 {
2192         host_addr &= ~PAGE_MASK;
2193         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2194 }
2195
2196 /* Return largest possible superpage level for a given mapping */
2197 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2198                                           unsigned long iov_pfn,
2199                                           unsigned long phy_pfn,
2200                                           unsigned long pages)
2201 {
2202         int support, level = 1;
2203         unsigned long pfnmerge;
2204
2205         support = domain->iommu_superpage;
2206
2207         /* To use a large page, the virtual *and* physical addresses
2208            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2209            of them will mean we have to use smaller pages. So just
2210            merge them and check both at once. */
2211         pfnmerge = iov_pfn | phy_pfn;
2212
2213         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2214                 pages >>= VTD_STRIDE_SHIFT;
2215                 if (!pages)
2216                         break;
2217                 pfnmerge >>= VTD_STRIDE_SHIFT;
2218                 level++;
2219                 support--;
2220         }
2221         return level;
2222 }
2223
2224 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2225                             struct scatterlist *sg, unsigned long phys_pfn,
2226                             unsigned long nr_pages, int prot)
2227 {
2228         struct dma_pte *first_pte = NULL, *pte = NULL;
2229         phys_addr_t uninitialized_var(pteval);
2230         unsigned long sg_res = 0;
2231         unsigned int largepage_lvl = 0;
2232         unsigned long lvl_pages = 0;
2233
2234         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2235
2236         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2237                 return -EINVAL;
2238
2239         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2240
2241         if (!sg) {
2242                 sg_res = nr_pages;
2243                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2244         }
2245
2246         while (nr_pages > 0) {
2247                 uint64_t tmp;
2248
2249                 if (!sg_res) {
2250                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2251
2252                         sg_res = aligned_nrpages(sg->offset, sg->length);
2253                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2254                         sg->dma_length = sg->length;
2255                         pteval = (sg_phys(sg) - pgoff) | prot;
2256                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2257                 }
2258
2259                 if (!pte) {
2260                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2261
2262                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2263                         if (!pte)
2264                                 return -ENOMEM;
2265                         /* It is large page*/
2266                         if (largepage_lvl > 1) {
2267                                 unsigned long nr_superpages, end_pfn;
2268
2269                                 pteval |= DMA_PTE_LARGE_PAGE;
2270                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2271
2272                                 nr_superpages = sg_res / lvl_pages;
2273                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2274
2275                                 /*
2276                                  * Ensure that old small page tables are
2277                                  * removed to make room for superpage(s).
2278                                  * We're adding new large pages, so make sure
2279                                  * we don't remove their parent tables.
2280                                  */
2281                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2282                                                        largepage_lvl + 1);
2283                         } else {
2284                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2285                         }
2286
2287                 }
2288                 /* We don't need lock here, nobody else
2289                  * touches the iova range
2290                  */
2291                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2292                 if (tmp) {
2293                         static int dumps = 5;
2294                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2295                                 iov_pfn, tmp, (unsigned long long)pteval);
2296                         if (dumps) {
2297                                 dumps--;
2298                                 debug_dma_dump_mappings(NULL);
2299                         }
2300                         WARN_ON(1);
2301                 }
2302
2303                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2304
2305                 BUG_ON(nr_pages < lvl_pages);
2306                 BUG_ON(sg_res < lvl_pages);
2307
2308                 nr_pages -= lvl_pages;
2309                 iov_pfn += lvl_pages;
2310                 phys_pfn += lvl_pages;
2311                 pteval += lvl_pages * VTD_PAGE_SIZE;
2312                 sg_res -= lvl_pages;
2313
2314                 /* If the next PTE would be the first in a new page, then we
2315                    need to flush the cache on the entries we've just written.
2316                    And then we'll need to recalculate 'pte', so clear it and
2317                    let it get set again in the if (!pte) block above.
2318
2319                    If we're done (!nr_pages) we need to flush the cache too.
2320
2321                    Also if we've been setting superpages, we may need to
2322                    recalculate 'pte' and switch back to smaller pages for the
2323                    end of the mapping, if the trailing size is not enough to
2324                    use another superpage (i.e. sg_res < lvl_pages). */
2325                 pte++;
2326                 if (!nr_pages || first_pte_in_page(pte) ||
2327                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2328                         domain_flush_cache(domain, first_pte,
2329                                            (void *)pte - (void *)first_pte);
2330                         pte = NULL;
2331                 }
2332
2333                 if (!sg_res && nr_pages)
2334                         sg = sg_next(sg);
2335         }
2336         return 0;
2337 }
2338
2339 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2340                          struct scatterlist *sg, unsigned long phys_pfn,
2341                          unsigned long nr_pages, int prot)
2342 {
2343        int ret;
2344        struct intel_iommu *iommu;
2345
2346        /* Do the real mapping first */
2347        ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2348        if (ret)
2349                return ret;
2350
2351        /* Notify about the new mapping */
2352        if (domain_type_is_vm(domain)) {
2353                /* VM typed domains can have more than one IOMMUs */
2354                int iommu_id;
2355                for_each_domain_iommu(iommu_id, domain) {
2356                        iommu = g_iommus[iommu_id];
2357                        __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2358                }
2359        } else {
2360                /* General domains only have one IOMMU */
2361                iommu = domain_get_iommu(domain);
2362                __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2363        }
2364
2365        return 0;
2366 }
2367
2368 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2369                                     struct scatterlist *sg, unsigned long nr_pages,
2370                                     int prot)
2371 {
2372         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2373 }
2374
2375 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2376                                      unsigned long phys_pfn, unsigned long nr_pages,
2377                                      int prot)
2378 {
2379         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2380 }
2381
2382 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2383 {
2384         unsigned long flags;
2385         struct context_entry *context;
2386         u16 did_old;
2387
2388         if (!iommu)
2389                 return;
2390
2391         spin_lock_irqsave(&iommu->lock, flags);
2392         context = iommu_context_addr(iommu, bus, devfn, 0);
2393         if (!context) {
2394                 spin_unlock_irqrestore(&iommu->lock, flags);
2395                 return;
2396         }
2397         did_old = context_domain_id(context);
2398         context_clear_entry(context);
2399         __iommu_flush_cache(iommu, context, sizeof(*context));
2400         spin_unlock_irqrestore(&iommu->lock, flags);
2401         iommu->flush.flush_context(iommu,
2402                                    did_old,
2403                                    (((u16)bus) << 8) | devfn,
2404                                    DMA_CCMD_MASK_NOBIT,
2405                                    DMA_CCMD_DEVICE_INVL);
2406         iommu->flush.flush_iotlb(iommu,
2407                                  did_old,
2408                                  0,
2409                                  0,
2410                                  DMA_TLB_DSI_FLUSH);
2411 }
2412
2413 static inline void unlink_domain_info(struct device_domain_info *info)
2414 {
2415         assert_spin_locked(&device_domain_lock);
2416         list_del(&info->link);
2417         list_del(&info->global);
2418         if (info->dev)
2419                 info->dev->archdata.iommu = NULL;
2420 }
2421
2422 static void domain_remove_dev_info(struct dmar_domain *domain)
2423 {
2424         struct device_domain_info *info, *tmp;
2425         unsigned long flags;
2426
2427         spin_lock_irqsave(&device_domain_lock, flags);
2428         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2429                 __dmar_remove_one_dev_info(info);
2430         spin_unlock_irqrestore(&device_domain_lock, flags);
2431 }
2432
2433 /*
2434  * find_domain
2435  * Note: we use struct device->archdata.iommu stores the info
2436  */
2437 static struct dmar_domain *find_domain(struct device *dev)
2438 {
2439         struct device_domain_info *info;
2440
2441         /* No lock here, assumes no domain exit in normal case */
2442         info = dev->archdata.iommu;
2443         if (likely(info))
2444                 return info->domain;
2445         return NULL;
2446 }
2447
2448 static inline struct device_domain_info *
2449 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2450 {
2451         struct device_domain_info *info;
2452
2453         list_for_each_entry(info, &device_domain_list, global)
2454                 if (info->iommu->segment == segment && info->bus == bus &&
2455                     info->devfn == devfn)
2456                         return info;
2457
2458         return NULL;
2459 }
2460
2461 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2462                                                     int bus, int devfn,
2463                                                     struct device *dev,
2464                                                     struct dmar_domain *domain)
2465 {
2466         struct dmar_domain *found = NULL;
2467         struct device_domain_info *info;
2468         unsigned long flags;
2469         int ret;
2470
2471         info = alloc_devinfo_mem();
2472         if (!info)
2473                 return NULL;
2474
2475         info->bus = bus;
2476         info->devfn = devfn;
2477         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2478         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2479         info->ats_qdep = 0;
2480         info->dev = dev;
2481         info->domain = domain;
2482         info->iommu = iommu;
2483         info->pasid_table = NULL;
2484
2485         if (dev && dev_is_pci(dev)) {
2486                 struct pci_dev *pdev = to_pci_dev(info->dev);
2487
2488                 if (!pci_ats_disabled() &&
2489                     ecap_dev_iotlb_support(iommu->ecap) &&
2490                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2491                     dmar_find_matched_atsr_unit(pdev))
2492                         info->ats_supported = 1;
2493
2494                 if (ecs_enabled(iommu)) {
2495                         if (pasid_enabled(iommu)) {
2496                                 int features = pci_pasid_features(pdev);
2497                                 if (features >= 0)
2498                                         info->pasid_supported = features | 1;
2499                         }
2500
2501                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2502                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2503                                 info->pri_supported = 1;
2504                 }
2505         }
2506
2507         spin_lock_irqsave(&device_domain_lock, flags);
2508         if (dev)
2509                 found = find_domain(dev);
2510
2511         if (!found) {
2512                 struct device_domain_info *info2;
2513                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2514                 if (info2) {
2515                         found      = info2->domain;
2516                         info2->dev = dev;
2517                 }
2518         }
2519
2520         if (found) {
2521                 spin_unlock_irqrestore(&device_domain_lock, flags);
2522                 free_devinfo_mem(info);
2523                 /* Caller must free the original domain */
2524                 return found;
2525         }
2526
2527         spin_lock(&iommu->lock);
2528         ret = domain_attach_iommu(domain, iommu);
2529         spin_unlock(&iommu->lock);
2530
2531         if (ret) {
2532                 spin_unlock_irqrestore(&device_domain_lock, flags);
2533                 free_devinfo_mem(info);
2534                 return NULL;
2535         }
2536
2537         list_add(&info->link, &domain->devices);
2538         list_add(&info->global, &device_domain_list);
2539         if (dev)
2540                 dev->archdata.iommu = info;
2541
2542         if (dev && dev_is_pci(dev) && info->pasid_supported) {
2543                 ret = intel_pasid_alloc_table(dev);
2544                 if (ret) {
2545                         pr_warn("No pasid table for %s, pasid disabled\n",
2546                                 dev_name(dev));
2547                         info->pasid_supported = 0;
2548                 }
2549         }
2550         spin_unlock_irqrestore(&device_domain_lock, flags);
2551
2552         if (dev && domain_context_mapping(domain, dev)) {
2553                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2554                 dmar_remove_one_dev_info(domain, dev);
2555                 return NULL;
2556         }
2557
2558         return domain;
2559 }
2560
2561 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2562 {
2563         *(u16 *)opaque = alias;
2564         return 0;
2565 }
2566
2567 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2568 {
2569         struct device_domain_info *info = NULL;
2570         struct dmar_domain *domain = NULL;
2571         struct intel_iommu *iommu;
2572         u16 dma_alias;
2573         unsigned long flags;
2574         u8 bus, devfn;
2575
2576         iommu = device_to_iommu(dev, &bus, &devfn);
2577         if (!iommu)
2578                 return NULL;
2579
2580         if (dev_is_pci(dev)) {
2581                 struct pci_dev *pdev = to_pci_dev(dev);
2582
2583                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2584
2585                 spin_lock_irqsave(&device_domain_lock, flags);
2586                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2587                                                       PCI_BUS_NUM(dma_alias),
2588                                                       dma_alias & 0xff);
2589                 if (info) {
2590                         iommu = info->iommu;
2591                         domain = info->domain;
2592                 }
2593                 spin_unlock_irqrestore(&device_domain_lock, flags);
2594
2595                 /* DMA alias already has a domain, use it */
2596                 if (info)
2597                         goto out;
2598         }
2599
2600         /* Allocate and initialize new domain for the device */
2601         domain = alloc_domain(0);
2602         if (!domain)
2603                 return NULL;
2604         if (domain_init(domain, iommu, gaw)) {
2605                 domain_exit(domain);
2606                 return NULL;
2607         }
2608
2609 out:
2610
2611         return domain;
2612 }
2613
2614 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2615                                               struct dmar_domain *domain)
2616 {
2617         struct intel_iommu *iommu;
2618         struct dmar_domain *tmp;
2619         u16 req_id, dma_alias;
2620         u8 bus, devfn;
2621
2622         iommu = device_to_iommu(dev, &bus, &devfn);
2623         if (!iommu)
2624                 return NULL;
2625
2626         req_id = ((u16)bus << 8) | devfn;
2627
2628         if (dev_is_pci(dev)) {
2629                 struct pci_dev *pdev = to_pci_dev(dev);
2630
2631                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2632
2633                 /* register PCI DMA alias device */
2634                 if (req_id != dma_alias) {
2635                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2636                                         dma_alias & 0xff, NULL, domain);
2637
2638                         if (!tmp || tmp != domain)
2639                                 return tmp;
2640                 }
2641         }
2642
2643         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2644         if (!tmp || tmp != domain)
2645                 return tmp;
2646
2647         return domain;
2648 }
2649
2650 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2651 {
2652         struct dmar_domain *domain, *tmp;
2653
2654         domain = find_domain(dev);
2655         if (domain)
2656                 goto out;
2657
2658         domain = find_or_alloc_domain(dev, gaw);
2659         if (!domain)
2660                 goto out;
2661
2662         tmp = set_domain_for_dev(dev, domain);
2663         if (!tmp || domain != tmp) {
2664                 domain_exit(domain);
2665                 domain = tmp;
2666         }
2667
2668 out:
2669
2670         return domain;
2671 }
2672
2673 static int iommu_domain_identity_map(struct dmar_domain *domain,
2674                                      unsigned long long start,
2675                                      unsigned long long end)
2676 {
2677         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2678         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2679
2680         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2681                           dma_to_mm_pfn(last_vpfn))) {
2682                 pr_err("Reserving iova failed\n");
2683                 return -ENOMEM;
2684         }
2685
2686         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2687         /*
2688          * RMRR range might have overlap with physical memory range,
2689          * clear it first
2690          */
2691         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2692
2693         return __domain_mapping(domain, first_vpfn, NULL,
2694                                 first_vpfn, last_vpfn - first_vpfn + 1,
2695                                 DMA_PTE_READ|DMA_PTE_WRITE);
2696 }
2697
2698 static int domain_prepare_identity_map(struct device *dev,
2699                                        struct dmar_domain *domain,
2700                                        unsigned long long start,
2701                                        unsigned long long end)
2702 {
2703         /* For _hardware_ passthrough, don't bother. But for software
2704            passthrough, we do it anyway -- it may indicate a memory
2705            range which is reserved in E820, so which didn't get set
2706            up to start with in si_domain */
2707         if (domain == si_domain && hw_pass_through) {
2708                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2709                         dev_name(dev), start, end);
2710                 return 0;
2711         }
2712
2713         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2714                 dev_name(dev), start, end);
2715
2716         if (end < start) {
2717                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2718                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2719                         dmi_get_system_info(DMI_BIOS_VENDOR),
2720                         dmi_get_system_info(DMI_BIOS_VERSION),
2721                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2722                 return -EIO;
2723         }
2724
2725         if (end >> agaw_to_width(domain->agaw)) {
2726                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2727                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2728                      agaw_to_width(domain->agaw),
2729                      dmi_get_system_info(DMI_BIOS_VENDOR),
2730                      dmi_get_system_info(DMI_BIOS_VERSION),
2731                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2732                 return -EIO;
2733         }
2734
2735         return iommu_domain_identity_map(domain, start, end);
2736 }
2737
2738 static int iommu_prepare_identity_map(struct device *dev,
2739                                       unsigned long long start,
2740                                       unsigned long long end)
2741 {
2742         struct dmar_domain *domain;
2743         int ret;
2744
2745         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2746         if (!domain)
2747                 return -ENOMEM;
2748
2749         ret = domain_prepare_identity_map(dev, domain, start, end);
2750         if (ret)
2751                 domain_exit(domain);
2752
2753         return ret;
2754 }
2755
2756 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2757                                          struct device *dev)
2758 {
2759         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2760                 return 0;
2761         return iommu_prepare_identity_map(dev, rmrr->base_address,
2762                                           rmrr->end_address);
2763 }
2764
2765 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2766 static inline void iommu_prepare_isa(void)
2767 {
2768         struct pci_dev *pdev;
2769         int ret;
2770
2771         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2772         if (!pdev)
2773                 return;
2774
2775         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2776         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2777
2778         if (ret)
2779                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2780
2781         pci_dev_put(pdev);
2782 }
2783 #else
2784 static inline void iommu_prepare_isa(void)
2785 {
2786         return;
2787 }
2788 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2789
2790 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2791
2792 static int __init si_domain_init(int hw)
2793 {
2794         int nid, ret = 0;
2795
2796         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2797         if (!si_domain)
2798                 return -EFAULT;
2799
2800         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2801                 domain_exit(si_domain);
2802                 return -EFAULT;
2803         }
2804
2805         pr_debug("Identity mapping domain allocated\n");
2806
2807         if (hw)
2808                 return 0;
2809
2810         for_each_online_node(nid) {
2811                 unsigned long start_pfn, end_pfn;
2812                 int i;
2813
2814                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2815                         ret = iommu_domain_identity_map(si_domain,
2816                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2817                         if (ret)
2818                                 return ret;
2819                 }
2820         }
2821
2822         return 0;
2823 }
2824
2825 static int identity_mapping(struct device *dev)
2826 {
2827         struct device_domain_info *info;
2828
2829         if (likely(!iommu_identity_mapping))
2830                 return 0;
2831
2832         info = dev->archdata.iommu;
2833         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2834                 return (info->domain == si_domain);
2835
2836         return 0;
2837 }
2838
2839 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2840 {
2841         struct dmar_domain *ndomain;
2842         struct intel_iommu *iommu;
2843         u8 bus, devfn;
2844
2845         iommu = device_to_iommu(dev, &bus, &devfn);
2846         if (!iommu)
2847                 return -ENODEV;
2848
2849         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2850         if (ndomain != domain)
2851                 return -EBUSY;
2852
2853         return 0;
2854 }
2855
2856 static bool device_has_rmrr(struct device *dev)
2857 {
2858         struct dmar_rmrr_unit *rmrr;
2859         struct device *tmp;
2860         int i;
2861
2862         rcu_read_lock();
2863         for_each_rmrr_units(rmrr) {
2864                 /*
2865                  * Return TRUE if this RMRR contains the device that
2866                  * is passed in.
2867                  */
2868                 for_each_active_dev_scope(rmrr->devices,
2869                                           rmrr->devices_cnt, i, tmp)
2870                         if (tmp == dev) {
2871                                 rcu_read_unlock();
2872                                 return true;
2873                         }
2874         }
2875         rcu_read_unlock();
2876         return false;
2877 }
2878
2879 /*
2880  * There are a couple cases where we need to restrict the functionality of
2881  * devices associated with RMRRs.  The first is when evaluating a device for
2882  * identity mapping because problems exist when devices are moved in and out
2883  * of domains and their respective RMRR information is lost.  This means that
2884  * a device with associated RMRRs will never be in a "passthrough" domain.
2885  * The second is use of the device through the IOMMU API.  This interface
2886  * expects to have full control of the IOVA space for the device.  We cannot
2887  * satisfy both the requirement that RMRR access is maintained and have an
2888  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2889  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2890  * We therefore prevent devices associated with an RMRR from participating in
2891  * the IOMMU API, which eliminates them from device assignment.
2892  *
2893  * In both cases we assume that PCI USB devices with RMRRs have them largely
2894  * for historical reasons and that the RMRR space is not actively used post
2895  * boot.  This exclusion may change if vendors begin to abuse it.
2896  *
2897  * The same exception is made for graphics devices, with the requirement that
2898  * any use of the RMRR regions will be torn down before assigning the device
2899  * to a guest.
2900  */
2901 static bool device_is_rmrr_locked(struct device *dev)
2902 {
2903         if (!device_has_rmrr(dev))
2904                 return false;
2905
2906         if (dev_is_pci(dev)) {
2907                 struct pci_dev *pdev = to_pci_dev(dev);
2908
2909                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2910                         return false;
2911         }
2912
2913         return true;
2914 }
2915
2916 static int iommu_should_identity_map(struct device *dev, int startup)
2917 {
2918
2919         if (dev_is_pci(dev)) {
2920                 struct pci_dev *pdev = to_pci_dev(dev);
2921
2922                 if (device_is_rmrr_locked(dev))
2923                         return 0;
2924
2925                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2926                         return 1;
2927
2928                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2929                         return 1;
2930
2931                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2932                         return 0;
2933
2934                 /*
2935                  * We want to start off with all devices in the 1:1 domain, and
2936                  * take them out later if we find they can't access all of memory.
2937                  *
2938                  * However, we can't do this for PCI devices behind bridges,
2939                  * because all PCI devices behind the same bridge will end up
2940                  * with the same source-id on their transactions.
2941                  *
2942                  * Practically speaking, we can't change things around for these
2943                  * devices at run-time, because we can't be sure there'll be no
2944                  * DMA transactions in flight for any of their siblings.
2945                  *
2946                  * So PCI devices (unless they're on the root bus) as well as
2947                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2948                  * the 1:1 domain, just in _case_ one of their siblings turns out
2949                  * not to be able to map all of memory.
2950                  */
2951                 if (!pci_is_pcie(pdev)) {
2952                         if (!pci_is_root_bus(pdev->bus))
2953                                 return 0;
2954                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2955                                 return 0;
2956                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2957                         return 0;
2958         } else {
2959                 if (device_has_rmrr(dev))
2960                         return 0;
2961         }
2962
2963         /*
2964          * At boot time, we don't yet know if devices will be 64-bit capable.
2965          * Assume that they will — if they turn out not to be, then we can
2966          * take them out of the 1:1 domain later.
2967          */
2968         if (!startup) {
2969                 /*
2970                  * If the device's dma_mask is less than the system's memory
2971                  * size then this is not a candidate for identity mapping.
2972                  */
2973                 u64 dma_mask = *dev->dma_mask;
2974
2975                 if (dev->coherent_dma_mask &&
2976                     dev->coherent_dma_mask < dma_mask)
2977                         dma_mask = dev->coherent_dma_mask;
2978
2979                 return dma_mask >= dma_get_required_mask(dev);
2980         }
2981
2982         return 1;
2983 }
2984
2985 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2986 {
2987         int ret;
2988
2989         if (!iommu_should_identity_map(dev, 1))
2990                 return 0;
2991
2992         ret = domain_add_dev_info(si_domain, dev);
2993         if (!ret)
2994                 pr_info("%s identity mapping for device %s\n",
2995                         hw ? "Hardware" : "Software", dev_name(dev));
2996         else if (ret == -ENODEV)
2997                 /* device not associated with an iommu */
2998                 ret = 0;
2999
3000         return ret;
3001 }
3002
3003
3004 static int __init iommu_prepare_static_identity_mapping(int hw)
3005 {
3006         struct pci_dev *pdev = NULL;
3007         struct dmar_drhd_unit *drhd;
3008         struct intel_iommu *iommu;
3009         struct device *dev;
3010         int i;
3011         int ret = 0;
3012
3013         for_each_pci_dev(pdev) {
3014                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
3015                 if (ret)
3016                         return ret;
3017         }
3018
3019         for_each_active_iommu(iommu, drhd)
3020                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
3021                         struct acpi_device_physical_node *pn;
3022                         struct acpi_device *adev;
3023
3024                         if (dev->bus != &acpi_bus_type)
3025                                 continue;
3026
3027                         adev= to_acpi_device(dev);
3028                         mutex_lock(&adev->physical_node_lock);
3029                         list_for_each_entry(pn, &adev->physical_node_list, node) {
3030                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3031                                 if (ret)
3032                                         break;
3033                         }
3034                         mutex_unlock(&adev->physical_node_lock);
3035                         if (ret)
3036                                 return ret;
3037                 }
3038
3039         return 0;
3040 }
3041
3042 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3043 {
3044         /*
3045          * Start from the sane iommu hardware state.
3046          * If the queued invalidation is already initialized by us
3047          * (for example, while enabling interrupt-remapping) then
3048          * we got the things already rolling from a sane state.
3049          */
3050         if (!iommu->qi) {
3051                 /*
3052                  * Clear any previous faults.
3053                  */
3054                 dmar_fault(-1, iommu);
3055                 /*
3056                  * Disable queued invalidation if supported and already enabled
3057                  * before OS handover.
3058                  */
3059                 dmar_disable_qi(iommu);
3060         }
3061
3062         if (dmar_enable_qi(iommu)) {
3063                 /*
3064                  * Queued Invalidate not enabled, use Register Based Invalidate
3065                  */
3066                 iommu->flush.flush_context = __iommu_flush_context;
3067                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3068                 pr_info("%s: Using Register based invalidation\n",
3069                         iommu->name);
3070         } else {
3071                 iommu->flush.flush_context = qi_flush_context;
3072                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3073                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3074         }
3075 }
3076
3077 static int copy_context_table(struct intel_iommu *iommu,
3078                               struct root_entry *old_re,
3079                               struct context_entry **tbl,
3080                               int bus, bool ext)
3081 {
3082         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3083         struct context_entry *new_ce = NULL, ce;
3084         struct context_entry *old_ce = NULL;
3085         struct root_entry re;
3086         phys_addr_t old_ce_phys;
3087
3088         tbl_idx = ext ? bus * 2 : bus;
3089         memcpy(&re, old_re, sizeof(re));
3090
3091         for (devfn = 0; devfn < 256; devfn++) {
3092                 /* First calculate the correct index */
3093                 idx = (ext ? devfn * 2 : devfn) % 256;
3094
3095                 if (idx == 0) {
3096                         /* First save what we may have and clean up */
3097                         if (new_ce) {
3098                                 tbl[tbl_idx] = new_ce;
3099                                 __iommu_flush_cache(iommu, new_ce,
3100                                                     VTD_PAGE_SIZE);
3101                                 pos = 1;
3102                         }
3103
3104                         if (old_ce)
3105                                 memunmap(old_ce);
3106
3107                         ret = 0;
3108                         if (devfn < 0x80)
3109                                 old_ce_phys = root_entry_lctp(&re);
3110                         else
3111                                 old_ce_phys = root_entry_uctp(&re);
3112
3113                         if (!old_ce_phys) {
3114                                 if (ext && devfn == 0) {
3115                                         /* No LCTP, try UCTP */
3116                                         devfn = 0x7f;
3117                                         continue;
3118                                 } else {
3119                                         goto out;
3120                                 }
3121                         }
3122
3123                         ret = -ENOMEM;
3124                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3125                                         MEMREMAP_WB);
3126                         if (!old_ce)
3127                                 goto out;
3128
3129                         new_ce = alloc_pgtable_page(iommu->node);
3130                         if (!new_ce)
3131                                 goto out_unmap;
3132
3133                         ret = 0;
3134                 }
3135
3136                 /* Now copy the context entry */
3137                 memcpy(&ce, old_ce + idx, sizeof(ce));
3138
3139                 if (!__context_present(&ce))
3140                         continue;
3141
3142                 did = context_domain_id(&ce);
3143                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3144                         set_bit(did, iommu->domain_ids);
3145
3146                 /*
3147                  * We need a marker for copied context entries. This
3148                  * marker needs to work for the old format as well as
3149                  * for extended context entries.
3150                  *
3151                  * Bit 67 of the context entry is used. In the old
3152                  * format this bit is available to software, in the
3153                  * extended format it is the PGE bit, but PGE is ignored
3154                  * by HW if PASIDs are disabled (and thus still
3155                  * available).
3156                  *
3157                  * So disable PASIDs first and then mark the entry
3158                  * copied. This means that we don't copy PASID
3159                  * translations from the old kernel, but this is fine as
3160                  * faults there are not fatal.
3161                  */
3162                 context_clear_pasid_enable(&ce);
3163                 context_set_copied(&ce);
3164
3165                 new_ce[idx] = ce;
3166         }
3167
3168         tbl[tbl_idx + pos] = new_ce;
3169
3170         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3171
3172 out_unmap:
3173         memunmap(old_ce);
3174
3175 out:
3176         return ret;
3177 }
3178
3179 static int copy_translation_tables(struct intel_iommu *iommu)
3180 {
3181         struct context_entry **ctxt_tbls;
3182         struct root_entry *old_rt;
3183         phys_addr_t old_rt_phys;
3184         int ctxt_table_entries;
3185         unsigned long flags;
3186         u64 rtaddr_reg;
3187         int bus, ret;
3188         bool new_ext, ext;
3189
3190         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3191         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3192         new_ext    = !!ecap_ecs(iommu->ecap);
3193
3194         /*
3195          * The RTT bit can only be changed when translation is disabled,
3196          * but disabling translation means to open a window for data
3197          * corruption. So bail out and don't copy anything if we would
3198          * have to change the bit.
3199          */
3200         if (new_ext != ext)
3201                 return -EINVAL;
3202
3203         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3204         if (!old_rt_phys)
3205                 return -EINVAL;
3206
3207         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3208         if (!old_rt)
3209                 return -ENOMEM;
3210
3211         /* This is too big for the stack - allocate it from slab */
3212         ctxt_table_entries = ext ? 512 : 256;
3213         ret = -ENOMEM;
3214         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3215         if (!ctxt_tbls)
3216                 goto out_unmap;
3217
3218         for (bus = 0; bus < 256; bus++) {
3219                 ret = copy_context_table(iommu, &old_rt[bus],
3220                                          ctxt_tbls, bus, ext);
3221                 if (ret) {
3222                         pr_err("%s: Failed to copy context table for bus %d\n",
3223                                 iommu->name, bus);
3224                         continue;
3225                 }
3226         }
3227
3228         spin_lock_irqsave(&iommu->lock, flags);
3229
3230         /* Context tables are copied, now write them to the root_entry table */
3231         for (bus = 0; bus < 256; bus++) {
3232                 int idx = ext ? bus * 2 : bus;
3233                 u64 val;
3234
3235                 if (ctxt_tbls[idx]) {
3236                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3237                         iommu->root_entry[bus].lo = val;
3238                 }
3239
3240                 if (!ext || !ctxt_tbls[idx + 1])
3241                         continue;
3242
3243                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3244                 iommu->root_entry[bus].hi = val;
3245         }
3246
3247         spin_unlock_irqrestore(&iommu->lock, flags);
3248
3249         kfree(ctxt_tbls);
3250
3251         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3252
3253         ret = 0;
3254
3255 out_unmap:
3256         memunmap(old_rt);
3257
3258         return ret;
3259 }
3260
3261 static int __init init_dmars(void)
3262 {
3263         struct dmar_drhd_unit *drhd;
3264         struct dmar_rmrr_unit *rmrr;
3265         bool copied_tables = false;
3266         struct device *dev;
3267         struct intel_iommu *iommu;
3268         int i, ret;
3269
3270         /*
3271          * for each drhd
3272          *    allocate root
3273          *    initialize and program root entry to not present
3274          * endfor
3275          */
3276         for_each_drhd_unit(drhd) {
3277                 /*
3278                  * lock not needed as this is only incremented in the single
3279                  * threaded kernel __init code path all other access are read
3280                  * only
3281                  */
3282                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3283                         g_num_of_iommus++;
3284                         continue;
3285                 }
3286                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3287         }
3288
3289         /* Preallocate enough resources for IOMMU hot-addition */
3290         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3291                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3292
3293         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3294                         GFP_KERNEL);
3295         if (!g_iommus) {
3296                 pr_err("Allocating global iommu array failed\n");
3297                 ret = -ENOMEM;
3298                 goto error;
3299         }
3300
3301         for_each_active_iommu(iommu, drhd) {
3302                 /*
3303                  * Find the max pasid size of all IOMMU's in the system.
3304                  * We need to ensure the system pasid table is no bigger
3305                  * than the smallest supported.
3306                  */
3307                 if (pasid_enabled(iommu)) {
3308                         u32 temp = 2 << ecap_pss(iommu->ecap);
3309
3310                         intel_pasid_max_id = min_t(u32, temp,
3311                                                    intel_pasid_max_id);
3312                 }
3313
3314                 g_iommus[iommu->seq_id] = iommu;
3315
3316                 intel_iommu_init_qi(iommu);
3317
3318                 ret = iommu_init_domains(iommu);
3319                 if (ret)
3320                         goto free_iommu;
3321
3322                 init_translation_status(iommu);
3323
3324                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3325                         iommu_disable_translation(iommu);
3326                         clear_translation_pre_enabled(iommu);
3327                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3328                                 iommu->name);
3329                 }
3330
3331                 /*
3332                  * TBD:
3333                  * we could share the same root & context tables
3334                  * among all IOMMU's. Need to Split it later.
3335                  */
3336                 ret = iommu_alloc_root_entry(iommu);
3337                 if (ret)
3338                         goto free_iommu;
3339
3340                 if (translation_pre_enabled(iommu)) {
3341                         pr_info("Translation already enabled - trying to copy translation structures\n");
3342
3343                         ret = copy_translation_tables(iommu);
3344                         if (ret) {
3345                                 /*
3346                                  * We found the IOMMU with translation
3347                                  * enabled - but failed to copy over the
3348                                  * old root-entry table. Try to proceed
3349                                  * by disabling translation now and
3350                                  * allocating a clean root-entry table.
3351                                  * This might cause DMAR faults, but
3352                                  * probably the dump will still succeed.
3353                                  */
3354                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3355                                        iommu->name);
3356                                 iommu_disable_translation(iommu);
3357                                 clear_translation_pre_enabled(iommu);
3358                         } else {
3359                                 pr_info("Copied translation tables from previous kernel for %s\n",
3360                                         iommu->name);
3361                                 copied_tables = true;
3362                         }
3363                 }
3364
3365                 if (!ecap_pass_through(iommu->ecap))
3366                         hw_pass_through = 0;
3367 #ifdef CONFIG_INTEL_IOMMU_SVM
3368                 if (pasid_enabled(iommu))
3369                         intel_svm_init(iommu);
3370 #endif
3371         }
3372
3373         /*
3374          * Now that qi is enabled on all iommus, set the root entry and flush
3375          * caches. This is required on some Intel X58 chipsets, otherwise the
3376          * flush_context function will loop forever and the boot hangs.
3377          */
3378         for_each_active_iommu(iommu, drhd) {
3379                 iommu_flush_write_buffer(iommu);
3380                 iommu_set_root_entry(iommu);
3381                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3382                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3383         }
3384
3385         if (iommu_pass_through)
3386                 iommu_identity_mapping |= IDENTMAP_ALL;
3387
3388 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3389         dmar_map_gfx = 0;
3390 #endif
3391
3392         if (!dmar_map_gfx)
3393                 iommu_identity_mapping |= IDENTMAP_GFX;
3394
3395         check_tylersburg_isoch();
3396
3397         if (iommu_identity_mapping) {
3398                 ret = si_domain_init(hw_pass_through);
3399                 if (ret)
3400                         goto free_iommu;
3401         }
3402
3403
3404         /*
3405          * If we copied translations from a previous kernel in the kdump
3406          * case, we can not assign the devices to domains now, as that
3407          * would eliminate the old mappings. So skip this part and defer
3408          * the assignment to device driver initialization time.
3409          */
3410         if (copied_tables)
3411                 goto domains_done;
3412
3413         /*
3414          * If pass through is not set or not enabled, setup context entries for
3415          * identity mappings for rmrr, gfx, and isa and may fall back to static
3416          * identity mapping if iommu_identity_mapping is set.
3417          */
3418         if (iommu_identity_mapping) {
3419                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3420                 if (ret) {
3421                         pr_crit("Failed to setup IOMMU pass-through\n");
3422                         goto free_iommu;
3423                 }
3424         }
3425         /*
3426          * For each rmrr
3427          *   for each dev attached to rmrr
3428          *   do
3429          *     locate drhd for dev, alloc domain for dev
3430          *     allocate free domain
3431          *     allocate page table entries for rmrr
3432          *     if context not allocated for bus
3433          *           allocate and init context
3434          *           set present in root table for this bus
3435          *     init context with domain, translation etc
3436          *    endfor
3437          * endfor
3438          */
3439         pr_info("Setting RMRR:\n");
3440         for_each_rmrr_units(rmrr) {
3441                 /* some BIOS lists non-exist devices in DMAR table. */
3442                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3443                                           i, dev) {
3444                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3445                         if (ret)
3446                                 pr_err("Mapping reserved region failed\n");
3447                 }
3448         }
3449
3450         iommu_prepare_isa();
3451
3452 domains_done:
3453
3454         /*
3455          * for each drhd
3456          *   enable fault log
3457          *   global invalidate context cache
3458          *   global invalidate iotlb
3459          *   enable translation
3460          */
3461         for_each_iommu(iommu, drhd) {
3462                 if (drhd->ignored) {
3463                         /*
3464                          * we always have to disable PMRs or DMA may fail on
3465                          * this device
3466                          */
3467                         if (force_on)
3468                                 iommu_disable_protect_mem_regions(iommu);
3469                         continue;
3470                 }
3471
3472                 iommu_flush_write_buffer(iommu);
3473
3474 #ifdef CONFIG_INTEL_IOMMU_SVM
3475                 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3476                         ret = intel_svm_enable_prq(iommu);
3477                         if (ret)
3478                                 goto free_iommu;
3479                 }
3480 #endif
3481                 ret = dmar_set_interrupt(iommu);
3482                 if (ret)
3483                         goto free_iommu;
3484
3485                 if (!translation_pre_enabled(iommu))
3486                         iommu_enable_translation(iommu);
3487
3488                 iommu_disable_protect_mem_regions(iommu);
3489         }
3490
3491         return 0;
3492
3493 free_iommu:
3494         for_each_active_iommu(iommu, drhd) {
3495                 disable_dmar_iommu(iommu);
3496                 free_dmar_iommu(iommu);
3497         }
3498
3499         kfree(g_iommus);
3500
3501 error:
3502         return ret;
3503 }
3504
3505 /* This takes a number of _MM_ pages, not VTD pages */
3506 static unsigned long intel_alloc_iova(struct device *dev,
3507                                      struct dmar_domain *domain,
3508                                      unsigned long nrpages, uint64_t dma_mask)
3509 {
3510         unsigned long iova_pfn = 0;
3511
3512         /* Restrict dma_mask to the width that the iommu can handle */
3513         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3514         /* Ensure we reserve the whole size-aligned region */
3515         nrpages = __roundup_pow_of_two(nrpages);
3516
3517         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3518                 /*
3519                  * First try to allocate an io virtual address in
3520                  * DMA_BIT_MASK(32) and if that fails then try allocating
3521                  * from higher range
3522                  */
3523                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3524                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3525                 if (iova_pfn)
3526                         return iova_pfn;
3527         }
3528         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3529                                    IOVA_PFN(dma_mask), true);
3530         if (unlikely(!iova_pfn)) {
3531                 pr_err("Allocating %ld-page iova for %s failed",
3532                        nrpages, dev_name(dev));
3533                 return 0;
3534         }
3535
3536         return iova_pfn;
3537 }
3538
3539 struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3540 {
3541         struct dmar_domain *domain, *tmp;
3542         struct dmar_rmrr_unit *rmrr;
3543         struct device *i_dev;
3544         int i, ret;
3545
3546         domain = find_domain(dev);
3547         if (domain)
3548                 goto out;
3549
3550         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3551         if (!domain)
3552                 goto out;
3553
3554         /* We have a new domain - setup possible RMRRs for the device */
3555         rcu_read_lock();
3556         for_each_rmrr_units(rmrr) {
3557                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3558                                           i, i_dev) {
3559                         if (i_dev != dev)
3560                                 continue;
3561
3562                         ret = domain_prepare_identity_map(dev, domain,
3563                                                           rmrr->base_address,
3564                                                           rmrr->end_address);
3565                         if (ret)
3566                                 dev_err(dev, "Mapping reserved region failed\n");
3567                 }
3568         }
3569         rcu_read_unlock();
3570
3571         tmp = set_domain_for_dev(dev, domain);
3572         if (!tmp || domain != tmp) {
3573                 domain_exit(domain);
3574                 domain = tmp;
3575         }
3576
3577 out:
3578
3579         if (!domain)
3580                 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3581
3582
3583         return domain;
3584 }
3585
3586 /* Check if the dev needs to go through non-identity map and unmap process.*/
3587 static int iommu_no_mapping(struct device *dev)
3588 {
3589         int found;
3590
3591         if (iommu_dummy(dev))
3592                 return 1;
3593
3594         if (!iommu_identity_mapping)
3595                 return 0;
3596
3597         found = identity_mapping(dev);
3598         if (found) {
3599                 if (iommu_should_identity_map(dev, 0))
3600                         return 1;
3601                 else {
3602                         /*
3603                          * 32 bit DMA is removed from si_domain and fall back
3604                          * to non-identity mapping.
3605                          */
3606                         dmar_remove_one_dev_info(si_domain, dev);
3607                         pr_info("32bit %s uses non-identity mapping\n",
3608                                 dev_name(dev));
3609                         return 0;
3610                 }
3611         } else {
3612                 /*
3613                  * In case of a detached 64 bit DMA device from vm, the device
3614                  * is put into si_domain for identity mapping.
3615                  */
3616                 if (iommu_should_identity_map(dev, 0)) {
3617                         int ret;
3618                         ret = domain_add_dev_info(si_domain, dev);
3619                         if (!ret) {
3620                                 pr_info("64bit %s uses identity mapping\n",
3621                                         dev_name(dev));
3622                                 return 1;
3623                         }
3624                 }
3625         }
3626
3627         return 0;
3628 }
3629
3630 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3631                                      size_t size, int dir, u64 dma_mask)
3632 {
3633         struct dmar_domain *domain;
3634         phys_addr_t start_paddr;
3635         unsigned long iova_pfn;
3636         int prot = 0;
3637         int ret;
3638         struct intel_iommu *iommu;
3639         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3640
3641         BUG_ON(dir == DMA_NONE);
3642
3643         if (iommu_no_mapping(dev))
3644                 return paddr;
3645
3646         domain = get_valid_domain_for_dev(dev);
3647         if (!domain)
3648                 return 0;
3649
3650         iommu = domain_get_iommu(domain);
3651         size = aligned_nrpages(paddr, size);
3652
3653         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3654         if (!iova_pfn)
3655                 goto error;
3656
3657         /*
3658          * Check if DMAR supports zero-length reads on write only
3659          * mappings..
3660          */
3661         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3662                         !cap_zlr(iommu->cap))
3663                 prot |= DMA_PTE_READ;
3664         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3665                 prot |= DMA_PTE_WRITE;
3666         /*
3667          * paddr - (paddr + size) might be partial page, we should map the whole
3668          * page.  Note: if two part of one page are separately mapped, we
3669          * might have two guest_addr mapping to the same host paddr, but this
3670          * is not a big problem
3671          */
3672         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3673                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3674         if (ret)
3675                 goto error;
3676
3677         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3678         start_paddr += paddr & ~PAGE_MASK;
3679         return start_paddr;
3680
3681 error:
3682         if (iova_pfn)
3683                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3684         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3685                 dev_name(dev), size, (unsigned long long)paddr, dir);
3686         return 0;
3687 }
3688
3689 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3690                                  unsigned long offset, size_t size,
3691                                  enum dma_data_direction dir,
3692                                  unsigned long attrs)
3693 {
3694         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3695                                   dir, *dev->dma_mask);
3696 }
3697
3698 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3699 {
3700         struct dmar_domain *domain;
3701         unsigned long start_pfn, last_pfn;
3702         unsigned long nrpages;
3703         unsigned long iova_pfn;
3704         struct intel_iommu *iommu;
3705         struct page *freelist;
3706
3707         if (iommu_no_mapping(dev))
3708                 return;
3709
3710         domain = find_domain(dev);
3711         BUG_ON(!domain);
3712
3713         iommu = domain_get_iommu(domain);
3714
3715         iova_pfn = IOVA_PFN(dev_addr);
3716
3717         nrpages = aligned_nrpages(dev_addr, size);
3718         start_pfn = mm_to_dma_pfn(iova_pfn);
3719         last_pfn = start_pfn + nrpages - 1;
3720
3721         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3722                  dev_name(dev), start_pfn, last_pfn);
3723
3724         freelist = domain_unmap(domain, start_pfn, last_pfn);
3725
3726         if (intel_iommu_strict || !has_iova_flush_queue(&domain->iovad)) {
3727                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3728                                       nrpages, !freelist, 0);
3729                 /* free iova */
3730                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3731                 dma_free_pagelist(freelist);
3732         } else {
3733                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3734                            (unsigned long)freelist);
3735                 /*
3736                  * queue up the release of the unmap to save the 1/6th of the
3737                  * cpu used up by the iotlb flush operation...
3738                  */
3739         }
3740 }
3741
3742 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3743                              size_t size, enum dma_data_direction dir,
3744                              unsigned long attrs)
3745 {
3746         intel_unmap(dev, dev_addr, size);
3747 }
3748
3749 static void *intel_alloc_coherent(struct device *dev, size_t size,
3750                                   dma_addr_t *dma_handle, gfp_t flags,
3751                                   unsigned long attrs)
3752 {
3753         struct page *page = NULL;
3754         int order;
3755
3756         size = PAGE_ALIGN(size);
3757         order = get_order(size);
3758
3759         if (!iommu_no_mapping(dev))
3760                 flags &= ~(GFP_DMA | GFP_DMA32);
3761         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3762                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3763                         flags |= GFP_DMA;
3764                 else
3765                         flags |= GFP_DMA32;
3766         }
3767
3768         if (gfpflags_allow_blocking(flags)) {
3769                 unsigned int count = size >> PAGE_SHIFT;
3770
3771                 page = dma_alloc_from_contiguous(dev, count, order,
3772                                                  flags & __GFP_NOWARN);
3773                 if (page && iommu_no_mapping(dev) &&
3774                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3775                         dma_release_from_contiguous(dev, page, count);
3776                         page = NULL;
3777                 }
3778         }
3779
3780         if (!page)
3781                 page = alloc_pages(flags, order);
3782         if (!page)
3783                 return NULL;
3784         memset(page_address(page), 0, size);
3785
3786         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3787                                          DMA_BIDIRECTIONAL,
3788                                          dev->coherent_dma_mask);
3789         if (*dma_handle)
3790                 return page_address(page);
3791         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3792                 __free_pages(page, order);
3793
3794         return NULL;
3795 }
3796
3797 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3798                                 dma_addr_t dma_handle, unsigned long attrs)
3799 {
3800         int order;
3801         struct page *page = virt_to_page(vaddr);
3802
3803         size = PAGE_ALIGN(size);
3804         order = get_order(size);
3805
3806         intel_unmap(dev, dma_handle, size);
3807         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3808                 __free_pages(page, order);
3809 }
3810
3811 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3812                            int nelems, enum dma_data_direction dir,
3813                            unsigned long attrs)
3814 {
3815         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3816         unsigned long nrpages = 0;
3817         struct scatterlist *sg;
3818         int i;
3819
3820         for_each_sg(sglist, sg, nelems, i) {
3821                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3822         }
3823
3824         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3825 }
3826
3827 static int intel_nontranslate_map_sg(struct device *hddev,
3828         struct scatterlist *sglist, int nelems, int dir)
3829 {
3830         int i;
3831         struct scatterlist *sg;
3832
3833         for_each_sg(sglist, sg, nelems, i) {
3834                 BUG_ON(!sg_page(sg));
3835                 sg->dma_address = sg_phys(sg);
3836                 sg->dma_length = sg->length;
3837         }
3838         return nelems;
3839 }
3840
3841 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3842                         enum dma_data_direction dir, unsigned long attrs)
3843 {
3844         int i;
3845         struct dmar_domain *domain;
3846         size_t size = 0;
3847         int prot = 0;
3848         unsigned long iova_pfn;
3849         int ret;
3850         struct scatterlist *sg;
3851         unsigned long start_vpfn;
3852         struct intel_iommu *iommu;
3853
3854         BUG_ON(dir == DMA_NONE);
3855         if (iommu_no_mapping(dev))
3856                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3857
3858         domain = get_valid_domain_for_dev(dev);
3859         if (!domain)
3860                 return 0;
3861
3862         iommu = domain_get_iommu(domain);
3863
3864         for_each_sg(sglist, sg, nelems, i)
3865                 size += aligned_nrpages(sg->offset, sg->length);
3866
3867         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3868                                 *dev->dma_mask);
3869         if (!iova_pfn) {
3870                 sglist->dma_length = 0;
3871                 return 0;
3872         }
3873
3874         /*
3875          * Check if DMAR supports zero-length reads on write only
3876          * mappings..
3877          */
3878         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3879                         !cap_zlr(iommu->cap))
3880                 prot |= DMA_PTE_READ;
3881         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3882                 prot |= DMA_PTE_WRITE;
3883
3884         start_vpfn = mm_to_dma_pfn(iova_pfn);
3885
3886         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3887         if (unlikely(ret)) {
3888                 dma_pte_free_pagetable(domain, start_vpfn,
3889                                        start_vpfn + size - 1,
3890                                        agaw_to_level(domain->agaw) + 1);
3891                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3892                 return 0;
3893         }
3894
3895         return nelems;
3896 }
3897
3898 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3899 {
3900         return !dma_addr;
3901 }
3902
3903 const struct dma_map_ops intel_dma_ops = {
3904         .alloc = intel_alloc_coherent,
3905         .free = intel_free_coherent,
3906         .map_sg = intel_map_sg,
3907         .unmap_sg = intel_unmap_sg,
3908         .map_page = intel_map_page,
3909         .unmap_page = intel_unmap_page,
3910         .mapping_error = intel_mapping_error,
3911 #ifdef CONFIG_X86
3912         .dma_supported = dma_direct_supported,
3913 #endif
3914 };
3915
3916 static inline int iommu_domain_cache_init(void)
3917 {
3918         int ret = 0;
3919
3920         iommu_domain_cache = kmem_cache_create("iommu_domain",
3921                                          sizeof(struct dmar_domain),
3922                                          0,
3923                                          SLAB_HWCACHE_ALIGN,
3924
3925                                          NULL);
3926         if (!iommu_domain_cache) {
3927                 pr_err("Couldn't create iommu_domain cache\n");
3928                 ret = -ENOMEM;
3929         }
3930
3931         return ret;
3932 }
3933
3934 static inline int iommu_devinfo_cache_init(void)
3935 {
3936         int ret = 0;
3937
3938         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3939                                          sizeof(struct device_domain_info),
3940                                          0,
3941                                          SLAB_HWCACHE_ALIGN,
3942                                          NULL);
3943         if (!iommu_devinfo_cache) {
3944                 pr_err("Couldn't create devinfo cache\n");
3945                 ret = -ENOMEM;
3946         }
3947
3948         return ret;
3949 }
3950
3951 static int __init iommu_init_mempool(void)
3952 {
3953         int ret;
3954         ret = iova_cache_get();
3955         if (ret)
3956                 return ret;
3957
3958         ret = iommu_domain_cache_init();
3959         if (ret)
3960                 goto domain_error;
3961
3962         ret = iommu_devinfo_cache_init();
3963         if (!ret)
3964                 return ret;
3965
3966         kmem_cache_destroy(iommu_domain_cache);
3967 domain_error:
3968         iova_cache_put();
3969
3970         return -ENOMEM;
3971 }
3972
3973 static void __init iommu_exit_mempool(void)
3974 {
3975         kmem_cache_destroy(iommu_devinfo_cache);
3976         kmem_cache_destroy(iommu_domain_cache);
3977         iova_cache_put();
3978 }
3979
3980 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3981 {
3982         struct dmar_drhd_unit *drhd;
3983         u32 vtbar;
3984         int rc;
3985
3986         /* We know that this device on this chipset has its own IOMMU.
3987          * If we find it under a different IOMMU, then the BIOS is lying
3988          * to us. Hope that the IOMMU for this device is actually
3989          * disabled, and it needs no translation...
3990          */
3991         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3992         if (rc) {
3993                 /* "can't" happen */
3994                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3995                 return;
3996         }
3997         vtbar &= 0xffff0000;
3998
3999         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4000         drhd = dmar_find_matched_drhd_unit(pdev);
4001         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
4002                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4003                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4004                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4005         }
4006 }
4007 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4008
4009 static void __init init_no_remapping_devices(void)
4010 {
4011         struct dmar_drhd_unit *drhd;
4012         struct device *dev;
4013         int i;
4014
4015         for_each_drhd_unit(drhd) {
4016                 if (!drhd->include_all) {
4017                         for_each_active_dev_scope(drhd->devices,
4018                                                   drhd->devices_cnt, i, dev)
4019                                 break;
4020                         /* ignore DMAR unit if no devices exist */
4021                         if (i == drhd->devices_cnt)
4022                                 drhd->ignored = 1;
4023                 }
4024         }
4025
4026         for_each_active_drhd_unit(drhd) {
4027                 if (drhd->include_all)
4028                         continue;
4029
4030                 for_each_active_dev_scope(drhd->devices,
4031                                           drhd->devices_cnt, i, dev)
4032                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4033                                 break;
4034                 if (i < drhd->devices_cnt)
4035                         continue;
4036
4037                 /* This IOMMU has *only* gfx devices. Either bypass it or
4038                    set the gfx_mapped flag, as appropriate */
4039                 if (!dmar_map_gfx) {
4040                         drhd->ignored = 1;
4041                         for_each_active_dev_scope(drhd->devices,
4042                                                   drhd->devices_cnt, i, dev)
4043                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4044                 }
4045         }
4046 }
4047
4048 #ifdef CONFIG_SUSPEND
4049 static int init_iommu_hw(void)
4050 {
4051         struct dmar_drhd_unit *drhd;
4052         struct intel_iommu *iommu = NULL;
4053
4054         for_each_active_iommu(iommu, drhd)
4055                 if (iommu->qi)
4056                         dmar_reenable_qi(iommu);
4057
4058         for_each_iommu(iommu, drhd) {
4059                 if (drhd->ignored) {
4060                         /*
4061                          * we always have to disable PMRs or DMA may fail on
4062                          * this device
4063                          */
4064                         if (force_on)
4065                                 iommu_disable_protect_mem_regions(iommu);
4066                         continue;
4067                 }
4068
4069                 iommu_flush_write_buffer(iommu);
4070
4071                 iommu_set_root_entry(iommu);
4072
4073                 iommu->flush.flush_context(iommu, 0, 0, 0,
4074                                            DMA_CCMD_GLOBAL_INVL);
4075                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4076                 iommu_enable_translation(iommu);
4077                 iommu_disable_protect_mem_regions(iommu);
4078         }
4079
4080         return 0;
4081 }
4082
4083 static void iommu_flush_all(void)
4084 {
4085         struct dmar_drhd_unit *drhd;
4086         struct intel_iommu *iommu;
4087
4088         for_each_active_iommu(iommu, drhd) {
4089                 iommu->flush.flush_context(iommu, 0, 0, 0,
4090                                            DMA_CCMD_GLOBAL_INVL);
4091                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4092                                          DMA_TLB_GLOBAL_FLUSH);
4093         }
4094 }
4095
4096 static int iommu_suspend(void)
4097 {
4098         struct dmar_drhd_unit *drhd;
4099         struct intel_iommu *iommu = NULL;
4100         unsigned long flag;
4101
4102         for_each_active_iommu(iommu, drhd) {
4103                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4104                                                  GFP_ATOMIC);
4105                 if (!iommu->iommu_state)
4106                         goto nomem;
4107         }
4108
4109         iommu_flush_all();
4110
4111         for_each_active_iommu(iommu, drhd) {
4112                 iommu_disable_translation(iommu);
4113
4114                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4115
4116                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4117                         readl(iommu->reg + DMAR_FECTL_REG);
4118                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4119                         readl(iommu->reg + DMAR_FEDATA_REG);
4120                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4121                         readl(iommu->reg + DMAR_FEADDR_REG);
4122                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4123                         readl(iommu->reg + DMAR_FEUADDR_REG);
4124
4125                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4126         }
4127         return 0;
4128
4129 nomem:
4130         for_each_active_iommu(iommu, drhd)
4131                 kfree(iommu->iommu_state);
4132
4133         return -ENOMEM;
4134 }
4135
4136 static void iommu_resume(void)
4137 {
4138         struct dmar_drhd_unit *drhd;
4139         struct intel_iommu *iommu = NULL;
4140         unsigned long flag;
4141
4142         if (init_iommu_hw()) {
4143                 if (force_on)
4144                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4145                 else
4146                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4147                 return;
4148         }
4149
4150         for_each_active_iommu(iommu, drhd) {
4151
4152                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4153
4154                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4155                         iommu->reg + DMAR_FECTL_REG);
4156                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4157                         iommu->reg + DMAR_FEDATA_REG);
4158                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4159                         iommu->reg + DMAR_FEADDR_REG);
4160                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4161                         iommu->reg + DMAR_FEUADDR_REG);
4162
4163                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4164         }
4165
4166         for_each_active_iommu(iommu, drhd)
4167                 kfree(iommu->iommu_state);
4168 }
4169
4170 static struct syscore_ops iommu_syscore_ops = {
4171         .resume         = iommu_resume,
4172         .suspend        = iommu_suspend,
4173 };
4174
4175 static void __init init_iommu_pm_ops(void)
4176 {
4177         register_syscore_ops(&iommu_syscore_ops);
4178 }
4179
4180 #else
4181 static inline void init_iommu_pm_ops(void) {}
4182 #endif  /* CONFIG_PM */
4183
4184
4185 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4186 {
4187         struct acpi_dmar_reserved_memory *rmrr;
4188         struct dmar_rmrr_unit *rmrru;
4189         size_t length;
4190
4191         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4192         if (!rmrru)
4193                 goto out;
4194
4195         rmrru->hdr = header;
4196         rmrr = (struct acpi_dmar_reserved_memory *)header;
4197         rmrru->base_address = rmrr->base_address;
4198         rmrru->end_address = rmrr->end_address;
4199
4200         length = rmrr->end_address - rmrr->base_address + 1;
4201
4202         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4203                                 ((void *)rmrr) + rmrr->header.length,
4204                                 &rmrru->devices_cnt);
4205         if (rmrru->devices_cnt && rmrru->devices == NULL)
4206                 goto free_rmrru;
4207
4208         list_add(&rmrru->list, &dmar_rmrr_units);
4209
4210         return 0;
4211 free_rmrru:
4212         kfree(rmrru);
4213 out:
4214         return -ENOMEM;
4215 }
4216
4217 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4218 {
4219         struct dmar_atsr_unit *atsru;
4220         struct acpi_dmar_atsr *tmp;
4221
4222         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4223                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4224                 if (atsr->segment != tmp->segment)
4225                         continue;
4226                 if (atsr->header.length != tmp->header.length)
4227                         continue;
4228                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4229                         return atsru;
4230         }
4231
4232         return NULL;
4233 }
4234
4235 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4236 {
4237         struct acpi_dmar_atsr *atsr;
4238         struct dmar_atsr_unit *atsru;
4239
4240         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4241                 return 0;
4242
4243         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4244         atsru = dmar_find_atsr(atsr);
4245         if (atsru)
4246                 return 0;
4247
4248         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4249         if (!atsru)
4250                 return -ENOMEM;
4251
4252         /*
4253          * If memory is allocated from slab by ACPI _DSM method, we need to
4254          * copy the memory content because the memory buffer will be freed
4255          * on return.
4256          */
4257         atsru->hdr = (void *)(atsru + 1);
4258         memcpy(atsru->hdr, hdr, hdr->length);
4259         atsru->include_all = atsr->flags & 0x1;
4260         if (!atsru->include_all) {
4261                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4262                                 (void *)atsr + atsr->header.length,
4263                                 &atsru->devices_cnt);
4264                 if (atsru->devices_cnt && atsru->devices == NULL) {
4265                         kfree(atsru);
4266                         return -ENOMEM;
4267                 }
4268         }
4269
4270         list_add_rcu(&atsru->list, &dmar_atsr_units);
4271
4272         return 0;
4273 }
4274
4275 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4276 {
4277         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4278         kfree(atsru);
4279 }
4280
4281 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4282 {
4283         struct acpi_dmar_atsr *atsr;
4284         struct dmar_atsr_unit *atsru;
4285
4286         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4287         atsru = dmar_find_atsr(atsr);
4288         if (atsru) {
4289                 list_del_rcu(&atsru->list);
4290                 synchronize_rcu();
4291                 intel_iommu_free_atsr(atsru);
4292         }
4293
4294         return 0;
4295 }
4296
4297 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4298 {
4299         int i;
4300         struct device *dev;
4301         struct acpi_dmar_atsr *atsr;
4302         struct dmar_atsr_unit *atsru;
4303
4304         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4305         atsru = dmar_find_atsr(atsr);
4306         if (!atsru)
4307                 return 0;
4308
4309         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4310                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4311                                           i, dev)
4312                         return -EBUSY;
4313         }
4314
4315         return 0;
4316 }
4317
4318 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4319 {
4320         int sp, ret = 0;
4321         struct intel_iommu *iommu = dmaru->iommu;
4322
4323         if (g_iommus[iommu->seq_id])
4324                 return 0;
4325
4326         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4327                 pr_warn("%s: Doesn't support hardware pass through.\n",
4328                         iommu->name);
4329                 return -ENXIO;
4330         }
4331         if (!ecap_sc_support(iommu->ecap) &&
4332             domain_update_iommu_snooping(iommu)) {
4333                 pr_warn("%s: Doesn't support snooping.\n",
4334                         iommu->name);
4335                 return -ENXIO;
4336         }
4337         sp = domain_update_iommu_superpage(iommu) - 1;
4338         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4339                 pr_warn("%s: Doesn't support large page.\n",
4340                         iommu->name);
4341                 return -ENXIO;
4342         }
4343
4344         /*
4345          * Disable translation if already enabled prior to OS handover.
4346          */
4347         if (iommu->gcmd & DMA_GCMD_TE)
4348                 iommu_disable_translation(iommu);
4349
4350         g_iommus[iommu->seq_id] = iommu;
4351         ret = iommu_init_domains(iommu);
4352         if (ret == 0)
4353                 ret = iommu_alloc_root_entry(iommu);
4354         if (ret)
4355                 goto out;
4356
4357 #ifdef CONFIG_INTEL_IOMMU_SVM
4358         if (pasid_enabled(iommu))
4359                 intel_svm_init(iommu);
4360 #endif
4361
4362         if (dmaru->ignored) {
4363                 /*
4364                  * we always have to disable PMRs or DMA may fail on this device
4365                  */
4366                 if (force_on)
4367                         iommu_disable_protect_mem_regions(iommu);
4368                 return 0;
4369         }
4370
4371         intel_iommu_init_qi(iommu);
4372         iommu_flush_write_buffer(iommu);
4373
4374 #ifdef CONFIG_INTEL_IOMMU_SVM
4375         if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4376                 ret = intel_svm_enable_prq(iommu);
4377                 if (ret)
4378                         goto disable_iommu;
4379         }
4380 #endif
4381         ret = dmar_set_interrupt(iommu);
4382         if (ret)
4383                 goto disable_iommu;
4384
4385         iommu_set_root_entry(iommu);
4386         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4387         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4388         iommu_enable_translation(iommu);
4389
4390         iommu_disable_protect_mem_regions(iommu);
4391         return 0;
4392
4393 disable_iommu:
4394         disable_dmar_iommu(iommu);
4395 out:
4396         free_dmar_iommu(iommu);
4397         return ret;
4398 }
4399
4400 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4401 {
4402         int ret = 0;
4403         struct intel_iommu *iommu = dmaru->iommu;
4404
4405         if (!intel_iommu_enabled)
4406                 return 0;
4407         if (iommu == NULL)
4408                 return -EINVAL;
4409
4410         if (insert) {
4411                 ret = intel_iommu_add(dmaru);
4412         } else {
4413                 disable_dmar_iommu(iommu);
4414                 free_dmar_iommu(iommu);
4415         }
4416
4417         return ret;
4418 }
4419
4420 static void intel_iommu_free_dmars(void)
4421 {
4422         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4423         struct dmar_atsr_unit *atsru, *atsr_n;
4424
4425         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4426                 list_del(&rmrru->list);
4427                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4428                 kfree(rmrru);
4429         }
4430
4431         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4432                 list_del(&atsru->list);
4433                 intel_iommu_free_atsr(atsru);
4434         }
4435 }
4436
4437 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4438 {
4439         int i, ret = 1;
4440         struct pci_bus *bus;
4441         struct pci_dev *bridge = NULL;
4442         struct device *tmp;
4443         struct acpi_dmar_atsr *atsr;
4444         struct dmar_atsr_unit *atsru;
4445
4446         dev = pci_physfn(dev);
4447         for (bus = dev->bus; bus; bus = bus->parent) {
4448                 bridge = bus->self;
4449                 /* If it's an integrated device, allow ATS */
4450                 if (!bridge)
4451                         return 1;
4452                 /* Connected via non-PCIe: no ATS */
4453                 if (!pci_is_pcie(bridge) ||
4454                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4455                         return 0;
4456                 /* If we found the root port, look it up in the ATSR */
4457                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4458                         break;
4459         }
4460
4461         rcu_read_lock();
4462         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4463                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4464                 if (atsr->segment != pci_domain_nr(dev->bus))
4465                         continue;
4466
4467                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4468                         if (tmp == &bridge->dev)
4469                                 goto out;
4470
4471                 if (atsru->include_all)
4472                         goto out;
4473         }
4474         ret = 0;
4475 out:
4476         rcu_read_unlock();
4477
4478         return ret;
4479 }
4480
4481 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4482 {
4483         int ret = 0;
4484         struct dmar_rmrr_unit *rmrru;
4485         struct dmar_atsr_unit *atsru;
4486         struct acpi_dmar_atsr *atsr;
4487         struct acpi_dmar_reserved_memory *rmrr;
4488
4489         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4490                 return 0;
4491
4492         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4493                 rmrr = container_of(rmrru->hdr,
4494                                     struct acpi_dmar_reserved_memory, header);
4495                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4496                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4497                                 ((void *)rmrr) + rmrr->header.length,
4498                                 rmrr->segment, rmrru->devices,
4499                                 rmrru->devices_cnt);
4500                         if(ret < 0)
4501                                 return ret;
4502                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4503                         dmar_remove_dev_scope(info, rmrr->segment,
4504                                 rmrru->devices, rmrru->devices_cnt);
4505                 }
4506         }
4507
4508         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4509                 if (atsru->include_all)
4510                         continue;
4511
4512                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4513                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4514                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4515                                         (void *)atsr + atsr->header.length,
4516                                         atsr->segment, atsru->devices,
4517                                         atsru->devices_cnt);
4518                         if (ret > 0)
4519                                 break;
4520                         else if(ret < 0)
4521                                 return ret;
4522                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4523                         if (dmar_remove_dev_scope(info, atsr->segment,
4524                                         atsru->devices, atsru->devices_cnt))
4525                                 break;
4526                 }
4527         }
4528
4529         return 0;
4530 }
4531
4532 /*
4533  * Here we only respond to action of unbound device from driver.
4534  *
4535  * Added device is not attached to its DMAR domain here yet. That will happen
4536  * when mapping the device to iova.
4537  */
4538 static int device_notifier(struct notifier_block *nb,
4539                                   unsigned long action, void *data)
4540 {
4541         struct device *dev = data;
4542         struct dmar_domain *domain;
4543
4544         if (iommu_dummy(dev))
4545                 return 0;
4546
4547         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4548                 return 0;
4549
4550         domain = find_domain(dev);
4551         if (!domain)
4552                 return 0;
4553
4554         dmar_remove_one_dev_info(domain, dev);
4555         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4556                 domain_exit(domain);
4557
4558         return 0;
4559 }
4560
4561 static struct notifier_block device_nb = {
4562         .notifier_call = device_notifier,
4563 };
4564
4565 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4566                                        unsigned long val, void *v)
4567 {
4568         struct memory_notify *mhp = v;
4569         unsigned long long start, end;
4570         unsigned long start_vpfn, last_vpfn;
4571
4572         switch (val) {
4573         case MEM_GOING_ONLINE:
4574                 start = mhp->start_pfn << PAGE_SHIFT;
4575                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4576                 if (iommu_domain_identity_map(si_domain, start, end)) {
4577                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4578                                 start, end);
4579                         return NOTIFY_BAD;
4580                 }
4581                 break;
4582
4583         case MEM_OFFLINE:
4584         case MEM_CANCEL_ONLINE:
4585                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4586                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4587                 while (start_vpfn <= last_vpfn) {
4588                         struct iova *iova;
4589                         struct dmar_drhd_unit *drhd;
4590                         struct intel_iommu *iommu;
4591                         struct page *freelist;
4592
4593                         iova = find_iova(&si_domain->iovad, start_vpfn);
4594                         if (iova == NULL) {
4595                                 pr_debug("Failed get IOVA for PFN %lx\n",
4596                                          start_vpfn);
4597                                 break;
4598                         }
4599
4600                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4601                                                      start_vpfn, last_vpfn);
4602                         if (iova == NULL) {
4603                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4604                                         start_vpfn, last_vpfn);
4605                                 return NOTIFY_BAD;
4606                         }
4607
4608                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4609                                                iova->pfn_hi);
4610
4611                         rcu_read_lock();
4612                         for_each_active_iommu(iommu, drhd)
4613                                 iommu_flush_iotlb_psi(iommu, si_domain,
4614                                         iova->pfn_lo, iova_size(iova),
4615                                         !freelist, 0);
4616                         rcu_read_unlock();
4617                         dma_free_pagelist(freelist);
4618
4619                         start_vpfn = iova->pfn_hi + 1;
4620                         free_iova_mem(iova);
4621                 }
4622                 break;
4623         }
4624
4625         return NOTIFY_OK;
4626 }
4627
4628 static struct notifier_block intel_iommu_memory_nb = {
4629         .notifier_call = intel_iommu_memory_notifier,
4630         .priority = 0
4631 };
4632
4633 static void free_all_cpu_cached_iovas(unsigned int cpu)
4634 {
4635         int i;
4636
4637         for (i = 0; i < g_num_of_iommus; i++) {
4638                 struct intel_iommu *iommu = g_iommus[i];
4639                 struct dmar_domain *domain;
4640                 int did;
4641
4642                 if (!iommu)
4643                         continue;
4644
4645                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4646                         domain = get_iommu_domain(iommu, (u16)did);
4647
4648                         if (!domain)
4649                                 continue;
4650                         free_cpu_cached_iovas(cpu, &domain->iovad);
4651                 }
4652         }
4653 }
4654
4655 static int intel_iommu_cpu_dead(unsigned int cpu)
4656 {
4657         free_all_cpu_cached_iovas(cpu);
4658         return 0;
4659 }
4660
4661 static void intel_disable_iommus(void)
4662 {
4663         struct intel_iommu *iommu = NULL;
4664         struct dmar_drhd_unit *drhd;
4665
4666         for_each_iommu(iommu, drhd)
4667                 iommu_disable_translation(iommu);
4668 }
4669
4670 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4671 {
4672         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4673
4674         return container_of(iommu_dev, struct intel_iommu, iommu);
4675 }
4676
4677 static ssize_t intel_iommu_show_version(struct device *dev,
4678                                         struct device_attribute *attr,
4679                                         char *buf)
4680 {
4681         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4682         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4683         return sprintf(buf, "%d:%d\n",
4684                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4685 }
4686 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4687
4688 static ssize_t intel_iommu_show_address(struct device *dev,
4689                                         struct device_attribute *attr,
4690                                         char *buf)
4691 {
4692         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4693         return sprintf(buf, "%llx\n", iommu->reg_phys);
4694 }
4695 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4696
4697 static ssize_t intel_iommu_show_cap(struct device *dev,
4698                                     struct device_attribute *attr,
4699                                     char *buf)
4700 {
4701         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4702         return sprintf(buf, "%llx\n", iommu->cap);
4703 }
4704 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4705
4706 static ssize_t intel_iommu_show_ecap(struct device *dev,
4707                                     struct device_attribute *attr,
4708                                     char *buf)
4709 {
4710         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4711         return sprintf(buf, "%llx\n", iommu->ecap);
4712 }
4713 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4714
4715 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4716                                       struct device_attribute *attr,
4717                                       char *buf)
4718 {
4719         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4720         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4721 }
4722 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4723
4724 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4725                                            struct device_attribute *attr,
4726                                            char *buf)
4727 {
4728         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4729         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4730                                                   cap_ndoms(iommu->cap)));
4731 }
4732 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4733
4734 static struct attribute *intel_iommu_attrs[] = {
4735         &dev_attr_version.attr,
4736         &dev_attr_address.attr,
4737         &dev_attr_cap.attr,
4738         &dev_attr_ecap.attr,
4739         &dev_attr_domains_supported.attr,
4740         &dev_attr_domains_used.attr,
4741         NULL,
4742 };
4743
4744 static struct attribute_group intel_iommu_group = {
4745         .name = "intel-iommu",
4746         .attrs = intel_iommu_attrs,
4747 };
4748
4749 const struct attribute_group *intel_iommu_groups[] = {
4750         &intel_iommu_group,
4751         NULL,
4752 };
4753
4754 int __init intel_iommu_init(void)
4755 {
4756         int ret = -ENODEV;
4757         struct dmar_drhd_unit *drhd;
4758         struct intel_iommu *iommu;
4759
4760         /* VT-d is required for a TXT/tboot launch, so enforce that */
4761         force_on = tboot_force_iommu();
4762
4763         if (iommu_init_mempool()) {
4764                 if (force_on)
4765                         panic("tboot: Failed to initialize iommu memory\n");
4766                 return -ENOMEM;
4767         }
4768
4769         down_write(&dmar_global_lock);
4770         if (dmar_table_init()) {
4771                 if (force_on)
4772                         panic("tboot: Failed to initialize DMAR table\n");
4773                 goto out_free_dmar;
4774         }
4775
4776         if (dmar_dev_scope_init() < 0) {
4777                 if (force_on)
4778                         panic("tboot: Failed to initialize DMAR device scope\n");
4779                 goto out_free_dmar;
4780         }
4781
4782         up_write(&dmar_global_lock);
4783
4784         /*
4785          * The bus notifier takes the dmar_global_lock, so lockdep will
4786          * complain later when we register it under the lock.
4787          */
4788         dmar_register_bus_notifier();
4789
4790         down_write(&dmar_global_lock);
4791
4792         if (no_iommu || dmar_disabled) {
4793                 /*
4794                  * We exit the function here to ensure IOMMU's remapping and
4795                  * mempool aren't setup, which means that the IOMMU's PMRs
4796                  * won't be disabled via the call to init_dmars(). So disable
4797                  * it explicitly here. The PMRs were setup by tboot prior to
4798                  * calling SENTER, but the kernel is expected to reset/tear
4799                  * down the PMRs.
4800                  */
4801                 if (intel_iommu_tboot_noforce) {
4802                         for_each_iommu(iommu, drhd)
4803                                 iommu_disable_protect_mem_regions(iommu);
4804                 }
4805
4806                 /*
4807                  * Make sure the IOMMUs are switched off, even when we
4808                  * boot into a kexec kernel and the previous kernel left
4809                  * them enabled
4810                  */
4811                 intel_disable_iommus();
4812                 goto out_free_dmar;
4813         }
4814
4815         if (list_empty(&dmar_rmrr_units))
4816                 pr_info("No RMRR found\n");
4817
4818         if (list_empty(&dmar_atsr_units))
4819                 pr_info("No ATSR found\n");
4820
4821         if (dmar_init_reserved_ranges()) {
4822                 if (force_on)
4823                         panic("tboot: Failed to reserve iommu ranges\n");
4824                 goto out_free_reserved_range;
4825         }
4826
4827         if (dmar_map_gfx)
4828                 intel_iommu_gfx_mapped = 1;
4829
4830         init_no_remapping_devices();
4831
4832         ret = init_dmars();
4833         if (ret) {
4834                 if (force_on)
4835                         panic("tboot: Failed to initialize DMARs\n");
4836                 pr_err("Initialization failed\n");
4837                 goto out_free_reserved_range;
4838         }
4839         up_write(&dmar_global_lock);
4840         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4841
4842 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4843         swiotlb = 0;
4844 #endif
4845         dma_ops = &intel_dma_ops;
4846
4847         init_iommu_pm_ops();
4848
4849         for_each_active_iommu(iommu, drhd) {
4850                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4851                                        intel_iommu_groups,
4852                                        "%s", iommu->name);
4853                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4854                 iommu_device_register(&iommu->iommu);
4855         }
4856
4857         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4858         bus_register_notifier(&pci_bus_type, &device_nb);
4859         if (si_domain && !hw_pass_through)
4860                 register_memory_notifier(&intel_iommu_memory_nb);
4861         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4862                           intel_iommu_cpu_dead);
4863         intel_iommu_enabled = 1;
4864
4865         return 0;
4866
4867 out_free_reserved_range:
4868         put_iova_domain(&reserved_iova_list);
4869 out_free_dmar:
4870         intel_iommu_free_dmars();
4871         up_write(&dmar_global_lock);
4872         iommu_exit_mempool();
4873         return ret;
4874 }
4875
4876 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4877 {
4878         struct intel_iommu *iommu = opaque;
4879
4880         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4881         return 0;
4882 }
4883
4884 /*
4885  * NB - intel-iommu lacks any sort of reference counting for the users of
4886  * dependent devices.  If multiple endpoints have intersecting dependent
4887  * devices, unbinding the driver from any one of them will possibly leave
4888  * the others unable to operate.
4889  */
4890 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4891 {
4892         if (!iommu || !dev || !dev_is_pci(dev))
4893                 return;
4894
4895         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4896 }
4897
4898 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4899 {
4900         struct intel_iommu *iommu;
4901         unsigned long flags;
4902
4903         assert_spin_locked(&device_domain_lock);
4904
4905         if (WARN_ON(!info))
4906                 return;
4907
4908         iommu = info->iommu;
4909
4910         if (info->dev) {
4911                 iommu_disable_dev_iotlb(info);
4912                 domain_context_clear(iommu, info->dev);
4913                 intel_pasid_free_table(info->dev);
4914         }
4915
4916         unlink_domain_info(info);
4917
4918         spin_lock_irqsave(&iommu->lock, flags);
4919         domain_detach_iommu(info->domain, iommu);
4920         spin_unlock_irqrestore(&iommu->lock, flags);
4921
4922         free_devinfo_mem(info);
4923 }
4924
4925 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4926                                      struct device *dev)
4927 {
4928         struct device_domain_info *info;
4929         unsigned long flags;
4930
4931         spin_lock_irqsave(&device_domain_lock, flags);
4932         info = dev->archdata.iommu;
4933         __dmar_remove_one_dev_info(info);
4934         spin_unlock_irqrestore(&device_domain_lock, flags);
4935 }
4936
4937 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4938 {
4939         int adjust_width;
4940
4941         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4942         domain_reserve_special_ranges(domain);
4943
4944         /* calculate AGAW */
4945         domain->gaw = guest_width;
4946         adjust_width = guestwidth_to_adjustwidth(guest_width);
4947         domain->agaw = width_to_agaw(adjust_width);
4948
4949         domain->iommu_coherency = 0;
4950         domain->iommu_snooping = 0;
4951         domain->iommu_superpage = 0;
4952         domain->max_addr = 0;
4953
4954         /* always allocate the top pgd */
4955         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4956         if (!domain->pgd)
4957                 return -ENOMEM;
4958         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4959         return 0;
4960 }
4961
4962 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4963 {
4964         struct dmar_domain *dmar_domain;
4965         struct iommu_domain *domain;
4966
4967         if (type != IOMMU_DOMAIN_UNMANAGED)
4968                 return NULL;
4969
4970         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4971         if (!dmar_domain) {
4972                 pr_err("Can't allocate dmar_domain\n");
4973                 return NULL;
4974         }
4975         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4976                 pr_err("Domain initialization failed\n");
4977                 domain_exit(dmar_domain);
4978                 return NULL;
4979         }
4980         domain_update_iommu_cap(dmar_domain);
4981
4982         domain = &dmar_domain->domain;
4983         domain->geometry.aperture_start = 0;
4984         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4985         domain->geometry.force_aperture = true;
4986
4987         return domain;
4988 }
4989
4990 static void intel_iommu_domain_free(struct iommu_domain *domain)
4991 {
4992         domain_exit(to_dmar_domain(domain));
4993 }
4994
4995 static int intel_iommu_attach_device(struct iommu_domain *domain,
4996                                      struct device *dev)
4997 {
4998         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4999         struct intel_iommu *iommu;
5000         int addr_width;
5001         u8 bus, devfn;
5002
5003         if (device_is_rmrr_locked(dev)) {
5004                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5005                 return -EPERM;
5006         }
5007
5008         /* normally dev is not mapped */
5009         if (unlikely(domain_context_mapped(dev))) {
5010                 struct dmar_domain *old_domain;
5011
5012                 old_domain = find_domain(dev);
5013                 if (old_domain) {
5014                         rcu_read_lock();
5015                         dmar_remove_one_dev_info(old_domain, dev);
5016                         rcu_read_unlock();
5017
5018                         if (!domain_type_is_vm_or_si(old_domain) &&
5019                              list_empty(&old_domain->devices))
5020                                 domain_exit(old_domain);
5021                 }
5022         }
5023
5024         iommu = device_to_iommu(dev, &bus, &devfn);
5025         if (!iommu)
5026                 return -ENODEV;
5027
5028         /* check if this iommu agaw is sufficient for max mapped address */
5029         addr_width = agaw_to_width(iommu->agaw);
5030         if (addr_width > cap_mgaw(iommu->cap))
5031                 addr_width = cap_mgaw(iommu->cap);
5032
5033         if (dmar_domain->max_addr > (1LL << addr_width)) {
5034                 pr_err("%s: iommu width (%d) is not "
5035                        "sufficient for the mapped address (%llx)\n",
5036                        __func__, addr_width, dmar_domain->max_addr);
5037                 return -EFAULT;
5038         }
5039         dmar_domain->gaw = addr_width;
5040
5041         /*
5042          * Knock out extra levels of page tables if necessary
5043          */
5044         while (iommu->agaw < dmar_domain->agaw) {
5045                 struct dma_pte *pte;
5046
5047                 pte = dmar_domain->pgd;
5048                 if (dma_pte_present(pte)) {
5049                         dmar_domain->pgd = (struct dma_pte *)
5050                                 phys_to_virt(dma_pte_addr(pte));
5051                         free_pgtable_page(pte);
5052                 }
5053                 dmar_domain->agaw--;
5054         }
5055
5056         return domain_add_dev_info(dmar_domain, dev);
5057 }
5058
5059 static void intel_iommu_detach_device(struct iommu_domain *domain,
5060                                       struct device *dev)
5061 {
5062         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5063 }
5064
5065 static int intel_iommu_map(struct iommu_domain *domain,
5066                            unsigned long iova, phys_addr_t hpa,
5067                            size_t size, int iommu_prot)
5068 {
5069         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5070         u64 max_addr;
5071         int prot = 0;
5072         int ret;
5073
5074         if (iommu_prot & IOMMU_READ)
5075                 prot |= DMA_PTE_READ;
5076         if (iommu_prot & IOMMU_WRITE)
5077                 prot |= DMA_PTE_WRITE;
5078         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5079                 prot |= DMA_PTE_SNP;
5080
5081         max_addr = iova + size;
5082         if (dmar_domain->max_addr < max_addr) {
5083                 u64 end;
5084
5085                 /* check if minimum agaw is sufficient for mapped address */
5086                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5087                 if (end < max_addr) {
5088                         pr_err("%s: iommu width (%d) is not "
5089                                "sufficient for the mapped address (%llx)\n",
5090                                __func__, dmar_domain->gaw, max_addr);
5091                         return -EFAULT;
5092                 }
5093                 dmar_domain->max_addr = max_addr;
5094         }
5095         /* Round up size to next multiple of PAGE_SIZE, if it and
5096            the low bits of hpa would take us onto the next page */
5097         size = aligned_nrpages(hpa, size);
5098         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5099                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5100         return ret;
5101 }
5102
5103 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5104                                 unsigned long iova, size_t size)
5105 {
5106         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5107         struct page *freelist = NULL;
5108         unsigned long start_pfn, last_pfn;
5109         unsigned int npages;
5110         int iommu_id, level = 0;
5111
5112         /* Cope with horrid API which requires us to unmap more than the
5113            size argument if it happens to be a large-page mapping. */
5114         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5115
5116         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5117                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5118
5119         start_pfn = iova >> VTD_PAGE_SHIFT;
5120         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5121
5122         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5123
5124         npages = last_pfn - start_pfn + 1;
5125
5126         for_each_domain_iommu(iommu_id, dmar_domain)
5127                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5128                                       start_pfn, npages, !freelist, 0);
5129
5130         dma_free_pagelist(freelist);
5131
5132         if (dmar_domain->max_addr == iova + size)
5133                 dmar_domain->max_addr = iova;
5134
5135         return size;
5136 }
5137
5138 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5139                                             dma_addr_t iova)
5140 {
5141         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5142         struct dma_pte *pte;
5143         int level = 0;
5144         u64 phys = 0;
5145
5146         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5147         if (pte && dma_pte_present(pte))
5148                 phys = dma_pte_addr(pte) +
5149                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5150                                                 VTD_PAGE_SHIFT) - 1));
5151
5152         return phys;
5153 }
5154
5155 static bool intel_iommu_capable(enum iommu_cap cap)
5156 {
5157         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5158                 return domain_update_iommu_snooping(NULL) == 1;
5159         if (cap == IOMMU_CAP_INTR_REMAP)
5160                 return irq_remapping_enabled == 1;
5161
5162         return false;
5163 }
5164
5165 static int intel_iommu_add_device(struct device *dev)
5166 {
5167         struct intel_iommu *iommu;
5168         struct iommu_group *group;
5169         u8 bus, devfn;
5170
5171         iommu = device_to_iommu(dev, &bus, &devfn);
5172         if (!iommu)
5173                 return -ENODEV;
5174
5175         iommu_device_link(&iommu->iommu, dev);
5176
5177         group = iommu_group_get_for_dev(dev);
5178
5179         if (IS_ERR(group))
5180                 return PTR_ERR(group);
5181
5182         iommu_group_put(group);
5183         return 0;
5184 }
5185
5186 static void intel_iommu_remove_device(struct device *dev)
5187 {
5188         struct intel_iommu *iommu;
5189         u8 bus, devfn;
5190
5191         iommu = device_to_iommu(dev, &bus, &devfn);
5192         if (!iommu)
5193                 return;
5194
5195         iommu_group_remove_device(dev);
5196
5197         iommu_device_unlink(&iommu->iommu, dev);
5198 }
5199
5200 static void intel_iommu_get_resv_regions(struct device *device,
5201                                          struct list_head *head)
5202 {
5203         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5204         struct iommu_resv_region *reg;
5205         struct dmar_rmrr_unit *rmrr;
5206         struct device *i_dev;
5207         int i;
5208
5209         down_read(&dmar_global_lock);
5210         for_each_rmrr_units(rmrr) {
5211                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5212                                           i, i_dev) {
5213                         struct iommu_resv_region *resv;
5214                         size_t length;
5215
5216                         if (i_dev != device)
5217                                 continue;
5218
5219                         length = rmrr->end_address - rmrr->base_address + 1;
5220                         resv = iommu_alloc_resv_region(rmrr->base_address,
5221                                                        length, prot,
5222                                                        IOMMU_RESV_DIRECT);
5223                         if (!resv)
5224                                 break;
5225
5226                         list_add_tail(&resv->list, head);
5227                 }
5228         }
5229         up_read(&dmar_global_lock);
5230
5231         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5232                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5233                                       0, IOMMU_RESV_MSI);
5234         if (!reg)
5235                 return;
5236         list_add_tail(&reg->list, head);
5237 }
5238
5239 static void intel_iommu_put_resv_regions(struct device *dev,
5240                                          struct list_head *head)
5241 {
5242         struct iommu_resv_region *entry, *next;
5243
5244         list_for_each_entry_safe(entry, next, head, list)
5245                 kfree(entry);
5246 }
5247
5248 #ifdef CONFIG_INTEL_IOMMU_SVM
5249 #define MAX_NR_PASID_BITS (20)
5250 static inline unsigned long intel_iommu_get_pts(struct device *dev)
5251 {
5252         int pts, max_pasid;
5253
5254         max_pasid = intel_pasid_get_dev_max_id(dev);
5255         pts = find_first_bit((unsigned long *)&max_pasid, MAX_NR_PASID_BITS);
5256         if (pts < 5)
5257                 return 0;
5258
5259         return pts - 5;
5260 }
5261
5262 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5263 {
5264         struct device_domain_info *info;
5265         struct context_entry *context;
5266         struct dmar_domain *domain;
5267         unsigned long flags;
5268         u64 ctx_lo;
5269         int ret;
5270
5271         domain = get_valid_domain_for_dev(sdev->dev);
5272         if (!domain)
5273                 return -EINVAL;
5274
5275         spin_lock_irqsave(&device_domain_lock, flags);
5276         spin_lock(&iommu->lock);
5277
5278         ret = -EINVAL;
5279         info = sdev->dev->archdata.iommu;
5280         if (!info || !info->pasid_supported)
5281                 goto out;
5282
5283         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5284         if (WARN_ON(!context))
5285                 goto out;
5286
5287         ctx_lo = context[0].lo;
5288
5289         sdev->did = domain->iommu_did[iommu->seq_id];
5290         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5291
5292         if (!(ctx_lo & CONTEXT_PASIDE)) {
5293                 if (iommu->pasid_state_table)
5294                         context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5295                 context[1].lo = (u64)virt_to_phys(info->pasid_table->table) |
5296                         intel_iommu_get_pts(sdev->dev);
5297
5298                 wmb();
5299                 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5300                  * extended to permit requests-with-PASID if the PASIDE bit
5301                  * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5302                  * however, the PASIDE bit is ignored and requests-with-PASID
5303                  * are unconditionally blocked. Which makes less sense.
5304                  * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5305                  * "guest mode" translation types depending on whether ATS
5306                  * is available or not. Annoyingly, we can't use the new
5307                  * modes *unless* PASIDE is set. */
5308                 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5309                         ctx_lo &= ~CONTEXT_TT_MASK;
5310                         if (info->ats_supported)
5311                                 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5312                         else
5313                                 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5314                 }
5315                 ctx_lo |= CONTEXT_PASIDE;
5316                 if (iommu->pasid_state_table)
5317                         ctx_lo |= CONTEXT_DINVE;
5318                 if (info->pri_supported)
5319                         ctx_lo |= CONTEXT_PRS;
5320                 context[0].lo = ctx_lo;
5321                 wmb();
5322                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5323                                            DMA_CCMD_MASK_NOBIT,
5324                                            DMA_CCMD_DEVICE_INVL);
5325         }
5326
5327         /* Enable PASID support in the device, if it wasn't already */
5328         if (!info->pasid_enabled)
5329                 iommu_enable_dev_iotlb(info);
5330
5331         if (info->ats_enabled) {
5332                 sdev->dev_iotlb = 1;
5333                 sdev->qdep = info->ats_qdep;
5334                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5335                         sdev->qdep = 0;
5336         }
5337         ret = 0;
5338
5339  out:
5340         spin_unlock(&iommu->lock);
5341         spin_unlock_irqrestore(&device_domain_lock, flags);
5342
5343         return ret;
5344 }
5345
5346 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5347 {
5348         struct intel_iommu *iommu;
5349         u8 bus, devfn;
5350
5351         if (iommu_dummy(dev)) {
5352                 dev_warn(dev,
5353                          "No IOMMU translation for device; cannot enable SVM\n");
5354                 return NULL;
5355         }
5356
5357         iommu = device_to_iommu(dev, &bus, &devfn);
5358         if ((!iommu)) {
5359                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5360                 return NULL;
5361         }
5362
5363         return iommu;
5364 }
5365 #endif /* CONFIG_INTEL_IOMMU_SVM */
5366
5367 const struct iommu_ops intel_iommu_ops = {
5368         .capable                = intel_iommu_capable,
5369         .domain_alloc           = intel_iommu_domain_alloc,
5370         .domain_free            = intel_iommu_domain_free,
5371         .attach_dev             = intel_iommu_attach_device,
5372         .detach_dev             = intel_iommu_detach_device,
5373         .map                    = intel_iommu_map,
5374         .unmap                  = intel_iommu_unmap,
5375         .iova_to_phys           = intel_iommu_iova_to_phys,
5376         .add_device             = intel_iommu_add_device,
5377         .remove_device          = intel_iommu_remove_device,
5378         .get_resv_regions       = intel_iommu_get_resv_regions,
5379         .put_resv_regions       = intel_iommu_put_resv_regions,
5380         .device_group           = pci_device_group,
5381         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5382 };
5383
5384 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5385 {
5386         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5387         pr_info("Disabling IOMMU for graphics on this chipset\n");
5388         dmar_map_gfx = 0;
5389 }
5390
5391 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5392 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5393 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5394 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5395 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5396 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5397 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5398
5399 static void quirk_iommu_rwbf(struct pci_dev *dev)
5400 {
5401         /*
5402          * Mobile 4 Series Chipset neglects to set RWBF capability,
5403          * but needs it. Same seems to hold for the desktop versions.
5404          */
5405         pr_info("Forcing write-buffer flush capability\n");
5406         rwbf_quirk = 1;
5407 }
5408
5409 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5410 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5411 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5412 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5413 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5414 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5415 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5416
5417 #define GGC 0x52
5418 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5419 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5420 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5421 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5422 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5423 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5424 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5425 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5426
5427 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5428 {
5429         unsigned short ggc;
5430
5431         if (pci_read_config_word(dev, GGC, &ggc))
5432                 return;
5433
5434         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5435                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5436                 dmar_map_gfx = 0;
5437         } else if (dmar_map_gfx) {
5438                 /* we have to ensure the gfx device is idle before we flush */
5439                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5440                 intel_iommu_strict = 1;
5441        }
5442 }
5443 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5444 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5445 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5446 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5447
5448 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5449    ISOCH DMAR unit for the Azalia sound device, but not give it any
5450    TLB entries, which causes it to deadlock. Check for that.  We do
5451    this in a function called from init_dmars(), instead of in a PCI
5452    quirk, because we don't want to print the obnoxious "BIOS broken"
5453    message if VT-d is actually disabled.
5454 */
5455 static void __init check_tylersburg_isoch(void)
5456 {
5457         struct pci_dev *pdev;
5458         uint32_t vtisochctrl;
5459
5460         /* If there's no Azalia in the system anyway, forget it. */
5461         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5462         if (!pdev)
5463                 return;
5464         pci_dev_put(pdev);
5465
5466         /* System Management Registers. Might be hidden, in which case
5467            we can't do the sanity check. But that's OK, because the
5468            known-broken BIOSes _don't_ actually hide it, so far. */
5469         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5470         if (!pdev)
5471                 return;
5472
5473         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5474                 pci_dev_put(pdev);
5475                 return;
5476         }
5477
5478         pci_dev_put(pdev);
5479
5480         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5481         if (vtisochctrl & 1)
5482                 return;
5483
5484         /* Drop all bits other than the number of TLB entries */
5485         vtisochctrl &= 0x1c;
5486
5487         /* If we have the recommended number of TLB entries (16), fine. */
5488         if (vtisochctrl == 0x10)
5489                 return;
5490
5491         /* Zero TLB entries? You get to ride the short bus to school. */
5492         if (!vtisochctrl) {
5493                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5494                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5495                      dmi_get_system_info(DMI_BIOS_VENDOR),
5496                      dmi_get_system_info(DMI_BIOS_VERSION),
5497                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5498                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5499                 return;
5500         }
5501
5502         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5503                vtisochctrl);
5504 }