drivers/misc/ocxl/link.c

   1 // SPDX-License-Identifier: GPL-2.0+
   2 // Copyright 2017 IBM Corp.
   3 #include <linux/sched/mm.h>
   4 #include <linux/mutex.h>
   5 #include <linux/mm.h>
   6 #include <linux/mm_types.h>
   7 #include <linux/mmu_context.h>
   8 #include <linux/mmu_notifier.h>
   9 #include <asm/copro.h>
  10 #include <asm/pnv-ocxl.h>
  11 #include <asm/xive.h>
  12 #include <misc/ocxl.h>
  13 #include "ocxl_internal.h"
  14 #include "trace.h"
  15
  16
  17 #define SPA_PASID_BITS          15
  18 #define SPA_PASID_MAX           ((1 << SPA_PASID_BITS) - 1)
  19 #define SPA_PE_MASK             SPA_PASID_MAX
  20 #define SPA_SPA_SIZE_LOG        22 /* Each SPA is 4 Mb */
  21
  22 #define SPA_CFG_SF              (1ull << (63-0))
  23 #define SPA_CFG_TA              (1ull << (63-1))
  24 #define SPA_CFG_HV              (1ull << (63-3))
  25 #define SPA_CFG_UV              (1ull << (63-4))
  26 #define SPA_CFG_XLAT_hpt        (0ull << (63-6)) /* Hashed page table (HPT) mode */
  27 #define SPA_CFG_XLAT_roh        (2ull << (63-6)) /* Radix on HPT mode */
  28 #define SPA_CFG_XLAT_ror        (3ull << (63-6)) /* Radix on Radix mode */
  29 #define SPA_CFG_PR              (1ull << (63-49))
  30 #define SPA_CFG_TC              (1ull << (63-54))
  31 #define SPA_CFG_DR              (1ull << (63-59))
  32
  33 #define SPA_XSL_TF              (1ull << (63-3))  /* Translation fault */
  34 #define SPA_XSL_S               (1ull << (63-38)) /* Store operation */
  35
  36 #define SPA_PE_VALID            0x80000000
  37
  38 struct ocxl_link;
  39
  40 struct pe_data {
  41         struct mm_struct *mm;
  42         /* callback to trigger when a translation fault occurs */
  43         void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr);
  44         /* opaque pointer to be passed to the above callback */
  45         void *xsl_err_data;
  46         struct rcu_head rcu;
  47         struct ocxl_link *link;
  48         struct mmu_notifier mmu_notifier;
  49 };
  50
  51 struct spa {
  52         struct ocxl_process_element *spa_mem;
  53         int spa_order;
  54         struct mutex spa_lock;
  55         struct radix_tree_root pe_tree; /* Maps PE handles to pe_data */
  56         char *irq_name;
  57         int virq;
  58         void __iomem *reg_dsisr;
  59         void __iomem *reg_dar;
  60         void __iomem *reg_tfc;
  61         void __iomem *reg_pe_handle;
  62         /*
  63          * The following field are used by the memory fault
  64          * interrupt handler. We can only have one interrupt at a
  65          * time. The NPU won't raise another interrupt until the
  66          * previous one has been ack'd by writing to the TFC register
  67          */
  68         struct xsl_fault {
  69                 struct work_struct fault_work;
  70                 u64 pe;
  71                 u64 dsisr;
  72                 u64 dar;
  73                 struct pe_data pe_data;
  74         } xsl_fault;
  75 };
  76
  77 /*
  78  * A opencapi link can be used be by several PCI functions. We have
  79  * one link per device slot.
  80  *
  81  * A linked list of opencapi links should suffice, as there's a
  82  * limited number of opencapi slots on a system and lookup is only
  83  * done when the device is probed
  84  */
  85 struct ocxl_link {
  86         struct list_head list;
  87         struct kref ref;
  88         int domain;
  89         int bus;
  90         int dev;
  91         void __iomem *arva;     /* ATSD register virtual address */
  92         spinlock_t atsd_lock;   /* to serialize shootdowns */
  93         atomic_t irq_available;
  94         struct spa *spa;
  95         void *platform_data;
  96 };
  97 static struct list_head links_list = LIST_HEAD_INIT(links_list);
  98 static DEFINE_MUTEX(links_list_lock);
  99
 100 enum xsl_response {
 101         CONTINUE,
 102         ADDRESS_ERROR,
 103         RESTART,
 104 };
 105
 106
 107 static void read_irq(struct spa *spa, u64 *dsisr, u64 *dar, u64 *pe)
 108 {
 109         u64 reg;
 110
 111         *dsisr = in_be64(spa->reg_dsisr);
 112         *dar = in_be64(spa->reg_dar);
 113         reg = in_be64(spa->reg_pe_handle);
 114         *pe = reg & SPA_PE_MASK;
 115 }
 116
 117 static void ack_irq(struct spa *spa, enum xsl_response r)
 118 {
 119         u64 reg = 0;
 120
 121         /* continue is not supported */
 122         if (r == RESTART)
 123                 reg = PPC_BIT(31);
 124         else if (r == ADDRESS_ERROR)
 125                 reg = PPC_BIT(30);
 126         else
 127                 WARN(1, "Invalid irq response %d\n", r);
 128
 129         if (reg) {
 130                 trace_ocxl_fault_ack(spa->spa_mem, spa->xsl_fault.pe,
 131                                 spa->xsl_fault.dsisr, spa->xsl_fault.dar, reg);
 132                 out_be64(spa->reg_tfc, reg);
 133         }
 134 }
 135
 136 static void xsl_fault_handler_bh(struct work_struct *fault_work)
 137 {
 138         vm_fault_t flt = 0;
 139         unsigned long access, flags, inv_flags = 0;
 140         enum xsl_response r;
 141         struct xsl_fault *fault = container_of(fault_work, struct xsl_fault,
 142                                         fault_work);
 143         struct spa *spa = container_of(fault, struct spa, xsl_fault);
 144
 145         int rc;
 146
 147         /*
 148          * We must release a reference on mm_users whenever exiting this
 149          * function (taken in the memory fault interrupt handler)
 150          */
 151         rc = copro_handle_mm_fault(fault->pe_data.mm, fault->dar, fault->dsisr,
 152                                 &flt);
 153         if (rc) {
 154                 pr_debug("copro_handle_mm_fault failed: %d\n", rc);
 155                 if (fault->pe_data.xsl_err_cb) {
 156                         fault->pe_data.xsl_err_cb(
 157                                 fault->pe_data.xsl_err_data,
 158                                 fault->dar, fault->dsisr);
 159                 }
 160                 r = ADDRESS_ERROR;
 161                 goto ack;
 162         }
 163
 164         if (!radix_enabled()) {
 165                 /*
 166                  * update_mmu_cache() will not have loaded the hash
 167                  * since current->trap is not a 0x400 or 0x300, so
 168                  * just call hash_page_mm() here.
 169                  */
 170                 access = _PAGE_PRESENT | _PAGE_READ;
 171                 if (fault->dsisr & SPA_XSL_S)
 172                         access |= _PAGE_WRITE;
 173
 174                 if (get_region_id(fault->dar) != USER_REGION_ID)
 175                         access |= _PAGE_PRIVILEGED;
 176
 177                 local_irq_save(flags);
 178                 hash_page_mm(fault->pe_data.mm, fault->dar, access, 0x300,
 179                         inv_flags);
 180                 local_irq_restore(flags);
 181         }
 182         r = RESTART;
 183 ack:
 184         mmput(fault->pe_data.mm);
 185         ack_irq(spa, r);
 186 }
 187
 188 static irqreturn_t xsl_fault_handler(int irq, void *data)
 189 {
 190         struct ocxl_link *link = (struct ocxl_link *) data;
 191         struct spa *spa = link->spa;
 192         u64 dsisr, dar, pe_handle;
 193         struct pe_data *pe_data;
 194         struct ocxl_process_element *pe;
 195         int pid;
 196         bool schedule = false;
 197
 198         read_irq(spa, &dsisr, &dar, &pe_handle);
 199         trace_ocxl_fault(spa->spa_mem, pe_handle, dsisr, dar, -1);
 200
 201         WARN_ON(pe_handle > SPA_PE_MASK);
 202         pe = spa->spa_mem + pe_handle;
 203         pid = be32_to_cpu(pe->pid);
 204         /* We could be reading all null values here if the PE is being
 205          * removed while an interrupt kicks in. It's not supposed to
 206          * happen if the driver notified the AFU to terminate the
 207          * PASID, and the AFU waited for pending operations before
 208          * acknowledging. But even if it happens, we won't find a
 209          * memory context below and fail silently, so it should be ok.
 210          */
 211         if (!(dsisr & SPA_XSL_TF)) {
 212                 WARN(1, "Invalid xsl interrupt fault register %#llx\n", dsisr);
 213                 ack_irq(spa, ADDRESS_ERROR);
 214                 return IRQ_HANDLED;
 215         }
 216
 217         rcu_read_lock();
 218         pe_data = radix_tree_lookup(&spa->pe_tree, pe_handle);
 219         if (!pe_data) {
 220                 /*
 221                  * Could only happen if the driver didn't notify the
 222                  * AFU about PASID termination before removing the PE,
 223                  * or the AFU didn't wait for all memory access to
 224                  * have completed.
 225                  *
 226                  * Either way, we fail early, but we shouldn't log an
 227                  * error message, as it is a valid (if unexpected)
 228                  * scenario
 229                  */
 230                 rcu_read_unlock();
 231                 pr_debug("Unknown mm context for xsl interrupt\n");
 232                 ack_irq(spa, ADDRESS_ERROR);
 233                 return IRQ_HANDLED;
 234         }
 235
 236         if (!pe_data->mm) {
 237                 /*
 238                  * translation fault from a kernel context - an OpenCAPI
 239                  * device tried to access a bad kernel address
 240                  */
 241                 rcu_read_unlock();
 242                 pr_warn("Unresolved OpenCAPI xsl fault in kernel context\n");
 243                 ack_irq(spa, ADDRESS_ERROR);
 244                 return IRQ_HANDLED;
 245         }
 246         WARN_ON(pe_data->mm->context.id != pid);
 247
 248         if (mmget_not_zero(pe_data->mm)) {
 249                         spa->xsl_fault.pe = pe_handle;
 250                         spa->xsl_fault.dar = dar;
 251                         spa->xsl_fault.dsisr = dsisr;
 252                         spa->xsl_fault.pe_data = *pe_data;
 253                         schedule = true;
 254                         /* mm_users count released by bottom half */
 255         }
 256         rcu_read_unlock();
 257         if (schedule)
 258                 schedule_work(&spa->xsl_fault.fault_work);
 259         else
 260                 ack_irq(spa, ADDRESS_ERROR);
 261         return IRQ_HANDLED;
 262 }
 263
 264 static void unmap_irq_registers(struct spa *spa)
 265 {
 266         pnv_ocxl_unmap_xsl_regs(spa->reg_dsisr, spa->reg_dar, spa->reg_tfc,
 267                                 spa->reg_pe_handle);
 268 }
 269
 270 static int map_irq_registers(struct pci_dev *dev, struct spa *spa)
 271 {
 272         return pnv_ocxl_map_xsl_regs(dev, &spa->reg_dsisr, &spa->reg_dar,
 273                                 &spa->reg_tfc, &spa->reg_pe_handle);
 274 }
 275
 276 static int setup_xsl_irq(struct pci_dev *dev, struct ocxl_link *link)
 277 {
 278         struct spa *spa = link->spa;
 279         int rc;
 280         int hwirq;
 281
 282         rc = pnv_ocxl_get_xsl_irq(dev, &hwirq);
 283         if (rc)
 284                 return rc;
 285
 286         rc = map_irq_registers(dev, spa);
 287         if (rc)
 288                 return rc;
 289
 290         spa->irq_name = kasprintf(GFP_KERNEL, "ocxl-xsl-%x-%x-%x",
 291                                 link->domain, link->bus, link->dev);
 292         if (!spa->irq_name) {
 293                 dev_err(&dev->dev, "Can't allocate name for xsl interrupt\n");
 294                 rc = -ENOMEM;
 295                 goto err_xsl;
 296         }
 297         /*
 298          * At some point, we'll need to look into allowing a higher
 299          * number of interrupts. Could we have an IRQ domain per link?
 300          */
 301         spa->virq = irq_create_mapping(NULL, hwirq);
 302         if (!spa->virq) {
 303                 dev_err(&dev->dev,
 304                         "irq_create_mapping failed for translation interrupt\n");
 305                 rc = -EINVAL;
 306                 goto err_name;
 307         }
 308
 309         dev_dbg(&dev->dev, "hwirq %d mapped to virq %d\n", hwirq, spa->virq);
 310
 311         rc = request_irq(spa->virq, xsl_fault_handler, 0, spa->irq_name,
 312                         link);
 313         if (rc) {
 314                 dev_err(&dev->dev,
 315                         "request_irq failed for translation interrupt: %d\n",
 316                         rc);
 317                 rc = -EINVAL;
 318                 goto err_mapping;
 319         }
 320         return 0;
 321
 322 err_mapping:
 323         irq_dispose_mapping(spa->virq);
 324 err_name:
 325         kfree(spa->irq_name);
 326 err_xsl:
 327         unmap_irq_registers(spa);
 328         return rc;
 329 }
 330
 331 static void release_xsl_irq(struct ocxl_link *link)
 332 {
 333         struct spa *spa = link->spa;
 334
 335         if (spa->virq) {
 336                 free_irq(spa->virq, link);
 337                 irq_dispose_mapping(spa->virq);
 338         }
 339         kfree(spa->irq_name);
 340         unmap_irq_registers(spa);
 341 }
 342
 343 static int alloc_spa(struct pci_dev *dev, struct ocxl_link *link)
 344 {
 345         struct spa *spa;
 346
 347         spa = kzalloc(sizeof(struct spa), GFP_KERNEL);
 348         if (!spa)
 349                 return -ENOMEM;
 350
 351         mutex_init(&spa->spa_lock);
 352         INIT_RADIX_TREE(&spa->pe_tree, GFP_KERNEL);
 353         INIT_WORK(&spa->xsl_fault.fault_work, xsl_fault_handler_bh);
 354
 355         spa->spa_order = SPA_SPA_SIZE_LOG - PAGE_SHIFT;
 356         spa->spa_mem = (struct ocxl_process_element *)
 357                 __get_free_pages(GFP_KERNEL | __GFP_ZERO, spa->spa_order);
 358         if (!spa->spa_mem) {
 359                 dev_err(&dev->dev, "Can't allocate Shared Process Area\n");
 360                 kfree(spa);
 361                 return -ENOMEM;
 362         }
 363         pr_debug("Allocated SPA for %x:%x:%x at %p\n", link->domain, link->bus,
 364                 link->dev, spa->spa_mem);
 365
 366         link->spa = spa;
 367         return 0;
 368 }
 369
 370 static void free_spa(struct ocxl_link *link)
 371 {
 372         struct spa *spa = link->spa;
 373
 374         pr_debug("Freeing SPA for %x:%x:%x\n", link->domain, link->bus,
 375                 link->dev);
 376
 377         if (spa && spa->spa_mem) {
 378                 free_pages((unsigned long) spa->spa_mem, spa->spa_order);
 379                 kfree(spa);
 380                 link->spa = NULL;
 381         }
 382 }
 383
 384 static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_link)
 385 {
 386         struct ocxl_link *link;
 387         int rc;
 388
 389         link = kzalloc(sizeof(struct ocxl_link), GFP_KERNEL);
 390         if (!link)
 391                 return -ENOMEM;
 392
 393         kref_init(&link->ref);
 394         link->domain = pci_domain_nr(dev->bus);
 395         link->bus = dev->bus->number;
 396         link->dev = PCI_SLOT(dev->devfn);
 397         atomic_set(&link->irq_available, MAX_IRQ_PER_LINK);
 398         spin_lock_init(&link->atsd_lock);
 399
 400         rc = alloc_spa(dev, link);
 401         if (rc)
 402                 goto err_free;
 403
 404         rc = setup_xsl_irq(dev, link);
 405         if (rc)
 406                 goto err_spa;
 407
 408         /* platform specific hook */
 409         rc = pnv_ocxl_spa_setup(dev, link->spa->spa_mem, PE_mask,
 410                                 &link->platform_data);
 411         if (rc)
 412                 goto err_xsl_irq;
 413
 414         /* if link->arva is not defeined, MMIO registers are not used to
 415          * generate TLB invalidate. PowerBus snooping is enabled.
 416          * Otherwise, PowerBus snooping is disabled. TLB Invalidates are
 417          * initiated using MMIO registers.
 418          */
 419         pnv_ocxl_map_lpar(dev, mfspr(SPRN_LPID), 0, &link->arva);
 420
 421         *out_link = link;
 422         return 0;
 423
 424 err_xsl_irq:
 425         release_xsl_irq(link);
 426 err_spa:
 427         free_spa(link);
 428 err_free:
 429         kfree(link);
 430         return rc;
 431 }
 432
 433 static void free_link(struct ocxl_link *link)
 434 {
 435         release_xsl_irq(link);
 436         free_spa(link);
 437         kfree(link);
 438 }
 439
 440 int ocxl_link_setup(struct pci_dev *dev, int PE_mask, void **link_handle)
 441 {
 442         int rc = 0;
 443         struct ocxl_link *link;
 444
 445         mutex_lock(&links_list_lock);
 446         list_for_each_entry(link, &links_list, list) {
 447                 /* The functions of a device all share the same link */
 448                 if (link->domain == pci_domain_nr(dev->bus) &&
 449                         link->bus == dev->bus->number &&
 450                         link->dev == PCI_SLOT(dev->devfn)) {
 451                         kref_get(&link->ref);
 452                         *link_handle = link;
 453                         goto unlock;
 454                 }
 455         }
 456         rc = alloc_link(dev, PE_mask, &link);
 457         if (rc)
 458                 goto unlock;
 459
 460         list_add(&link->list, &links_list);
 461         *link_handle = link;
 462 unlock:
 463         mutex_unlock(&links_list_lock);
 464         return rc;
 465 }
 466 EXPORT_SYMBOL_GPL(ocxl_link_setup);
 467
 468 static void release_xsl(struct kref *ref)
 469 {
 470         struct ocxl_link *link = container_of(ref, struct ocxl_link, ref);
 471
 472         if (link->arva) {
 473                 pnv_ocxl_unmap_lpar(link->arva);
 474                 link->arva = NULL;
 475         }
 476
 477         list_del(&link->list);
 478         /* call platform code before releasing data */
 479         pnv_ocxl_spa_release(link->platform_data);
 480         free_link(link);
 481 }
 482
 483 void ocxl_link_release(struct pci_dev *dev, void *link_handle)
 484 {
 485         struct ocxl_link *link = (struct ocxl_link *) link_handle;
 486
 487         mutex_lock(&links_list_lock);
 488         kref_put(&link->ref, release_xsl);
 489         mutex_unlock(&links_list_lock);
 490 }
 491 EXPORT_SYMBOL_GPL(ocxl_link_release);
 492
 493 static void invalidate_range(struct mmu_notifier *mn,
 494                              struct mm_struct *mm,
 495                              unsigned long start, unsigned long end)
 496 {
 497         struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier);
 498         struct ocxl_link *link = pe_data->link;
 499         unsigned long addr, pid, page_size = PAGE_SIZE;
 500
 501         pid = mm->context.id;
 502         trace_ocxl_mmu_notifier_range(start, end, pid);
 503
 504         spin_lock(&link->atsd_lock);
 505         for (addr = start; addr < end; addr += page_size)
 506                 pnv_ocxl_tlb_invalidate(link->arva, pid, addr, page_size);
 507         spin_unlock(&link->atsd_lock);
 508 }
 509
 510 static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = {
 511         .invalidate_range = invalidate_range,
 512 };
 513
 514 static u64 calculate_cfg_state(bool kernel)
 515 {
 516         u64 state;
 517
 518         state = SPA_CFG_DR;
 519         if (mfspr(SPRN_LPCR) & LPCR_TC)
 520                 state |= SPA_CFG_TC;
 521         if (radix_enabled())
 522                 state |= SPA_CFG_XLAT_ror;
 523         else
 524                 state |= SPA_CFG_XLAT_hpt;
 525         state |= SPA_CFG_HV;
 526         if (kernel) {
 527                 if (mfmsr() & MSR_SF)
 528                         state |= SPA_CFG_SF;
 529         } else {
 530                 state |= SPA_CFG_PR;
 531                 if (!test_tsk_thread_flag(current, TIF_32BIT))
 532                         state |= SPA_CFG_SF;
 533         }
 534         return state;
 535 }
 536
 537 int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
 538                 u64 amr, u16 bdf, struct mm_struct *mm,
 539                 void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr),
 540                 void *xsl_err_data)
 541 {
 542         struct ocxl_link *link = (struct ocxl_link *) link_handle;
 543         struct spa *spa = link->spa;
 544         struct ocxl_process_element *pe;
 545         int pe_handle, rc = 0;
 546         struct pe_data *pe_data;
 547
 548         BUILD_BUG_ON(sizeof(struct ocxl_process_element) != 128);
 549         if (pasid > SPA_PASID_MAX)
 550                 return -EINVAL;
 551
 552         mutex_lock(&spa->spa_lock);
 553         pe_handle = pasid & SPA_PE_MASK;
 554         pe = spa->spa_mem + pe_handle;
 555
 556         if (pe->software_state) {
 557                 rc = -EBUSY;
 558                 goto unlock;
 559         }
 560
 561         pe_data = kmalloc(sizeof(*pe_data), GFP_KERNEL);
 562         if (!pe_data) {
 563                 rc = -ENOMEM;
 564                 goto unlock;
 565         }
 566
 567         pe_data->mm = mm;
 568         pe_data->xsl_err_cb = xsl_err_cb;
 569         pe_data->xsl_err_data = xsl_err_data;
 570         pe_data->link = link;
 571         pe_data->mmu_notifier.ops = &ocxl_mmu_notifier_ops;
 572
 573         memset(pe, 0, sizeof(struct ocxl_process_element));
 574         pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0));
 575         pe->pasid = cpu_to_be32(pasid << (31 - 19));
 576         pe->bdf = cpu_to_be16(bdf);
 577         pe->lpid = cpu_to_be32(mfspr(SPRN_LPID));
 578         pe->pid = cpu_to_be32(pidr);
 579         pe->tid = cpu_to_be32(tidr);
 580         pe->amr = cpu_to_be64(amr);
 581         pe->software_state = cpu_to_be32(SPA_PE_VALID);
 582
 583         /*
 584          * For user contexts, register a copro so that TLBIs are seen
 585          * by the nest MMU. If we have a kernel context, TLBIs are
 586          * already global.
 587          */
 588         if (mm) {
 589                 mm_context_add_copro(mm);
 590                 if (link->arva) {
 591                         /* Use MMIO registers for the TLB Invalidate
 592                          * operations.
 593                          */
 594                         trace_ocxl_init_mmu_notifier(pasid, mm->context.id);
 595                         mmu_notifier_register(&pe_data->mmu_notifier, mm);
 596                 }
 597         }
 598
 599         /*
 600          * Barrier is to make sure PE is visible in the SPA before it
 601          * is used by the device. It also helps with the global TLBI
 602          * invalidation
 603          */
 604         mb();
 605         radix_tree_insert(&spa->pe_tree, pe_handle, pe_data);
 606
 607         /*
 608          * The mm must stay valid for as long as the device uses it. We
 609          * lower the count when the context is removed from the SPA.
 610          *
 611          * We grab mm_count (and not mm_users), as we don't want to
 612          * end up in a circular dependency if a process mmaps its
 613          * mmio, therefore incrementing the file ref count when
 614          * calling mmap(), and forgets to unmap before exiting. In
 615          * that scenario, when the kernel handles the death of the
 616          * process, the file is not cleaned because unmap was not
 617          * called, and the mm wouldn't be freed because we would still
 618          * have a reference on mm_users. Incrementing mm_count solves
 619          * the problem.
 620          */
 621         if (mm)
 622                 mmgrab(mm);
 623         trace_ocxl_context_add(current->pid, spa->spa_mem, pasid, pidr, tidr);
 624 unlock:
 625         mutex_unlock(&spa->spa_lock);
 626         return rc;
 627 }
 628 EXPORT_SYMBOL_GPL(ocxl_link_add_pe);
 629
 630 int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid)
 631 {
 632         struct ocxl_link *link = (struct ocxl_link *) link_handle;
 633         struct spa *spa = link->spa;
 634         struct ocxl_process_element *pe;
 635         int pe_handle, rc;
 636
 637         if (pasid > SPA_PASID_MAX)
 638                 return -EINVAL;
 639
 640         pe_handle = pasid & SPA_PE_MASK;
 641         pe = spa->spa_mem + pe_handle;
 642
 643         mutex_lock(&spa->spa_lock);
 644
 645         pe->tid = cpu_to_be32(tid);
 646
 647         /*
 648          * The barrier makes sure the PE is updated
 649          * before we clear the NPU context cache below, so that the
 650          * old PE cannot be reloaded erroneously.
 651          */
 652         mb();
 653
 654         /*
 655          * hook to platform code
 656          * On powerpc, the entry needs to be cleared from the context
 657          * cache of the NPU.
 658          */
 659         rc = pnv_ocxl_spa_remove_pe_from_cache(link->platform_data, pe_handle);
 660         WARN_ON(rc);
 661
 662         mutex_unlock(&spa->spa_lock);
 663         return rc;
 664 }
 665
 666 int ocxl_link_remove_pe(void *link_handle, int pasid)
 667 {
 668         struct ocxl_link *link = (struct ocxl_link *) link_handle;
 669         struct spa *spa = link->spa;
 670         struct ocxl_process_element *pe;
 671         struct pe_data *pe_data;
 672         int pe_handle, rc;
 673
 674         if (pasid > SPA_PASID_MAX)
 675                 return -EINVAL;
 676
 677         /*
 678          * About synchronization with our memory fault handler:
 679          *
 680          * Before removing the PE, the driver is supposed to have
 681          * notified the AFU, which should have cleaned up and make
 682          * sure the PASID is no longer in use, including pending
 683          * interrupts. However, there's no way to be sure...
 684          *
 685          * We clear the PE and remove the context from our radix
 686          * tree. From that point on, any new interrupt for that
 687          * context will fail silently, which is ok. As mentioned
 688          * above, that's not expected, but it could happen if the
 689          * driver or AFU didn't do the right thing.
 690          *
 691          * There could still be a bottom half running, but we don't
 692          * need to wait/flush, as it is managing a reference count on
 693          * the mm it reads from the radix tree.
 694          */
 695         pe_handle = pasid & SPA_PE_MASK;
 696         pe = spa->spa_mem + pe_handle;
 697
 698         mutex_lock(&spa->spa_lock);
 699
 700         if (!(be32_to_cpu(pe->software_state) & SPA_PE_VALID)) {
 701                 rc = -EINVAL;
 702                 goto unlock;
 703         }
 704
 705         trace_ocxl_context_remove(current->pid, spa->spa_mem, pasid,
 706                                 be32_to_cpu(pe->pid), be32_to_cpu(pe->tid));
 707
 708         memset(pe, 0, sizeof(struct ocxl_process_element));
 709         /*
 710          * The barrier makes sure the PE is removed from the SPA
 711          * before we clear the NPU context cache below, so that the
 712          * old PE cannot be reloaded erroneously.
 713          */
 714         mb();
 715
 716         /*
 717          * hook to platform code
 718          * On powerpc, the entry needs to be cleared from the context
 719          * cache of the NPU.
 720          */
 721         rc = pnv_ocxl_spa_remove_pe_from_cache(link->platform_data, pe_handle);
 722         WARN_ON(rc);
 723
 724         pe_data = radix_tree_delete(&spa->pe_tree, pe_handle);
 725         if (!pe_data) {
 726                 WARN(1, "Couldn't find pe data when removing PE\n");
 727         } else {
 728                 if (pe_data->mm) {
 729                         if (link->arva) {
 730                                 trace_ocxl_release_mmu_notifier(pasid,
 731                                                                 pe_data->mm->context.id);
 732                                 mmu_notifier_unregister(&pe_data->mmu_notifier,
 733                                                         pe_data->mm);
 734                                 spin_lock(&link->atsd_lock);
 735                                 pnv_ocxl_tlb_invalidate(link->arva,
 736                                                         pe_data->mm->context.id,
 737                                                         0ull,
 738                                                         PAGE_SIZE);
 739                                 spin_unlock(&link->atsd_lock);
 740                         }
 741                         mm_context_remove_copro(pe_data->mm);
 742                         mmdrop(pe_data->mm);
 743                 }
 744                 kfree_rcu(pe_data, rcu);
 745         }
 746 unlock:
 747         mutex_unlock(&spa->spa_lock);
 748         return rc;
 749 }
 750 EXPORT_SYMBOL_GPL(ocxl_link_remove_pe);
 751
 752 int ocxl_link_irq_alloc(void *link_handle, int *hw_irq)
 753 {
 754         struct ocxl_link *link = (struct ocxl_link *) link_handle;
 755         int irq;
 756
 757         if (atomic_dec_if_positive(&link->irq_available) < 0)
 758                 return -ENOSPC;
 759
 760         irq = xive_native_alloc_irq();
 761         if (!irq) {
 762                 atomic_inc(&link->irq_available);
 763                 return -ENXIO;
 764         }
 765
 766         *hw_irq = irq;
 767         return 0;
 768 }
 769 EXPORT_SYMBOL_GPL(ocxl_link_irq_alloc);
 770
 771 void ocxl_link_free_irq(void *link_handle, int hw_irq)
 772 {
 773         struct ocxl_link *link = (struct ocxl_link *) link_handle;
 774
 775         xive_native_free_irq(hw_irq);
 776         atomic_inc(&link->irq_available);
 777 }
 778 EXPORT_SYMBOL_GPL(ocxl_link_free_irq);