1 // SPDX-License-Identifier: GPL-2.0+
2 // Copyright 2017 IBM Corp.
3 #include <linux/sched/mm.h>
4 #include <linux/mutex.h>
6 #include <linux/mm_types.h>
7 #include <linux/mmu_context.h>
8 #include <linux/mmu_notifier.h>
10 #include <asm/pnv-ocxl.h>
12 #include <misc/ocxl.h>
13 #include "ocxl_internal.h"
17 #define SPA_PASID_BITS 15
18 #define SPA_PASID_MAX ((1 << SPA_PASID_BITS) - 1)
19 #define SPA_PE_MASK SPA_PASID_MAX
20 #define SPA_SPA_SIZE_LOG 22 /* Each SPA is 4 Mb */
22 #define SPA_CFG_SF (1ull << (63-0))
23 #define SPA_CFG_TA (1ull << (63-1))
24 #define SPA_CFG_HV (1ull << (63-3))
25 #define SPA_CFG_UV (1ull << (63-4))
26 #define SPA_CFG_XLAT_hpt (0ull << (63-6)) /* Hashed page table (HPT) mode */
27 #define SPA_CFG_XLAT_roh (2ull << (63-6)) /* Radix on HPT mode */
28 #define SPA_CFG_XLAT_ror (3ull << (63-6)) /* Radix on Radix mode */
29 #define SPA_CFG_PR (1ull << (63-49))
30 #define SPA_CFG_TC (1ull << (63-54))
31 #define SPA_CFG_DR (1ull << (63-59))
33 #define SPA_XSL_TF (1ull << (63-3)) /* Translation fault */
34 #define SPA_XSL_S (1ull << (63-38)) /* Store operation */
36 #define SPA_PE_VALID 0x80000000
42 /* callback to trigger when a translation fault occurs */
43 void (*xsl_err_cb
)(void *data
, u64 addr
, u64 dsisr
);
44 /* opaque pointer to be passed to the above callback */
47 struct ocxl_link
*link
;
48 struct mmu_notifier mmu_notifier
;
52 struct ocxl_process_element
*spa_mem
;
54 struct mutex spa_lock
;
55 struct radix_tree_root pe_tree
; /* Maps PE handles to pe_data */
58 void __iomem
*reg_dsisr
;
59 void __iomem
*reg_dar
;
60 void __iomem
*reg_tfc
;
61 void __iomem
*reg_pe_handle
;
63 * The following field are used by the memory fault
64 * interrupt handler. We can only have one interrupt at a
65 * time. The NPU won't raise another interrupt until the
66 * previous one has been ack'd by writing to the TFC register
69 struct work_struct fault_work
;
73 struct pe_data pe_data
;
78 * A opencapi link can be used be by several PCI functions. We have
79 * one link per device slot.
81 * A linked list of opencapi links should suffice, as there's a
82 * limited number of opencapi slots on a system and lookup is only
83 * done when the device is probed
86 struct list_head list
;
91 void __iomem
*arva
; /* ATSD register virtual address */
92 spinlock_t atsd_lock
; /* to serialize shootdowns */
93 atomic_t irq_available
;
97 static struct list_head links_list
= LIST_HEAD_INIT(links_list
);
98 static DEFINE_MUTEX(links_list_lock
);
107 static void read_irq(struct spa
*spa
, u64
*dsisr
, u64
*dar
, u64
*pe
)
111 *dsisr
= in_be64(spa
->reg_dsisr
);
112 *dar
= in_be64(spa
->reg_dar
);
113 reg
= in_be64(spa
->reg_pe_handle
);
114 *pe
= reg
& SPA_PE_MASK
;
117 static void ack_irq(struct spa
*spa
, enum xsl_response r
)
121 /* continue is not supported */
124 else if (r
== ADDRESS_ERROR
)
127 WARN(1, "Invalid irq response %d\n", r
);
130 trace_ocxl_fault_ack(spa
->spa_mem
, spa
->xsl_fault
.pe
,
131 spa
->xsl_fault
.dsisr
, spa
->xsl_fault
.dar
, reg
);
132 out_be64(spa
->reg_tfc
, reg
);
136 static void xsl_fault_handler_bh(struct work_struct
*fault_work
)
139 unsigned long access
, flags
, inv_flags
= 0;
141 struct xsl_fault
*fault
= container_of(fault_work
, struct xsl_fault
,
143 struct spa
*spa
= container_of(fault
, struct spa
, xsl_fault
);
148 * We must release a reference on mm_users whenever exiting this
149 * function (taken in the memory fault interrupt handler)
151 rc
= copro_handle_mm_fault(fault
->pe_data
.mm
, fault
->dar
, fault
->dsisr
,
154 pr_debug("copro_handle_mm_fault failed: %d\n", rc
);
155 if (fault
->pe_data
.xsl_err_cb
) {
156 fault
->pe_data
.xsl_err_cb(
157 fault
->pe_data
.xsl_err_data
,
158 fault
->dar
, fault
->dsisr
);
164 if (!radix_enabled()) {
166 * update_mmu_cache() will not have loaded the hash
167 * since current->trap is not a 0x400 or 0x300, so
168 * just call hash_page_mm() here.
170 access
= _PAGE_PRESENT
| _PAGE_READ
;
171 if (fault
->dsisr
& SPA_XSL_S
)
172 access
|= _PAGE_WRITE
;
174 if (get_region_id(fault
->dar
) != USER_REGION_ID
)
175 access
|= _PAGE_PRIVILEGED
;
177 local_irq_save(flags
);
178 hash_page_mm(fault
->pe_data
.mm
, fault
->dar
, access
, 0x300,
180 local_irq_restore(flags
);
184 mmput(fault
->pe_data
.mm
);
188 static irqreturn_t
xsl_fault_handler(int irq
, void *data
)
190 struct ocxl_link
*link
= (struct ocxl_link
*) data
;
191 struct spa
*spa
= link
->spa
;
192 u64 dsisr
, dar
, pe_handle
;
193 struct pe_data
*pe_data
;
194 struct ocxl_process_element
*pe
;
196 bool schedule
= false;
198 read_irq(spa
, &dsisr
, &dar
, &pe_handle
);
199 trace_ocxl_fault(spa
->spa_mem
, pe_handle
, dsisr
, dar
, -1);
201 WARN_ON(pe_handle
> SPA_PE_MASK
);
202 pe
= spa
->spa_mem
+ pe_handle
;
203 pid
= be32_to_cpu(pe
->pid
);
204 /* We could be reading all null values here if the PE is being
205 * removed while an interrupt kicks in. It's not supposed to
206 * happen if the driver notified the AFU to terminate the
207 * PASID, and the AFU waited for pending operations before
208 * acknowledging. But even if it happens, we won't find a
209 * memory context below and fail silently, so it should be ok.
211 if (!(dsisr
& SPA_XSL_TF
)) {
212 WARN(1, "Invalid xsl interrupt fault register %#llx\n", dsisr
);
213 ack_irq(spa
, ADDRESS_ERROR
);
218 pe_data
= radix_tree_lookup(&spa
->pe_tree
, pe_handle
);
221 * Could only happen if the driver didn't notify the
222 * AFU about PASID termination before removing the PE,
223 * or the AFU didn't wait for all memory access to
226 * Either way, we fail early, but we shouldn't log an
227 * error message, as it is a valid (if unexpected)
231 pr_debug("Unknown mm context for xsl interrupt\n");
232 ack_irq(spa
, ADDRESS_ERROR
);
238 * translation fault from a kernel context - an OpenCAPI
239 * device tried to access a bad kernel address
242 pr_warn("Unresolved OpenCAPI xsl fault in kernel context\n");
243 ack_irq(spa
, ADDRESS_ERROR
);
246 WARN_ON(pe_data
->mm
->context
.id
!= pid
);
248 if (mmget_not_zero(pe_data
->mm
)) {
249 spa
->xsl_fault
.pe
= pe_handle
;
250 spa
->xsl_fault
.dar
= dar
;
251 spa
->xsl_fault
.dsisr
= dsisr
;
252 spa
->xsl_fault
.pe_data
= *pe_data
;
254 /* mm_users count released by bottom half */
258 schedule_work(&spa
->xsl_fault
.fault_work
);
260 ack_irq(spa
, ADDRESS_ERROR
);
264 static void unmap_irq_registers(struct spa
*spa
)
266 pnv_ocxl_unmap_xsl_regs(spa
->reg_dsisr
, spa
->reg_dar
, spa
->reg_tfc
,
270 static int map_irq_registers(struct pci_dev
*dev
, struct spa
*spa
)
272 return pnv_ocxl_map_xsl_regs(dev
, &spa
->reg_dsisr
, &spa
->reg_dar
,
273 &spa
->reg_tfc
, &spa
->reg_pe_handle
);
276 static int setup_xsl_irq(struct pci_dev
*dev
, struct ocxl_link
*link
)
278 struct spa
*spa
= link
->spa
;
282 rc
= pnv_ocxl_get_xsl_irq(dev
, &hwirq
);
286 rc
= map_irq_registers(dev
, spa
);
290 spa
->irq_name
= kasprintf(GFP_KERNEL
, "ocxl-xsl-%x-%x-%x",
291 link
->domain
, link
->bus
, link
->dev
);
292 if (!spa
->irq_name
) {
293 dev_err(&dev
->dev
, "Can't allocate name for xsl interrupt\n");
298 * At some point, we'll need to look into allowing a higher
299 * number of interrupts. Could we have an IRQ domain per link?
301 spa
->virq
= irq_create_mapping(NULL
, hwirq
);
304 "irq_create_mapping failed for translation interrupt\n");
309 dev_dbg(&dev
->dev
, "hwirq %d mapped to virq %d\n", hwirq
, spa
->virq
);
311 rc
= request_irq(spa
->virq
, xsl_fault_handler
, 0, spa
->irq_name
,
315 "request_irq failed for translation interrupt: %d\n",
323 irq_dispose_mapping(spa
->virq
);
325 kfree(spa
->irq_name
);
327 unmap_irq_registers(spa
);
331 static void release_xsl_irq(struct ocxl_link
*link
)
333 struct spa
*spa
= link
->spa
;
336 free_irq(spa
->virq
, link
);
337 irq_dispose_mapping(spa
->virq
);
339 kfree(spa
->irq_name
);
340 unmap_irq_registers(spa
);
343 static int alloc_spa(struct pci_dev
*dev
, struct ocxl_link
*link
)
347 spa
= kzalloc(sizeof(struct spa
), GFP_KERNEL
);
351 mutex_init(&spa
->spa_lock
);
352 INIT_RADIX_TREE(&spa
->pe_tree
, GFP_KERNEL
);
353 INIT_WORK(&spa
->xsl_fault
.fault_work
, xsl_fault_handler_bh
);
355 spa
->spa_order
= SPA_SPA_SIZE_LOG
- PAGE_SHIFT
;
356 spa
->spa_mem
= (struct ocxl_process_element
*)
357 __get_free_pages(GFP_KERNEL
| __GFP_ZERO
, spa
->spa_order
);
359 dev_err(&dev
->dev
, "Can't allocate Shared Process Area\n");
363 pr_debug("Allocated SPA for %x:%x:%x at %p\n", link
->domain
, link
->bus
,
364 link
->dev
, spa
->spa_mem
);
370 static void free_spa(struct ocxl_link
*link
)
372 struct spa
*spa
= link
->spa
;
374 pr_debug("Freeing SPA for %x:%x:%x\n", link
->domain
, link
->bus
,
377 if (spa
&& spa
->spa_mem
) {
378 free_pages((unsigned long) spa
->spa_mem
, spa
->spa_order
);
384 static int alloc_link(struct pci_dev
*dev
, int PE_mask
, struct ocxl_link
**out_link
)
386 struct ocxl_link
*link
;
389 link
= kzalloc(sizeof(struct ocxl_link
), GFP_KERNEL
);
393 kref_init(&link
->ref
);
394 link
->domain
= pci_domain_nr(dev
->bus
);
395 link
->bus
= dev
->bus
->number
;
396 link
->dev
= PCI_SLOT(dev
->devfn
);
397 atomic_set(&link
->irq_available
, MAX_IRQ_PER_LINK
);
398 spin_lock_init(&link
->atsd_lock
);
400 rc
= alloc_spa(dev
, link
);
404 rc
= setup_xsl_irq(dev
, link
);
408 /* platform specific hook */
409 rc
= pnv_ocxl_spa_setup(dev
, link
->spa
->spa_mem
, PE_mask
,
410 &link
->platform_data
);
414 /* if link->arva is not defeined, MMIO registers are not used to
415 * generate TLB invalidate. PowerBus snooping is enabled.
416 * Otherwise, PowerBus snooping is disabled. TLB Invalidates are
417 * initiated using MMIO registers.
419 pnv_ocxl_map_lpar(dev
, mfspr(SPRN_LPID
), 0, &link
->arva
);
425 release_xsl_irq(link
);
433 static void free_link(struct ocxl_link
*link
)
435 release_xsl_irq(link
);
440 int ocxl_link_setup(struct pci_dev
*dev
, int PE_mask
, void **link_handle
)
443 struct ocxl_link
*link
;
445 mutex_lock(&links_list_lock
);
446 list_for_each_entry(link
, &links_list
, list
) {
447 /* The functions of a device all share the same link */
448 if (link
->domain
== pci_domain_nr(dev
->bus
) &&
449 link
->bus
== dev
->bus
->number
&&
450 link
->dev
== PCI_SLOT(dev
->devfn
)) {
451 kref_get(&link
->ref
);
456 rc
= alloc_link(dev
, PE_mask
, &link
);
460 list_add(&link
->list
, &links_list
);
463 mutex_unlock(&links_list_lock
);
466 EXPORT_SYMBOL_GPL(ocxl_link_setup
);
468 static void release_xsl(struct kref
*ref
)
470 struct ocxl_link
*link
= container_of(ref
, struct ocxl_link
, ref
);
473 pnv_ocxl_unmap_lpar(link
->arva
);
477 list_del(&link
->list
);
478 /* call platform code before releasing data */
479 pnv_ocxl_spa_release(link
->platform_data
);
483 void ocxl_link_release(struct pci_dev
*dev
, void *link_handle
)
485 struct ocxl_link
*link
= (struct ocxl_link
*) link_handle
;
487 mutex_lock(&links_list_lock
);
488 kref_put(&link
->ref
, release_xsl
);
489 mutex_unlock(&links_list_lock
);
491 EXPORT_SYMBOL_GPL(ocxl_link_release
);
493 static void invalidate_range(struct mmu_notifier
*mn
,
494 struct mm_struct
*mm
,
495 unsigned long start
, unsigned long end
)
497 struct pe_data
*pe_data
= container_of(mn
, struct pe_data
, mmu_notifier
);
498 struct ocxl_link
*link
= pe_data
->link
;
499 unsigned long addr
, pid
, page_size
= PAGE_SIZE
;
501 pid
= mm
->context
.id
;
502 trace_ocxl_mmu_notifier_range(start
, end
, pid
);
504 spin_lock(&link
->atsd_lock
);
505 for (addr
= start
; addr
< end
; addr
+= page_size
)
506 pnv_ocxl_tlb_invalidate(link
->arva
, pid
, addr
, page_size
);
507 spin_unlock(&link
->atsd_lock
);
510 static const struct mmu_notifier_ops ocxl_mmu_notifier_ops
= {
511 .invalidate_range
= invalidate_range
,
514 static u64
calculate_cfg_state(bool kernel
)
519 if (mfspr(SPRN_LPCR
) & LPCR_TC
)
522 state
|= SPA_CFG_XLAT_ror
;
524 state
|= SPA_CFG_XLAT_hpt
;
527 if (mfmsr() & MSR_SF
)
531 if (!test_tsk_thread_flag(current
, TIF_32BIT
))
537 int ocxl_link_add_pe(void *link_handle
, int pasid
, u32 pidr
, u32 tidr
,
538 u64 amr
, u16 bdf
, struct mm_struct
*mm
,
539 void (*xsl_err_cb
)(void *data
, u64 addr
, u64 dsisr
),
542 struct ocxl_link
*link
= (struct ocxl_link
*) link_handle
;
543 struct spa
*spa
= link
->spa
;
544 struct ocxl_process_element
*pe
;
545 int pe_handle
, rc
= 0;
546 struct pe_data
*pe_data
;
548 BUILD_BUG_ON(sizeof(struct ocxl_process_element
) != 128);
549 if (pasid
> SPA_PASID_MAX
)
552 mutex_lock(&spa
->spa_lock
);
553 pe_handle
= pasid
& SPA_PE_MASK
;
554 pe
= spa
->spa_mem
+ pe_handle
;
556 if (pe
->software_state
) {
561 pe_data
= kmalloc(sizeof(*pe_data
), GFP_KERNEL
);
568 pe_data
->xsl_err_cb
= xsl_err_cb
;
569 pe_data
->xsl_err_data
= xsl_err_data
;
570 pe_data
->link
= link
;
571 pe_data
->mmu_notifier
.ops
= &ocxl_mmu_notifier_ops
;
573 memset(pe
, 0, sizeof(struct ocxl_process_element
));
574 pe
->config_state
= cpu_to_be64(calculate_cfg_state(pidr
== 0));
575 pe
->pasid
= cpu_to_be32(pasid
<< (31 - 19));
576 pe
->bdf
= cpu_to_be16(bdf
);
577 pe
->lpid
= cpu_to_be32(mfspr(SPRN_LPID
));
578 pe
->pid
= cpu_to_be32(pidr
);
579 pe
->tid
= cpu_to_be32(tidr
);
580 pe
->amr
= cpu_to_be64(amr
);
581 pe
->software_state
= cpu_to_be32(SPA_PE_VALID
);
584 * For user contexts, register a copro so that TLBIs are seen
585 * by the nest MMU. If we have a kernel context, TLBIs are
589 mm_context_add_copro(mm
);
591 /* Use MMIO registers for the TLB Invalidate
594 trace_ocxl_init_mmu_notifier(pasid
, mm
->context
.id
);
595 mmu_notifier_register(&pe_data
->mmu_notifier
, mm
);
600 * Barrier is to make sure PE is visible in the SPA before it
601 * is used by the device. It also helps with the global TLBI
605 radix_tree_insert(&spa
->pe_tree
, pe_handle
, pe_data
);
608 * The mm must stay valid for as long as the device uses it. We
609 * lower the count when the context is removed from the SPA.
611 * We grab mm_count (and not mm_users), as we don't want to
612 * end up in a circular dependency if a process mmaps its
613 * mmio, therefore incrementing the file ref count when
614 * calling mmap(), and forgets to unmap before exiting. In
615 * that scenario, when the kernel handles the death of the
616 * process, the file is not cleaned because unmap was not
617 * called, and the mm wouldn't be freed because we would still
618 * have a reference on mm_users. Incrementing mm_count solves
623 trace_ocxl_context_add(current
->pid
, spa
->spa_mem
, pasid
, pidr
, tidr
);
625 mutex_unlock(&spa
->spa_lock
);
628 EXPORT_SYMBOL_GPL(ocxl_link_add_pe
);
630 int ocxl_link_update_pe(void *link_handle
, int pasid
, __u16 tid
)
632 struct ocxl_link
*link
= (struct ocxl_link
*) link_handle
;
633 struct spa
*spa
= link
->spa
;
634 struct ocxl_process_element
*pe
;
637 if (pasid
> SPA_PASID_MAX
)
640 pe_handle
= pasid
& SPA_PE_MASK
;
641 pe
= spa
->spa_mem
+ pe_handle
;
643 mutex_lock(&spa
->spa_lock
);
645 pe
->tid
= cpu_to_be32(tid
);
648 * The barrier makes sure the PE is updated
649 * before we clear the NPU context cache below, so that the
650 * old PE cannot be reloaded erroneously.
655 * hook to platform code
656 * On powerpc, the entry needs to be cleared from the context
659 rc
= pnv_ocxl_spa_remove_pe_from_cache(link
->platform_data
, pe_handle
);
662 mutex_unlock(&spa
->spa_lock
);
666 int ocxl_link_remove_pe(void *link_handle
, int pasid
)
668 struct ocxl_link
*link
= (struct ocxl_link
*) link_handle
;
669 struct spa
*spa
= link
->spa
;
670 struct ocxl_process_element
*pe
;
671 struct pe_data
*pe_data
;
674 if (pasid
> SPA_PASID_MAX
)
678 * About synchronization with our memory fault handler:
680 * Before removing the PE, the driver is supposed to have
681 * notified the AFU, which should have cleaned up and make
682 * sure the PASID is no longer in use, including pending
683 * interrupts. However, there's no way to be sure...
685 * We clear the PE and remove the context from our radix
686 * tree. From that point on, any new interrupt for that
687 * context will fail silently, which is ok. As mentioned
688 * above, that's not expected, but it could happen if the
689 * driver or AFU didn't do the right thing.
691 * There could still be a bottom half running, but we don't
692 * need to wait/flush, as it is managing a reference count on
693 * the mm it reads from the radix tree.
695 pe_handle
= pasid
& SPA_PE_MASK
;
696 pe
= spa
->spa_mem
+ pe_handle
;
698 mutex_lock(&spa
->spa_lock
);
700 if (!(be32_to_cpu(pe
->software_state
) & SPA_PE_VALID
)) {
705 trace_ocxl_context_remove(current
->pid
, spa
->spa_mem
, pasid
,
706 be32_to_cpu(pe
->pid
), be32_to_cpu(pe
->tid
));
708 memset(pe
, 0, sizeof(struct ocxl_process_element
));
710 * The barrier makes sure the PE is removed from the SPA
711 * before we clear the NPU context cache below, so that the
712 * old PE cannot be reloaded erroneously.
717 * hook to platform code
718 * On powerpc, the entry needs to be cleared from the context
721 rc
= pnv_ocxl_spa_remove_pe_from_cache(link
->platform_data
, pe_handle
);
724 pe_data
= radix_tree_delete(&spa
->pe_tree
, pe_handle
);
726 WARN(1, "Couldn't find pe data when removing PE\n");
730 trace_ocxl_release_mmu_notifier(pasid
,
731 pe_data
->mm
->context
.id
);
732 mmu_notifier_unregister(&pe_data
->mmu_notifier
,
734 spin_lock(&link
->atsd_lock
);
735 pnv_ocxl_tlb_invalidate(link
->arva
,
736 pe_data
->mm
->context
.id
,
739 spin_unlock(&link
->atsd_lock
);
741 mm_context_remove_copro(pe_data
->mm
);
744 kfree_rcu(pe_data
, rcu
);
747 mutex_unlock(&spa
->spa_lock
);
750 EXPORT_SYMBOL_GPL(ocxl_link_remove_pe
);
752 int ocxl_link_irq_alloc(void *link_handle
, int *hw_irq
)
754 struct ocxl_link
*link
= (struct ocxl_link
*) link_handle
;
757 if (atomic_dec_if_positive(&link
->irq_available
) < 0)
760 irq
= xive_native_alloc_irq();
762 atomic_inc(&link
->irq_available
);
769 EXPORT_SYMBOL_GPL(ocxl_link_irq_alloc
);
771 void ocxl_link_free_irq(void *link_handle
, int hw_irq
)
773 struct ocxl_link
*link
= (struct ocxl_link
*) link_handle
;
775 xive_native_free_irq(hw_irq
);
776 atomic_inc(&link
->irq_available
);
778 EXPORT_SYMBOL_GPL(ocxl_link_free_irq
);