1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
3 #include <linux/init.h>
4 #include <linux/kernel.h>
5 #include <linux/module.h>
7 #include <linux/io-64-nonatomic-lo-hi.h>
8 #include <linux/dmaengine.h>
9 #include <linux/delay.h>
10 #include <linux/iommu.h>
11 #include <linux/sched/mm.h>
12 #include <uapi/linux/idxd.h>
13 #include "../dmaengine.h"
15 #include "registers.h"
19 IRQ_WORK_PROCESS_FAULT
,
22 struct idxd_resubmit
{
23 struct work_struct work
;
24 struct idxd_desc
*desc
;
27 struct idxd_int_handle_revoke
{
28 struct work_struct work
;
29 struct idxd_device
*idxd
;
32 static void idxd_device_reinit(struct work_struct
*work
)
34 struct idxd_device
*idxd
= container_of(work
, struct idxd_device
, work
);
35 struct device
*dev
= &idxd
->pdev
->dev
;
38 idxd_device_reset(idxd
);
39 rc
= idxd_device_config(idxd
);
43 rc
= idxd_device_enable(idxd
);
47 for (i
= 0; i
< idxd
->max_wqs
; i
++) {
48 if (test_bit(i
, idxd
->wq_enable_map
)) {
49 struct idxd_wq
*wq
= idxd
->wqs
[i
];
51 rc
= idxd_wq_enable(wq
);
53 clear_bit(i
, idxd
->wq_enable_map
);
54 dev_warn(dev
, "Unable to re-enable wq %s\n",
55 dev_name(wq_confdev(wq
)));
63 idxd_device_clear_state(idxd
);
67 * The function sends a drain descriptor for the interrupt handle. The drain ensures
68 * all descriptors with this interrupt handle is flushed and the interrupt
69 * will allow the cleanup of the outstanding descriptors.
71 static void idxd_int_handle_revoke_drain(struct idxd_irq_entry
*ie
)
73 struct idxd_wq
*wq
= ie_to_wq(ie
);
74 struct idxd_device
*idxd
= wq
->idxd
;
75 struct device
*dev
= &idxd
->pdev
->dev
;
76 struct dsa_hw_desc desc
= {};
80 /* Issue a simple drain operation with interrupt but no completion record */
81 desc
.flags
= IDXD_OP_FLAG_RCI
;
82 desc
.opcode
= DSA_OPCODE_DRAIN
;
85 if (ie
->pasid
!= IOMMU_PASID_INVALID
)
86 desc
.pasid
= ie
->pasid
;
87 desc
.int_handle
= ie
->int_handle
;
88 portal
= idxd_wq_portal_addr(wq
);
91 * The wmb() makes sure that the descriptor is all there before we
95 if (wq_dedicated(wq
)) {
96 iosubmit_cmds512(portal
, &desc
, 1);
98 rc
= idxd_enqcmds(wq
, portal
, &desc
);
99 /* This should not fail unless hardware failed. */
101 dev_warn(dev
, "Failed to submit drain desc on wq %d\n", wq
->id
);
105 static void idxd_abort_invalid_int_handle_descs(struct idxd_irq_entry
*ie
)
108 struct idxd_desc
*d
, *t
;
109 struct llist_node
*head
;
111 spin_lock(&ie
->list_lock
);
112 head
= llist_del_all(&ie
->pending_llist
);
114 llist_for_each_entry_safe(d
, t
, head
, llnode
)
115 list_add_tail(&d
->list
, &ie
->work_list
);
118 list_for_each_entry_safe(d
, t
, &ie
->work_list
, list
) {
119 if (d
->completion
->status
== DSA_COMP_INT_HANDLE_INVAL
)
120 list_move_tail(&d
->list
, &flist
);
122 spin_unlock(&ie
->list_lock
);
124 list_for_each_entry_safe(d
, t
, &flist
, list
) {
126 idxd_desc_complete(d
, IDXD_COMPLETE_ABORT
, true);
130 static void idxd_int_handle_revoke(struct work_struct
*work
)
132 struct idxd_int_handle_revoke
*revoke
=
133 container_of(work
, struct idxd_int_handle_revoke
, work
);
134 struct idxd_device
*idxd
= revoke
->idxd
;
135 struct pci_dev
*pdev
= idxd
->pdev
;
136 struct device
*dev
= &pdev
->dev
;
137 int i
, new_handle
, rc
;
139 if (!idxd
->request_int_handles
) {
141 dev_warn(dev
, "Unexpected int handle refresh interrupt.\n");
146 * The loop attempts to acquire new interrupt handle for all interrupt
147 * vectors that supports a handle. If a new interrupt handle is acquired and the
148 * wq is kernel type, the driver will kill the percpu_ref to pause all
149 * ongoing descriptor submissions. The interrupt handle is then changed.
150 * After change, the percpu_ref is revived and all the pending submissions
151 * are woken to try again. A drain is sent to for the interrupt handle
152 * at the end to make sure all invalid int handle descriptors are processed.
154 for (i
= 1; i
< idxd
->irq_cnt
; i
++) {
155 struct idxd_irq_entry
*ie
= idxd_get_ie(idxd
, i
);
156 struct idxd_wq
*wq
= ie_to_wq(ie
);
158 if (ie
->int_handle
== INVALID_INT_HANDLE
)
161 rc
= idxd_device_request_int_handle(idxd
, i
, &new_handle
, IDXD_IRQ_MSIX
);
163 dev_warn(dev
, "get int handle %d failed: %d\n", i
, rc
);
165 * Failed to acquire new interrupt handle. Kill the WQ
166 * and release all the pending submitters. The submitters will
167 * get error return code and handle appropriately.
169 ie
->int_handle
= INVALID_INT_HANDLE
;
171 idxd_abort_invalid_int_handle_descs(ie
);
175 /* No change in interrupt handle, nothing needs to be done */
176 if (ie
->int_handle
== new_handle
)
179 if (wq
->state
!= IDXD_WQ_ENABLED
|| wq
->type
!= IDXD_WQT_KERNEL
) {
181 * All the MSIX interrupts are allocated at once during probe.
182 * Therefore we need to update all interrupts even if the WQ
183 * isn't supporting interrupt operations.
185 ie
->int_handle
= new_handle
;
189 mutex_lock(&wq
->wq_lock
);
190 reinit_completion(&wq
->wq_resurrect
);
192 /* Kill percpu_ref to pause additional descriptor submissions */
193 percpu_ref_kill(&wq
->wq_active
);
195 /* Wait for all submitters quiesce before we change interrupt handle */
196 wait_for_completion(&wq
->wq_dead
);
198 ie
->int_handle
= new_handle
;
200 /* Revive percpu ref and wake up all the waiting submitters */
201 percpu_ref_reinit(&wq
->wq_active
);
202 complete_all(&wq
->wq_resurrect
);
203 mutex_unlock(&wq
->wq_lock
);
206 * The delay here is to wait for all possible MOVDIR64B that
207 * are issued before percpu_ref_kill() has happened to have
208 * reached the PCIe domain before the drain is issued. The driver
209 * needs to ensure that the drain descriptor issued does not pass
210 * all the other issued descriptors that contain the invalid
211 * interrupt handle in order to ensure that the drain descriptor
212 * interrupt will allow the cleanup of all the descriptors with
213 * invalid interrupt handle.
215 if (wq_dedicated(wq
))
217 idxd_int_handle_revoke_drain(ie
);
222 static void idxd_evl_fault_work(struct work_struct
*work
)
224 struct idxd_evl_fault
*fault
= container_of(work
, struct idxd_evl_fault
, work
);
225 struct idxd_wq
*wq
= fault
->wq
;
226 struct idxd_device
*idxd
= wq
->idxd
;
227 struct device
*dev
= &idxd
->pdev
->dev
;
228 struct idxd_evl
*evl
= idxd
->evl
;
229 struct __evl_entry
*entry_head
= fault
->entry
;
230 void *cr
= (void *)entry_head
+ idxd
->data
->evl_cr_off
;
231 int cr_size
= idxd
->data
->compl_size
;
232 u8
*status
= (u8
*)cr
+ idxd
->data
->cr_status_off
;
233 u8
*result
= (u8
*)cr
+ idxd
->data
->cr_result_off
;
234 int copied
, copy_size
;
237 switch (fault
->status
) {
238 case DSA_COMP_CRA_XLAT
:
239 if (entry_head
->batch
&& entry_head
->first_err_in_batch
)
240 evl
->batch_fail
[entry_head
->batch_id
] = false;
243 idxd_user_counter_increment(wq
, entry_head
->pasid
, COUNTER_FAULTS
);
245 case DSA_COMP_BATCH_EVL_ERR
:
246 bf
= &evl
->batch_fail
[entry_head
->batch_id
];
248 copy_size
= entry_head
->rcr
|| *bf
? cr_size
: 0;
250 if (*status
== DSA_COMP_SUCCESS
)
251 *status
= DSA_COMP_BATCH_FAIL
;
255 idxd_user_counter_increment(wq
, entry_head
->pasid
, COUNTER_FAULTS
);
257 case DSA_COMP_DRAIN_EVL
:
262 dev_dbg_ratelimited(dev
, "Unrecognized error code: %#x\n", fault
->status
);
270 * Copy completion record to fault_addr in user address space
271 * that is found by wq and PASID.
273 copied
= idxd_copy_cr(wq
, entry_head
->pasid
, entry_head
->fault_addr
,
276 * The task that triggered the page fault is unknown currently
277 * because multiple threads may share the user address
278 * space or the task exits already before this fault.
279 * So if the copy fails, SIGSEGV can not be sent to the task.
280 * Just print an error for the failure. The user application
281 * waiting for the completion record will time out on this
284 switch (fault
->status
) {
285 case DSA_COMP_CRA_XLAT
:
286 if (copied
!= copy_size
) {
287 idxd_user_counter_increment(wq
, entry_head
->pasid
, COUNTER_FAULT_FAILS
);
288 dev_dbg_ratelimited(dev
, "Failed to write to completion record: (%d:%d)\n",
290 if (entry_head
->batch
)
291 evl
->batch_fail
[entry_head
->batch_id
] = true;
294 case DSA_COMP_BATCH_EVL_ERR
:
295 if (copied
!= copy_size
) {
296 idxd_user_counter_increment(wq
, entry_head
->pasid
, COUNTER_FAULT_FAILS
);
297 dev_dbg_ratelimited(dev
, "Failed to write to batch completion record: (%d:%d)\n",
301 case DSA_COMP_DRAIN_EVL
:
302 if (copied
!= copy_size
)
303 dev_dbg_ratelimited(dev
, "Failed to write to drain completion record: (%d:%d)\n",
308 kmem_cache_free(idxd
->evl_cache
, fault
);
311 static void process_evl_entry(struct idxd_device
*idxd
,
312 struct __evl_entry
*entry_head
, unsigned int index
)
314 struct device
*dev
= &idxd
->pdev
->dev
;
315 struct idxd_evl
*evl
= idxd
->evl
;
318 if (test_bit(index
, evl
->bmap
)) {
319 clear_bit(index
, evl
->bmap
);
321 status
= DSA_COMP_STATUS(entry_head
->error
);
323 if (status
== DSA_COMP_CRA_XLAT
|| status
== DSA_COMP_DRAIN_EVL
||
324 status
== DSA_COMP_BATCH_EVL_ERR
) {
325 struct idxd_evl_fault
*fault
;
326 int ent_size
= evl_ent_size(idxd
);
329 dev_dbg(dev
, "Completion Int Req set, ignoring!\n");
331 if (!entry_head
->rcr
&& status
== DSA_COMP_DRAIN_EVL
)
334 fault
= kmem_cache_alloc(idxd
->evl_cache
, GFP_ATOMIC
);
336 struct idxd_wq
*wq
= idxd
->wqs
[entry_head
->wq_idx
];
339 fault
->status
= status
;
340 memcpy(&fault
->entry
, entry_head
, ent_size
);
341 INIT_WORK(&fault
->work
, idxd_evl_fault_work
);
342 queue_work(wq
->wq
, &fault
->work
);
344 dev_warn(dev
, "Failed to service fault work.\n");
347 dev_warn_ratelimited(dev
, "Device error %#x operation: %#x fault addr: %#llx\n",
348 status
, entry_head
->operation
,
349 entry_head
->fault_addr
);
354 static void process_evl_entries(struct idxd_device
*idxd
)
356 union evl_status_reg evl_status
;
358 struct idxd_evl
*evl
= idxd
->evl
;
359 struct __evl_entry
*entry_head
;
360 unsigned int ent_size
= evl_ent_size(idxd
);
364 evl_status
.int_pending
= 1;
366 mutex_lock(&evl
->lock
);
367 /* Clear interrupt pending bit */
368 iowrite32(evl_status
.bits_upper32
,
369 idxd
->reg_base
+ IDXD_EVLSTATUS_OFFSET
+ sizeof(u32
));
370 evl_status
.bits
= ioread64(idxd
->reg_base
+ IDXD_EVLSTATUS_OFFSET
);
373 size
= idxd
->evl
->size
;
376 entry_head
= (struct __evl_entry
*)(evl
->log
+ (h
* ent_size
));
377 process_evl_entry(idxd
, entry_head
, h
);
382 iowrite32(evl_status
.bits_lower32
, idxd
->reg_base
+ IDXD_EVLSTATUS_OFFSET
);
383 mutex_unlock(&evl
->lock
);
386 irqreturn_t
idxd_misc_thread(int vec
, void *data
)
388 struct idxd_irq_entry
*irq_entry
= data
;
389 struct idxd_device
*idxd
= ie_to_idxd(irq_entry
);
390 struct device
*dev
= &idxd
->pdev
->dev
;
391 union gensts_reg gensts
;
397 cause
= ioread32(idxd
->reg_base
+ IDXD_INTCAUSE_OFFSET
);
401 iowrite32(cause
, idxd
->reg_base
+ IDXD_INTCAUSE_OFFSET
);
403 if (cause
& IDXD_INTC_HALT_STATE
)
406 if (cause
& IDXD_INTC_ERR
) {
407 spin_lock(&idxd
->dev_lock
);
408 for (i
= 0; i
< 4; i
++)
409 idxd
->sw_err
.bits
[i
] = ioread64(idxd
->reg_base
+
410 IDXD_SWERR_OFFSET
+ i
* sizeof(u64
));
412 iowrite64(idxd
->sw_err
.bits
[0] & IDXD_SWERR_ACK
,
413 idxd
->reg_base
+ IDXD_SWERR_OFFSET
);
415 if (idxd
->sw_err
.valid
&& idxd
->sw_err
.wq_idx_valid
) {
416 int id
= idxd
->sw_err
.wq_idx
;
417 struct idxd_wq
*wq
= idxd
->wqs
[id
];
419 if (wq
->type
== IDXD_WQT_USER
)
420 wake_up_interruptible(&wq
->err_queue
);
424 for (i
= 0; i
< idxd
->max_wqs
; i
++) {
425 struct idxd_wq
*wq
= idxd
->wqs
[i
];
427 if (wq
->type
== IDXD_WQT_USER
)
428 wake_up_interruptible(&wq
->err_queue
);
432 spin_unlock(&idxd
->dev_lock
);
433 val
|= IDXD_INTC_ERR
;
435 for (i
= 0; i
< 4; i
++)
436 dev_warn_ratelimited(dev
, "err[%d]: %#16.16llx\n",
437 i
, idxd
->sw_err
.bits
[i
]);
441 if (cause
& IDXD_INTC_INT_HANDLE_REVOKED
) {
442 struct idxd_int_handle_revoke
*revoke
;
444 val
|= IDXD_INTC_INT_HANDLE_REVOKED
;
446 revoke
= kzalloc(sizeof(*revoke
), GFP_ATOMIC
);
449 INIT_WORK(&revoke
->work
, idxd_int_handle_revoke
);
450 queue_work(idxd
->wq
, &revoke
->work
);
453 dev_err(dev
, "Failed to allocate work for int handle revoke\n");
454 idxd_wqs_quiesce(idxd
);
458 if (cause
& IDXD_INTC_CMD
) {
459 val
|= IDXD_INTC_CMD
;
460 complete(idxd
->cmd_done
);
463 if (cause
& IDXD_INTC_OCCUPY
) {
464 /* Driver does not utilize occupancy interrupt */
465 val
|= IDXD_INTC_OCCUPY
;
468 if (cause
& IDXD_INTC_PERFMON_OVFL
) {
469 val
|= IDXD_INTC_PERFMON_OVFL
;
470 perfmon_counter_overflow(idxd
);
473 if (cause
& IDXD_INTC_EVL
) {
474 val
|= IDXD_INTC_EVL
;
475 process_evl_entries(idxd
);
480 dev_warn_once(dev
, "Unexpected interrupt cause bits set: %#x\n",
487 gensts
.bits
= ioread32(idxd
->reg_base
+ IDXD_GENSTATS_OFFSET
);
488 if (gensts
.state
== IDXD_DEVICE_STATE_HALT
) {
489 idxd
->state
= IDXD_DEV_HALTED
;
490 if (gensts
.reset_type
== IDXD_DEVICE_RESET_SOFTWARE
) {
492 * If we need a software reset, we will throw the work
493 * on a system workqueue in order to allow interrupts
494 * for the device command completions.
496 INIT_WORK(&idxd
->work
, idxd_device_reinit
);
497 queue_work(idxd
->wq
, &idxd
->work
);
499 idxd
->state
= IDXD_DEV_HALTED
;
500 idxd_wqs_quiesce(idxd
);
501 idxd_wqs_unmap_portal(idxd
);
502 idxd_device_clear_state(idxd
);
503 dev_err(&idxd
->pdev
->dev
,
504 "idxd halted, need %s.\n",
505 gensts
.reset_type
== IDXD_DEVICE_RESET_FLR
?
506 "FLR" : "system reset");
514 static void idxd_int_handle_resubmit_work(struct work_struct
*work
)
516 struct idxd_resubmit
*irw
= container_of(work
, struct idxd_resubmit
, work
);
517 struct idxd_desc
*desc
= irw
->desc
;
518 struct idxd_wq
*wq
= desc
->wq
;
521 desc
->completion
->status
= 0;
522 rc
= idxd_submit_desc(wq
, desc
);
524 dev_dbg(&wq
->idxd
->pdev
->dev
, "Failed to resubmit desc %d to wq %d.\n",
527 * If the error is not -EAGAIN, it means the submission failed due to wq
528 * has been killed instead of ENQCMDS failure. Here the driver needs to
529 * notify the submitter of the failure by reporting abort status.
531 * -EAGAIN comes from ENQCMDS failure. idxd_submit_desc() will handle the
535 desc
->completion
->status
= IDXD_COMP_DESC_ABORT
;
536 idxd_desc_complete(desc
, IDXD_COMPLETE_ABORT
, false);
538 idxd_free_desc(wq
, desc
);
543 bool idxd_queue_int_handle_resubmit(struct idxd_desc
*desc
)
545 struct idxd_wq
*wq
= desc
->wq
;
546 struct idxd_device
*idxd
= wq
->idxd
;
547 struct idxd_resubmit
*irw
;
549 irw
= kzalloc(sizeof(*irw
), GFP_KERNEL
);
554 INIT_WORK(&irw
->work
, idxd_int_handle_resubmit_work
);
555 queue_work(idxd
->wq
, &irw
->work
);
559 static void irq_process_pending_llist(struct idxd_irq_entry
*irq_entry
)
561 struct idxd_desc
*desc
, *t
;
562 struct llist_node
*head
;
564 head
= llist_del_all(&irq_entry
->pending_llist
);
568 llist_for_each_entry_safe(desc
, t
, head
, llnode
) {
569 u8 status
= desc
->completion
->status
& DSA_COMP_STATUS_MASK
;
573 * Check against the original status as ABORT is software defined
574 * and 0xff, which DSA_COMP_STATUS_MASK can mask out.
576 if (unlikely(desc
->completion
->status
== IDXD_COMP_DESC_ABORT
)) {
577 idxd_desc_complete(desc
, IDXD_COMPLETE_ABORT
, true);
581 idxd_desc_complete(desc
, IDXD_COMPLETE_NORMAL
, true);
583 spin_lock(&irq_entry
->list_lock
);
584 list_add_tail(&desc
->list
,
585 &irq_entry
->work_list
);
586 spin_unlock(&irq_entry
->list_lock
);
591 static void irq_process_work_list(struct idxd_irq_entry
*irq_entry
)
594 struct idxd_desc
*desc
, *n
;
597 * This lock protects list corruption from access of list outside of the irq handler
600 spin_lock(&irq_entry
->list_lock
);
601 if (list_empty(&irq_entry
->work_list
)) {
602 spin_unlock(&irq_entry
->list_lock
);
606 list_for_each_entry_safe(desc
, n
, &irq_entry
->work_list
, list
) {
607 if (desc
->completion
->status
) {
608 list_move_tail(&desc
->list
, &flist
);
612 spin_unlock(&irq_entry
->list_lock
);
614 list_for_each_entry_safe(desc
, n
, &flist
, list
) {
616 * Check against the original status as ABORT is software defined
617 * and 0xff, which DSA_COMP_STATUS_MASK can mask out.
619 list_del(&desc
->list
);
621 if (unlikely(desc
->completion
->status
== IDXD_COMP_DESC_ABORT
)) {
622 idxd_desc_complete(desc
, IDXD_COMPLETE_ABORT
, true);
626 idxd_desc_complete(desc
, IDXD_COMPLETE_NORMAL
, true);
630 irqreturn_t
idxd_wq_thread(int irq
, void *data
)
632 struct idxd_irq_entry
*irq_entry
= data
;
635 * There are two lists we are processing. The pending_llist is where
636 * submmiter adds all the submitted descriptor after sending it to
637 * the workqueue. It's a lockless singly linked list. The work_list
638 * is the common linux double linked list. We are in a scenario of
639 * multiple producers and a single consumer. The producers are all
640 * the kernel submitters of descriptors, and the consumer is the
641 * kernel irq handler thread for the msix vector when using threaded
642 * irq. To work with the restrictions of llist to remain lockless,
643 * we are doing the following steps:
644 * 1. Iterate through the work_list and process any completed
645 * descriptor. Delete the completed entries during iteration.
646 * 2. llist_del_all() from the pending list.
647 * 3. Iterate through the llist that was deleted from the pending list
648 * and process the completed entries.
649 * 4. If the entry is still waiting on hardware, list_add_tail() to
652 irq_process_work_list(irq_entry
);
653 irq_process_pending_llist(irq_entry
);