1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
3 #include <linux/init.h>
4 #include <linux/kernel.h>
5 #include <linux/module.h>
7 #include <linux/device.h>
8 #include <linux/sched/task.h>
9 #include <linux/io-64-nonatomic-lo-hi.h>
10 #include <linux/cdev.h>
12 #include <linux/poll.h>
13 #include <linux/iommu.h>
14 #include <linux/highmem.h>
15 #include <uapi/linux/idxd.h>
16 #include <linux/xarray.h>
17 #include "registers.h"
20 struct idxd_cdev_context
{
27 * Since user file names are global in DSA devices, define their ida's as
28 * global to avoid conflict file names.
30 static DEFINE_IDA(file_ida
);
31 static DEFINE_MUTEX(ida_lock
);
34 * ictx is an array based off of accelerator types. enum idxd_type
37 static struct idxd_cdev_context ictx
[IDXD_TYPE_MAX
] = {
42 struct idxd_user_context
{
44 struct task_struct
*task
;
48 struct iommu_sva
*sva
;
49 struct idxd_dev idxd_dev
;
50 u64 counters
[COUNTER_MAX
];
55 static void idxd_cdev_evl_drain_pasid(struct idxd_wq
*wq
, u32 pasid
);
56 static void idxd_xa_pasid_remove(struct idxd_user_context
*ctx
);
58 static inline struct idxd_user_context
*dev_to_uctx(struct device
*dev
)
60 struct idxd_dev
*idxd_dev
= confdev_to_idxd_dev(dev
);
62 return container_of(idxd_dev
, struct idxd_user_context
, idxd_dev
);
65 static ssize_t
cr_faults_show(struct device
*dev
, struct device_attribute
*attr
, char *buf
)
67 struct idxd_user_context
*ctx
= dev_to_uctx(dev
);
69 return sysfs_emit(buf
, "%llu\n", ctx
->counters
[COUNTER_FAULTS
]);
71 static DEVICE_ATTR_RO(cr_faults
);
73 static ssize_t
cr_fault_failures_show(struct device
*dev
,
74 struct device_attribute
*attr
, char *buf
)
76 struct idxd_user_context
*ctx
= dev_to_uctx(dev
);
78 return sysfs_emit(buf
, "%llu\n", ctx
->counters
[COUNTER_FAULT_FAILS
]);
80 static DEVICE_ATTR_RO(cr_fault_failures
);
82 static ssize_t
pid_show(struct device
*dev
, struct device_attribute
*attr
, char *buf
)
84 struct idxd_user_context
*ctx
= dev_to_uctx(dev
);
86 return sysfs_emit(buf
, "%u\n", ctx
->pid
);
88 static DEVICE_ATTR_RO(pid
);
90 static struct attribute
*cdev_file_attributes
[] = {
91 &dev_attr_cr_faults
.attr
,
92 &dev_attr_cr_fault_failures
.attr
,
97 static umode_t
cdev_file_attr_visible(struct kobject
*kobj
, struct attribute
*a
, int n
)
99 struct device
*dev
= container_of(kobj
, typeof(*dev
), kobj
);
100 struct idxd_user_context
*ctx
= dev_to_uctx(dev
);
101 struct idxd_wq
*wq
= ctx
->wq
;
103 if (!wq_pasid_enabled(wq
))
109 static const struct attribute_group cdev_file_attribute_group
= {
110 .attrs
= cdev_file_attributes
,
111 .is_visible
= cdev_file_attr_visible
,
114 static const struct attribute_group
*cdev_file_attribute_groups
[] = {
115 &cdev_file_attribute_group
,
119 static void idxd_file_dev_release(struct device
*dev
)
121 struct idxd_user_context
*ctx
= dev_to_uctx(dev
);
122 struct idxd_wq
*wq
= ctx
->wq
;
123 struct idxd_device
*idxd
= wq
->idxd
;
126 mutex_lock(&ida_lock
);
127 ida_free(&file_ida
, ctx
->id
);
128 mutex_unlock(&ida_lock
);
130 /* Wait for in-flight operations to complete. */
132 idxd_device_drain_pasid(idxd
, ctx
->pasid
);
134 if (device_user_pasid_enabled(idxd
)) {
135 /* The wq disable in the disable pasid function will drain the wq */
136 rc
= idxd_wq_disable_pasid(wq
);
138 dev_err(dev
, "wq disable pasid failed.\n");
145 idxd_cdev_evl_drain_pasid(wq
, ctx
->pasid
);
146 iommu_sva_unbind_device(ctx
->sva
);
147 idxd_xa_pasid_remove(ctx
);
150 mutex_lock(&wq
->wq_lock
);
152 mutex_unlock(&wq
->wq_lock
);
155 static const struct device_type idxd_cdev_file_type
= {
157 .release
= idxd_file_dev_release
,
158 .groups
= cdev_file_attribute_groups
,
161 static void idxd_cdev_dev_release(struct device
*dev
)
163 struct idxd_cdev
*idxd_cdev
= dev_to_cdev(dev
);
164 struct idxd_cdev_context
*cdev_ctx
;
165 struct idxd_wq
*wq
= idxd_cdev
->wq
;
167 cdev_ctx
= &ictx
[wq
->idxd
->data
->type
];
168 ida_free(&cdev_ctx
->minor_ida
, idxd_cdev
->minor
);
172 static const struct device_type idxd_cdev_device_type
= {
174 .release
= idxd_cdev_dev_release
,
177 static inline struct idxd_cdev
*inode_idxd_cdev(struct inode
*inode
)
179 struct cdev
*cdev
= inode
->i_cdev
;
181 return container_of(cdev
, struct idxd_cdev
, cdev
);
184 static inline struct idxd_wq
*inode_wq(struct inode
*inode
)
186 struct idxd_cdev
*idxd_cdev
= inode_idxd_cdev(inode
);
188 return idxd_cdev
->wq
;
191 static void idxd_xa_pasid_remove(struct idxd_user_context
*ctx
)
193 struct idxd_wq
*wq
= ctx
->wq
;
196 mutex_lock(&wq
->uc_lock
);
197 ptr
= xa_cmpxchg(&wq
->upasid_xa
, ctx
->pasid
, ctx
, NULL
, GFP_KERNEL
);
198 if (ptr
!= (void *)ctx
)
199 dev_warn(&wq
->idxd
->pdev
->dev
, "xarray cmpxchg failed for pasid %u\n",
201 mutex_unlock(&wq
->uc_lock
);
204 void idxd_user_counter_increment(struct idxd_wq
*wq
, u32 pasid
, int index
)
206 struct idxd_user_context
*ctx
;
208 if (index
>= COUNTER_MAX
)
211 mutex_lock(&wq
->uc_lock
);
212 ctx
= xa_load(&wq
->upasid_xa
, pasid
);
214 mutex_unlock(&wq
->uc_lock
);
217 ctx
->counters
[index
]++;
218 mutex_unlock(&wq
->uc_lock
);
221 static int idxd_cdev_open(struct inode
*inode
, struct file
*filp
)
223 struct idxd_user_context
*ctx
;
224 struct idxd_device
*idxd
;
226 struct device
*dev
, *fdev
;
228 struct iommu_sva
*sva
;
230 struct idxd_cdev
*idxd_cdev
;
232 wq
= inode_wq(inode
);
234 dev
= &idxd
->pdev
->dev
;
236 dev_dbg(dev
, "%s called: %d\n", __func__
, idxd_wq_refcount(wq
));
238 ctx
= kzalloc(sizeof(*ctx
), GFP_KERNEL
);
242 mutex_lock(&wq
->wq_lock
);
244 if (idxd_wq_refcount(wq
) > 0 && wq_dedicated(wq
)) {
250 filp
->private_data
= ctx
;
251 ctx
->pid
= current
->pid
;
253 if (device_user_pasid_enabled(idxd
)) {
254 sva
= iommu_sva_bind_device(dev
, current
->mm
);
257 dev_err(dev
, "pasid allocation failed: %d\n", rc
);
261 pasid
= iommu_sva_get_pasid(sva
);
262 if (pasid
== IOMMU_PASID_INVALID
) {
264 goto failed_get_pasid
;
269 ctx
->mm
= current
->mm
;
271 mutex_lock(&wq
->uc_lock
);
272 rc
= xa_insert(&wq
->upasid_xa
, pasid
, ctx
, GFP_KERNEL
);
273 mutex_unlock(&wq
->uc_lock
);
275 dev_warn(dev
, "PASID entry already exist in xarray.\n");
277 if (wq_dedicated(wq
)) {
278 rc
= idxd_wq_set_pasid(wq
, pasid
);
280 dev_err(dev
, "wq set pasid failed: %d\n", rc
);
281 goto failed_set_pasid
;
286 idxd_cdev
= wq
->idxd_cdev
;
287 mutex_lock(&ida_lock
);
288 ctx
->id
= ida_alloc(&file_ida
, GFP_KERNEL
);
289 mutex_unlock(&ida_lock
);
291 dev_warn(dev
, "ida alloc failure\n");
294 ctx
->idxd_dev
.type
= IDXD_DEV_CDEV_FILE
;
295 fdev
= user_ctx_dev(ctx
);
296 device_initialize(fdev
);
297 fdev
->parent
= cdev_dev(idxd_cdev
);
298 fdev
->bus
= &dsa_bus_type
;
299 fdev
->type
= &idxd_cdev_file_type
;
301 rc
= dev_set_name(fdev
, "file%d", ctx
->id
);
303 dev_warn(dev
, "set name failure\n");
304 goto failed_dev_name
;
307 rc
= device_add(fdev
);
309 dev_warn(dev
, "file device add failure\n");
314 mutex_unlock(&wq
->wq_lock
);
322 if (device_user_pasid_enabled(idxd
))
323 idxd_xa_pasid_remove(ctx
);
325 if (device_user_pasid_enabled(idxd
))
326 iommu_sva_unbind_device(sva
);
328 mutex_unlock(&wq
->wq_lock
);
333 static void idxd_cdev_evl_drain_pasid(struct idxd_wq
*wq
, u32 pasid
)
335 struct idxd_device
*idxd
= wq
->idxd
;
336 struct idxd_evl
*evl
= idxd
->evl
;
337 union evl_status_reg status
;
339 int ent_size
= evl_ent_size(idxd
);
340 struct __evl_entry
*entry_head
;
345 mutex_lock(&evl
->lock
);
346 status
.bits
= ioread64(idxd
->reg_base
+ IDXD_EVLSTATUS_OFFSET
);
352 entry_head
= (struct __evl_entry
*)(evl
->log
+ (h
* ent_size
));
353 if (entry_head
->pasid
== pasid
&& entry_head
->wq_idx
== wq
->id
)
354 set_bit(h
, evl
->bmap
);
357 drain_workqueue(wq
->wq
);
358 mutex_unlock(&evl
->lock
);
361 static int idxd_cdev_release(struct inode
*node
, struct file
*filep
)
363 struct idxd_user_context
*ctx
= filep
->private_data
;
364 struct idxd_wq
*wq
= ctx
->wq
;
365 struct idxd_device
*idxd
= wq
->idxd
;
366 struct device
*dev
= &idxd
->pdev
->dev
;
368 dev_dbg(dev
, "%s called\n", __func__
);
369 filep
->private_data
= NULL
;
371 device_unregister(user_ctx_dev(ctx
));
376 static int check_vma(struct idxd_wq
*wq
, struct vm_area_struct
*vma
,
379 struct device
*dev
= &wq
->idxd
->pdev
->dev
;
381 if ((vma
->vm_end
- vma
->vm_start
) > PAGE_SIZE
) {
382 dev_info_ratelimited(dev
,
383 "%s: %s: mapping too large: %lu\n",
385 vma
->vm_end
- vma
->vm_start
);
392 static int idxd_cdev_mmap(struct file
*filp
, struct vm_area_struct
*vma
)
394 struct idxd_user_context
*ctx
= filp
->private_data
;
395 struct idxd_wq
*wq
= ctx
->wq
;
396 struct idxd_device
*idxd
= wq
->idxd
;
397 struct pci_dev
*pdev
= idxd
->pdev
;
398 phys_addr_t base
= pci_resource_start(pdev
, IDXD_WQ_BAR
);
402 dev_dbg(&pdev
->dev
, "%s called\n", __func__
);
405 * Due to an erratum in some of the devices supported by the driver,
406 * direct user submission to the device can be unsafe.
407 * (See the INTEL-SA-01084 security advisory)
409 * For the devices that exhibit this behavior, require that the user
410 * has CAP_SYS_RAWIO capabilities.
412 if (!idxd
->user_submission_safe
&& !capable(CAP_SYS_RAWIO
))
415 rc
= check_vma(wq
, vma
, __func__
);
419 vm_flags_set(vma
, VM_DONTCOPY
);
420 pfn
= (base
+ idxd_get_wq_portal_full_offset(wq
->id
,
421 IDXD_PORTAL_LIMITED
)) >> PAGE_SHIFT
;
422 vma
->vm_page_prot
= pgprot_noncached(vma
->vm_page_prot
);
423 vma
->vm_private_data
= ctx
;
425 return io_remap_pfn_range(vma
, vma
->vm_start
, pfn
, PAGE_SIZE
,
429 static int idxd_submit_user_descriptor(struct idxd_user_context
*ctx
,
430 struct dsa_hw_desc __user
*udesc
)
432 struct idxd_wq
*wq
= ctx
->wq
;
433 struct idxd_dev
*idxd_dev
= &wq
->idxd
->idxd_dev
;
434 const uint64_t comp_addr_align
= is_dsa_dev(idxd_dev
) ? 0x20 : 0x40;
435 void __iomem
*portal
= idxd_wq_portal_addr(wq
);
436 struct dsa_hw_desc descriptor
__aligned(64);
439 rc
= copy_from_user(&descriptor
, udesc
, sizeof(descriptor
));
444 * DSA devices are capable of indirect ("batch") command submission.
445 * On devices where direct user submissions are not safe, we cannot
446 * allow this since there is no good way for us to verify these
449 if (is_dsa_dev(idxd_dev
) && descriptor
.opcode
== DSA_OPCODE_BATCH
&&
450 !wq
->idxd
->user_submission_safe
)
453 * As per the programming specification, the completion address must be
454 * aligned to 32 or 64 bytes. If this is violated the hardware
455 * engine can get very confused (security issue).
457 if (!IS_ALIGNED(descriptor
.completion_addr
, comp_addr_align
))
460 if (wq_dedicated(wq
))
461 iosubmit_cmds512(portal
, &descriptor
, 1);
464 descriptor
.pasid
= ctx
->pasid
;
465 rc
= idxd_enqcmds(wq
, portal
, &descriptor
);
473 static ssize_t
idxd_cdev_write(struct file
*filp
, const char __user
*buf
, size_t len
,
476 struct dsa_hw_desc __user
*udesc
= (struct dsa_hw_desc __user
*)buf
;
477 struct idxd_user_context
*ctx
= filp
->private_data
;
481 for (i
= 0; i
< len
/sizeof(struct dsa_hw_desc
); i
++) {
482 int rc
= idxd_submit_user_descriptor(ctx
, udesc
+ i
);
485 return written
? written
: rc
;
487 written
+= sizeof(struct dsa_hw_desc
);
493 static __poll_t
idxd_cdev_poll(struct file
*filp
,
494 struct poll_table_struct
*wait
)
496 struct idxd_user_context
*ctx
= filp
->private_data
;
497 struct idxd_wq
*wq
= ctx
->wq
;
498 struct idxd_device
*idxd
= wq
->idxd
;
501 poll_wait(filp
, &wq
->err_queue
, wait
);
502 spin_lock(&idxd
->dev_lock
);
503 if (idxd
->sw_err
.valid
)
504 out
= EPOLLIN
| EPOLLRDNORM
;
505 spin_unlock(&idxd
->dev_lock
);
510 static const struct file_operations idxd_cdev_fops
= {
511 .owner
= THIS_MODULE
,
512 .open
= idxd_cdev_open
,
513 .release
= idxd_cdev_release
,
514 .mmap
= idxd_cdev_mmap
,
515 .write
= idxd_cdev_write
,
516 .poll
= idxd_cdev_poll
,
519 int idxd_cdev_get_major(struct idxd_device
*idxd
)
521 return MAJOR(ictx
[idxd
->data
->type
].devt
);
524 int idxd_wq_add_cdev(struct idxd_wq
*wq
)
526 struct idxd_device
*idxd
= wq
->idxd
;
527 struct idxd_cdev
*idxd_cdev
;
530 struct idxd_cdev_context
*cdev_ctx
;
533 idxd_cdev
= kzalloc(sizeof(*idxd_cdev
), GFP_KERNEL
);
537 idxd_cdev
->idxd_dev
.type
= IDXD_DEV_CDEV
;
539 cdev
= &idxd_cdev
->cdev
;
540 dev
= cdev_dev(idxd_cdev
);
541 cdev_ctx
= &ictx
[wq
->idxd
->data
->type
];
542 minor
= ida_alloc_max(&cdev_ctx
->minor_ida
, MINORMASK
, GFP_KERNEL
);
547 idxd_cdev
->minor
= minor
;
549 device_initialize(dev
);
550 dev
->parent
= wq_confdev(wq
);
551 dev
->bus
= &dsa_bus_type
;
552 dev
->type
= &idxd_cdev_device_type
;
553 dev
->devt
= MKDEV(MAJOR(cdev_ctx
->devt
), minor
);
555 rc
= dev_set_name(dev
, "%s/wq%u.%u", idxd
->data
->name_prefix
, idxd
->id
, wq
->id
);
559 wq
->idxd_cdev
= idxd_cdev
;
560 cdev_init(cdev
, &idxd_cdev_fops
);
561 rc
= cdev_device_add(cdev
, dev
);
563 dev_dbg(&wq
->idxd
->pdev
->dev
, "cdev_add failed: %d\n", rc
);
571 wq
->idxd_cdev
= NULL
;
575 void idxd_wq_del_cdev(struct idxd_wq
*wq
)
577 struct idxd_cdev
*idxd_cdev
;
579 idxd_cdev
= wq
->idxd_cdev
;
580 wq
->idxd_cdev
= NULL
;
581 cdev_device_del(&idxd_cdev
->cdev
, cdev_dev(idxd_cdev
));
582 put_device(cdev_dev(idxd_cdev
));
585 static int idxd_user_drv_probe(struct idxd_dev
*idxd_dev
)
587 struct device
*dev
= &idxd_dev
->conf_dev
;
588 struct idxd_wq
*wq
= idxd_dev_to_wq(idxd_dev
);
589 struct idxd_device
*idxd
= wq
->idxd
;
592 if (idxd
->state
!= IDXD_DEV_ENABLED
)
595 mutex_lock(&wq
->wq_lock
);
597 if (!idxd_wq_driver_name_match(wq
, dev
)) {
598 idxd
->cmd_status
= IDXD_SCMD_WQ_NO_DRV_NAME
;
604 * User type WQ is enabled only when SVA is enabled for two reasons:
605 * - If no IOMMU or IOMMU Passthrough without SVA, userspace
606 * can directly access physical address through the WQ.
607 * - The IDXD cdev driver does not provide any ways to pin
608 * user pages and translate the address from user VA to IOVA or
609 * PA without IOMMU SVA. Therefore the application has no way
610 * to instruct the device to perform DMA function. This makes
611 * the cdev not usable for normal application usage.
613 if (!device_user_pasid_enabled(idxd
)) {
614 idxd
->cmd_status
= IDXD_SCMD_WQ_USER_NO_IOMMU
;
615 dev_dbg(&idxd
->pdev
->dev
,
616 "User type WQ cannot be enabled without SVA.\n");
622 wq
->wq
= create_workqueue(dev_name(wq_confdev(wq
)));
628 wq
->type
= IDXD_WQT_USER
;
629 rc
= idxd_drv_enable_wq(wq
);
633 rc
= idxd_wq_add_cdev(wq
);
635 idxd
->cmd_status
= IDXD_SCMD_CDEV_ERR
;
639 idxd
->cmd_status
= 0;
640 mutex_unlock(&wq
->wq_lock
);
644 idxd_drv_disable_wq(wq
);
646 destroy_workqueue(wq
->wq
);
647 wq
->type
= IDXD_WQT_NONE
;
649 mutex_unlock(&wq
->wq_lock
);
653 static void idxd_user_drv_remove(struct idxd_dev
*idxd_dev
)
655 struct idxd_wq
*wq
= idxd_dev_to_wq(idxd_dev
);
657 mutex_lock(&wq
->wq_lock
);
658 idxd_wq_del_cdev(wq
);
659 idxd_drv_disable_wq(wq
);
660 wq
->type
= IDXD_WQT_NONE
;
661 destroy_workqueue(wq
->wq
);
663 mutex_unlock(&wq
->wq_lock
);
666 static enum idxd_dev_type dev_types
[] = {
671 struct idxd_device_driver idxd_user_drv
= {
672 .probe
= idxd_user_drv_probe
,
673 .remove
= idxd_user_drv_remove
,
677 EXPORT_SYMBOL_GPL(idxd_user_drv
);
679 int idxd_cdev_register(void)
683 for (i
= 0; i
< IDXD_TYPE_MAX
; i
++) {
684 ida_init(&ictx
[i
].minor_ida
);
685 rc
= alloc_chrdev_region(&ictx
[i
].devt
, 0, MINORMASK
,
688 goto err_free_chrdev_region
;
693 err_free_chrdev_region
:
694 for (i
--; i
>= 0; i
--)
695 unregister_chrdev_region(ictx
[i
].devt
, MINORMASK
);
700 void idxd_cdev_remove(void)
704 for (i
= 0; i
< IDXD_TYPE_MAX
; i
++) {
705 unregister_chrdev_region(ictx
[i
].devt
, MINORMASK
);
706 ida_destroy(&ictx
[i
].minor_ida
);
711 * idxd_copy_cr - copy completion record to user address space found by wq and
715 * @addr: user fault address to write
716 * @cr: completion record
717 * @len: number of bytes to copy
719 * This is called by a work that handles completion record fault.
721 * Return: number of bytes copied.
723 int idxd_copy_cr(struct idxd_wq
*wq
, ioasid_t pasid
, unsigned long addr
,
726 struct device
*dev
= &wq
->idxd
->pdev
->dev
;
727 int left
= len
, status_size
= 1;
728 struct idxd_user_context
*ctx
;
729 struct mm_struct
*mm
;
731 mutex_lock(&wq
->uc_lock
);
733 ctx
= xa_load(&wq
->upasid_xa
, pasid
);
735 dev_warn(dev
, "No user context\n");
741 * The completion record fault handling work is running in kernel
742 * thread context. It temporarily switches to the mm to copy cr
746 left
= copy_to_user((void __user
*)addr
+ status_size
, cr
+ status_size
,
749 * Copy status only after the rest of completion record is copied
750 * successfully so that the user gets the complete completion record
751 * when a non-zero status is polled.
757 * Ensure that the completion record's status field is written
758 * after the rest of the completion record has been written.
759 * This ensures that the user receives the correct completion
760 * record information once polling for a non-zero status.
764 if (put_user(status
, (u8 __user
*)addr
))
769 kthread_unuse_mm(mm
);
772 mutex_unlock(&wq
->uc_lock
);