1 // SPDX-License-Identifier: GPL-2.0-only
3 * IOMMU API for RISC-V IOMMU implementations.
5 * Copyright © 2022-2024 Rivos Inc.
6 * Copyright © 2023 FORTH-ICS/CARV
9 * Tomasz Jeznach <tjeznach@rivosinc.com>
10 * Nick Kossifidis <mick@ics.forth.gr>
13 #define pr_fmt(fmt) "riscv-iommu: " fmt
15 #include <linux/compiler.h>
16 #include <linux/crash_dump.h>
17 #include <linux/init.h>
18 #include <linux/iommu.h>
19 #include <linux/iopoll.h>
20 #include <linux/kernel.h>
21 #include <linux/pci.h>
23 #include "../iommu-pages.h"
24 #include "iommu-bits.h"
27 /* Timeouts in [us] */
28 #define RISCV_IOMMU_QCSR_TIMEOUT 150000
29 #define RISCV_IOMMU_QUEUE_TIMEOUT 150000
30 #define RISCV_IOMMU_DDTP_TIMEOUT 10000000
31 #define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000
33 /* Number of entries per CMD/FLT queue, should be <= INT_MAX */
34 #define RISCV_IOMMU_DEF_CQ_COUNT 8192
35 #define RISCV_IOMMU_DEF_FQ_COUNT 4096
37 /* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
38 #define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10))
39 #define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12))
41 #define dev_to_iommu(dev) \
42 iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu)
44 /* IOMMU PSCID allocation namespace. */
45 static DEFINE_IDA(riscv_iommu_pscids
);
46 #define RISCV_IOMMU_MAX_PSCID (BIT(20) - 1)
48 /* Device resource-managed allocations */
49 struct riscv_iommu_devres
{
54 static void riscv_iommu_devres_pages_release(struct device
*dev
, void *res
)
56 struct riscv_iommu_devres
*devres
= res
;
58 iommu_free_pages(devres
->addr
, devres
->order
);
61 static int riscv_iommu_devres_pages_match(struct device
*dev
, void *res
, void *p
)
63 struct riscv_iommu_devres
*devres
= res
;
64 struct riscv_iommu_devres
*target
= p
;
66 return devres
->addr
== target
->addr
;
69 static void *riscv_iommu_get_pages(struct riscv_iommu_device
*iommu
, int order
)
71 struct riscv_iommu_devres
*devres
;
74 addr
= iommu_alloc_pages_node(dev_to_node(iommu
->dev
),
75 GFP_KERNEL_ACCOUNT
, order
);
79 devres
= devres_alloc(riscv_iommu_devres_pages_release
,
80 sizeof(struct riscv_iommu_devres
), GFP_KERNEL
);
82 if (unlikely(!devres
)) {
83 iommu_free_pages(addr
, order
);
88 devres
->order
= order
;
90 devres_add(iommu
->dev
, devres
);
95 static void riscv_iommu_free_pages(struct riscv_iommu_device
*iommu
, void *addr
)
97 struct riscv_iommu_devres devres
= { .addr
= addr
};
99 devres_release(iommu
->dev
, riscv_iommu_devres_pages_release
,
100 riscv_iommu_devres_pages_match
, &devres
);
104 * Hardware queue allocation and management.
107 /* Setup queue base, control registers and default queue length */
108 #define RISCV_IOMMU_QUEUE_INIT(q, name) do { \
109 struct riscv_iommu_queue *_q = q; \
110 _q->qid = RISCV_IOMMU_INTR_ ## name; \
111 _q->qbr = RISCV_IOMMU_REG_ ## name ## B; \
112 _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR; \
113 _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\
116 /* Note: offsets are the same for all queues */
117 #define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
118 #define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
119 #define Q_ITEM(q, index) ((q)->mask & (index))
120 #define Q_IPSR(q) BIT((q)->qid)
123 * Discover queue ring buffer hardware configuration, allocate in-memory
124 * ring buffer or use fixed I/O memory location, configure queue base register.
125 * Must be called before hardware queue is enabled.
127 * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT()
128 * @entry_size - queue single element size in bytes.
130 static int riscv_iommu_queue_alloc(struct riscv_iommu_device
*iommu
,
131 struct riscv_iommu_queue
*queue
,
138 * Use WARL base register property to discover maximum allowed
139 * number of entries and optional fixed IO address for queue location.
141 riscv_iommu_writeq(iommu
, queue
->qbr
, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD
);
142 qb
= riscv_iommu_readq(iommu
, queue
->qbr
);
145 * Calculate and verify hardware supported queue length, as reported
146 * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1).
147 * Update queue size based on hardware supported value.
149 logsz
= ilog2(queue
->mask
);
150 if (logsz
> FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD
, qb
))
151 logsz
= FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD
, qb
);
154 * Use WARL base register property to discover an optional fixed IO
155 * address for queue ring buffer location. Otherwise allocate contiguous
158 if (FIELD_GET(RISCV_IOMMU_PPN_FIELD
, qb
)) {
159 const size_t queue_size
= entry_size
<< (logsz
+ 1);
161 queue
->phys
= pfn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD
, qb
));
162 queue
->base
= devm_ioremap(iommu
->dev
, queue
->phys
, queue_size
);
165 const size_t queue_size
= entry_size
<< (logsz
+ 1);
166 const int order
= get_order(queue_size
);
168 queue
->base
= riscv_iommu_get_pages(iommu
, order
);
169 queue
->phys
= __pa(queue
->base
);
170 } while (!queue
->base
&& logsz
-- > 0);
176 qb
= phys_to_ppn(queue
->phys
) |
177 FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD
, logsz
);
179 /* Update base register and read back to verify hw accepted our write */
180 riscv_iommu_writeq(iommu
, queue
->qbr
, qb
);
181 rb
= riscv_iommu_readq(iommu
, queue
->qbr
);
183 dev_err(iommu
->dev
, "queue #%u allocation failed\n", queue
->qid
);
187 /* Update actual queue mask */
188 queue
->mask
= (2U << logsz
) - 1;
190 dev_dbg(iommu
->dev
, "queue #%u allocated 2^%u entries",
191 queue
->qid
, logsz
+ 1);
196 /* Check interrupt queue status, IPSR */
197 static irqreturn_t
riscv_iommu_queue_ipsr(int irq
, void *data
)
199 struct riscv_iommu_queue
*queue
= (struct riscv_iommu_queue
*)data
;
201 if (riscv_iommu_readl(queue
->iommu
, RISCV_IOMMU_REG_IPSR
) & Q_IPSR(queue
))
202 return IRQ_WAKE_THREAD
;
207 static int riscv_iommu_queue_vec(struct riscv_iommu_device
*iommu
, int n
)
209 /* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */
210 return (iommu
->icvec
>> (n
* 4)) & RISCV_IOMMU_ICVEC_CIV
;
214 * Enable queue processing in the hardware, register interrupt handler.
216 * @queue - data structure, already allocated with riscv_iommu_queue_alloc()
217 * @irq_handler - threaded interrupt handler.
219 static int riscv_iommu_queue_enable(struct riscv_iommu_device
*iommu
,
220 struct riscv_iommu_queue
*queue
,
221 irq_handler_t irq_handler
)
223 const unsigned int irq
= iommu
->irqs
[riscv_iommu_queue_vec(iommu
, queue
->qid
)];
230 /* Polling not implemented */
234 queue
->iommu
= iommu
;
235 rc
= request_threaded_irq(irq
, riscv_iommu_queue_ipsr
, irq_handler
,
236 IRQF_ONESHOT
| IRQF_SHARED
,
237 dev_name(iommu
->dev
), queue
);
244 * Enable queue with interrupts, clear any memory fault if any.
245 * Wait for the hardware to acknowledge request and activate queue
247 * Note: All CSR bitfields are in the same offsets for all queues.
249 riscv_iommu_writel(iommu
, queue
->qcr
,
250 RISCV_IOMMU_QUEUE_ENABLE
|
251 RISCV_IOMMU_QUEUE_INTR_ENABLE
|
252 RISCV_IOMMU_QUEUE_MEM_FAULT
);
254 riscv_iommu_readl_timeout(iommu
, queue
->qcr
,
255 csr
, !(csr
& RISCV_IOMMU_QUEUE_BUSY
),
256 10, RISCV_IOMMU_QCSR_TIMEOUT
);
258 if (RISCV_IOMMU_QUEUE_ACTIVE
!= (csr
& (RISCV_IOMMU_QUEUE_ACTIVE
|
259 RISCV_IOMMU_QUEUE_BUSY
|
260 RISCV_IOMMU_QUEUE_MEM_FAULT
))) {
261 /* Best effort to stop and disable failing hardware queue. */
262 riscv_iommu_writel(iommu
, queue
->qcr
, 0);
263 free_irq(irq
, queue
);
265 dev_err(iommu
->dev
, "queue #%u failed to start\n", queue
->qid
);
269 /* Clear any pending interrupt flag. */
270 riscv_iommu_writel(iommu
, RISCV_IOMMU_REG_IPSR
, Q_IPSR(queue
));
276 * Disable queue. Wait for the hardware to acknowledge request and
277 * stop processing enqueued requests. Report errors but continue.
279 static void riscv_iommu_queue_disable(struct riscv_iommu_queue
*queue
)
281 struct riscv_iommu_device
*iommu
= queue
->iommu
;
287 free_irq(iommu
->irqs
[riscv_iommu_queue_vec(iommu
, queue
->qid
)], queue
);
288 riscv_iommu_writel(iommu
, queue
->qcr
, 0);
289 riscv_iommu_readl_timeout(iommu
, queue
->qcr
,
290 csr
, !(csr
& RISCV_IOMMU_QUEUE_BUSY
),
291 10, RISCV_IOMMU_QCSR_TIMEOUT
);
293 if (csr
& (RISCV_IOMMU_QUEUE_ACTIVE
| RISCV_IOMMU_QUEUE_BUSY
))
294 dev_err(iommu
->dev
, "fail to disable hardware queue #%u, csr 0x%x\n",
301 * Returns number of available valid queue entries and the first item index.
302 * Update shadow producer index if necessary.
304 static int riscv_iommu_queue_consume(struct riscv_iommu_queue
*queue
,
307 unsigned int head
= atomic_read(&queue
->head
);
308 unsigned int tail
= atomic_read(&queue
->tail
);
309 unsigned int last
= Q_ITEM(queue
, tail
);
310 int available
= (int)(tail
- head
);
317 /* read hardware producer index, check reserved register bits are not set. */
318 if (riscv_iommu_readl_timeout(queue
->iommu
, Q_TAIL(queue
),
319 tail
, (tail
& ~queue
->mask
) == 0,
320 0, RISCV_IOMMU_QUEUE_TIMEOUT
)) {
321 dev_err_once(queue
->iommu
->dev
,
322 "Hardware error: queue access timeout\n");
329 /* update shadow producer index */
330 return (int)(atomic_add_return((tail
- last
) & queue
->mask
, &queue
->tail
) - head
);
334 * Release processed queue entries, should match riscv_iommu_queue_consume() calls.
336 static void riscv_iommu_queue_release(struct riscv_iommu_queue
*queue
, int count
)
338 const unsigned int head
= atomic_add_return(count
, &queue
->head
);
340 riscv_iommu_writel(queue
->iommu
, Q_HEAD(queue
), Q_ITEM(queue
, head
));
343 /* Return actual consumer index based on hardware reported queue head index. */
344 static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue
*queue
)
346 const unsigned int cons
= atomic_read(&queue
->head
);
347 const unsigned int last
= Q_ITEM(queue
, cons
);
350 if (riscv_iommu_readl_timeout(queue
->iommu
, Q_HEAD(queue
), head
,
351 !(head
& ~queue
->mask
),
352 0, RISCV_IOMMU_QUEUE_TIMEOUT
))
355 return cons
+ ((head
- last
) & queue
->mask
);
358 /* Wait for submitted item to be processed. */
359 static int riscv_iommu_queue_wait(struct riscv_iommu_queue
*queue
,
361 unsigned int timeout_us
)
363 unsigned int cons
= atomic_read(&queue
->head
);
365 /* Already processed by the consumer */
366 if ((int)(cons
- index
) > 0)
369 /* Monitor consumer index */
370 return readx_poll_timeout(riscv_iommu_queue_cons
, queue
, cons
,
371 (int)(cons
- index
) > 0, 0, timeout_us
);
374 /* Enqueue an entry and wait to be processed if timeout_us > 0
376 * Error handling for IOMMU hardware not responding in reasonable time
377 * will be added as separate patch series along with other RAS features.
378 * For now, only report hardware failure and continue.
380 static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue
*queue
,
381 void *entry
, size_t entry_size
)
388 /* Do not preempt submission flow. */
389 local_irq_save(flags
);
391 /* 1. Allocate some space in the queue */
392 prod
= atomic_inc_return(&queue
->prod
) - 1;
393 head
= atomic_read(&queue
->head
);
395 /* 2. Wait for space availability. */
396 if ((prod
- head
) > queue
->mask
) {
397 if (readx_poll_timeout(atomic_read
, &queue
->head
,
398 head
, (prod
- head
) < queue
->mask
,
399 0, RISCV_IOMMU_QUEUE_TIMEOUT
))
401 } else if ((prod
- head
) == queue
->mask
) {
402 const unsigned int last
= Q_ITEM(queue
, head
);
404 if (riscv_iommu_readl_timeout(queue
->iommu
, Q_HEAD(queue
), head
,
405 !(head
& ~queue
->mask
) && head
!= last
,
406 0, RISCV_IOMMU_QUEUE_TIMEOUT
))
408 atomic_add((head
- last
) & queue
->mask
, &queue
->head
);
411 /* 3. Store entry in the ring buffer */
412 memcpy(queue
->base
+ Q_ITEM(queue
, prod
) * entry_size
, entry
, entry_size
);
414 /* 4. Wait for all previous entries to be ready */
415 if (readx_poll_timeout(atomic_read
, &queue
->tail
, tail
, prod
== tail
,
416 0, RISCV_IOMMU_QUEUE_TIMEOUT
))
420 * 5. Make sure the ring buffer update (whether in normal or I/O memory) is
421 * completed and visible before signaling the tail doorbell to fetch
422 * the next command. 'fence ow, ow'
425 riscv_iommu_writel(queue
->iommu
, Q_TAIL(queue
), Q_ITEM(queue
, prod
+ 1));
428 * 6. Make sure the doorbell write to the device has finished before updating
429 * the shadow tail index in normal memory. 'fence o, w'
432 atomic_inc(&queue
->tail
);
434 /* 7. Complete submission and restore local interrupts */
435 local_irq_restore(flags
);
440 local_irq_restore(flags
);
441 dev_err_once(queue
->iommu
->dev
, "Hardware error: command enqueue failed\n");
447 * IOMMU Command queue chapter 3.1
450 /* Command queue interrupt handler thread function */
451 static irqreturn_t
riscv_iommu_cmdq_process(int irq
, void *data
)
453 const struct riscv_iommu_queue
*queue
= (struct riscv_iommu_queue
*)data
;
456 /* Clear MF/CQ errors, complete error recovery to be implemented. */
457 ctrl
= riscv_iommu_readl(queue
->iommu
, queue
->qcr
);
458 if (ctrl
& (RISCV_IOMMU_CQCSR_CQMF
| RISCV_IOMMU_CQCSR_CMD_TO
|
459 RISCV_IOMMU_CQCSR_CMD_ILL
| RISCV_IOMMU_CQCSR_FENCE_W_IP
)) {
460 riscv_iommu_writel(queue
->iommu
, queue
->qcr
, ctrl
);
461 dev_warn(queue
->iommu
->dev
,
462 "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n",
464 !!(ctrl
& RISCV_IOMMU_CQCSR_CQMF
),
465 !!(ctrl
& RISCV_IOMMU_CQCSR_CMD_TO
),
466 !!(ctrl
& RISCV_IOMMU_CQCSR_CMD_ILL
),
467 !!(ctrl
& RISCV_IOMMU_CQCSR_FENCE_W_IP
));
470 /* Placeholder for command queue interrupt notifiers */
472 /* Clear command interrupt pending. */
473 riscv_iommu_writel(queue
->iommu
, RISCV_IOMMU_REG_IPSR
, Q_IPSR(queue
));
478 /* Send command to the IOMMU command queue */
479 static void riscv_iommu_cmd_send(struct riscv_iommu_device
*iommu
,
480 struct riscv_iommu_command
*cmd
)
482 riscv_iommu_queue_send(&iommu
->cmdq
, cmd
, sizeof(*cmd
));
485 /* Send IOFENCE.C command and wait for all scheduled commands to complete. */
486 static void riscv_iommu_cmd_sync(struct riscv_iommu_device
*iommu
,
487 unsigned int timeout_us
)
489 struct riscv_iommu_command cmd
;
492 riscv_iommu_cmd_iofence(&cmd
);
493 prod
= riscv_iommu_queue_send(&iommu
->cmdq
, &cmd
, sizeof(cmd
));
498 if (riscv_iommu_queue_wait(&iommu
->cmdq
, prod
, timeout_us
))
499 dev_err_once(iommu
->dev
,
500 "Hardware error: command execution timeout\n");
504 * IOMMU Fault/Event queue chapter 3.2
507 static void riscv_iommu_fault(struct riscv_iommu_device
*iommu
,
508 struct riscv_iommu_fq_record
*event
)
510 unsigned int err
= FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE
, event
->hdr
);
511 unsigned int devid
= FIELD_GET(RISCV_IOMMU_FQ_HDR_DID
, event
->hdr
);
513 /* Placeholder for future fault handling implementation, report only. */
515 dev_warn_ratelimited(iommu
->dev
,
516 "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n",
517 err
, devid
, event
->iotval
, event
->iotval2
);
520 /* Fault queue interrupt handler thread function */
521 static irqreturn_t
riscv_iommu_fltq_process(int irq
, void *data
)
523 struct riscv_iommu_queue
*queue
= (struct riscv_iommu_queue
*)data
;
524 struct riscv_iommu_device
*iommu
= queue
->iommu
;
525 struct riscv_iommu_fq_record
*events
;
526 unsigned int ctrl
, idx
;
529 events
= (struct riscv_iommu_fq_record
*)queue
->base
;
531 /* Clear fault interrupt pending and process all received fault events. */
532 riscv_iommu_writel(iommu
, RISCV_IOMMU_REG_IPSR
, Q_IPSR(queue
));
535 cnt
= riscv_iommu_queue_consume(queue
, &idx
);
536 for (len
= 0; len
< cnt
; idx
++, len
++)
537 riscv_iommu_fault(iommu
, &events
[Q_ITEM(queue
, idx
)]);
538 riscv_iommu_queue_release(queue
, cnt
);
541 /* Clear MF/OF errors, complete error recovery to be implemented. */
542 ctrl
= riscv_iommu_readl(iommu
, queue
->qcr
);
543 if (ctrl
& (RISCV_IOMMU_FQCSR_FQMF
| RISCV_IOMMU_FQCSR_FQOF
)) {
544 riscv_iommu_writel(iommu
, queue
->qcr
, ctrl
);
546 "Queue #%u error; memory fault:%d overflow:%d\n",
548 !!(ctrl
& RISCV_IOMMU_FQCSR_FQMF
),
549 !!(ctrl
& RISCV_IOMMU_FQCSR_FQOF
));
555 /* Lookup and initialize device context info structure. */
556 static struct riscv_iommu_dc
*riscv_iommu_get_dc(struct riscv_iommu_device
*iommu
,
559 const bool base_format
= !(iommu
->caps
& RISCV_IOMMU_CAPABILITIES_MSI_FLAT
);
561 unsigned long ddt
, old
, new;
563 u8 ddi_bits
[3] = { 0 };
566 /* Make sure the mode is valid */
567 if (iommu
->ddt_mode
< RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL
||
568 iommu
->ddt_mode
> RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL
)
572 * Device id partitioning for base format:
573 * DDI[0]: bits 0 - 6 (1st level) (7 bits)
574 * DDI[1]: bits 7 - 15 (2nd level) (9 bits)
575 * DDI[2]: bits 16 - 23 (3rd level) (8 bits)
577 * For extended format:
578 * DDI[0]: bits 0 - 5 (1st level) (6 bits)
579 * DDI[1]: bits 6 - 14 (2nd level) (9 bits)
580 * DDI[2]: bits 15 - 23 (3rd level) (9 bits)
585 ddi_bits
[2] = 7 + 9 + 8;
589 ddi_bits
[2] = 6 + 9 + 9;
592 /* Make sure device id is within range */
593 depth
= iommu
->ddt_mode
- RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL
;
594 if (devid
>= (1 << ddi_bits
[depth
]))
597 /* Get to the level of the non-leaf node that holds the device context */
598 for (ddtp
= iommu
->ddt_root
; depth
-- > 0;) {
599 const int split
= ddi_bits
[depth
];
601 * Each non-leaf node is 64bits wide and on each level
602 * nodes are indexed by DDI[depth].
604 ddtp
+= (devid
>> split
) & 0x1FF;
607 * Check if this node has been populated and if not
608 * allocate a new level and populate it.
611 ddt
= READ_ONCE(*(unsigned long *)ddtp
);
612 if (ddt
& RISCV_IOMMU_DDTE_V
) {
613 ddtp
= __va(ppn_to_phys(ddt
));
617 ptr
= riscv_iommu_get_pages(iommu
, 0);
621 new = phys_to_ppn(__pa(ptr
)) | RISCV_IOMMU_DDTE_V
;
622 old
= cmpxchg_relaxed((unsigned long *)ddtp
, ddt
, new);
629 /* Race setting DDT detected, re-read and retry. */
630 riscv_iommu_free_pages(iommu
, ptr
);
635 * Grab the node that matches DDI[depth], note that when using base
636 * format the device context is 4 * 64bits, and the extended format
637 * is 8 * 64bits, hence the (3 - base_format) below.
639 ddtp
+= (devid
& ((64 << base_format
) - 1)) << (3 - base_format
);
641 return (struct riscv_iommu_dc
*)ddtp
;
645 * This is best effort IOMMU translation shutdown flow.
646 * Disable IOMMU without waiting for hardware response.
648 static void riscv_iommu_disable(struct riscv_iommu_device
*iommu
)
650 riscv_iommu_writeq(iommu
, RISCV_IOMMU_REG_DDTP
, 0);
651 riscv_iommu_writel(iommu
, RISCV_IOMMU_REG_CQCSR
, 0);
652 riscv_iommu_writel(iommu
, RISCV_IOMMU_REG_FQCSR
, 0);
653 riscv_iommu_writel(iommu
, RISCV_IOMMU_REG_PQCSR
, 0);
656 #define riscv_iommu_read_ddtp(iommu) ({ \
658 riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \
659 !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \
660 RISCV_IOMMU_DDTP_TIMEOUT); \
663 static int riscv_iommu_iodir_alloc(struct riscv_iommu_device
*iommu
)
668 ddtp
= riscv_iommu_read_ddtp(iommu
);
669 if (ddtp
& RISCV_IOMMU_DDTP_BUSY
)
673 * It is optional for the hardware to report a fixed address for device
674 * directory root page when DDT.MODE is OFF or BARE.
676 mode
= FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE
, ddtp
);
677 if (mode
== RISCV_IOMMU_DDTP_IOMMU_MODE_BARE
||
678 mode
== RISCV_IOMMU_DDTP_IOMMU_MODE_OFF
) {
679 /* Use WARL to discover hardware fixed DDT PPN */
680 riscv_iommu_writeq(iommu
, RISCV_IOMMU_REG_DDTP
,
681 FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE
, mode
));
682 ddtp
= riscv_iommu_read_ddtp(iommu
);
683 if (ddtp
& RISCV_IOMMU_DDTP_BUSY
)
686 iommu
->ddt_phys
= ppn_to_phys(ddtp
);
688 iommu
->ddt_root
= devm_ioremap(iommu
->dev
,
689 iommu
->ddt_phys
, PAGE_SIZE
);
691 memset(iommu
->ddt_root
, 0, PAGE_SIZE
);
694 if (!iommu
->ddt_root
) {
695 iommu
->ddt_root
= riscv_iommu_get_pages(iommu
, 0);
696 iommu
->ddt_phys
= __pa(iommu
->ddt_root
);
699 if (!iommu
->ddt_root
)
706 * Discover supported DDT modes starting from requested value,
707 * configure DDTP register with accepted mode and root DDT address.
708 * Accepted iommu->ddt_mode is updated on success.
710 static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device
*iommu
,
711 unsigned int ddtp_mode
)
713 struct device
*dev
= iommu
->dev
;
715 unsigned int mode
, rq_mode
= ddtp_mode
;
716 struct riscv_iommu_command cmd
;
718 ddtp
= riscv_iommu_read_ddtp(iommu
);
719 if (ddtp
& RISCV_IOMMU_DDTP_BUSY
)
722 /* Disallow state transition from xLVL to xLVL. */
723 mode
= FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE
, ddtp
);
724 if (mode
!= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE
&&
725 mode
!= RISCV_IOMMU_DDTP_IOMMU_MODE_OFF
&&
726 rq_mode
!= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE
&&
727 rq_mode
!= RISCV_IOMMU_DDTP_IOMMU_MODE_OFF
)
731 rq_ddtp
= FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE
, rq_mode
);
732 if (rq_mode
> RISCV_IOMMU_DDTP_IOMMU_MODE_BARE
)
733 rq_ddtp
|= phys_to_ppn(iommu
->ddt_phys
);
735 riscv_iommu_writeq(iommu
, RISCV_IOMMU_REG_DDTP
, rq_ddtp
);
736 ddtp
= riscv_iommu_read_ddtp(iommu
);
737 if (ddtp
& RISCV_IOMMU_DDTP_BUSY
) {
738 dev_err(dev
, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n",
743 /* Verify IOMMU hardware accepts new DDTP config. */
744 mode
= FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE
, ddtp
);
749 /* Hardware mandatory DDTP mode has not been accepted. */
750 if (rq_mode
< RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL
&& rq_ddtp
!= ddtp
) {
751 dev_err(dev
, "DDTP update failed hw: %llx vs %llx\n",
757 * Mode field is WARL, an IOMMU may support a subset of
758 * directory table levels in which case if we tried to set
759 * an unsupported number of levels we'll readback either
760 * a valid xLVL or off/bare. If we got off/bare, try again
761 * with a smaller xLVL.
763 if (mode
< RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL
&&
764 rq_mode
> RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL
) {
765 dev_dbg(dev
, "DDTP hw mode %u vs %u\n", mode
, rq_mode
);
771 * We tried all supported modes and IOMMU hardware failed to
772 * accept new settings, something went very wrong since off/bare
773 * and at least one xLVL must be supported.
775 dev_err(dev
, "DDTP hw mode %u, failed to set %u\n",
780 iommu
->ddt_mode
= mode
;
781 if (mode
!= ddtp_mode
)
782 dev_dbg(dev
, "DDTP hw mode %u, requested %u\n", mode
, ddtp_mode
);
784 /* Invalidate device context cache */
785 riscv_iommu_cmd_iodir_inval_ddt(&cmd
);
786 riscv_iommu_cmd_send(iommu
, &cmd
);
788 /* Invalidate address translation cache */
789 riscv_iommu_cmd_inval_vma(&cmd
);
790 riscv_iommu_cmd_send(iommu
, &cmd
);
793 riscv_iommu_cmd_sync(iommu
, RISCV_IOMMU_IOTINVAL_TIMEOUT
);
798 /* This struct contains protection domain specific IOMMU driver data. */
799 struct riscv_iommu_domain
{
800 struct iommu_domain domain
;
801 struct list_head bonds
;
802 spinlock_t lock
; /* protect bonds list updates. */
806 unsigned int pgd_mode
;
807 unsigned long *pgd_root
;
810 #define iommu_domain_to_riscv(iommu_domain) \
811 container_of(iommu_domain, struct riscv_iommu_domain, domain)
813 /* Private IOMMU data for managed devices, dev_iommu_priv_* */
814 struct riscv_iommu_info
{
815 struct riscv_iommu_domain
*domain
;
819 * Linkage between an iommu_domain and attached devices.
821 * Protection domain requiring IOATC and DevATC translation cache invalidations,
822 * should be linked to attached devices using a riscv_iommu_bond structure.
823 * Devices should be linked to the domain before first use and unlinked after
824 * the translations from the referenced protection domain can no longer be used.
825 * Blocking and identity domains are not tracked here, as the IOMMU hardware
826 * does not cache negative and/or identity (BARE mode) translations, and DevATC
827 * is disabled for those protection domains.
829 * The device pointer and IOMMU data remain stable in the bond struct after
830 * _probe_device() where it's attached to the managed IOMMU, up to the
831 * completion of the _release_device() call. The release of the bond structure
832 * is synchronized with the device release.
834 struct riscv_iommu_bond
{
835 struct list_head list
;
840 static int riscv_iommu_bond_link(struct riscv_iommu_domain
*domain
,
843 struct riscv_iommu_device
*iommu
= dev_to_iommu(dev
);
844 struct riscv_iommu_bond
*bond
;
845 struct list_head
*bonds
;
847 bond
= kzalloc(sizeof(*bond
), GFP_KERNEL
);
853 * List of devices attached to the domain is arranged based on
854 * managed IOMMU device.
857 spin_lock(&domain
->lock
);
858 list_for_each(bonds
, &domain
->bonds
)
859 if (dev_to_iommu(list_entry(bonds
, struct riscv_iommu_bond
, list
)->dev
) == iommu
)
861 list_add_rcu(&bond
->list
, bonds
);
862 spin_unlock(&domain
->lock
);
864 /* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */
870 static void riscv_iommu_bond_unlink(struct riscv_iommu_domain
*domain
,
873 struct riscv_iommu_device
*iommu
= dev_to_iommu(dev
);
874 struct riscv_iommu_bond
*bond
, *found
= NULL
;
875 struct riscv_iommu_command cmd
;
881 spin_lock(&domain
->lock
);
882 list_for_each_entry(bond
, &domain
->bonds
, list
) {
885 else if (bond
->dev
== dev
)
887 else if (dev_to_iommu(bond
->dev
) == iommu
)
891 list_del_rcu(&found
->list
);
892 spin_unlock(&domain
->lock
);
893 kfree_rcu(found
, rcu
);
896 * If this was the last bond between this domain and the IOMMU
897 * invalidate all cached entries for domain's PSCID.
900 riscv_iommu_cmd_inval_vma(&cmd
);
901 riscv_iommu_cmd_inval_set_pscid(&cmd
, domain
->pscid
);
902 riscv_iommu_cmd_send(iommu
, &cmd
);
904 riscv_iommu_cmd_sync(iommu
, RISCV_IOMMU_IOTINVAL_TIMEOUT
);
909 * Send IOTLB.INVAL for whole address space for ranges larger than 2MB.
910 * This limit will be replaced with range invalidations, if supported by
911 * the hardware, when RISC-V IOMMU architecture specification update for
912 * range invalidations update will be available.
914 #define RISCV_IOMMU_IOTLB_INVAL_LIMIT (2 << 20)
916 static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain
*domain
,
917 unsigned long start
, unsigned long end
)
919 struct riscv_iommu_bond
*bond
;
920 struct riscv_iommu_device
*iommu
, *prev
;
921 struct riscv_iommu_command cmd
;
922 unsigned long len
= end
- start
+ 1;
926 * For each IOMMU linked with this protection domain (via bonds->dev),
927 * an IOTLB invaliation command will be submitted and executed.
929 * Possbile race with domain attach flow is handled by sequencing
930 * bond creation - riscv_iommu_bond_link(), and device directory
931 * update - riscv_iommu_iodir_update().
933 * PTE Update / IOTLB Inval Device attach & directory update
934 * -------------------------- --------------------------
935 * update page table entries add dev to the bond list
936 * FENCE RW,RW FENCE RW,RW
937 * For all IOMMUs: (can be empty) Update FSC/PSCID
938 * FENCE IOW,IOW FENCE IOW,IOW
939 * IOTLB.INVAL IODIR.INVAL
942 * If bond list is not updated with new device, directory context will
943 * be configured with already valid page table content. If an IOMMU is
944 * linked to the protection domain it will receive invalidation
945 * requests for updated page table entries.
952 list_for_each_entry_rcu(bond
, &domain
->bonds
, list
) {
953 iommu
= dev_to_iommu(bond
->dev
);
956 * IOTLB invalidation request can be safely omitted if already sent
957 * to the IOMMU for the same PSCID, and with domain->bonds list
958 * arranged based on the device's IOMMU, it's sufficient to check
959 * last device the invalidation was sent to.
964 riscv_iommu_cmd_inval_vma(&cmd
);
965 riscv_iommu_cmd_inval_set_pscid(&cmd
, domain
->pscid
);
966 if (len
&& len
< RISCV_IOMMU_IOTLB_INVAL_LIMIT
) {
967 for (iova
= start
; iova
< end
; iova
+= PAGE_SIZE
) {
968 riscv_iommu_cmd_inval_set_addr(&cmd
, iova
);
969 riscv_iommu_cmd_send(iommu
, &cmd
);
972 riscv_iommu_cmd_send(iommu
, &cmd
);
978 list_for_each_entry_rcu(bond
, &domain
->bonds
, list
) {
979 iommu
= dev_to_iommu(bond
->dev
);
983 riscv_iommu_cmd_sync(iommu
, RISCV_IOMMU_IOTINVAL_TIMEOUT
);
989 #define RISCV_IOMMU_FSC_BARE 0
992 * Update IODIR for the device.
994 * During the execution of riscv_iommu_probe_device(), IODIR entries are
995 * allocated for the device's identifiers. Device context invalidation
996 * becomes necessary only if one of the updated entries was previously
997 * marked as valid, given that invalid device context entries are not
998 * cached by the IOMMU hardware.
999 * In this implementation, updating a valid device context while the
1000 * device is not quiesced might be disruptive, potentially causing
1001 * interim translation faults.
1003 static void riscv_iommu_iodir_update(struct riscv_iommu_device
*iommu
,
1004 struct device
*dev
, u64 fsc
, u64 ta
)
1006 struct iommu_fwspec
*fwspec
= dev_iommu_fwspec_get(dev
);
1007 struct riscv_iommu_dc
*dc
;
1008 struct riscv_iommu_command cmd
;
1009 bool sync_required
= false;
1013 for (i
= 0; i
< fwspec
->num_ids
; i
++) {
1014 dc
= riscv_iommu_get_dc(iommu
, fwspec
->ids
[i
]);
1015 tc
= READ_ONCE(dc
->tc
);
1016 if (!(tc
& RISCV_IOMMU_DC_TC_V
))
1019 WRITE_ONCE(dc
->tc
, tc
& ~RISCV_IOMMU_DC_TC_V
);
1021 /* Invalidate device context cached values */
1022 riscv_iommu_cmd_iodir_inval_ddt(&cmd
);
1023 riscv_iommu_cmd_iodir_set_did(&cmd
, fwspec
->ids
[i
]);
1024 riscv_iommu_cmd_send(iommu
, &cmd
);
1025 sync_required
= true;
1029 riscv_iommu_cmd_sync(iommu
, RISCV_IOMMU_IOTINVAL_TIMEOUT
);
1032 * For device context with DC_TC_PDTV = 0, translation attributes valid bit
1033 * is stored as DC_TC_V bit (both sharing the same location at BIT(0)).
1035 for (i
= 0; i
< fwspec
->num_ids
; i
++) {
1036 dc
= riscv_iommu_get_dc(iommu
, fwspec
->ids
[i
]);
1037 tc
= READ_ONCE(dc
->tc
);
1038 tc
|= ta
& RISCV_IOMMU_DC_TC_V
;
1040 WRITE_ONCE(dc
->fsc
, fsc
);
1041 WRITE_ONCE(dc
->ta
, ta
& RISCV_IOMMU_PC_TA_PSCID
);
1042 /* Update device context, write TC.V as the last step. */
1044 WRITE_ONCE(dc
->tc
, tc
);
1046 /* Invalidate device context after update */
1047 riscv_iommu_cmd_iodir_inval_ddt(&cmd
);
1048 riscv_iommu_cmd_iodir_set_did(&cmd
, fwspec
->ids
[i
]);
1049 riscv_iommu_cmd_send(iommu
, &cmd
);
1052 riscv_iommu_cmd_sync(iommu
, RISCV_IOMMU_IOTINVAL_TIMEOUT
);
1056 * IOVA page translation tree management.
1059 static void riscv_iommu_iotlb_flush_all(struct iommu_domain
*iommu_domain
)
1061 struct riscv_iommu_domain
*domain
= iommu_domain_to_riscv(iommu_domain
);
1063 riscv_iommu_iotlb_inval(domain
, 0, ULONG_MAX
);
1066 static void riscv_iommu_iotlb_sync(struct iommu_domain
*iommu_domain
,
1067 struct iommu_iotlb_gather
*gather
)
1069 struct riscv_iommu_domain
*domain
= iommu_domain_to_riscv(iommu_domain
);
1071 riscv_iommu_iotlb_inval(domain
, gather
->start
, gather
->end
);
1074 #define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t)))
1076 #define _io_pte_present(pte) ((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE))
1077 #define _io_pte_leaf(pte) ((pte) & _PAGE_LEAF)
1078 #define _io_pte_none(pte) ((pte) == 0)
1079 #define _io_pte_entry(pn, prot) ((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot))
1081 static void riscv_iommu_pte_free(struct riscv_iommu_domain
*domain
,
1082 unsigned long pte
, struct list_head
*freelist
)
1087 if (!_io_pte_present(pte
) || _io_pte_leaf(pte
))
1090 ptr
= (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte
));
1092 /* Recursively free all sub page table pages */
1093 for (i
= 0; i
< PTRS_PER_PTE
; i
++) {
1094 pte
= READ_ONCE(ptr
[i
]);
1095 if (!_io_pte_none(pte
) && cmpxchg_relaxed(ptr
+ i
, pte
, 0) == pte
)
1096 riscv_iommu_pte_free(domain
, pte
, freelist
);
1100 list_add_tail(&virt_to_page(ptr
)->lru
, freelist
);
1102 iommu_free_page(ptr
);
1105 static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain
*domain
,
1106 unsigned long iova
, size_t pgsize
,
1109 unsigned long *ptr
= domain
->pgd_root
;
1110 unsigned long pte
, old
;
1111 int level
= domain
->pgd_mode
- RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39
+ 2;
1115 const int shift
= PAGE_SHIFT
+ PT_SHIFT
* level
;
1117 ptr
+= ((iova
>> shift
) & (PTRS_PER_PTE
- 1));
1119 * Note: returned entry might be a non-leaf if there was
1120 * existing mapping with smaller granularity. Up to the caller
1121 * to replace and invalidate.
1123 if (((size_t)1 << shift
) == pgsize
)
1126 pte
= READ_ONCE(*ptr
);
1128 * This is very likely incorrect as we should not be adding
1129 * new mapping with smaller granularity on top
1130 * of existing 2M/1G mapping. Fail.
1132 if (_io_pte_present(pte
) && _io_pte_leaf(pte
))
1135 * Non-leaf entry is missing, allocate and try to add to the
1136 * page table. This might race with other mappings, retry.
1138 if (_io_pte_none(pte
)) {
1139 addr
= iommu_alloc_page_node(domain
->numa_node
, gfp
);
1143 pte
= _io_pte_entry(virt_to_pfn(addr
), _PAGE_TABLE
);
1144 if (cmpxchg_relaxed(ptr
, old
, pte
) != old
) {
1145 iommu_free_page(addr
);
1149 ptr
= (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte
));
1150 } while (level
-- > 0);
1155 static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain
*domain
,
1156 unsigned long iova
, size_t *pte_pgsize
)
1158 unsigned long *ptr
= domain
->pgd_root
;
1160 int level
= domain
->pgd_mode
- RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39
+ 2;
1163 const int shift
= PAGE_SHIFT
+ PT_SHIFT
* level
;
1165 ptr
+= ((iova
>> shift
) & (PTRS_PER_PTE
- 1));
1166 pte
= READ_ONCE(*ptr
);
1167 if (_io_pte_present(pte
) && _io_pte_leaf(pte
)) {
1168 *pte_pgsize
= (size_t)1 << shift
;
1171 if (_io_pte_none(pte
))
1173 ptr
= (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte
));
1174 } while (level
-- > 0);
1179 static int riscv_iommu_map_pages(struct iommu_domain
*iommu_domain
,
1180 unsigned long iova
, phys_addr_t phys
,
1181 size_t pgsize
, size_t pgcount
, int prot
,
1182 gfp_t gfp
, size_t *mapped
)
1184 struct riscv_iommu_domain
*domain
= iommu_domain_to_riscv(iommu_domain
);
1187 unsigned long pte
, old
, pte_prot
;
1189 LIST_HEAD(freelist
);
1191 if (!(prot
& IOMMU_WRITE
))
1192 pte_prot
= _PAGE_BASE
| _PAGE_READ
;
1193 else if (domain
->amo_enabled
)
1194 pte_prot
= _PAGE_BASE
| _PAGE_READ
| _PAGE_WRITE
;
1196 pte_prot
= _PAGE_BASE
| _PAGE_READ
| _PAGE_WRITE
| _PAGE_DIRTY
;
1199 ptr
= riscv_iommu_pte_alloc(domain
, iova
, pgsize
, gfp
);
1205 old
= READ_ONCE(*ptr
);
1206 pte
= _io_pte_entry(phys_to_pfn(phys
), pte_prot
);
1207 if (cmpxchg_relaxed(ptr
, old
, pte
) != old
)
1210 riscv_iommu_pte_free(domain
, old
, &freelist
);
1220 if (!list_empty(&freelist
)) {
1222 * In 1.0 spec version, the smallest scope we can use to
1223 * invalidate all levels of page table (i.e. leaf and non-leaf)
1224 * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0.
1225 * This will be updated with hardware support for
1226 * capability.NL (non-leaf) IOTINVAL command.
1228 riscv_iommu_iotlb_inval(domain
, 0, ULONG_MAX
);
1229 iommu_put_pages_list(&freelist
);
1235 static size_t riscv_iommu_unmap_pages(struct iommu_domain
*iommu_domain
,
1236 unsigned long iova
, size_t pgsize
,
1238 struct iommu_iotlb_gather
*gather
)
1240 struct riscv_iommu_domain
*domain
= iommu_domain_to_riscv(iommu_domain
);
1241 size_t size
= pgcount
<< __ffs(pgsize
);
1242 unsigned long *ptr
, old
;
1243 size_t unmapped
= 0;
1246 while (unmapped
< size
) {
1247 ptr
= riscv_iommu_pte_fetch(domain
, iova
, &pte_size
);
1251 /* partial unmap is not allowed, fail. */
1252 if (iova
& (pte_size
- 1))
1255 old
= READ_ONCE(*ptr
);
1256 if (cmpxchg_relaxed(ptr
, old
, 0) != old
)
1259 iommu_iotlb_gather_add_page(&domain
->domain
, gather
, iova
,
1263 unmapped
+= pte_size
;
1269 static phys_addr_t
riscv_iommu_iova_to_phys(struct iommu_domain
*iommu_domain
,
1272 struct riscv_iommu_domain
*domain
= iommu_domain_to_riscv(iommu_domain
);
1273 unsigned long pte_size
;
1276 ptr
= riscv_iommu_pte_fetch(domain
, iova
, &pte_size
);
1277 if (_io_pte_none(*ptr
) || !_io_pte_present(*ptr
))
1280 return pfn_to_phys(__page_val_to_pfn(*ptr
)) | (iova
& (pte_size
- 1));
1283 static void riscv_iommu_free_paging_domain(struct iommu_domain
*iommu_domain
)
1285 struct riscv_iommu_domain
*domain
= iommu_domain_to_riscv(iommu_domain
);
1286 const unsigned long pfn
= virt_to_pfn(domain
->pgd_root
);
1288 WARN_ON(!list_empty(&domain
->bonds
));
1290 if ((int)domain
->pscid
> 0)
1291 ida_free(&riscv_iommu_pscids
, domain
->pscid
);
1293 riscv_iommu_pte_free(domain
, _io_pte_entry(pfn
, _PAGE_TABLE
), NULL
);
1297 static bool riscv_iommu_pt_supported(struct riscv_iommu_device
*iommu
, int pgd_mode
)
1300 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39
:
1301 return iommu
->caps
& RISCV_IOMMU_CAPABILITIES_SV39
;
1303 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48
:
1304 return iommu
->caps
& RISCV_IOMMU_CAPABILITIES_SV48
;
1306 case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57
:
1307 return iommu
->caps
& RISCV_IOMMU_CAPABILITIES_SV57
;
1312 static int riscv_iommu_attach_paging_domain(struct iommu_domain
*iommu_domain
,
1315 struct riscv_iommu_domain
*domain
= iommu_domain_to_riscv(iommu_domain
);
1316 struct riscv_iommu_device
*iommu
= dev_to_iommu(dev
);
1317 struct riscv_iommu_info
*info
= dev_iommu_priv_get(dev
);
1320 if (!riscv_iommu_pt_supported(iommu
, domain
->pgd_mode
))
1323 fsc
= FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE
, domain
->pgd_mode
) |
1324 FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN
, virt_to_pfn(domain
->pgd_root
));
1325 ta
= FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID
, domain
->pscid
) |
1326 RISCV_IOMMU_PC_TA_V
;
1328 if (riscv_iommu_bond_link(domain
, dev
))
1331 riscv_iommu_iodir_update(iommu
, dev
, fsc
, ta
);
1332 riscv_iommu_bond_unlink(info
->domain
, dev
);
1333 info
->domain
= domain
;
1338 static const struct iommu_domain_ops riscv_iommu_paging_domain_ops
= {
1339 .attach_dev
= riscv_iommu_attach_paging_domain
,
1340 .free
= riscv_iommu_free_paging_domain
,
1341 .map_pages
= riscv_iommu_map_pages
,
1342 .unmap_pages
= riscv_iommu_unmap_pages
,
1343 .iova_to_phys
= riscv_iommu_iova_to_phys
,
1344 .iotlb_sync
= riscv_iommu_iotlb_sync
,
1345 .flush_iotlb_all
= riscv_iommu_iotlb_flush_all
,
1348 static struct iommu_domain
*riscv_iommu_alloc_paging_domain(struct device
*dev
)
1350 struct riscv_iommu_domain
*domain
;
1351 struct riscv_iommu_device
*iommu
;
1352 unsigned int pgd_mode
;
1356 iommu
= dev_to_iommu(dev
);
1357 if (iommu
->caps
& RISCV_IOMMU_CAPABILITIES_SV57
) {
1358 pgd_mode
= RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57
;
1360 } else if (iommu
->caps
& RISCV_IOMMU_CAPABILITIES_SV48
) {
1361 pgd_mode
= RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48
;
1363 } else if (iommu
->caps
& RISCV_IOMMU_CAPABILITIES_SV39
) {
1364 pgd_mode
= RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39
;
1367 dev_err(dev
, "cannot find supported page table mode\n");
1368 return ERR_PTR(-ENODEV
);
1371 domain
= kzalloc(sizeof(*domain
), GFP_KERNEL
);
1373 return ERR_PTR(-ENOMEM
);
1375 INIT_LIST_HEAD_RCU(&domain
->bonds
);
1376 spin_lock_init(&domain
->lock
);
1377 domain
->numa_node
= dev_to_node(iommu
->dev
);
1378 domain
->amo_enabled
= !!(iommu
->caps
& RISCV_IOMMU_CAPABILITIES_AMO_HWAD
);
1379 domain
->pgd_mode
= pgd_mode
;
1380 domain
->pgd_root
= iommu_alloc_page_node(domain
->numa_node
,
1381 GFP_KERNEL_ACCOUNT
);
1382 if (!domain
->pgd_root
) {
1384 return ERR_PTR(-ENOMEM
);
1387 domain
->pscid
= ida_alloc_range(&riscv_iommu_pscids
, 1,
1388 RISCV_IOMMU_MAX_PSCID
, GFP_KERNEL
);
1389 if (domain
->pscid
< 0) {
1390 iommu_free_page(domain
->pgd_root
);
1392 return ERR_PTR(-ENOMEM
);
1396 * Note: RISC-V Privilege spec mandates that virtual addresses
1397 * need to be sign-extended, so if (VA_BITS - 1) is set, all
1398 * bits >= VA_BITS need to also be set or else we'll get a
1399 * page fault. However the code that creates the mappings
1400 * above us (e.g. iommu_dma_alloc_iova()) won't do that for us
1401 * for now, so we'll end up with invalid virtual addresses
1402 * to map. As a workaround until we get this sorted out
1403 * limit the available virtual addresses to VA_BITS - 1.
1405 va_mask
= DMA_BIT_MASK(va_bits
- 1);
1407 domain
->domain
.geometry
.aperture_start
= 0;
1408 domain
->domain
.geometry
.aperture_end
= va_mask
;
1409 domain
->domain
.geometry
.force_aperture
= true;
1410 domain
->domain
.pgsize_bitmap
= va_mask
& (SZ_4K
| SZ_2M
| SZ_1G
| SZ_512G
);
1412 domain
->domain
.ops
= &riscv_iommu_paging_domain_ops
;
1414 return &domain
->domain
;
1417 static int riscv_iommu_attach_blocking_domain(struct iommu_domain
*iommu_domain
,
1420 struct riscv_iommu_device
*iommu
= dev_to_iommu(dev
);
1421 struct riscv_iommu_info
*info
= dev_iommu_priv_get(dev
);
1423 /* Make device context invalid, translation requests will fault w/ #258 */
1424 riscv_iommu_iodir_update(iommu
, dev
, RISCV_IOMMU_FSC_BARE
, 0);
1425 riscv_iommu_bond_unlink(info
->domain
, dev
);
1426 info
->domain
= NULL
;
1431 static struct iommu_domain riscv_iommu_blocking_domain
= {
1432 .type
= IOMMU_DOMAIN_BLOCKED
,
1433 .ops
= &(const struct iommu_domain_ops
) {
1434 .attach_dev
= riscv_iommu_attach_blocking_domain
,
1438 static int riscv_iommu_attach_identity_domain(struct iommu_domain
*iommu_domain
,
1441 struct riscv_iommu_device
*iommu
= dev_to_iommu(dev
);
1442 struct riscv_iommu_info
*info
= dev_iommu_priv_get(dev
);
1444 riscv_iommu_iodir_update(iommu
, dev
, RISCV_IOMMU_FSC_BARE
, RISCV_IOMMU_PC_TA_V
);
1445 riscv_iommu_bond_unlink(info
->domain
, dev
);
1446 info
->domain
= NULL
;
1451 static struct iommu_domain riscv_iommu_identity_domain
= {
1452 .type
= IOMMU_DOMAIN_IDENTITY
,
1453 .ops
= &(const struct iommu_domain_ops
) {
1454 .attach_dev
= riscv_iommu_attach_identity_domain
,
1458 static struct iommu_group
*riscv_iommu_device_group(struct device
*dev
)
1460 if (dev_is_pci(dev
))
1461 return pci_device_group(dev
);
1462 return generic_device_group(dev
);
1465 static int riscv_iommu_of_xlate(struct device
*dev
, const struct of_phandle_args
*args
)
1467 return iommu_fwspec_add_ids(dev
, args
->args
, 1);
1470 static struct iommu_device
*riscv_iommu_probe_device(struct device
*dev
)
1472 struct iommu_fwspec
*fwspec
= dev_iommu_fwspec_get(dev
);
1473 struct riscv_iommu_device
*iommu
;
1474 struct riscv_iommu_info
*info
;
1475 struct riscv_iommu_dc
*dc
;
1479 if (!fwspec
|| !fwspec
->iommu_fwnode
->dev
|| !fwspec
->num_ids
)
1480 return ERR_PTR(-ENODEV
);
1482 iommu
= dev_get_drvdata(fwspec
->iommu_fwnode
->dev
);
1484 return ERR_PTR(-ENODEV
);
1487 * IOMMU hardware operating in fail-over BARE mode will provide
1488 * identity translation for all connected devices anyway...
1490 if (iommu
->ddt_mode
<= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE
)
1491 return ERR_PTR(-ENODEV
);
1493 info
= kzalloc(sizeof(*info
), GFP_KERNEL
);
1495 return ERR_PTR(-ENOMEM
);
1497 * Allocate and pre-configure device context entries in
1498 * the device directory. Do not mark the context valid yet.
1501 if (iommu
->caps
& RISCV_IOMMU_CAPABILITIES_AMO_HWAD
)
1502 tc
|= RISCV_IOMMU_DC_TC_SADE
;
1503 for (i
= 0; i
< fwspec
->num_ids
; i
++) {
1504 dc
= riscv_iommu_get_dc(iommu
, fwspec
->ids
[i
]);
1507 return ERR_PTR(-ENODEV
);
1509 if (READ_ONCE(dc
->tc
) & RISCV_IOMMU_DC_TC_V
)
1510 dev_warn(dev
, "already attached to IOMMU device directory\n");
1511 WRITE_ONCE(dc
->tc
, tc
);
1514 dev_iommu_priv_set(dev
, info
);
1516 return &iommu
->iommu
;
1519 static void riscv_iommu_release_device(struct device
*dev
)
1521 struct riscv_iommu_info
*info
= dev_iommu_priv_get(dev
);
1523 kfree_rcu_mightsleep(info
);
1526 static const struct iommu_ops riscv_iommu_ops
= {
1527 .pgsize_bitmap
= SZ_4K
,
1528 .of_xlate
= riscv_iommu_of_xlate
,
1529 .identity_domain
= &riscv_iommu_identity_domain
,
1530 .blocked_domain
= &riscv_iommu_blocking_domain
,
1531 .release_domain
= &riscv_iommu_blocking_domain
,
1532 .domain_alloc_paging
= riscv_iommu_alloc_paging_domain
,
1533 .device_group
= riscv_iommu_device_group
,
1534 .probe_device
= riscv_iommu_probe_device
,
1535 .release_device
= riscv_iommu_release_device
,
1538 static int riscv_iommu_init_check(struct riscv_iommu_device
*iommu
)
1543 * Make sure the IOMMU is switched off or in pass-through mode during
1544 * regular boot flow and disable translation when we boot into a kexec
1545 * kernel and the previous kernel left them enabled.
1547 ddtp
= riscv_iommu_readq(iommu
, RISCV_IOMMU_REG_DDTP
);
1548 if (ddtp
& RISCV_IOMMU_DDTP_BUSY
)
1551 if (FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE
, ddtp
) >
1552 RISCV_IOMMU_DDTP_IOMMU_MODE_BARE
) {
1553 if (!is_kdump_kernel())
1555 riscv_iommu_disable(iommu
);
1558 /* Configure accesses to in-memory data structures for CPU-native byte order. */
1559 if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN
) !=
1560 !!(iommu
->fctl
& RISCV_IOMMU_FCTL_BE
)) {
1561 if (!(iommu
->caps
& RISCV_IOMMU_CAPABILITIES_END
))
1563 riscv_iommu_writel(iommu
, RISCV_IOMMU_REG_FCTL
,
1564 iommu
->fctl
^ RISCV_IOMMU_FCTL_BE
);
1565 iommu
->fctl
= riscv_iommu_readl(iommu
, RISCV_IOMMU_REG_FCTL
);
1566 if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN
) !=
1567 !!(iommu
->fctl
& RISCV_IOMMU_FCTL_BE
))
1572 * Distribute interrupt vectors, always use first vector for CIV.
1573 * At least one interrupt is required. Read back and verify.
1575 if (!iommu
->irqs_count
)
1578 iommu
->icvec
= FIELD_PREP(RISCV_IOMMU_ICVEC_FIV
, 1 % iommu
->irqs_count
) |
1579 FIELD_PREP(RISCV_IOMMU_ICVEC_PIV
, 2 % iommu
->irqs_count
) |
1580 FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV
, 3 % iommu
->irqs_count
);
1581 riscv_iommu_writeq(iommu
, RISCV_IOMMU_REG_ICVEC
, iommu
->icvec
);
1582 iommu
->icvec
= riscv_iommu_readq(iommu
, RISCV_IOMMU_REG_ICVEC
);
1583 if (max(max(FIELD_GET(RISCV_IOMMU_ICVEC_CIV
, iommu
->icvec
),
1584 FIELD_GET(RISCV_IOMMU_ICVEC_FIV
, iommu
->icvec
)),
1585 max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV
, iommu
->icvec
),
1586 FIELD_GET(RISCV_IOMMU_ICVEC_PMIV
, iommu
->icvec
))) >= iommu
->irqs_count
)
1592 void riscv_iommu_remove(struct riscv_iommu_device
*iommu
)
1594 iommu_device_unregister(&iommu
->iommu
);
1595 iommu_device_sysfs_remove(&iommu
->iommu
);
1596 riscv_iommu_iodir_set_mode(iommu
, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF
);
1597 riscv_iommu_queue_disable(&iommu
->cmdq
);
1598 riscv_iommu_queue_disable(&iommu
->fltq
);
1601 int riscv_iommu_init(struct riscv_iommu_device
*iommu
)
1605 RISCV_IOMMU_QUEUE_INIT(&iommu
->cmdq
, CQ
);
1606 RISCV_IOMMU_QUEUE_INIT(&iommu
->fltq
, FQ
);
1608 rc
= riscv_iommu_init_check(iommu
);
1610 return dev_err_probe(iommu
->dev
, rc
, "unexpected device state\n");
1612 rc
= riscv_iommu_iodir_alloc(iommu
);
1616 rc
= riscv_iommu_queue_alloc(iommu
, &iommu
->cmdq
,
1617 sizeof(struct riscv_iommu_command
));
1621 rc
= riscv_iommu_queue_alloc(iommu
, &iommu
->fltq
,
1622 sizeof(struct riscv_iommu_fq_record
));
1626 rc
= riscv_iommu_queue_enable(iommu
, &iommu
->cmdq
, riscv_iommu_cmdq_process
);
1630 rc
= riscv_iommu_queue_enable(iommu
, &iommu
->fltq
, riscv_iommu_fltq_process
);
1632 goto err_queue_disable
;
1634 rc
= riscv_iommu_iodir_set_mode(iommu
, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX
);
1636 goto err_queue_disable
;
1638 rc
= iommu_device_sysfs_add(&iommu
->iommu
, NULL
, NULL
, "riscv-iommu@%s",
1639 dev_name(iommu
->dev
));
1641 dev_err_probe(iommu
->dev
, rc
, "cannot register sysfs interface\n");
1645 rc
= iommu_device_register(&iommu
->iommu
, &riscv_iommu_ops
, iommu
->dev
);
1647 dev_err_probe(iommu
->dev
, rc
, "cannot register iommu interface\n");
1648 goto err_remove_sysfs
;
1654 iommu_device_sysfs_remove(&iommu
->iommu
);
1656 riscv_iommu_iodir_set_mode(iommu
, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF
);
1658 riscv_iommu_queue_disable(&iommu
->fltq
);
1659 riscv_iommu_queue_disable(&iommu
->cmdq
);