aboutsummaryrefslogtreecommitdiff
path: root/drivers/iommu/riscv/iommu.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/iommu/riscv/iommu.c')
-rw-r--r--drivers/iommu/riscv/iommu.c1661
1 files changed, 1661 insertions, 0 deletions
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
new file mode 100644
index 000000000000..8a05def774bd
--- /dev/null
+++ b/drivers/iommu/riscv/iommu.c
@@ -0,0 +1,1661 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IOMMU API for RISC-V IOMMU implementations.
+ *
+ * Copyright © 2022-2024 Rivos Inc.
+ * Copyright © 2023 FORTH-ICS/CARV
+ *
+ * Authors
+ * Tomasz Jeznach <tjeznach@rivosinc.com>
+ * Nick Kossifidis <mick@ics.forth.gr>
+ */
+
+#define pr_fmt(fmt) "riscv-iommu: " fmt
+
+#include <linux/compiler.h>
+#include <linux/crash_dump.h>
+#include <linux/init.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+
+#include "../iommu-pages.h"
+#include "iommu-bits.h"
+#include "iommu.h"
+
+/* Timeouts in [us] */
+#define RISCV_IOMMU_QCSR_TIMEOUT 150000
+#define RISCV_IOMMU_QUEUE_TIMEOUT 150000
+#define RISCV_IOMMU_DDTP_TIMEOUT 10000000
+#define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000
+
+/* Number of entries per CMD/FLT queue, should be <= INT_MAX */
+#define RISCV_IOMMU_DEF_CQ_COUNT 8192
+#define RISCV_IOMMU_DEF_FQ_COUNT 4096
+
+/* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
+#define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10))
+#define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12))
+
+#define dev_to_iommu(dev) \
+ iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu)
+
+/* IOMMU PSCID allocation namespace. */
+static DEFINE_IDA(riscv_iommu_pscids);
+#define RISCV_IOMMU_MAX_PSCID (BIT(20) - 1)
+
+/* Device resource-managed allocations */
+struct riscv_iommu_devres {
+ void *addr;
+ int order;
+};
+
+static void riscv_iommu_devres_pages_release(struct device *dev, void *res)
+{
+ struct riscv_iommu_devres *devres = res;
+
+ iommu_free_pages(devres->addr, devres->order);
+}
+
+static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p)
+{
+ struct riscv_iommu_devres *devres = res;
+ struct riscv_iommu_devres *target = p;
+
+ return devres->addr == target->addr;
+}
+
+static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu, int order)
+{
+ struct riscv_iommu_devres *devres;
+ void *addr;
+
+ addr = iommu_alloc_pages_node(dev_to_node(iommu->dev),
+ GFP_KERNEL_ACCOUNT, order);
+ if (unlikely(!addr))
+ return NULL;
+
+ devres = devres_alloc(riscv_iommu_devres_pages_release,
+ sizeof(struct riscv_iommu_devres), GFP_KERNEL);
+
+ if (unlikely(!devres)) {
+ iommu_free_pages(addr, order);
+ return NULL;
+ }
+
+ devres->addr = addr;
+ devres->order = order;
+
+ devres_add(iommu->dev, devres);
+
+ return addr;
+}
+
+static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr)
+{
+ struct riscv_iommu_devres devres = { .addr = addr };
+
+ devres_release(iommu->dev, riscv_iommu_devres_pages_release,
+ riscv_iommu_devres_pages_match, &devres);
+}
+
+/*
+ * Hardware queue allocation and management.
+ */
+
+/* Setup queue base, control registers and default queue length */
+#define RISCV_IOMMU_QUEUE_INIT(q, name) do { \
+ struct riscv_iommu_queue *_q = q; \
+ _q->qid = RISCV_IOMMU_INTR_ ## name; \
+ _q->qbr = RISCV_IOMMU_REG_ ## name ## B; \
+ _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR; \
+ _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\
+} while (0)
+
+/* Note: offsets are the same for all queues */
+#define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
+#define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
+#define Q_ITEM(q, index) ((q)->mask & (index))
+#define Q_IPSR(q) BIT((q)->qid)
+
+/*
+ * Discover queue ring buffer hardware configuration, allocate in-memory
+ * ring buffer or use fixed I/O memory location, configure queue base register.
+ * Must be called before hardware queue is enabled.
+ *
+ * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT()
+ * @entry_size - queue single element size in bytes.
+ */
+static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu,
+ struct riscv_iommu_queue *queue,
+ size_t entry_size)
+{
+ unsigned int logsz;
+ u64 qb, rb;
+
+ /*
+ * Use WARL base register property to discover maximum allowed
+ * number of entries and optional fixed IO address for queue location.
+ */
+ riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD);
+ qb = riscv_iommu_readq(iommu, queue->qbr);
+
+ /*
+ * Calculate and verify hardware supported queue length, as reported
+ * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1).
+ * Update queue size based on hardware supported value.
+ */
+ logsz = ilog2(queue->mask);
+ if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb))
+ logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb);
+
+ /*
+ * Use WARL base register property to discover an optional fixed IO
+ * address for queue ring buffer location. Otherwise allocate contiguous
+ * system memory.
+ */
+ if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) {
+ const size_t queue_size = entry_size << (logsz + 1);
+
+ queue->phys = pfn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb));
+ queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size);
+ } else {
+ do {
+ const size_t queue_size = entry_size << (logsz + 1);
+ const int order = get_order(queue_size);
+
+ queue->base = riscv_iommu_get_pages(iommu, order);
+ queue->phys = __pa(queue->base);
+ } while (!queue->base && logsz-- > 0);
+ }
+
+ if (!queue->base)
+ return -ENOMEM;
+
+ qb = phys_to_ppn(queue->phys) |
+ FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz);
+
+ /* Update base register and read back to verify hw accepted our write */
+ riscv_iommu_writeq(iommu, queue->qbr, qb);
+ rb = riscv_iommu_readq(iommu, queue->qbr);
+ if (rb != qb) {
+ dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid);
+ return -ENODEV;
+ }
+
+ /* Update actual queue mask */
+ queue->mask = (2U << logsz) - 1;
+
+ dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries",
+ queue->qid, logsz + 1);
+
+ return 0;
+}
+
+/* Check interrupt queue status, IPSR */
+static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data)
+{
+ struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
+
+ if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue))
+ return IRQ_WAKE_THREAD;
+
+ return IRQ_NONE;
+}
+
+static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n)
+{
+ /* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */
+ return (iommu->icvec >> (n * 4)) & RISCV_IOMMU_ICVEC_CIV;
+}
+
+/*
+ * Enable queue processing in the hardware, register interrupt handler.
+ *
+ * @queue - data structure, already allocated with riscv_iommu_queue_alloc()
+ * @irq_handler - threaded interrupt handler.
+ */
+static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu,
+ struct riscv_iommu_queue *queue,
+ irq_handler_t irq_handler)
+{
+ const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)];
+ u32 csr;
+ int rc;
+
+ if (queue->iommu)
+ return -EBUSY;
+
+ /* Polling not implemented */
+ if (!irq)
+ return -ENODEV;
+
+ queue->iommu = iommu;
+ rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler,
+ IRQF_ONESHOT | IRQF_SHARED,
+ dev_name(iommu->dev), queue);
+ if (rc) {
+ queue->iommu = NULL;
+ return rc;
+ }
+
+ /*
+ * Enable queue with interrupts, clear any memory fault if any.
+ * Wait for the hardware to acknowledge request and activate queue
+ * processing.
+ * Note: All CSR bitfields are in the same offsets for all queues.
+ */
+ riscv_iommu_writel(iommu, queue->qcr,
+ RISCV_IOMMU_QUEUE_ENABLE |
+ RISCV_IOMMU_QUEUE_INTR_ENABLE |
+ RISCV_IOMMU_QUEUE_MEM_FAULT);
+
+ riscv_iommu_readl_timeout(iommu, queue->qcr,
+ csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
+ 10, RISCV_IOMMU_QCSR_TIMEOUT);
+
+ if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE |
+ RISCV_IOMMU_QUEUE_BUSY |
+ RISCV_IOMMU_QUEUE_MEM_FAULT))) {
+ /* Best effort to stop and disable failing hardware queue. */
+ riscv_iommu_writel(iommu, queue->qcr, 0);
+ free_irq(irq, queue);
+ queue->iommu = NULL;
+ dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid);
+ return -EBUSY;
+ }
+
+ /* Clear any pending interrupt flag. */
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
+
+ return 0;
+}
+
+/*
+ * Disable queue. Wait for the hardware to acknowledge request and
+ * stop processing enqueued requests. Report errors but continue.
+ */
+static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue)
+{
+ struct riscv_iommu_device *iommu = queue->iommu;
+ u32 csr;
+
+ if (!iommu)
+ return;
+
+ free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue);
+ riscv_iommu_writel(iommu, queue->qcr, 0);
+ riscv_iommu_readl_timeout(iommu, queue->qcr,
+ csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
+ 10, RISCV_IOMMU_QCSR_TIMEOUT);
+
+ if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY))
+ dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n",
+ queue->qid, csr);
+
+ queue->iommu = NULL;
+}
+
+/*
+ * Returns number of available valid queue entries and the first item index.
+ * Update shadow producer index if necessary.
+ */
+static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue,
+ unsigned int *index)
+{
+ unsigned int head = atomic_read(&queue->head);
+ unsigned int tail = atomic_read(&queue->tail);
+ unsigned int last = Q_ITEM(queue, tail);
+ int available = (int)(tail - head);
+
+ *index = head;
+
+ if (available > 0)
+ return available;
+
+ /* read hardware producer index, check reserved register bits are not set. */
+ if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue),
+ tail, (tail & ~queue->mask) == 0,
+ 0, RISCV_IOMMU_QUEUE_TIMEOUT)) {
+ dev_err_once(queue->iommu->dev,
+ "Hardware error: queue access timeout\n");
+ return 0;
+ }
+
+ if (tail == last)
+ return 0;
+
+ /* update shadow producer index */
+ return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head);
+}
+
+/*
+ * Release processed queue entries, should match riscv_iommu_queue_consume() calls.
+ */
+static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count)
+{
+ const unsigned int head = atomic_add_return(count, &queue->head);
+
+ riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head));
+}
+
+/* Return actual consumer index based on hardware reported queue head index. */
+static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue)
+{
+ const unsigned int cons = atomic_read(&queue->head);
+ const unsigned int last = Q_ITEM(queue, cons);
+ unsigned int head;
+
+ if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
+ !(head & ~queue->mask),
+ 0, RISCV_IOMMU_QUEUE_TIMEOUT))
+ return cons;
+
+ return cons + ((head - last) & queue->mask);
+}
+
+/* Wait for submitted item to be processed. */
+static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue,
+ unsigned int index,
+ unsigned int timeout_us)
+{
+ unsigned int cons = atomic_read(&queue->head);
+
+ /* Already processed by the consumer */
+ if ((int)(cons - index) > 0)
+ return 0;
+
+ /* Monitor consumer index */
+ return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons,
+ (int)(cons - index) > 0, 0, timeout_us);
+}
+
+/* Enqueue an entry and wait to be processed if timeout_us > 0
+ *
+ * Error handling for IOMMU hardware not responding in reasonable time
+ * will be added as separate patch series along with other RAS features.
+ * For now, only report hardware failure and continue.
+ */
+static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue *queue,
+ void *entry, size_t entry_size)
+{
+ unsigned int prod;
+ unsigned int head;
+ unsigned int tail;
+ unsigned long flags;
+
+ /* Do not preempt submission flow. */
+ local_irq_save(flags);
+
+ /* 1. Allocate some space in the queue */
+ prod = atomic_inc_return(&queue->prod) - 1;
+ head = atomic_read(&queue->head);
+
+ /* 2. Wait for space availability. */
+ if ((prod - head) > queue->mask) {
+ if (readx_poll_timeout(atomic_read, &queue->head,
+ head, (prod - head) < queue->mask,
+ 0, RISCV_IOMMU_QUEUE_TIMEOUT))
+ goto err_busy;
+ } else if ((prod - head) == queue->mask) {
+ const unsigned int last = Q_ITEM(queue, head);
+
+ if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
+ !(head & ~queue->mask) && head != last,
+ 0, RISCV_IOMMU_QUEUE_TIMEOUT))
+ goto err_busy;
+ atomic_add((head - last) & queue->mask, &queue->head);
+ }
+
+ /* 3. Store entry in the ring buffer */
+ memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size);
+
+ /* 4. Wait for all previous entries to be ready */
+ if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail,
+ 0, RISCV_IOMMU_QUEUE_TIMEOUT))
+ goto err_busy;
+
+ /*
+ * 5. Make sure the ring buffer update (whether in normal or I/O memory) is
+ * completed and visible before signaling the tail doorbell to fetch
+ * the next command. 'fence ow, ow'
+ */
+ dma_wmb();
+ riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1));
+
+ /*
+ * 6. Make sure the doorbell write to the device has finished before updating
+ * the shadow tail index in normal memory. 'fence o, w'
+ */
+ mmiowb();
+ atomic_inc(&queue->tail);
+
+ /* 7. Complete submission and restore local interrupts */
+ local_irq_restore(flags);
+
+ return prod;
+
+err_busy:
+ local_irq_restore(flags);
+ dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n");
+
+ return prod;
+}
+
+/*
+ * IOMMU Command queue chapter 3.1
+ */
+
+/* Command queue interrupt handler thread function */
+static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
+{
+ const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
+ unsigned int ctrl;
+
+ /* Clear MF/CQ errors, complete error recovery to be implemented. */
+ ctrl = riscv_iommu_readl(queue->iommu, queue->qcr);
+ if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO |
+ RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) {
+ riscv_iommu_writel(queue->iommu, queue->qcr, ctrl);
+ dev_warn(queue->iommu->dev,
+ "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n",
+ queue->qid,
+ !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
+ !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
+ !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL),
+ !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP));
+ }
+
+ /* Placeholder for command queue interrupt notifiers */
+
+ /* Clear command interrupt pending. */
+ riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
+
+ return IRQ_HANDLED;
+}
+
+/* Send command to the IOMMU command queue */
+static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu,
+ struct riscv_iommu_command *cmd)
+{
+ riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd));
+}
+
+/* Send IOFENCE.C command and wait for all scheduled commands to complete. */
+static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu,
+ unsigned int timeout_us)
+{
+ struct riscv_iommu_command cmd;
+ unsigned int prod;
+
+ riscv_iommu_cmd_iofence(&cmd);
+ prod = riscv_iommu_queue_send(&iommu->cmdq, &cmd, sizeof(cmd));
+
+ if (!timeout_us)
+ return;
+
+ if (riscv_iommu_queue_wait(&iommu->cmdq, prod, timeout_us))
+ dev_err_once(iommu->dev,
+ "Hardware error: command execution timeout\n");
+}
+
+/*
+ * IOMMU Fault/Event queue chapter 3.2
+ */
+
+static void riscv_iommu_fault(struct riscv_iommu_device *iommu,
+ struct riscv_iommu_fq_record *event)
+{
+ unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
+ unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);
+
+ /* Placeholder for future fault handling implementation, report only. */
+ if (err)
+ dev_warn_ratelimited(iommu->dev,
+ "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n",
+ err, devid, event->iotval, event->iotval2);
+}
+
+/* Fault queue interrupt handler thread function */
+static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
+{
+ struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
+ struct riscv_iommu_device *iommu = queue->iommu;
+ struct riscv_iommu_fq_record *events;
+ unsigned int ctrl, idx;
+ int cnt, len;
+
+ events = (struct riscv_iommu_fq_record *)queue->base;
+
+ /* Clear fault interrupt pending and process all received fault events. */
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
+
+ do {
+ cnt = riscv_iommu_queue_consume(queue, &idx);
+ for (len = 0; len < cnt; idx++, len++)
+ riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]);
+ riscv_iommu_queue_release(queue, cnt);
+ } while (cnt > 0);
+
+ /* Clear MF/OF errors, complete error recovery to be implemented. */
+ ctrl = riscv_iommu_readl(iommu, queue->qcr);
+ if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
+ riscv_iommu_writel(iommu, queue->qcr, ctrl);
+ dev_warn(iommu->dev,
+ "Queue #%u error; memory fault:%d overflow:%d\n",
+ queue->qid,
+ !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
+ !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
+ }
+
+ return IRQ_HANDLED;
+}
+
+/* Lookup and initialize device context info structure. */
+static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
+ unsigned int devid)
+{
+ const bool base_format = !(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT);
+ unsigned int depth;
+ unsigned long ddt, old, new;
+ void *ptr;
+ u8 ddi_bits[3] = { 0 };
+ u64 *ddtp = NULL;
+
+ /* Make sure the mode is valid */
+ if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL ||
+ iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL)
+ return NULL;
+
+ /*
+ * Device id partitioning for base format:
+ * DDI[0]: bits 0 - 6 (1st level) (7 bits)
+ * DDI[1]: bits 7 - 15 (2nd level) (9 bits)
+ * DDI[2]: bits 16 - 23 (3rd level) (8 bits)
+ *
+ * For extended format:
+ * DDI[0]: bits 0 - 5 (1st level) (6 bits)
+ * DDI[1]: bits 6 - 14 (2nd level) (9 bits)
+ * DDI[2]: bits 15 - 23 (3rd level) (9 bits)
+ */
+ if (base_format) {
+ ddi_bits[0] = 7;
+ ddi_bits[1] = 7 + 9;
+ ddi_bits[2] = 7 + 9 + 8;
+ } else {
+ ddi_bits[0] = 6;
+ ddi_bits[1] = 6 + 9;
+ ddi_bits[2] = 6 + 9 + 9;
+ }
+
+ /* Make sure device id is within range */
+ depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL;
+ if (devid >= (1 << ddi_bits[depth]))
+ return NULL;
+
+ /* Get to the level of the non-leaf node that holds the device context */
+ for (ddtp = iommu->ddt_root; depth-- > 0;) {
+ const int split = ddi_bits[depth];
+ /*
+ * Each non-leaf node is 64bits wide and on each level
+ * nodes are indexed by DDI[depth].
+ */
+ ddtp += (devid >> split) & 0x1FF;
+
+ /*
+ * Check if this node has been populated and if not
+ * allocate a new level and populate it.
+ */
+ do {
+ ddt = READ_ONCE(*(unsigned long *)ddtp);
+ if (ddt & RISCV_IOMMU_DDTE_V) {
+ ddtp = __va(ppn_to_phys(ddt));
+ break;
+ }
+
+ ptr = riscv_iommu_get_pages(iommu, 0);
+ if (!ptr)
+ return NULL;
+
+ new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V;
+ old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new);
+
+ if (old == ddt) {
+ ddtp = (u64 *)ptr;
+ break;
+ }
+
+ /* Race setting DDT detected, re-read and retry. */
+ riscv_iommu_free_pages(iommu, ptr);
+ } while (1);
+ }
+
+ /*
+ * Grab the node that matches DDI[depth], note that when using base
+ * format the device context is 4 * 64bits, and the extended format
+ * is 8 * 64bits, hence the (3 - base_format) below.
+ */
+ ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format);
+
+ return (struct riscv_iommu_dc *)ddtp;
+}
+
+/*
+ * This is best effort IOMMU translation shutdown flow.
+ * Disable IOMMU without waiting for hardware response.
+ */
+static void riscv_iommu_disable(struct riscv_iommu_device *iommu)
+{
+ riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, 0);
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQCSR, 0);
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FQCSR, 0);
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0);
+}
+
+#define riscv_iommu_read_ddtp(iommu) ({ \
+ u64 ddtp; \
+ riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \
+ !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \
+ RISCV_IOMMU_DDTP_TIMEOUT); \
+ ddtp; })
+
+static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu)
+{
+ u64 ddtp;
+ unsigned int mode;
+
+ ddtp = riscv_iommu_read_ddtp(iommu);
+ if (ddtp & RISCV_IOMMU_DDTP_BUSY)
+ return -EBUSY;
+
+ /*
+ * It is optional for the hardware to report a fixed address for device
+ * directory root page when DDT.MODE is OFF or BARE.
+ */
+ mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
+ if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE ||
+ mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) {
+ /* Use WARL to discover hardware fixed DDT PPN */
+ riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
+ FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode));
+ ddtp = riscv_iommu_read_ddtp(iommu);
+ if (ddtp & RISCV_IOMMU_DDTP_BUSY)
+ return -EBUSY;
+
+ iommu->ddt_phys = ppn_to_phys(ddtp);
+ if (iommu->ddt_phys)
+ iommu->ddt_root = devm_ioremap(iommu->dev,
+ iommu->ddt_phys, PAGE_SIZE);
+ if (iommu->ddt_root)
+ memset(iommu->ddt_root, 0, PAGE_SIZE);
+ }
+
+ if (!iommu->ddt_root) {
+ iommu->ddt_root = riscv_iommu_get_pages(iommu, 0);
+ iommu->ddt_phys = __pa(iommu->ddt_root);
+ }
+
+ if (!iommu->ddt_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/*
+ * Discover supported DDT modes starting from requested value,
+ * configure DDTP register with accepted mode and root DDT address.
+ * Accepted iommu->ddt_mode is updated on success.
+ */
+static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
+ unsigned int ddtp_mode)
+{
+ struct device *dev = iommu->dev;
+ u64 ddtp, rq_ddtp;
+ unsigned int mode, rq_mode = ddtp_mode;
+ struct riscv_iommu_command cmd;
+
+ ddtp = riscv_iommu_read_ddtp(iommu);
+ if (ddtp & RISCV_IOMMU_DDTP_BUSY)
+ return -EBUSY;
+
+ /* Disallow state transition from xLVL to xLVL. */
+ mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
+ if (mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
+ mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF &&
+ rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
+ rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF)
+ return -EINVAL;
+
+ do {
+ rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode);
+ if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
+ rq_ddtp |= phys_to_ppn(iommu->ddt_phys);
+
+ riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp);
+ ddtp = riscv_iommu_read_ddtp(iommu);
+ if (ddtp & RISCV_IOMMU_DDTP_BUSY) {
+ dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n",
+ rq_mode, ddtp);
+ return -EBUSY;
+ }
+
+ /* Verify IOMMU hardware accepts new DDTP config. */
+ mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
+
+ if (rq_mode == mode)
+ break;
+
+ /* Hardware mandatory DDTP mode has not been accepted. */
+ if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) {
+ dev_err(dev, "DDTP update failed hw: %llx vs %llx\n",
+ ddtp, rq_ddtp);
+ return -EINVAL;
+ }
+
+ /*
+ * Mode field is WARL, an IOMMU may support a subset of
+ * directory table levels in which case if we tried to set
+ * an unsupported number of levels we'll readback either
+ * a valid xLVL or off/bare. If we got off/bare, try again
+ * with a smaller xLVL.
+ */
+ if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL &&
+ rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) {
+ dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode);
+ rq_mode--;
+ continue;
+ }
+
+ /*
+ * We tried all supported modes and IOMMU hardware failed to
+ * accept new settings, something went very wrong since off/bare
+ * and at least one xLVL must be supported.
+ */
+ dev_err(dev, "DDTP hw mode %u, failed to set %u\n",
+ mode, ddtp_mode);
+ return -EINVAL;
+ } while (1);
+
+ iommu->ddt_mode = mode;
+ if (mode != ddtp_mode)
+ dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode);
+
+ /* Invalidate device context cache */
+ riscv_iommu_cmd_iodir_inval_ddt(&cmd);
+ riscv_iommu_cmd_send(iommu, &cmd);
+
+ /* Invalidate address translation cache */
+ riscv_iommu_cmd_inval_vma(&cmd);
+ riscv_iommu_cmd_send(iommu, &cmd);
+
+ /* IOFENCE.C */
+ riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
+
+ return 0;
+}
+
+/* This struct contains protection domain specific IOMMU driver data. */
+struct riscv_iommu_domain {
+ struct iommu_domain domain;
+ struct list_head bonds;
+ spinlock_t lock; /* protect bonds list updates. */
+ int pscid;
+ bool amo_enabled;
+ int numa_node;
+ unsigned int pgd_mode;
+ unsigned long *pgd_root;
+};
+
+#define iommu_domain_to_riscv(iommu_domain) \
+ container_of(iommu_domain, struct riscv_iommu_domain, domain)
+
+/* Private IOMMU data for managed devices, dev_iommu_priv_* */
+struct riscv_iommu_info {
+ struct riscv_iommu_domain *domain;
+};
+
+/*
+ * Linkage between an iommu_domain and attached devices.
+ *
+ * Protection domain requiring IOATC and DevATC translation cache invalidations,
+ * should be linked to attached devices using a riscv_iommu_bond structure.
+ * Devices should be linked to the domain before first use and unlinked after
+ * the translations from the referenced protection domain can no longer be used.
+ * Blocking and identity domains are not tracked here, as the IOMMU hardware
+ * does not cache negative and/or identity (BARE mode) translations, and DevATC
+ * is disabled for those protection domains.
+ *
+ * The device pointer and IOMMU data remain stable in the bond struct after
+ * _probe_device() where it's attached to the managed IOMMU, up to the
+ * completion of the _release_device() call. The release of the bond structure
+ * is synchronized with the device release.
+ */
+struct riscv_iommu_bond {
+ struct list_head list;
+ struct rcu_head rcu;
+ struct device *dev;
+};
+
+static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain,
+ struct device *dev)
+{
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct riscv_iommu_bond *bond;
+ struct list_head *bonds;
+
+ bond = kzalloc(sizeof(*bond), GFP_KERNEL);
+ if (!bond)
+ return -ENOMEM;
+ bond->dev = dev;
+
+ /*
+ * List of devices attached to the domain is arranged based on
+ * managed IOMMU device.
+ */
+
+ spin_lock(&domain->lock);
+ list_for_each(bonds, &domain->bonds)
+ if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu)
+ break;
+ list_add_rcu(&bond->list, bonds);
+ spin_unlock(&domain->lock);
+
+ /* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */
+ smp_mb();
+
+ return 0;
+}
+
+static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain,
+ struct device *dev)
+{
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct riscv_iommu_bond *bond, *found = NULL;
+ struct riscv_iommu_command cmd;
+ int count = 0;
+
+ if (!domain)
+ return;
+
+ spin_lock(&domain->lock);
+ list_for_each_entry(bond, &domain->bonds, list) {
+ if (found && count)
+ break;
+ else if (bond->dev == dev)
+ found = bond;
+ else if (dev_to_iommu(bond->dev) == iommu)
+ count++;
+ }
+ if (found)
+ list_del_rcu(&found->list);
+ spin_unlock(&domain->lock);
+ kfree_rcu(found, rcu);
+
+ /*
+ * If this was the last bond between this domain and the IOMMU
+ * invalidate all cached entries for domain's PSCID.
+ */
+ if (!count) {
+ riscv_iommu_cmd_inval_vma(&cmd);
+ riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
+ riscv_iommu_cmd_send(iommu, &cmd);
+
+ riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
+ }
+}
+
+/*
+ * Send IOTLB.INVAL for whole address space for ranges larger than 2MB.
+ * This limit will be replaced with range invalidations, if supported by
+ * the hardware, when RISC-V IOMMU architecture specification update for
+ * range invalidations update will be available.
+ */
+#define RISCV_IOMMU_IOTLB_INVAL_LIMIT (2 << 20)
+
+static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
+ unsigned long start, unsigned long end)
+{
+ struct riscv_iommu_bond *bond;
+ struct riscv_iommu_device *iommu, *prev;
+ struct riscv_iommu_command cmd;
+ unsigned long len = end - start + 1;
+ unsigned long iova;
+
+ /*
+ * For each IOMMU linked with this protection domain (via bonds->dev),
+ * an IOTLB invaliation command will be submitted and executed.
+ *
+ * Possbile race with domain attach flow is handled by sequencing
+ * bond creation - riscv_iommu_bond_link(), and device directory
+ * update - riscv_iommu_iodir_update().
+ *
+ * PTE Update / IOTLB Inval Device attach & directory update
+ * -------------------------- --------------------------
+ * update page table entries add dev to the bond list
+ * FENCE RW,RW FENCE RW,RW
+ * For all IOMMUs: (can be empty) Update FSC/PSCID
+ * FENCE IOW,IOW FENCE IOW,IOW
+ * IOTLB.INVAL IODIR.INVAL
+ * IOFENCE.C
+ *
+ * If bond list is not updated with new device, directory context will
+ * be configured with already valid page table content. If an IOMMU is
+ * linked to the protection domain it will receive invalidation
+ * requests for updated page table entries.
+ */
+ smp_mb();
+
+ rcu_read_lock();
+
+ prev = NULL;
+ list_for_each_entry_rcu(bond, &domain->bonds, list) {
+ iommu = dev_to_iommu(bond->dev);
+
+ /*
+ * IOTLB invalidation request can be safely omitted if already sent
+ * to the IOMMU for the same PSCID, and with domain->bonds list
+ * arranged based on the device's IOMMU, it's sufficient to check
+ * last device the invalidation was sent to.
+ */
+ if (iommu == prev)
+ continue;
+
+ riscv_iommu_cmd_inval_vma(&cmd);
+ riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
+ if (len && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
+ for (iova = start; iova < end; iova += PAGE_SIZE) {
+ riscv_iommu_cmd_inval_set_addr(&cmd, iova);
+ riscv_iommu_cmd_send(iommu, &cmd);
+ }
+ } else {
+ riscv_iommu_cmd_send(iommu, &cmd);
+ }
+ prev = iommu;
+ }
+
+ prev = NULL;
+ list_for_each_entry_rcu(bond, &domain->bonds, list) {
+ iommu = dev_to_iommu(bond->dev);
+ if (iommu == prev)
+ continue;
+
+ riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
+ prev = iommu;
+ }
+ rcu_read_unlock();
+}
+
+#define RISCV_IOMMU_FSC_BARE 0
+
+/*
+ * Update IODIR for the device.
+ *
+ * During the execution of riscv_iommu_probe_device(), IODIR entries are
+ * allocated for the device's identifiers. Device context invalidation
+ * becomes necessary only if one of the updated entries was previously
+ * marked as valid, given that invalid device context entries are not
+ * cached by the IOMMU hardware.
+ * In this implementation, updating a valid device context while the
+ * device is not quiesced might be disruptive, potentially causing
+ * interim translation faults.
+ */
+static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
+ struct device *dev, u64 fsc, u64 ta)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct riscv_iommu_dc *dc;
+ struct riscv_iommu_command cmd;
+ bool sync_required = false;
+ u64 tc;
+ int i;
+
+ for (i = 0; i < fwspec->num_ids; i++) {
+ dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
+ tc = READ_ONCE(dc->tc);
+ if (!(tc & RISCV_IOMMU_DC_TC_V))
+ continue;
+
+ WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V);
+
+ /* Invalidate device context cached values */
+ riscv_iommu_cmd_iodir_inval_ddt(&cmd);
+ riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
+ riscv_iommu_cmd_send(iommu, &cmd);
+ sync_required = true;
+ }
+
+ if (sync_required)
+ riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
+
+ /*
+ * For device context with DC_TC_PDTV = 0, translation attributes valid bit
+ * is stored as DC_TC_V bit (both sharing the same location at BIT(0)).
+ */
+ for (i = 0; i < fwspec->num_ids; i++) {
+ dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
+ tc = READ_ONCE(dc->tc);
+ tc |= ta & RISCV_IOMMU_DC_TC_V;
+
+ WRITE_ONCE(dc->fsc, fsc);
+ WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID);
+ /* Update device context, write TC.V as the last step. */
+ dma_wmb();
+ WRITE_ONCE(dc->tc, tc);
+
+ /* Invalidate device context after update */
+ riscv_iommu_cmd_iodir_inval_ddt(&cmd);
+ riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
+ riscv_iommu_cmd_send(iommu, &cmd);
+ }
+
+ riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
+}
+
+/*
+ * IOVA page translation tree management.
+ */
+
+static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+
+ riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
+}
+
+static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain,
+ struct iommu_iotlb_gather *gather)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+
+ riscv_iommu_iotlb_inval(domain, gather->start, gather->end);
+}
+
+#define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t)))
+
+#define _io_pte_present(pte) ((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE))
+#define _io_pte_leaf(pte) ((pte) & _PAGE_LEAF)
+#define _io_pte_none(pte) ((pte) == 0)
+#define _io_pte_entry(pn, prot) ((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot))
+
+static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain,
+ unsigned long pte, struct list_head *freelist)
+{
+ unsigned long *ptr;
+ int i;
+
+ if (!_io_pte_present(pte) || _io_pte_leaf(pte))
+ return;
+
+ ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
+
+ /* Recursively free all sub page table pages */
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte = READ_ONCE(ptr[i]);
+ if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte)
+ riscv_iommu_pte_free(domain, pte, freelist);
+ }
+
+ if (freelist)
+ list_add_tail(&virt_to_page(ptr)->lru, freelist);
+ else
+ iommu_free_page(ptr);
+}
+
+static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain,
+ unsigned long iova, size_t pgsize,
+ gfp_t gfp)
+{
+ unsigned long *ptr = domain->pgd_root;
+ unsigned long pte, old;
+ int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
+ void *addr;
+
+ do {
+ const int shift = PAGE_SHIFT + PT_SHIFT * level;
+
+ ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
+ /*
+ * Note: returned entry might be a non-leaf if there was
+ * existing mapping with smaller granularity. Up to the caller
+ * to replace and invalidate.
+ */
+ if (((size_t)1 << shift) == pgsize)
+ return ptr;
+pte_retry:
+ pte = READ_ONCE(*ptr);
+ /*
+ * This is very likely incorrect as we should not be adding
+ * new mapping with smaller granularity on top
+ * of existing 2M/1G mapping. Fail.
+ */
+ if (_io_pte_present(pte) && _io_pte_leaf(pte))
+ return NULL;
+ /*
+ * Non-leaf entry is missing, allocate and try to add to the
+ * page table. This might race with other mappings, retry.
+ */
+ if (_io_pte_none(pte)) {
+ addr = iommu_alloc_page_node(domain->numa_node, gfp);
+ if (!addr)
+ return NULL;
+ old = pte;
+ pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE);
+ if (cmpxchg_relaxed(ptr, old, pte) != old) {
+ iommu_free_page(addr);
+ goto pte_retry;
+ }
+ }
+ ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
+ } while (level-- > 0);
+
+ return NULL;
+}
+
+static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain,
+ unsigned long iova, size_t *pte_pgsize)
+{
+ unsigned long *ptr = domain->pgd_root;
+ unsigned long pte;
+ int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
+
+ do {
+ const int shift = PAGE_SHIFT + PT_SHIFT * level;
+
+ ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
+ pte = READ_ONCE(*ptr);
+ if (_io_pte_present(pte) && _io_pte_leaf(pte)) {
+ *pte_pgsize = (size_t)1 << shift;
+ return ptr;
+ }
+ if (_io_pte_none(pte))
+ return NULL;
+ ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
+ } while (level-- > 0);
+
+ return NULL;
+}
+
+static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
+ unsigned long iova, phys_addr_t phys,
+ size_t pgsize, size_t pgcount, int prot,
+ gfp_t gfp, size_t *mapped)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+ size_t size = 0;
+ unsigned long *ptr;
+ unsigned long pte, old, pte_prot;
+ int rc = 0;
+ LIST_HEAD(freelist);
+
+ if (!(prot & IOMMU_WRITE))
+ pte_prot = _PAGE_BASE | _PAGE_READ;
+ else if (domain->amo_enabled)
+ pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE;
+ else
+ pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY;
+
+ while (pgcount) {
+ ptr = riscv_iommu_pte_alloc(domain, iova, pgsize, gfp);
+ if (!ptr) {
+ rc = -ENOMEM;
+ break;
+ }
+
+ old = READ_ONCE(*ptr);
+ pte = _io_pte_entry(phys_to_pfn(phys), pte_prot);
+ if (cmpxchg_relaxed(ptr, old, pte) != old)
+ continue;
+
+ riscv_iommu_pte_free(domain, old, &freelist);
+
+ size += pgsize;
+ iova += pgsize;
+ phys += pgsize;
+ --pgcount;
+ }
+
+ *mapped = size;
+
+ if (!list_empty(&freelist)) {
+ /*
+ * In 1.0 spec version, the smallest scope we can use to
+ * invalidate all levels of page table (i.e. leaf and non-leaf)
+ * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0.
+ * This will be updated with hardware support for
+ * capability.NL (non-leaf) IOTINVAL command.
+ */
+ riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
+ iommu_put_pages_list(&freelist);
+ }
+
+ return rc;
+}
+
+static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
+ unsigned long iova, size_t pgsize,
+ size_t pgcount,
+ struct iommu_iotlb_gather *gather)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+ size_t size = pgcount << __ffs(pgsize);
+ unsigned long *ptr, old;
+ size_t unmapped = 0;
+ size_t pte_size;
+
+ while (unmapped < size) {
+ ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
+ if (!ptr)
+ return unmapped;
+
+ /* partial unmap is not allowed, fail. */
+ if (iova & (pte_size - 1))
+ return unmapped;
+
+ old = READ_ONCE(*ptr);
+ if (cmpxchg_relaxed(ptr, old, 0) != old)
+ continue;
+
+ iommu_iotlb_gather_add_page(&domain->domain, gather, iova,
+ pte_size);
+
+ iova += pte_size;
+ unmapped += pte_size;
+ }
+
+ return unmapped;
+}
+
+static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
+ dma_addr_t iova)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+ unsigned long pte_size;
+ unsigned long *ptr;
+
+ ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
+ if (_io_pte_none(*ptr) || !_io_pte_present(*ptr))
+ return 0;
+
+ return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1));
+}
+
+static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+ const unsigned long pfn = virt_to_pfn(domain->pgd_root);
+
+ WARN_ON(!list_empty(&domain->bonds));
+
+ if ((int)domain->pscid > 0)
+ ida_free(&riscv_iommu_pscids, domain->pscid);
+
+ riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL);
+ kfree(domain);
+}
+
+static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode)
+{
+ switch (pgd_mode) {
+ case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
+ return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39;
+
+ case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
+ return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48;
+
+ case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
+ return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57;
+ }
+ return false;
+}
+
+static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
+ struct device *dev)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
+ u64 fsc, ta;
+
+ if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
+ return -ENODEV;
+
+ fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) |
+ FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root));
+ ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) |
+ RISCV_IOMMU_PC_TA_V;
+
+ if (riscv_iommu_bond_link(domain, dev))
+ return -ENOMEM;
+
+ riscv_iommu_iodir_update(iommu, dev, fsc, ta);
+ riscv_iommu_bond_unlink(info->domain, dev);
+ info->domain = domain;
+
+ return 0;
+}
+
+static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = {
+ .attach_dev = riscv_iommu_attach_paging_domain,
+ .free = riscv_iommu_free_paging_domain,
+ .map_pages = riscv_iommu_map_pages,
+ .unmap_pages = riscv_iommu_unmap_pages,
+ .iova_to_phys = riscv_iommu_iova_to_phys,
+ .iotlb_sync = riscv_iommu_iotlb_sync,
+ .flush_iotlb_all = riscv_iommu_iotlb_flush_all,
+};
+
+static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
+{
+ struct riscv_iommu_domain *domain;
+ struct riscv_iommu_device *iommu;
+ unsigned int pgd_mode;
+ dma_addr_t va_mask;
+ int va_bits;
+
+ iommu = dev_to_iommu(dev);
+ if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) {
+ pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57;
+ va_bits = 57;
+ } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) {
+ pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48;
+ va_bits = 48;
+ } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) {
+ pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39;
+ va_bits = 39;
+ } else {
+ dev_err(dev, "cannot find supported page table mode\n");
+ return ERR_PTR(-ENODEV);
+ }
+
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD_RCU(&domain->bonds);
+ spin_lock_init(&domain->lock);
+ domain->numa_node = dev_to_node(iommu->dev);
+ domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD);
+ domain->pgd_mode = pgd_mode;
+ domain->pgd_root = iommu_alloc_page_node(domain->numa_node,
+ GFP_KERNEL_ACCOUNT);
+ if (!domain->pgd_root) {
+ kfree(domain);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
+ RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
+ if (domain->pscid < 0) {
+ iommu_free_page(domain->pgd_root);
+ kfree(domain);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /*
+ * Note: RISC-V Privilege spec mandates that virtual addresses
+ * need to be sign-extended, so if (VA_BITS - 1) is set, all
+ * bits >= VA_BITS need to also be set or else we'll get a
+ * page fault. However the code that creates the mappings
+ * above us (e.g. iommu_dma_alloc_iova()) won't do that for us
+ * for now, so we'll end up with invalid virtual addresses
+ * to map. As a workaround until we get this sorted out
+ * limit the available virtual addresses to VA_BITS - 1.
+ */
+ va_mask = DMA_BIT_MASK(va_bits - 1);
+
+ domain->domain.geometry.aperture_start = 0;
+ domain->domain.geometry.aperture_end = va_mask;
+ domain->domain.geometry.force_aperture = true;
+ domain->domain.pgsize_bitmap = va_mask & (SZ_4K | SZ_2M | SZ_1G | SZ_512G);
+
+ domain->domain.ops = &riscv_iommu_paging_domain_ops;
+
+ return &domain->domain;
+}
+
+static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
+ struct device *dev)
+{
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
+
+ /* Make device context invalid, translation requests will fault w/ #258 */
+ riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0);
+ riscv_iommu_bond_unlink(info->domain, dev);
+ info->domain = NULL;
+
+ return 0;
+}
+
+static struct iommu_domain riscv_iommu_blocking_domain = {
+ .type = IOMMU_DOMAIN_BLOCKED,
+ .ops = &(const struct iommu_domain_ops) {
+ .attach_dev = riscv_iommu_attach_blocking_domain,
+ }
+};
+
+static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
+ struct device *dev)
+{
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
+
+ riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V);
+ riscv_iommu_bond_unlink(info->domain, dev);
+ info->domain = NULL;
+
+ return 0;
+}
+
+static struct iommu_domain riscv_iommu_identity_domain = {
+ .type = IOMMU_DOMAIN_IDENTITY,
+ .ops = &(const struct iommu_domain_ops) {
+ .attach_dev = riscv_iommu_attach_identity_domain,
+ }
+};
+
+static struct iommu_group *riscv_iommu_device_group(struct device *dev)
+{
+ if (dev_is_pci(dev))
+ return pci_device_group(dev);
+ return generic_device_group(dev);
+}
+
+static int riscv_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args)
+{
+ return iommu_fwspec_add_ids(dev, args->args, 1);
+}
+
+static struct iommu_device *riscv_iommu_probe_device(struct device *dev)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct riscv_iommu_device *iommu;
+ struct riscv_iommu_info *info;
+ struct riscv_iommu_dc *dc;
+ u64 tc;
+ int i;
+
+ if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids)
+ return ERR_PTR(-ENODEV);
+
+ iommu = dev_get_drvdata(fwspec->iommu_fwnode->dev);
+ if (!iommu)
+ return ERR_PTR(-ENODEV);
+
+ /*
+ * IOMMU hardware operating in fail-over BARE mode will provide
+ * identity translation for all connected devices anyway...
+ */
+ if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
+ return ERR_PTR(-ENODEV);
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return ERR_PTR(-ENOMEM);
+ /*
+ * Allocate and pre-configure device context entries in
+ * the device directory. Do not mark the context valid yet.
+ */
+ tc = 0;
+ if (iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD)
+ tc |= RISCV_IOMMU_DC_TC_SADE;
+ for (i = 0; i < fwspec->num_ids; i++) {
+ dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
+ if (!dc) {
+ kfree(info);
+ return ERR_PTR(-ENODEV);
+ }
+ if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V)
+ dev_warn(dev, "already attached to IOMMU device directory\n");
+ WRITE_ONCE(dc->tc, tc);
+ }
+
+ dev_iommu_priv_set(dev, info);
+
+ return &iommu->iommu;
+}
+
+static void riscv_iommu_release_device(struct device *dev)
+{
+ struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
+
+ kfree_rcu_mightsleep(info);
+}
+
+static const struct iommu_ops riscv_iommu_ops = {
+ .pgsize_bitmap = SZ_4K,
+ .of_xlate = riscv_iommu_of_xlate,
+ .identity_domain = &riscv_iommu_identity_domain,
+ .blocked_domain = &riscv_iommu_blocking_domain,
+ .release_domain = &riscv_iommu_blocking_domain,
+ .domain_alloc_paging = riscv_iommu_alloc_paging_domain,
+ .device_group = riscv_iommu_device_group,
+ .probe_device = riscv_iommu_probe_device,
+ .release_device = riscv_iommu_release_device,
+};
+
+static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
+{
+ u64 ddtp;
+
+ /*
+ * Make sure the IOMMU is switched off or in pass-through mode during
+ * regular boot flow and disable translation when we boot into a kexec
+ * kernel and the previous kernel left them enabled.
+ */
+ ddtp = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_DDTP);
+ if (ddtp & RISCV_IOMMU_DDTP_BUSY)
+ return -EBUSY;
+
+ if (FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp) >
+ RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) {
+ if (!is_kdump_kernel())
+ return -EBUSY;
+ riscv_iommu_disable(iommu);
+ }
+
+ /* Configure accesses to in-memory data structures for CPU-native byte order. */
+ if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
+ !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) {
+ if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_END))
+ return -EINVAL;
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL,
+ iommu->fctl ^ RISCV_IOMMU_FCTL_BE);
+ iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
+ if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
+ !!(iommu->fctl & RISCV_IOMMU_FCTL_BE))
+ return -EINVAL;
+ }
+
+ /*
+ * Distribute interrupt vectors, always use first vector for CIV.
+ * At least one interrupt is required. Read back and verify.
+ */
+ if (!iommu->irqs_count)
+ return -EINVAL;
+
+ iommu->icvec = FIELD_PREP(RISCV_IOMMU_ICVEC_FIV, 1 % iommu->irqs_count) |
+ FIELD_PREP(RISCV_IOMMU_ICVEC_PIV, 2 % iommu->irqs_count) |
+ FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV, 3 % iommu->irqs_count);
+ riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_ICVEC, iommu->icvec);
+ iommu->icvec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_ICVEC);
+ if (max(max(FIELD_GET(RISCV_IOMMU_ICVEC_CIV, iommu->icvec),
+ FIELD_GET(RISCV_IOMMU_ICVEC_FIV, iommu->icvec)),
+ max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV, iommu->icvec),
+ FIELD_GET(RISCV_IOMMU_ICVEC_PMIV, iommu->icvec))) >= iommu->irqs_count)
+ return -EINVAL;
+
+ return 0;
+}
+
+void riscv_iommu_remove(struct riscv_iommu_device *iommu)
+{
+ iommu_device_unregister(&iommu->iommu);
+ iommu_device_sysfs_remove(&iommu->iommu);
+ riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
+ riscv_iommu_queue_disable(&iommu->cmdq);
+ riscv_iommu_queue_disable(&iommu->fltq);
+}
+
+int riscv_iommu_init(struct riscv_iommu_device *iommu)
+{
+ int rc;
+
+ RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ);
+ RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ);
+
+ rc = riscv_iommu_init_check(iommu);
+ if (rc)
+ return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
+
+ rc = riscv_iommu_iodir_alloc(iommu);
+ if (rc)
+ return rc;
+
+ rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq,
+ sizeof(struct riscv_iommu_command));
+ if (rc)
+ return rc;
+
+ rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq,
+ sizeof(struct riscv_iommu_fq_record));
+ if (rc)
+ return rc;
+
+ rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process);
+ if (rc)
+ return rc;
+
+ rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process);
+ if (rc)
+ goto err_queue_disable;
+
+ rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX);
+ if (rc)
+ goto err_queue_disable;
+
+ rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
+ dev_name(iommu->dev));
+ if (rc) {
+ dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n");
+ goto err_iodir_off;
+ }
+
+ rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev);
+ if (rc) {
+ dev_err_probe(iommu->dev, rc, "cannot register iommu interface\n");
+ goto err_remove_sysfs;
+ }
+
+ return 0;
+
+err_remove_sysfs:
+ iommu_device_sysfs_remove(&iommu->iommu);
+err_iodir_off:
+ riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
+err_queue_disable:
+ riscv_iommu_queue_disable(&iommu->fltq);
+ riscv_iommu_queue_disable(&iommu->cmdq);
+ return rc;
+}