diff options
Diffstat (limited to 'drivers/iommu/riscv/iommu.c')
-rw-r--r-- | drivers/iommu/riscv/iommu.c | 1661 |
1 files changed, 1661 insertions, 0 deletions
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c new file mode 100644 index 000000000000..8a05def774bd --- /dev/null +++ b/drivers/iommu/riscv/iommu.c @@ -0,0 +1,1661 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * IOMMU API for RISC-V IOMMU implementations. + * + * Copyright © 2022-2024 Rivos Inc. + * Copyright © 2023 FORTH-ICS/CARV + * + * Authors + * Tomasz Jeznach <tjeznach@rivosinc.com> + * Nick Kossifidis <mick@ics.forth.gr> + */ + +#define pr_fmt(fmt) "riscv-iommu: " fmt + +#include <linux/compiler.h> +#include <linux/crash_dump.h> +#include <linux/init.h> +#include <linux/iommu.h> +#include <linux/iopoll.h> +#include <linux/kernel.h> +#include <linux/pci.h> + +#include "../iommu-pages.h" +#include "iommu-bits.h" +#include "iommu.h" + +/* Timeouts in [us] */ +#define RISCV_IOMMU_QCSR_TIMEOUT 150000 +#define RISCV_IOMMU_QUEUE_TIMEOUT 150000 +#define RISCV_IOMMU_DDTP_TIMEOUT 10000000 +#define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000 + +/* Number of entries per CMD/FLT queue, should be <= INT_MAX */ +#define RISCV_IOMMU_DEF_CQ_COUNT 8192 +#define RISCV_IOMMU_DEF_FQ_COUNT 4096 + +/* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */ +#define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10)) +#define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12)) + +#define dev_to_iommu(dev) \ + iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu) + +/* IOMMU PSCID allocation namespace. */ +static DEFINE_IDA(riscv_iommu_pscids); +#define RISCV_IOMMU_MAX_PSCID (BIT(20) - 1) + +/* Device resource-managed allocations */ +struct riscv_iommu_devres { + void *addr; + int order; +}; + +static void riscv_iommu_devres_pages_release(struct device *dev, void *res) +{ + struct riscv_iommu_devres *devres = res; + + iommu_free_pages(devres->addr, devres->order); +} + +static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p) +{ + struct riscv_iommu_devres *devres = res; + struct riscv_iommu_devres *target = p; + + return devres->addr == target->addr; +} + +static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu, int order) +{ + struct riscv_iommu_devres *devres; + void *addr; + + addr = iommu_alloc_pages_node(dev_to_node(iommu->dev), + GFP_KERNEL_ACCOUNT, order); + if (unlikely(!addr)) + return NULL; + + devres = devres_alloc(riscv_iommu_devres_pages_release, + sizeof(struct riscv_iommu_devres), GFP_KERNEL); + + if (unlikely(!devres)) { + iommu_free_pages(addr, order); + return NULL; + } + + devres->addr = addr; + devres->order = order; + + devres_add(iommu->dev, devres); + + return addr; +} + +static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr) +{ + struct riscv_iommu_devres devres = { .addr = addr }; + + devres_release(iommu->dev, riscv_iommu_devres_pages_release, + riscv_iommu_devres_pages_match, &devres); +} + +/* + * Hardware queue allocation and management. + */ + +/* Setup queue base, control registers and default queue length */ +#define RISCV_IOMMU_QUEUE_INIT(q, name) do { \ + struct riscv_iommu_queue *_q = q; \ + _q->qid = RISCV_IOMMU_INTR_ ## name; \ + _q->qbr = RISCV_IOMMU_REG_ ## name ## B; \ + _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR; \ + _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\ +} while (0) + +/* Note: offsets are the same for all queues */ +#define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB)) +#define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB)) +#define Q_ITEM(q, index) ((q)->mask & (index)) +#define Q_IPSR(q) BIT((q)->qid) + +/* + * Discover queue ring buffer hardware configuration, allocate in-memory + * ring buffer or use fixed I/O memory location, configure queue base register. + * Must be called before hardware queue is enabled. + * + * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT() + * @entry_size - queue single element size in bytes. + */ +static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu, + struct riscv_iommu_queue *queue, + size_t entry_size) +{ + unsigned int logsz; + u64 qb, rb; + + /* + * Use WARL base register property to discover maximum allowed + * number of entries and optional fixed IO address for queue location. + */ + riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD); + qb = riscv_iommu_readq(iommu, queue->qbr); + + /* + * Calculate and verify hardware supported queue length, as reported + * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1). + * Update queue size based on hardware supported value. + */ + logsz = ilog2(queue->mask); + if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb)) + logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb); + + /* + * Use WARL base register property to discover an optional fixed IO + * address for queue ring buffer location. Otherwise allocate contiguous + * system memory. + */ + if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) { + const size_t queue_size = entry_size << (logsz + 1); + + queue->phys = pfn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)); + queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size); + } else { + do { + const size_t queue_size = entry_size << (logsz + 1); + const int order = get_order(queue_size); + + queue->base = riscv_iommu_get_pages(iommu, order); + queue->phys = __pa(queue->base); + } while (!queue->base && logsz-- > 0); + } + + if (!queue->base) + return -ENOMEM; + + qb = phys_to_ppn(queue->phys) | + FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz); + + /* Update base register and read back to verify hw accepted our write */ + riscv_iommu_writeq(iommu, queue->qbr, qb); + rb = riscv_iommu_readq(iommu, queue->qbr); + if (rb != qb) { + dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid); + return -ENODEV; + } + + /* Update actual queue mask */ + queue->mask = (2U << logsz) - 1; + + dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries", + queue->qid, logsz + 1); + + return 0; +} + +/* Check interrupt queue status, IPSR */ +static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data) +{ + struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; + + if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue)) + return IRQ_WAKE_THREAD; + + return IRQ_NONE; +} + +static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n) +{ + /* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */ + return (iommu->icvec >> (n * 4)) & RISCV_IOMMU_ICVEC_CIV; +} + +/* + * Enable queue processing in the hardware, register interrupt handler. + * + * @queue - data structure, already allocated with riscv_iommu_queue_alloc() + * @irq_handler - threaded interrupt handler. + */ +static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu, + struct riscv_iommu_queue *queue, + irq_handler_t irq_handler) +{ + const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)]; + u32 csr; + int rc; + + if (queue->iommu) + return -EBUSY; + + /* Polling not implemented */ + if (!irq) + return -ENODEV; + + queue->iommu = iommu; + rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler, + IRQF_ONESHOT | IRQF_SHARED, + dev_name(iommu->dev), queue); + if (rc) { + queue->iommu = NULL; + return rc; + } + + /* + * Enable queue with interrupts, clear any memory fault if any. + * Wait for the hardware to acknowledge request and activate queue + * processing. + * Note: All CSR bitfields are in the same offsets for all queues. + */ + riscv_iommu_writel(iommu, queue->qcr, + RISCV_IOMMU_QUEUE_ENABLE | + RISCV_IOMMU_QUEUE_INTR_ENABLE | + RISCV_IOMMU_QUEUE_MEM_FAULT); + + riscv_iommu_readl_timeout(iommu, queue->qcr, + csr, !(csr & RISCV_IOMMU_QUEUE_BUSY), + 10, RISCV_IOMMU_QCSR_TIMEOUT); + + if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE | + RISCV_IOMMU_QUEUE_BUSY | + RISCV_IOMMU_QUEUE_MEM_FAULT))) { + /* Best effort to stop and disable failing hardware queue. */ + riscv_iommu_writel(iommu, queue->qcr, 0); + free_irq(irq, queue); + queue->iommu = NULL; + dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid); + return -EBUSY; + } + + /* Clear any pending interrupt flag. */ + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); + + return 0; +} + +/* + * Disable queue. Wait for the hardware to acknowledge request and + * stop processing enqueued requests. Report errors but continue. + */ +static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue) +{ + struct riscv_iommu_device *iommu = queue->iommu; + u32 csr; + + if (!iommu) + return; + + free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue); + riscv_iommu_writel(iommu, queue->qcr, 0); + riscv_iommu_readl_timeout(iommu, queue->qcr, + csr, !(csr & RISCV_IOMMU_QUEUE_BUSY), + 10, RISCV_IOMMU_QCSR_TIMEOUT); + + if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY)) + dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n", + queue->qid, csr); + + queue->iommu = NULL; +} + +/* + * Returns number of available valid queue entries and the first item index. + * Update shadow producer index if necessary. + */ +static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue, + unsigned int *index) +{ + unsigned int head = atomic_read(&queue->head); + unsigned int tail = atomic_read(&queue->tail); + unsigned int last = Q_ITEM(queue, tail); + int available = (int)(tail - head); + + *index = head; + + if (available > 0) + return available; + + /* read hardware producer index, check reserved register bits are not set. */ + if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue), + tail, (tail & ~queue->mask) == 0, + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) { + dev_err_once(queue->iommu->dev, + "Hardware error: queue access timeout\n"); + return 0; + } + + if (tail == last) + return 0; + + /* update shadow producer index */ + return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head); +} + +/* + * Release processed queue entries, should match riscv_iommu_queue_consume() calls. + */ +static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count) +{ + const unsigned int head = atomic_add_return(count, &queue->head); + + riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head)); +} + +/* Return actual consumer index based on hardware reported queue head index. */ +static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue) +{ + const unsigned int cons = atomic_read(&queue->head); + const unsigned int last = Q_ITEM(queue, cons); + unsigned int head; + + if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head, + !(head & ~queue->mask), + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) + return cons; + + return cons + ((head - last) & queue->mask); +} + +/* Wait for submitted item to be processed. */ +static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue, + unsigned int index, + unsigned int timeout_us) +{ + unsigned int cons = atomic_read(&queue->head); + + /* Already processed by the consumer */ + if ((int)(cons - index) > 0) + return 0; + + /* Monitor consumer index */ + return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons, + (int)(cons - index) > 0, 0, timeout_us); +} + +/* Enqueue an entry and wait to be processed if timeout_us > 0 + * + * Error handling for IOMMU hardware not responding in reasonable time + * will be added as separate patch series along with other RAS features. + * For now, only report hardware failure and continue. + */ +static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue *queue, + void *entry, size_t entry_size) +{ + unsigned int prod; + unsigned int head; + unsigned int tail; + unsigned long flags; + + /* Do not preempt submission flow. */ + local_irq_save(flags); + + /* 1. Allocate some space in the queue */ + prod = atomic_inc_return(&queue->prod) - 1; + head = atomic_read(&queue->head); + + /* 2. Wait for space availability. */ + if ((prod - head) > queue->mask) { + if (readx_poll_timeout(atomic_read, &queue->head, + head, (prod - head) < queue->mask, + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) + goto err_busy; + } else if ((prod - head) == queue->mask) { + const unsigned int last = Q_ITEM(queue, head); + + if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head, + !(head & ~queue->mask) && head != last, + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) + goto err_busy; + atomic_add((head - last) & queue->mask, &queue->head); + } + + /* 3. Store entry in the ring buffer */ + memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size); + + /* 4. Wait for all previous entries to be ready */ + if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail, + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) + goto err_busy; + + /* + * 5. Make sure the ring buffer update (whether in normal or I/O memory) is + * completed and visible before signaling the tail doorbell to fetch + * the next command. 'fence ow, ow' + */ + dma_wmb(); + riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1)); + + /* + * 6. Make sure the doorbell write to the device has finished before updating + * the shadow tail index in normal memory. 'fence o, w' + */ + mmiowb(); + atomic_inc(&queue->tail); + + /* 7. Complete submission and restore local interrupts */ + local_irq_restore(flags); + + return prod; + +err_busy: + local_irq_restore(flags); + dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n"); + + return prod; +} + +/* + * IOMMU Command queue chapter 3.1 + */ + +/* Command queue interrupt handler thread function */ +static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data) +{ + const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; + unsigned int ctrl; + + /* Clear MF/CQ errors, complete error recovery to be implemented. */ + ctrl = riscv_iommu_readl(queue->iommu, queue->qcr); + if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO | + RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) { + riscv_iommu_writel(queue->iommu, queue->qcr, ctrl); + dev_warn(queue->iommu->dev, + "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n", + queue->qid, + !!(ctrl & RISCV_IOMMU_CQCSR_CQMF), + !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO), + !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL), + !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP)); + } + + /* Placeholder for command queue interrupt notifiers */ + + /* Clear command interrupt pending. */ + riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); + + return IRQ_HANDLED; +} + +/* Send command to the IOMMU command queue */ +static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu, + struct riscv_iommu_command *cmd) +{ + riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd)); +} + +/* Send IOFENCE.C command and wait for all scheduled commands to complete. */ +static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu, + unsigned int timeout_us) +{ + struct riscv_iommu_command cmd; + unsigned int prod; + + riscv_iommu_cmd_iofence(&cmd); + prod = riscv_iommu_queue_send(&iommu->cmdq, &cmd, sizeof(cmd)); + + if (!timeout_us) + return; + + if (riscv_iommu_queue_wait(&iommu->cmdq, prod, timeout_us)) + dev_err_once(iommu->dev, + "Hardware error: command execution timeout\n"); +} + +/* + * IOMMU Fault/Event queue chapter 3.2 + */ + +static void riscv_iommu_fault(struct riscv_iommu_device *iommu, + struct riscv_iommu_fq_record *event) +{ + unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr); + unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr); + + /* Placeholder for future fault handling implementation, report only. */ + if (err) + dev_warn_ratelimited(iommu->dev, + "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n", + err, devid, event->iotval, event->iotval2); +} + +/* Fault queue interrupt handler thread function */ +static irqreturn_t riscv_iommu_fltq_process(int irq, void *data) +{ + struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; + struct riscv_iommu_device *iommu = queue->iommu; + struct riscv_iommu_fq_record *events; + unsigned int ctrl, idx; + int cnt, len; + + events = (struct riscv_iommu_fq_record *)queue->base; + + /* Clear fault interrupt pending and process all received fault events. */ + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); + + do { + cnt = riscv_iommu_queue_consume(queue, &idx); + for (len = 0; len < cnt; idx++, len++) + riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]); + riscv_iommu_queue_release(queue, cnt); + } while (cnt > 0); + + /* Clear MF/OF errors, complete error recovery to be implemented. */ + ctrl = riscv_iommu_readl(iommu, queue->qcr); + if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) { + riscv_iommu_writel(iommu, queue->qcr, ctrl); + dev_warn(iommu->dev, + "Queue #%u error; memory fault:%d overflow:%d\n", + queue->qid, + !!(ctrl & RISCV_IOMMU_FQCSR_FQMF), + !!(ctrl & RISCV_IOMMU_FQCSR_FQOF)); + } + + return IRQ_HANDLED; +} + +/* Lookup and initialize device context info structure. */ +static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu, + unsigned int devid) +{ + const bool base_format = !(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT); + unsigned int depth; + unsigned long ddt, old, new; + void *ptr; + u8 ddi_bits[3] = { 0 }; + u64 *ddtp = NULL; + + /* Make sure the mode is valid */ + if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL || + iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL) + return NULL; + + /* + * Device id partitioning for base format: + * DDI[0]: bits 0 - 6 (1st level) (7 bits) + * DDI[1]: bits 7 - 15 (2nd level) (9 bits) + * DDI[2]: bits 16 - 23 (3rd level) (8 bits) + * + * For extended format: + * DDI[0]: bits 0 - 5 (1st level) (6 bits) + * DDI[1]: bits 6 - 14 (2nd level) (9 bits) + * DDI[2]: bits 15 - 23 (3rd level) (9 bits) + */ + if (base_format) { + ddi_bits[0] = 7; + ddi_bits[1] = 7 + 9; + ddi_bits[2] = 7 + 9 + 8; + } else { + ddi_bits[0] = 6; + ddi_bits[1] = 6 + 9; + ddi_bits[2] = 6 + 9 + 9; + } + + /* Make sure device id is within range */ + depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL; + if (devid >= (1 << ddi_bits[depth])) + return NULL; + + /* Get to the level of the non-leaf node that holds the device context */ + for (ddtp = iommu->ddt_root; depth-- > 0;) { + const int split = ddi_bits[depth]; + /* + * Each non-leaf node is 64bits wide and on each level + * nodes are indexed by DDI[depth]. + */ + ddtp += (devid >> split) & 0x1FF; + + /* + * Check if this node has been populated and if not + * allocate a new level and populate it. + */ + do { + ddt = READ_ONCE(*(unsigned long *)ddtp); + if (ddt & RISCV_IOMMU_DDTE_V) { + ddtp = __va(ppn_to_phys(ddt)); + break; + } + + ptr = riscv_iommu_get_pages(iommu, 0); + if (!ptr) + return NULL; + + new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V; + old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new); + + if (old == ddt) { + ddtp = (u64 *)ptr; + break; + } + + /* Race setting DDT detected, re-read and retry. */ + riscv_iommu_free_pages(iommu, ptr); + } while (1); + } + + /* + * Grab the node that matches DDI[depth], note that when using base + * format the device context is 4 * 64bits, and the extended format + * is 8 * 64bits, hence the (3 - base_format) below. + */ + ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format); + + return (struct riscv_iommu_dc *)ddtp; +} + +/* + * This is best effort IOMMU translation shutdown flow. + * Disable IOMMU without waiting for hardware response. + */ +static void riscv_iommu_disable(struct riscv_iommu_device *iommu) +{ + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, 0); + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQCSR, 0); + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FQCSR, 0); + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0); +} + +#define riscv_iommu_read_ddtp(iommu) ({ \ + u64 ddtp; \ + riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \ + !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \ + RISCV_IOMMU_DDTP_TIMEOUT); \ + ddtp; }) + +static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu) +{ + u64 ddtp; + unsigned int mode; + + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) + return -EBUSY; + + /* + * It is optional for the hardware to report a fixed address for device + * directory root page when DDT.MODE is OFF or BARE. + */ + mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); + if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE || + mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) { + /* Use WARL to discover hardware fixed DDT PPN */ + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, + FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode)); + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) + return -EBUSY; + + iommu->ddt_phys = ppn_to_phys(ddtp); + if (iommu->ddt_phys) + iommu->ddt_root = devm_ioremap(iommu->dev, + iommu->ddt_phys, PAGE_SIZE); + if (iommu->ddt_root) + memset(iommu->ddt_root, 0, PAGE_SIZE); + } + + if (!iommu->ddt_root) { + iommu->ddt_root = riscv_iommu_get_pages(iommu, 0); + iommu->ddt_phys = __pa(iommu->ddt_root); + } + + if (!iommu->ddt_root) + return -ENOMEM; + + return 0; +} + +/* + * Discover supported DDT modes starting from requested value, + * configure DDTP register with accepted mode and root DDT address. + * Accepted iommu->ddt_mode is updated on success. + */ +static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu, + unsigned int ddtp_mode) +{ + struct device *dev = iommu->dev; + u64 ddtp, rq_ddtp; + unsigned int mode, rq_mode = ddtp_mode; + struct riscv_iommu_command cmd; + + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) + return -EBUSY; + + /* Disallow state transition from xLVL to xLVL. */ + mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); + if (mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE && + mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF && + rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE && + rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) + return -EINVAL; + + do { + rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode); + if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) + rq_ddtp |= phys_to_ppn(iommu->ddt_phys); + + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp); + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) { + dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n", + rq_mode, ddtp); + return -EBUSY; + } + + /* Verify IOMMU hardware accepts new DDTP config. */ + mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); + + if (rq_mode == mode) + break; + + /* Hardware mandatory DDTP mode has not been accepted. */ + if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) { + dev_err(dev, "DDTP update failed hw: %llx vs %llx\n", + ddtp, rq_ddtp); + return -EINVAL; + } + + /* + * Mode field is WARL, an IOMMU may support a subset of + * directory table levels in which case if we tried to set + * an unsupported number of levels we'll readback either + * a valid xLVL or off/bare. If we got off/bare, try again + * with a smaller xLVL. + */ + if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && + rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) { + dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode); + rq_mode--; + continue; + } + + /* + * We tried all supported modes and IOMMU hardware failed to + * accept new settings, something went very wrong since off/bare + * and at least one xLVL must be supported. + */ + dev_err(dev, "DDTP hw mode %u, failed to set %u\n", + mode, ddtp_mode); + return -EINVAL; + } while (1); + + iommu->ddt_mode = mode; + if (mode != ddtp_mode) + dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode); + + /* Invalidate device context cache */ + riscv_iommu_cmd_iodir_inval_ddt(&cmd); + riscv_iommu_cmd_send(iommu, &cmd); + + /* Invalidate address translation cache */ + riscv_iommu_cmd_inval_vma(&cmd); + riscv_iommu_cmd_send(iommu, &cmd); + + /* IOFENCE.C */ + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); + + return 0; +} + +/* This struct contains protection domain specific IOMMU driver data. */ +struct riscv_iommu_domain { + struct iommu_domain domain; + struct list_head bonds; + spinlock_t lock; /* protect bonds list updates. */ + int pscid; + bool amo_enabled; + int numa_node; + unsigned int pgd_mode; + unsigned long *pgd_root; +}; + +#define iommu_domain_to_riscv(iommu_domain) \ + container_of(iommu_domain, struct riscv_iommu_domain, domain) + +/* Private IOMMU data for managed devices, dev_iommu_priv_* */ +struct riscv_iommu_info { + struct riscv_iommu_domain *domain; +}; + +/* + * Linkage between an iommu_domain and attached devices. + * + * Protection domain requiring IOATC and DevATC translation cache invalidations, + * should be linked to attached devices using a riscv_iommu_bond structure. + * Devices should be linked to the domain before first use and unlinked after + * the translations from the referenced protection domain can no longer be used. + * Blocking and identity domains are not tracked here, as the IOMMU hardware + * does not cache negative and/or identity (BARE mode) translations, and DevATC + * is disabled for those protection domains. + * + * The device pointer and IOMMU data remain stable in the bond struct after + * _probe_device() where it's attached to the managed IOMMU, up to the + * completion of the _release_device() call. The release of the bond structure + * is synchronized with the device release. + */ +struct riscv_iommu_bond { + struct list_head list; + struct rcu_head rcu; + struct device *dev; +}; + +static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain, + struct device *dev) +{ + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_bond *bond; + struct list_head *bonds; + + bond = kzalloc(sizeof(*bond), GFP_KERNEL); + if (!bond) + return -ENOMEM; + bond->dev = dev; + + /* + * List of devices attached to the domain is arranged based on + * managed IOMMU device. + */ + + spin_lock(&domain->lock); + list_for_each(bonds, &domain->bonds) + if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu) + break; + list_add_rcu(&bond->list, bonds); + spin_unlock(&domain->lock); + + /* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */ + smp_mb(); + + return 0; +} + +static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain, + struct device *dev) +{ + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_bond *bond, *found = NULL; + struct riscv_iommu_command cmd; + int count = 0; + + if (!domain) + return; + + spin_lock(&domain->lock); + list_for_each_entry(bond, &domain->bonds, list) { + if (found && count) + break; + else if (bond->dev == dev) + found = bond; + else if (dev_to_iommu(bond->dev) == iommu) + count++; + } + if (found) + list_del_rcu(&found->list); + spin_unlock(&domain->lock); + kfree_rcu(found, rcu); + + /* + * If this was the last bond between this domain and the IOMMU + * invalidate all cached entries for domain's PSCID. + */ + if (!count) { + riscv_iommu_cmd_inval_vma(&cmd); + riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid); + riscv_iommu_cmd_send(iommu, &cmd); + + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); + } +} + +/* + * Send IOTLB.INVAL for whole address space for ranges larger than 2MB. + * This limit will be replaced with range invalidations, if supported by + * the hardware, when RISC-V IOMMU architecture specification update for + * range invalidations update will be available. + */ +#define RISCV_IOMMU_IOTLB_INVAL_LIMIT (2 << 20) + +static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain, + unsigned long start, unsigned long end) +{ + struct riscv_iommu_bond *bond; + struct riscv_iommu_device *iommu, *prev; + struct riscv_iommu_command cmd; + unsigned long len = end - start + 1; + unsigned long iova; + + /* + * For each IOMMU linked with this protection domain (via bonds->dev), + * an IOTLB invaliation command will be submitted and executed. + * + * Possbile race with domain attach flow is handled by sequencing + * bond creation - riscv_iommu_bond_link(), and device directory + * update - riscv_iommu_iodir_update(). + * + * PTE Update / IOTLB Inval Device attach & directory update + * -------------------------- -------------------------- + * update page table entries add dev to the bond list + * FENCE RW,RW FENCE RW,RW + * For all IOMMUs: (can be empty) Update FSC/PSCID + * FENCE IOW,IOW FENCE IOW,IOW + * IOTLB.INVAL IODIR.INVAL + * IOFENCE.C + * + * If bond list is not updated with new device, directory context will + * be configured with already valid page table content. If an IOMMU is + * linked to the protection domain it will receive invalidation + * requests for updated page table entries. + */ + smp_mb(); + + rcu_read_lock(); + + prev = NULL; + list_for_each_entry_rcu(bond, &domain->bonds, list) { + iommu = dev_to_iommu(bond->dev); + + /* + * IOTLB invalidation request can be safely omitted if already sent + * to the IOMMU for the same PSCID, and with domain->bonds list + * arranged based on the device's IOMMU, it's sufficient to check + * last device the invalidation was sent to. + */ + if (iommu == prev) + continue; + + riscv_iommu_cmd_inval_vma(&cmd); + riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid); + if (len && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) { + for (iova = start; iova < end; iova += PAGE_SIZE) { + riscv_iommu_cmd_inval_set_addr(&cmd, iova); + riscv_iommu_cmd_send(iommu, &cmd); + } + } else { + riscv_iommu_cmd_send(iommu, &cmd); + } + prev = iommu; + } + + prev = NULL; + list_for_each_entry_rcu(bond, &domain->bonds, list) { + iommu = dev_to_iommu(bond->dev); + if (iommu == prev) + continue; + + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); + prev = iommu; + } + rcu_read_unlock(); +} + +#define RISCV_IOMMU_FSC_BARE 0 + +/* + * Update IODIR for the device. + * + * During the execution of riscv_iommu_probe_device(), IODIR entries are + * allocated for the device's identifiers. Device context invalidation + * becomes necessary only if one of the updated entries was previously + * marked as valid, given that invalid device context entries are not + * cached by the IOMMU hardware. + * In this implementation, updating a valid device context while the + * device is not quiesced might be disruptive, potentially causing + * interim translation faults. + */ +static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, + struct device *dev, u64 fsc, u64 ta) +{ + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct riscv_iommu_dc *dc; + struct riscv_iommu_command cmd; + bool sync_required = false; + u64 tc; + int i; + + for (i = 0; i < fwspec->num_ids; i++) { + dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); + tc = READ_ONCE(dc->tc); + if (!(tc & RISCV_IOMMU_DC_TC_V)) + continue; + + WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V); + + /* Invalidate device context cached values */ + riscv_iommu_cmd_iodir_inval_ddt(&cmd); + riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); + riscv_iommu_cmd_send(iommu, &cmd); + sync_required = true; + } + + if (sync_required) + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); + + /* + * For device context with DC_TC_PDTV = 0, translation attributes valid bit + * is stored as DC_TC_V bit (both sharing the same location at BIT(0)). + */ + for (i = 0; i < fwspec->num_ids; i++) { + dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); + tc = READ_ONCE(dc->tc); + tc |= ta & RISCV_IOMMU_DC_TC_V; + + WRITE_ONCE(dc->fsc, fsc); + WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID); + /* Update device context, write TC.V as the last step. */ + dma_wmb(); + WRITE_ONCE(dc->tc, tc); + + /* Invalidate device context after update */ + riscv_iommu_cmd_iodir_inval_ddt(&cmd); + riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); + riscv_iommu_cmd_send(iommu, &cmd); + } + + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); +} + +/* + * IOVA page translation tree management. + */ + +static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + + riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX); +} + +static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain, + struct iommu_iotlb_gather *gather) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + + riscv_iommu_iotlb_inval(domain, gather->start, gather->end); +} + +#define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t))) + +#define _io_pte_present(pte) ((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE)) +#define _io_pte_leaf(pte) ((pte) & _PAGE_LEAF) +#define _io_pte_none(pte) ((pte) == 0) +#define _io_pte_entry(pn, prot) ((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot)) + +static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain, + unsigned long pte, struct list_head *freelist) +{ + unsigned long *ptr; + int i; + + if (!_io_pte_present(pte) || _io_pte_leaf(pte)) + return; + + ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); + + /* Recursively free all sub page table pages */ + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = READ_ONCE(ptr[i]); + if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte) + riscv_iommu_pte_free(domain, pte, freelist); + } + + if (freelist) + list_add_tail(&virt_to_page(ptr)->lru, freelist); + else + iommu_free_page(ptr); +} + +static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain, + unsigned long iova, size_t pgsize, + gfp_t gfp) +{ + unsigned long *ptr = domain->pgd_root; + unsigned long pte, old; + int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2; + void *addr; + + do { + const int shift = PAGE_SHIFT + PT_SHIFT * level; + + ptr += ((iova >> shift) & (PTRS_PER_PTE - 1)); + /* + * Note: returned entry might be a non-leaf if there was + * existing mapping with smaller granularity. Up to the caller + * to replace and invalidate. + */ + if (((size_t)1 << shift) == pgsize) + return ptr; +pte_retry: + pte = READ_ONCE(*ptr); + /* + * This is very likely incorrect as we should not be adding + * new mapping with smaller granularity on top + * of existing 2M/1G mapping. Fail. + */ + if (_io_pte_present(pte) && _io_pte_leaf(pte)) + return NULL; + /* + * Non-leaf entry is missing, allocate and try to add to the + * page table. This might race with other mappings, retry. + */ + if (_io_pte_none(pte)) { + addr = iommu_alloc_page_node(domain->numa_node, gfp); + if (!addr) + return NULL; + old = pte; + pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE); + if (cmpxchg_relaxed(ptr, old, pte) != old) { + iommu_free_page(addr); + goto pte_retry; + } + } + ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); + } while (level-- > 0); + + return NULL; +} + +static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain, + unsigned long iova, size_t *pte_pgsize) +{ + unsigned long *ptr = domain->pgd_root; + unsigned long pte; + int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2; + + do { + const int shift = PAGE_SHIFT + PT_SHIFT * level; + + ptr += ((iova >> shift) & (PTRS_PER_PTE - 1)); + pte = READ_ONCE(*ptr); + if (_io_pte_present(pte) && _io_pte_leaf(pte)) { + *pte_pgsize = (size_t)1 << shift; + return ptr; + } + if (_io_pte_none(pte)) + return NULL; + ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); + } while (level-- > 0); + + return NULL; +} + +static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain, + unsigned long iova, phys_addr_t phys, + size_t pgsize, size_t pgcount, int prot, + gfp_t gfp, size_t *mapped) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + size_t size = 0; + unsigned long *ptr; + unsigned long pte, old, pte_prot; + int rc = 0; + LIST_HEAD(freelist); + + if (!(prot & IOMMU_WRITE)) + pte_prot = _PAGE_BASE | _PAGE_READ; + else if (domain->amo_enabled) + pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE; + else + pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY; + + while (pgcount) { + ptr = riscv_iommu_pte_alloc(domain, iova, pgsize, gfp); + if (!ptr) { + rc = -ENOMEM; + break; + } + + old = READ_ONCE(*ptr); + pte = _io_pte_entry(phys_to_pfn(phys), pte_prot); + if (cmpxchg_relaxed(ptr, old, pte) != old) + continue; + + riscv_iommu_pte_free(domain, old, &freelist); + + size += pgsize; + iova += pgsize; + phys += pgsize; + --pgcount; + } + + *mapped = size; + + if (!list_empty(&freelist)) { + /* + * In 1.0 spec version, the smallest scope we can use to + * invalidate all levels of page table (i.e. leaf and non-leaf) + * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0. + * This will be updated with hardware support for + * capability.NL (non-leaf) IOTINVAL command. + */ + riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX); + iommu_put_pages_list(&freelist); + } + + return rc; +} + +static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain, + unsigned long iova, size_t pgsize, + size_t pgcount, + struct iommu_iotlb_gather *gather) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + size_t size = pgcount << __ffs(pgsize); + unsigned long *ptr, old; + size_t unmapped = 0; + size_t pte_size; + + while (unmapped < size) { + ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size); + if (!ptr) + return unmapped; + + /* partial unmap is not allowed, fail. */ + if (iova & (pte_size - 1)) + return unmapped; + + old = READ_ONCE(*ptr); + if (cmpxchg_relaxed(ptr, old, 0) != old) + continue; + + iommu_iotlb_gather_add_page(&domain->domain, gather, iova, + pte_size); + + iova += pte_size; + unmapped += pte_size; + } + + return unmapped; +} + +static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain, + dma_addr_t iova) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + unsigned long pte_size; + unsigned long *ptr; + + ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size); + if (_io_pte_none(*ptr) || !_io_pte_present(*ptr)) + return 0; + + return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1)); +} + +static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + const unsigned long pfn = virt_to_pfn(domain->pgd_root); + + WARN_ON(!list_empty(&domain->bonds)); + + if ((int)domain->pscid > 0) + ida_free(&riscv_iommu_pscids, domain->pscid); + + riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL); + kfree(domain); +} + +static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode) +{ + switch (pgd_mode) { + case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39: + return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39; + + case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48: + return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48; + + case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57: + return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57; + } + return false; +} + +static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain, + struct device *dev) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + u64 fsc, ta; + + if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode)) + return -ENODEV; + + fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) | + FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root)); + ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) | + RISCV_IOMMU_PC_TA_V; + + if (riscv_iommu_bond_link(domain, dev)) + return -ENOMEM; + + riscv_iommu_iodir_update(iommu, dev, fsc, ta); + riscv_iommu_bond_unlink(info->domain, dev); + info->domain = domain; + + return 0; +} + +static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = { + .attach_dev = riscv_iommu_attach_paging_domain, + .free = riscv_iommu_free_paging_domain, + .map_pages = riscv_iommu_map_pages, + .unmap_pages = riscv_iommu_unmap_pages, + .iova_to_phys = riscv_iommu_iova_to_phys, + .iotlb_sync = riscv_iommu_iotlb_sync, + .flush_iotlb_all = riscv_iommu_iotlb_flush_all, +}; + +static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev) +{ + struct riscv_iommu_domain *domain; + struct riscv_iommu_device *iommu; + unsigned int pgd_mode; + dma_addr_t va_mask; + int va_bits; + + iommu = dev_to_iommu(dev); + if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) { + pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57; + va_bits = 57; + } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) { + pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48; + va_bits = 48; + } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) { + pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39; + va_bits = 39; + } else { + dev_err(dev, "cannot find supported page table mode\n"); + return ERR_PTR(-ENODEV); + } + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD_RCU(&domain->bonds); + spin_lock_init(&domain->lock); + domain->numa_node = dev_to_node(iommu->dev); + domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD); + domain->pgd_mode = pgd_mode; + domain->pgd_root = iommu_alloc_page_node(domain->numa_node, + GFP_KERNEL_ACCOUNT); + if (!domain->pgd_root) { + kfree(domain); + return ERR_PTR(-ENOMEM); + } + + domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1, + RISCV_IOMMU_MAX_PSCID, GFP_KERNEL); + if (domain->pscid < 0) { + iommu_free_page(domain->pgd_root); + kfree(domain); + return ERR_PTR(-ENOMEM); + } + + /* + * Note: RISC-V Privilege spec mandates that virtual addresses + * need to be sign-extended, so if (VA_BITS - 1) is set, all + * bits >= VA_BITS need to also be set or else we'll get a + * page fault. However the code that creates the mappings + * above us (e.g. iommu_dma_alloc_iova()) won't do that for us + * for now, so we'll end up with invalid virtual addresses + * to map. As a workaround until we get this sorted out + * limit the available virtual addresses to VA_BITS - 1. + */ + va_mask = DMA_BIT_MASK(va_bits - 1); + + domain->domain.geometry.aperture_start = 0; + domain->domain.geometry.aperture_end = va_mask; + domain->domain.geometry.force_aperture = true; + domain->domain.pgsize_bitmap = va_mask & (SZ_4K | SZ_2M | SZ_1G | SZ_512G); + + domain->domain.ops = &riscv_iommu_paging_domain_ops; + + return &domain->domain; +} + +static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, + struct device *dev) +{ + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + + /* Make device context invalid, translation requests will fault w/ #258 */ + riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0); + riscv_iommu_bond_unlink(info->domain, dev); + info->domain = NULL; + + return 0; +} + +static struct iommu_domain riscv_iommu_blocking_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = riscv_iommu_attach_blocking_domain, + } +}; + +static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain, + struct device *dev) +{ + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + + riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V); + riscv_iommu_bond_unlink(info->domain, dev); + info->domain = NULL; + + return 0; +} + +static struct iommu_domain riscv_iommu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = riscv_iommu_attach_identity_domain, + } +}; + +static struct iommu_group *riscv_iommu_device_group(struct device *dev) +{ + if (dev_is_pci(dev)) + return pci_device_group(dev); + return generic_device_group(dev); +} + +static int riscv_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args) +{ + return iommu_fwspec_add_ids(dev, args->args, 1); +} + +static struct iommu_device *riscv_iommu_probe_device(struct device *dev) +{ + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct riscv_iommu_device *iommu; + struct riscv_iommu_info *info; + struct riscv_iommu_dc *dc; + u64 tc; + int i; + + if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids) + return ERR_PTR(-ENODEV); + + iommu = dev_get_drvdata(fwspec->iommu_fwnode->dev); + if (!iommu) + return ERR_PTR(-ENODEV); + + /* + * IOMMU hardware operating in fail-over BARE mode will provide + * identity translation for all connected devices anyway... + */ + if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) + return ERR_PTR(-ENODEV); + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + /* + * Allocate and pre-configure device context entries in + * the device directory. Do not mark the context valid yet. + */ + tc = 0; + if (iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD) + tc |= RISCV_IOMMU_DC_TC_SADE; + for (i = 0; i < fwspec->num_ids; i++) { + dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); + if (!dc) { + kfree(info); + return ERR_PTR(-ENODEV); + } + if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V) + dev_warn(dev, "already attached to IOMMU device directory\n"); + WRITE_ONCE(dc->tc, tc); + } + + dev_iommu_priv_set(dev, info); + + return &iommu->iommu; +} + +static void riscv_iommu_release_device(struct device *dev) +{ + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + + kfree_rcu_mightsleep(info); +} + +static const struct iommu_ops riscv_iommu_ops = { + .pgsize_bitmap = SZ_4K, + .of_xlate = riscv_iommu_of_xlate, + .identity_domain = &riscv_iommu_identity_domain, + .blocked_domain = &riscv_iommu_blocking_domain, + .release_domain = &riscv_iommu_blocking_domain, + .domain_alloc_paging = riscv_iommu_alloc_paging_domain, + .device_group = riscv_iommu_device_group, + .probe_device = riscv_iommu_probe_device, + .release_device = riscv_iommu_release_device, +}; + +static int riscv_iommu_init_check(struct riscv_iommu_device *iommu) +{ + u64 ddtp; + + /* + * Make sure the IOMMU is switched off or in pass-through mode during + * regular boot flow and disable translation when we boot into a kexec + * kernel and the previous kernel left them enabled. + */ + ddtp = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_DDTP); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) + return -EBUSY; + + if (FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp) > + RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) { + if (!is_kdump_kernel()) + return -EBUSY; + riscv_iommu_disable(iommu); + } + + /* Configure accesses to in-memory data structures for CPU-native byte order. */ + if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) != + !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) { + if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_END)) + return -EINVAL; + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, + iommu->fctl ^ RISCV_IOMMU_FCTL_BE); + iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL); + if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) != + !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) + return -EINVAL; + } + + /* + * Distribute interrupt vectors, always use first vector for CIV. + * At least one interrupt is required. Read back and verify. + */ + if (!iommu->irqs_count) + return -EINVAL; + + iommu->icvec = FIELD_PREP(RISCV_IOMMU_ICVEC_FIV, 1 % iommu->irqs_count) | + FIELD_PREP(RISCV_IOMMU_ICVEC_PIV, 2 % iommu->irqs_count) | + FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV, 3 % iommu->irqs_count); + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_ICVEC, iommu->icvec); + iommu->icvec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_ICVEC); + if (max(max(FIELD_GET(RISCV_IOMMU_ICVEC_CIV, iommu->icvec), + FIELD_GET(RISCV_IOMMU_ICVEC_FIV, iommu->icvec)), + max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV, iommu->icvec), + FIELD_GET(RISCV_IOMMU_ICVEC_PMIV, iommu->icvec))) >= iommu->irqs_count) + return -EINVAL; + + return 0; +} + +void riscv_iommu_remove(struct riscv_iommu_device *iommu) +{ + iommu_device_unregister(&iommu->iommu); + iommu_device_sysfs_remove(&iommu->iommu); + riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); + riscv_iommu_queue_disable(&iommu->cmdq); + riscv_iommu_queue_disable(&iommu->fltq); +} + +int riscv_iommu_init(struct riscv_iommu_device *iommu) +{ + int rc; + + RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ); + RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ); + + rc = riscv_iommu_init_check(iommu); + if (rc) + return dev_err_probe(iommu->dev, rc, "unexpected device state\n"); + + rc = riscv_iommu_iodir_alloc(iommu); + if (rc) + return rc; + + rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq, + sizeof(struct riscv_iommu_command)); + if (rc) + return rc; + + rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq, + sizeof(struct riscv_iommu_fq_record)); + if (rc) + return rc; + + rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process); + if (rc) + return rc; + + rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process); + if (rc) + goto err_queue_disable; + + rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX); + if (rc) + goto err_queue_disable; + + rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s", + dev_name(iommu->dev)); + if (rc) { + dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n"); + goto err_iodir_off; + } + + rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev); + if (rc) { + dev_err_probe(iommu->dev, rc, "cannot register iommu interface\n"); + goto err_remove_sysfs; + } + + return 0; + +err_remove_sysfs: + iommu_device_sysfs_remove(&iommu->iommu); +err_iodir_off: + riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); +err_queue_disable: + riscv_iommu_queue_disable(&iommu->fltq); + riscv_iommu_queue_disable(&iommu->cmdq); + return rc; +} |