diff options
Diffstat (limited to 'drivers/iommu/arm-smmu-v3.c')
-rw-r--r-- | drivers/iommu/arm-smmu-v3.c | 1597 |
1 files changed, 1199 insertions, 398 deletions
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index c5c93e48b4db..aa3ac2a03807 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -21,8 +21,7 @@ #include <linux/io-pgtable.h> #include <linux/iommu.h> #include <linux/iopoll.h> -#include <linux/init.h> -#include <linux/moduleparam.h> +#include <linux/module.h> #include <linux/msi.h> #include <linux/of.h> #include <linux/of_address.h> @@ -181,12 +180,13 @@ #define ARM_SMMU_MEMATTR_DEVICE_nGnRE 0x1 #define ARM_SMMU_MEMATTR_OIWB 0xf -#define Q_IDX(q, p) ((p) & ((1 << (q)->max_n_shift) - 1)) -#define Q_WRP(q, p) ((p) & (1 << (q)->max_n_shift)) -#define Q_OVERFLOW_FLAG (1 << 31) -#define Q_OVF(q, p) ((p) & Q_OVERFLOW_FLAG) +#define Q_IDX(llq, p) ((p) & ((1 << (llq)->max_n_shift) - 1)) +#define Q_WRP(llq, p) ((p) & (1 << (llq)->max_n_shift)) +#define Q_OVERFLOW_FLAG (1U << 31) +#define Q_OVF(p) ((p) & Q_OVERFLOW_FLAG) #define Q_ENT(q, p) ((q)->base + \ - Q_IDX(q, p) * (q)->ent_dwords) + Q_IDX(&((q)->llq), p) * \ + (q)->ent_dwords) #define Q_BASE_RWA (1UL << 62) #define Q_BASE_ADDR_MASK GENMASK_ULL(51, 5) @@ -223,9 +223,15 @@ #define STRTAB_STE_0_S1FMT GENMASK_ULL(5, 4) #define STRTAB_STE_0_S1FMT_LINEAR 0 +#define STRTAB_STE_0_S1FMT_64K_L2 2 #define STRTAB_STE_0_S1CTXPTR_MASK GENMASK_ULL(51, 6) #define STRTAB_STE_0_S1CDMAX GENMASK_ULL(63, 59) +#define STRTAB_STE_1_S1DSS GENMASK_ULL(1, 0) +#define STRTAB_STE_1_S1DSS_TERMINATE 0x0 +#define STRTAB_STE_1_S1DSS_BYPASS 0x1 +#define STRTAB_STE_1_S1DSS_SSID0 0x2 + #define STRTAB_STE_1_S1C_CACHE_NC 0UL #define STRTAB_STE_1_S1C_CACHE_WBRA 1UL #define STRTAB_STE_1_S1C_CACHE_WT 2UL @@ -250,6 +256,13 @@ #define STRTAB_STE_2_S2VMID GENMASK_ULL(15, 0) #define STRTAB_STE_2_VTCR GENMASK_ULL(50, 32) +#define STRTAB_STE_2_VTCR_S2T0SZ GENMASK_ULL(5, 0) +#define STRTAB_STE_2_VTCR_S2SL0 GENMASK_ULL(7, 6) +#define STRTAB_STE_2_VTCR_S2IR0 GENMASK_ULL(9, 8) +#define STRTAB_STE_2_VTCR_S2OR0 GENMASK_ULL(11, 10) +#define STRTAB_STE_2_VTCR_S2SH0 GENMASK_ULL(13, 12) +#define STRTAB_STE_2_VTCR_S2TG GENMASK_ULL(15, 14) +#define STRTAB_STE_2_VTCR_S2PS GENMASK_ULL(18, 16) #define STRTAB_STE_2_S2AA64 (1UL << 51) #define STRTAB_STE_2_S2ENDI (1UL << 52) #define STRTAB_STE_2_S2PTW (1UL << 54) @@ -257,30 +270,34 @@ #define STRTAB_STE_3_S2TTB_MASK GENMASK_ULL(51, 4) -/* Context descriptor (stage-1 only) */ +/* + * Context descriptors. + * + * Linear: when less than 1024 SSIDs are supported + * 2lvl: at most 1024 L1 entries, + * 1024 lazy entries per table. + */ +#define CTXDESC_SPLIT 10 +#define CTXDESC_L2_ENTRIES (1 << CTXDESC_SPLIT) + +#define CTXDESC_L1_DESC_DWORDS 1 +#define CTXDESC_L1_DESC_V (1UL << 0) +#define CTXDESC_L1_DESC_L2PTR_MASK GENMASK_ULL(51, 12) + #define CTXDESC_CD_DWORDS 8 #define CTXDESC_CD_0_TCR_T0SZ GENMASK_ULL(5, 0) -#define ARM64_TCR_T0SZ GENMASK_ULL(5, 0) #define CTXDESC_CD_0_TCR_TG0 GENMASK_ULL(7, 6) -#define ARM64_TCR_TG0 GENMASK_ULL(15, 14) #define CTXDESC_CD_0_TCR_IRGN0 GENMASK_ULL(9, 8) -#define ARM64_TCR_IRGN0 GENMASK_ULL(9, 8) #define CTXDESC_CD_0_TCR_ORGN0 GENMASK_ULL(11, 10) -#define ARM64_TCR_ORGN0 GENMASK_ULL(11, 10) #define CTXDESC_CD_0_TCR_SH0 GENMASK_ULL(13, 12) -#define ARM64_TCR_SH0 GENMASK_ULL(13, 12) #define CTXDESC_CD_0_TCR_EPD0 (1ULL << 14) -#define ARM64_TCR_EPD0 (1ULL << 7) #define CTXDESC_CD_0_TCR_EPD1 (1ULL << 30) -#define ARM64_TCR_EPD1 (1ULL << 23) #define CTXDESC_CD_0_ENDI (1UL << 15) #define CTXDESC_CD_0_V (1UL << 31) #define CTXDESC_CD_0_TCR_IPS GENMASK_ULL(34, 32) -#define ARM64_TCR_IPS GENMASK_ULL(34, 32) #define CTXDESC_CD_0_TCR_TBI0 (1ULL << 38) -#define ARM64_TCR_TBI0 (1ULL << 37) #define CTXDESC_CD_0_AA64 (1UL << 41) #define CTXDESC_CD_0_S (1UL << 44) @@ -291,9 +308,11 @@ #define CTXDESC_CD_1_TTB0_MASK GENMASK_ULL(51, 4) -/* Convert between AArch64 (CPU) TCR format and SMMU CD format */ -#define ARM_SMMU_TCR2CD(tcr, fld) FIELD_PREP(CTXDESC_CD_0_TCR_##fld, \ - FIELD_GET(ARM64_TCR_##fld, tcr)) +/* + * When the SMMU only supports linear context descriptor tables, pick a + * reasonable size limit (64kB). + */ +#define CTXDESC_LINEAR_CDMAX ilog2(SZ_64K / (CTXDESC_CD_DWORDS << 3)) /* Command queue */ #define CMDQ_ENT_SZ_SHIFT 4 @@ -306,6 +325,15 @@ #define CMDQ_ERR_CERROR_ABT_IDX 2 #define CMDQ_ERR_CERROR_ATC_INV_IDX 3 +#define CMDQ_PROD_OWNED_FLAG Q_OVERFLOW_FLAG + +/* + * This is used to size the command queue and therefore must be at least + * BITS_PER_LONG so that the valid_map works correctly (it relies on the + * total number of queue entries being a multiple of BITS_PER_LONG). + */ +#define CMDQ_BATCH_ENTRIES BITS_PER_LONG + #define CMDQ_0_OP GENMASK_ULL(7, 0) #define CMDQ_0_SSV (1UL << 11) @@ -313,6 +341,7 @@ #define CMDQ_PREFETCH_1_SIZE GENMASK_ULL(4, 0) #define CMDQ_PREFETCH_1_ADDR_MASK GENMASK_ULL(63, 12) +#define CMDQ_CFGI_0_SSID GENMASK_ULL(31, 12) #define CMDQ_CFGI_0_SID GENMASK_ULL(63, 32) #define CMDQ_CFGI_1_LEAF (1UL << 0) #define CMDQ_CFGI_1_RANGE GENMASK_ULL(4, 0) @@ -368,17 +397,12 @@ #define PRIQ_1_ADDR_MASK GENMASK_ULL(63, 12) /* High-level queue structures */ -#define ARM_SMMU_POLL_TIMEOUT_US 100 -#define ARM_SMMU_CMDQ_SYNC_TIMEOUT_US 1000000 /* 1s! */ -#define ARM_SMMU_CMDQ_SYNC_SPIN_COUNT 10 +#define ARM_SMMU_POLL_TIMEOUT_US 1000000 /* 1s! */ +#define ARM_SMMU_POLL_SPIN_COUNT 10 #define MSI_IOVA_BASE 0x8000000 #define MSI_IOVA_LENGTH 0x100000 -/* - * not really modular, but the easiest way to keep compat with existing - * bootargs behaviour is to continue using module_param_named here. - */ static bool disable_bypass = 1; module_param_named(disable_bypass, disable_bypass, bool, S_IRUGO); MODULE_PARM_DESC(disable_bypass, @@ -431,8 +455,11 @@ struct arm_smmu_cmdq_ent { #define CMDQ_OP_CFGI_STE 0x3 #define CMDQ_OP_CFGI_ALL 0x4 + #define CMDQ_OP_CFGI_CD 0x5 + #define CMDQ_OP_CFGI_CD_ALL 0x6 struct { u32 sid; + u32 ssid; union { bool leaf; u8 span; @@ -472,13 +499,29 @@ struct arm_smmu_cmdq_ent { #define CMDQ_OP_CMD_SYNC 0x46 struct { - u32 msidata; u64 msiaddr; } sync; }; }; +struct arm_smmu_ll_queue { + union { + u64 val; + struct { + u32 prod; + u32 cons; + }; + struct { + atomic_t prod; + atomic_t cons; + } atomic; + u8 __pad[SMP_CACHE_BYTES]; + } ____cacheline_aligned_in_smp; + u32 max_n_shift; +}; + struct arm_smmu_queue { + struct arm_smmu_ll_queue llq; int irq; /* Wired interrupt */ __le64 *base; @@ -486,17 +529,23 @@ struct arm_smmu_queue { u64 q_base; size_t ent_dwords; - u32 max_n_shift; - u32 prod; - u32 cons; u32 __iomem *prod_reg; u32 __iomem *cons_reg; }; +struct arm_smmu_queue_poll { + ktime_t timeout; + unsigned int delay; + unsigned int spin_cnt; + bool wfe; +}; + struct arm_smmu_cmdq { struct arm_smmu_queue q; - spinlock_t lock; + atomic_long_t *valid_map; + atomic_t owner_prod; + atomic_t lock; }; struct arm_smmu_evtq { @@ -516,16 +565,30 @@ struct arm_smmu_strtab_l1_desc { dma_addr_t l2ptr_dma; }; +struct arm_smmu_ctx_desc { + u16 asid; + u64 ttbr; + u64 tcr; + u64 mair; +}; + +struct arm_smmu_l1_ctx_desc { + __le64 *l2ptr; + dma_addr_t l2ptr_dma; +}; + +struct arm_smmu_ctx_desc_cfg { + __le64 *cdtab; + dma_addr_t cdtab_dma; + struct arm_smmu_l1_ctx_desc *l1_desc; + unsigned int num_l1_ents; +}; + struct arm_smmu_s1_cfg { - __le64 *cdptr; - dma_addr_t cdptr_dma; - - struct arm_smmu_ctx_desc { - u16 asid; - u64 ttbr; - u64 tcr; - u64 mair; - } cd; + struct arm_smmu_ctx_desc_cfg cdcfg; + struct arm_smmu_ctx_desc cd; + u8 s1fmt; + u8 s1cdmax; }; struct arm_smmu_s2_cfg { @@ -576,8 +639,6 @@ struct arm_smmu_device { int gerr_irq; int combined_irq; - u32 sync_nr; - u8 prev_cmd_opcode; unsigned long ias; /* IPA */ unsigned long oas; /* PA */ @@ -596,12 +657,6 @@ struct arm_smmu_device { struct arm_smmu_strtab_cfg strtab_cfg; - /* Hi16xx adds an extra 32 bits of goodness to its MSI payload */ - union { - u32 sync_count; - u64 padding; - }; - /* IOMMU core code handle */ struct iommu_device iommu; }; @@ -614,7 +669,8 @@ struct arm_smmu_master { struct list_head domain_head; u32 *sids; unsigned int num_sids; - bool ats_enabled :1; + bool ats_enabled; + unsigned int ssid_bits; }; /* SMMU private data for an IOMMU domain */ @@ -631,6 +687,7 @@ struct arm_smmu_domain { struct io_pgtable_ops *pgtbl_ops; bool non_strict; + atomic_t nr_ats_masters; enum arm_smmu_domain_stage stage; union { @@ -685,85 +742,97 @@ static void parse_driver_options(struct arm_smmu_device *smmu) } /* Low-level queue manipulation functions */ -static bool queue_full(struct arm_smmu_queue *q) +static bool queue_has_space(struct arm_smmu_ll_queue *q, u32 n) +{ + u32 space, prod, cons; + + prod = Q_IDX(q, q->prod); + cons = Q_IDX(q, q->cons); + + if (Q_WRP(q, q->prod) == Q_WRP(q, q->cons)) + space = (1 << q->max_n_shift) - (prod - cons); + else + space = cons - prod; + + return space >= n; +} + +static bool queue_full(struct arm_smmu_ll_queue *q) { return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) && Q_WRP(q, q->prod) != Q_WRP(q, q->cons); } -static bool queue_empty(struct arm_smmu_queue *q) +static bool queue_empty(struct arm_smmu_ll_queue *q) { return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) && Q_WRP(q, q->prod) == Q_WRP(q, q->cons); } -static void queue_sync_cons(struct arm_smmu_queue *q) +static bool queue_consumed(struct arm_smmu_ll_queue *q, u32 prod) { - q->cons = readl_relaxed(q->cons_reg); + return ((Q_WRP(q, q->cons) == Q_WRP(q, prod)) && + (Q_IDX(q, q->cons) > Q_IDX(q, prod))) || + ((Q_WRP(q, q->cons) != Q_WRP(q, prod)) && + (Q_IDX(q, q->cons) <= Q_IDX(q, prod))); } -static void queue_inc_cons(struct arm_smmu_queue *q) +static void queue_sync_cons_out(struct arm_smmu_queue *q) { - u32 cons = (Q_WRP(q, q->cons) | Q_IDX(q, q->cons)) + 1; - - q->cons = Q_OVF(q, q->cons) | Q_WRP(q, cons) | Q_IDX(q, cons); - /* * Ensure that all CPU accesses (reads and writes) to the queue * are complete before we update the cons pointer. */ mb(); - writel_relaxed(q->cons, q->cons_reg); + writel_relaxed(q->llq.cons, q->cons_reg); +} + +static void queue_inc_cons(struct arm_smmu_ll_queue *q) +{ + u32 cons = (Q_WRP(q, q->cons) | Q_IDX(q, q->cons)) + 1; + q->cons = Q_OVF(q->cons) | Q_WRP(q, cons) | Q_IDX(q, cons); } -static int queue_sync_prod(struct arm_smmu_queue *q) +static int queue_sync_prod_in(struct arm_smmu_queue *q) { int ret = 0; u32 prod = readl_relaxed(q->prod_reg); - if (Q_OVF(q, prod) != Q_OVF(q, q->prod)) + if (Q_OVF(prod) != Q_OVF(q->llq.prod)) ret = -EOVERFLOW; - q->prod = prod; + q->llq.prod = prod; return ret; } -static void queue_inc_prod(struct arm_smmu_queue *q) +static u32 queue_inc_prod_n(struct arm_smmu_ll_queue *q, int n) { - u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1; - - q->prod = Q_OVF(q, q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod); - writel(q->prod, q->prod_reg); + u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + n; + return Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod); } -/* - * Wait for the SMMU to consume items. If sync is true, wait until the queue - * is empty. Otherwise, wait until there is at least one free slot. - */ -static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe) +static void queue_poll_init(struct arm_smmu_device *smmu, + struct arm_smmu_queue_poll *qp) { - ktime_t timeout; - unsigned int delay = 1, spin_cnt = 0; - - /* Wait longer if it's a CMD_SYNC */ - timeout = ktime_add_us(ktime_get(), sync ? - ARM_SMMU_CMDQ_SYNC_TIMEOUT_US : - ARM_SMMU_POLL_TIMEOUT_US); + qp->delay = 1; + qp->spin_cnt = 0; + qp->wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); + qp->timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US); +} - while (queue_sync_cons(q), (sync ? !queue_empty(q) : queue_full(q))) { - if (ktime_compare(ktime_get(), timeout) > 0) - return -ETIMEDOUT; +static int queue_poll(struct arm_smmu_queue_poll *qp) +{ + if (ktime_compare(ktime_get(), qp->timeout) > 0) + return -ETIMEDOUT; - if (wfe) { - wfe(); - } else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) { - cpu_relax(); - continue; - } else { - udelay(delay); - delay *= 2; - spin_cnt = 0; - } + if (qp->wfe) { + wfe(); + } else if (++qp->spin_cnt < ARM_SMMU_POLL_SPIN_COUNT) { + cpu_relax(); + } else { + udelay(qp->delay); + qp->delay *= 2; + qp->spin_cnt = 0; } return 0; @@ -777,16 +846,6 @@ static void queue_write(__le64 *dst, u64 *src, size_t n_dwords) *dst++ = cpu_to_le64(*src++); } -static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent) -{ - if (queue_full(q)) - return -ENOSPC; - - queue_write(Q_ENT(q, q->prod), ent, q->ent_dwords); - queue_inc_prod(q); - return 0; -} - static void queue_read(__le64 *dst, u64 *src, size_t n_dwords) { int i; @@ -797,11 +856,12 @@ static void queue_read(__le64 *dst, u64 *src, size_t n_dwords) static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent) { - if (queue_empty(q)) + if (queue_empty(&q->llq)) return -EAGAIN; - queue_read(ent, Q_ENT(q, q->cons), q->ent_dwords); - queue_inc_cons(q); + queue_read(ent, Q_ENT(q, q->llq.cons), q->ent_dwords); + queue_inc_cons(&q->llq); + queue_sync_cons_out(q); return 0; } @@ -820,15 +880,22 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) cmd[1] |= FIELD_PREP(CMDQ_PREFETCH_1_SIZE, ent->prefetch.size); cmd[1] |= ent->prefetch.addr & CMDQ_PREFETCH_1_ADDR_MASK; break; + case CMDQ_OP_CFGI_CD: + cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SSID, ent->cfgi.ssid); + /* Fallthrough */ case CMDQ_OP_CFGI_STE: cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SID, ent->cfgi.sid); cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_LEAF, ent->cfgi.leaf); break; + case CMDQ_OP_CFGI_CD_ALL: + cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SID, ent->cfgi.sid); + break; case CMDQ_OP_CFGI_ALL: /* Cover the entire SID range */ cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_RANGE, 31); break; case CMDQ_OP_TLBI_NH_VA: + cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid); cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid); cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf); cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK; @@ -868,20 +935,14 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp); break; case CMDQ_OP_CMD_SYNC: - if (ent->sync.msiaddr) + if (ent->sync.msiaddr) { cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_IRQ); - else + cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK; + } else { cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV); + } cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH); cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB); - /* - * Commands are written little-endian, but we want the SMMU to - * receive MSIData, and thus write it back to memory, in CPU - * byte order, so big-endian needs an extra byteswap here. - */ - cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIDATA, - cpu_to_le32(ent->sync.msidata)); - cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK; break; default: return -ENOENT; @@ -890,6 +951,27 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) return 0; } +static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu, + u32 prod) +{ + struct arm_smmu_queue *q = &smmu->cmdq.q; + struct arm_smmu_cmdq_ent ent = { + .opcode = CMDQ_OP_CMD_SYNC, + }; + + /* + * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI + * payload, so the write will zero the entire command on that platform. + */ + if (smmu->features & ARM_SMMU_FEAT_MSI && + smmu->features & ARM_SMMU_FEAT_COHERENCY) { + ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) * + q->ent_dwords * 8; + } + + arm_smmu_cmdq_build_cmd(cmd, &ent); +} + static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) { static const char *cerror_str[] = { @@ -948,156 +1030,691 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) queue_write(Q_ENT(q, cons), cmd, q->ent_dwords); } -static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd) +/* + * Command queue locking. + * This is a form of bastardised rwlock with the following major changes: + * + * - The only LOCK routines are exclusive_trylock() and shared_lock(). + * Neither have barrier semantics, and instead provide only a control + * dependency. + * + * - The UNLOCK routines are supplemented with shared_tryunlock(), which + * fails if the caller appears to be the last lock holder (yes, this is + * racy). All successful UNLOCK routines have RELEASE semantics. + */ +static void arm_smmu_cmdq_shared_lock(struct arm_smmu_cmdq *cmdq) { - struct arm_smmu_queue *q = &smmu->cmdq.q; - bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); + int val; - smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]); + /* + * We can try to avoid the cmpxchg() loop by simply incrementing the + * lock counter. When held in exclusive state, the lock counter is set + * to INT_MIN so these increments won't hurt as the value will remain + * negative. + */ + if (atomic_fetch_inc_relaxed(&cmdq->lock) >= 0) + return; - while (queue_insert_raw(q, cmd) == -ENOSPC) { - if (queue_poll_cons(q, false, wfe)) - dev_err_ratelimited(smmu->dev, "CMDQ timeout\n"); + do { + val = atomic_cond_read_relaxed(&cmdq->lock, VAL >= 0); + } while (atomic_cmpxchg_relaxed(&cmdq->lock, val, val + 1) != val); +} + +static void arm_smmu_cmdq_shared_unlock(struct arm_smmu_cmdq *cmdq) +{ + (void)atomic_dec_return_release(&cmdq->lock); +} + +static bool arm_smmu_cmdq_shared_tryunlock(struct arm_smmu_cmdq *cmdq) +{ + if (atomic_read(&cmdq->lock) == 1) + return false; + + arm_smmu_cmdq_shared_unlock(cmdq); + return true; +} + +#define arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags) \ +({ \ + bool __ret; \ + local_irq_save(flags); \ + __ret = !atomic_cmpxchg_relaxed(&cmdq->lock, 0, INT_MIN); \ + if (!__ret) \ + local_irq_restore(flags); \ + __ret; \ +}) + +#define arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags) \ +({ \ + atomic_set_release(&cmdq->lock, 0); \ + local_irq_restore(flags); \ +}) + + +/* + * Command queue insertion. + * This is made fiddly by our attempts to achieve some sort of scalability + * since there is one queue shared amongst all of the CPUs in the system. If + * you like mixed-size concurrency, dependency ordering and relaxed atomics, + * then you'll *love* this monstrosity. + * + * The basic idea is to split the queue up into ranges of commands that are + * owned by a given CPU; the owner may not have written all of the commands + * itself, but is responsible for advancing the hardware prod pointer when + * the time comes. The algorithm is roughly: + * + * 1. Allocate some space in the queue. At this point we also discover + * whether the head of the queue is currently owned by another CPU, + * or whether we are the owner. + * + * 2. Write our commands into our allocated slots in the queue. + * + * 3. Mark our slots as valid in arm_smmu_cmdq.valid_map. + * + * 4. If we are an owner: + * a. Wait for the previous owner to finish. + * b. Mark the queue head as unowned, which tells us the range + * that we are responsible for publishing. + * c. Wait for all commands in our owned range to become valid. + * d. Advance the hardware prod pointer. + * e. Tell the next owner we've finished. + * + * 5. If we are inserting a CMD_SYNC (we may or may not have been an + * owner), then we need to stick around until it has completed: + * a. If we have MSIs, the SMMU can write back into the CMD_SYNC + * to clear the first 4 bytes. + * b. Otherwise, we spin waiting for the hardware cons pointer to + * advance past our command. + * + * The devil is in the details, particularly the use of locking for handling + * SYNC completion and freeing up space in the queue before we think that it is + * full. + */ +static void __arm_smmu_cmdq_poll_set_valid_map(struct arm_smmu_cmdq *cmdq, + u32 sprod, u32 eprod, bool set) +{ + u32 swidx, sbidx, ewidx, ebidx; + struct arm_smmu_ll_queue llq = { + .max_n_shift = cmdq->q.llq.max_n_shift, + .prod = sprod, + }; + + ewidx = BIT_WORD(Q_IDX(&llq, eprod)); + ebidx = Q_IDX(&llq, eprod) % BITS_PER_LONG; + + while (llq.prod != eprod) { + unsigned long mask; + atomic_long_t *ptr; + u32 limit = BITS_PER_LONG; + + swidx = BIT_WORD(Q_IDX(&llq, llq.prod)); + sbidx = Q_IDX(&llq, llq.prod) % BITS_PER_LONG; + + ptr = &cmdq->valid_map[swidx]; + + if ((swidx == ewidx) && (sbidx < ebidx)) + limit = ebidx; + + mask = GENMASK(limit - 1, sbidx); + + /* + * The valid bit is the inverse of the wrap bit. This means + * that a zero-initialised queue is invalid and, after marking + * all entries as valid, they become invalid again when we + * wrap. + */ + if (set) { + atomic_long_xor(mask, ptr); + } else { /* Poll */ + unsigned long valid; + + valid = (ULONG_MAX + !!Q_WRP(&llq, llq.prod)) & mask; + atomic_long_cond_read_relaxed(ptr, (VAL & mask) == valid); + } + + llq.prod = queue_inc_prod_n(&llq, limit - sbidx); } } -static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, - struct arm_smmu_cmdq_ent *ent) +/* Mark all entries in the range [sprod, eprod) as valid */ +static void arm_smmu_cmdq_set_valid_map(struct arm_smmu_cmdq *cmdq, + u32 sprod, u32 eprod) +{ + __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, true); +} + +/* Wait for all entries in the range [sprod, eprod) to become valid */ +static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq, + u32 sprod, u32 eprod) +{ + __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, false); +} + +/* Wait for the command queue to become non-full */ +static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu, + struct arm_smmu_ll_queue *llq) { - u64 cmd[CMDQ_ENT_DWORDS]; unsigned long flags; + struct arm_smmu_queue_poll qp; + struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + int ret = 0; - if (arm_smmu_cmdq_build_cmd(cmd, ent)) { - dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n", - ent->opcode); - return; + /* + * Try to update our copy of cons by grabbing exclusive cmdq access. If + * that fails, spin until somebody else updates it for us. + */ + if (arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)) { + WRITE_ONCE(cmdq->q.llq.cons, readl_relaxed(cmdq->q.cons_reg)); + arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags); + llq->val = READ_ONCE(cmdq->q.llq.val); + return 0; } - spin_lock_irqsave(&smmu->cmdq.lock, flags); - arm_smmu_cmdq_insert_cmd(smmu, cmd); - spin_unlock_irqrestore(&smmu->cmdq.lock, flags); + queue_poll_init(smmu, &qp); + do { + llq->val = READ_ONCE(smmu->cmdq.q.llq.val); + if (!queue_full(llq)) + break; + + ret = queue_poll(&qp); + } while (!ret); + + return ret; } /* - * The difference between val and sync_idx is bounded by the maximum size of - * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic. + * Wait until the SMMU signals a CMD_SYNC completion MSI. + * Must be called with the cmdq lock held in some capacity. */ -static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx) +static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu, + struct arm_smmu_ll_queue *llq) { - ktime_t timeout; - u32 val; + int ret = 0; + struct arm_smmu_queue_poll qp; + struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod)); - timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US); - val = smp_cond_load_acquire(&smmu->sync_count, - (int)(VAL - sync_idx) >= 0 || - !ktime_before(ktime_get(), timeout)); + queue_poll_init(smmu, &qp); - return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0; + /* + * The MSI won't generate an event, since it's being written back + * into the command queue. + */ + qp.wfe = false; + smp_cond_load_relaxed(cmd, !VAL || (ret = queue_poll(&qp))); + llq->cons = ret ? llq->prod : queue_inc_prod_n(llq, 1); + return ret; } -static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu) +/* + * Wait until the SMMU cons index passes llq->prod. + * Must be called with the cmdq lock held in some capacity. + */ +static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu, + struct arm_smmu_ll_queue *llq) { - u64 cmd[CMDQ_ENT_DWORDS]; - unsigned long flags; - struct arm_smmu_cmdq_ent ent = { - .opcode = CMDQ_OP_CMD_SYNC, - .sync = { - .msiaddr = virt_to_phys(&smmu->sync_count), - }, - }; + struct arm_smmu_queue_poll qp; + struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + u32 prod = llq->prod; + int ret = 0; - spin_lock_irqsave(&smmu->cmdq.lock, flags); + queue_poll_init(smmu, &qp); + llq->val = READ_ONCE(smmu->cmdq.q.llq.val); + do { + if (queue_consumed(llq, prod)) + break; - /* Piggy-back on the previous command if it's a SYNC */ - if (smmu->prev_cmd_opcode == CMDQ_OP_CMD_SYNC) { - ent.sync.msidata = smmu->sync_nr; - } else { - ent.sync.msidata = ++smmu->sync_nr; - arm_smmu_cmdq_build_cmd(cmd, &ent); - arm_smmu_cmdq_insert_cmd(smmu, cmd); - } + ret = queue_poll(&qp); - spin_unlock_irqrestore(&smmu->cmdq.lock, flags); + /* + * This needs to be a readl() so that our subsequent call + * to arm_smmu_cmdq_shared_tryunlock() can fail accurately. + * + * Specifically, we need to ensure that we observe all + * shared_lock()s by other CMD_SYNCs that share our owner, + * so that a failing call to tryunlock() means that we're + * the last one out and therefore we can safely advance + * cmdq->q.llq.cons. Roughly speaking: + * + * CPU 0 CPU1 CPU2 (us) + * + * if (sync) + * shared_lock(); + * + * dma_wmb(); + * set_valid_map(); + * + * if (owner) { + * poll_valid_map(); + * <control dependency> + * writel(prod_reg); + * + * readl(cons_reg); + * tryunlock(); + * + * Requires us to see CPU 0's shared_lock() acquisition. + */ + llq->cons = readl(cmdq->q.cons_reg); + } while (!ret); - return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata); + return ret; } -static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) +static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu, + struct arm_smmu_ll_queue *llq) { - u64 cmd[CMDQ_ENT_DWORDS]; + if (smmu->features & ARM_SMMU_FEAT_MSI && + smmu->features & ARM_SMMU_FEAT_COHERENCY) + return __arm_smmu_cmdq_poll_until_msi(smmu, llq); + + return __arm_smmu_cmdq_poll_until_consumed(smmu, llq); +} + +static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds, + u32 prod, int n) +{ + int i; + struct arm_smmu_ll_queue llq = { + .max_n_shift = cmdq->q.llq.max_n_shift, + .prod = prod, + }; + + for (i = 0; i < n; ++i) { + u64 *cmd = &cmds[i * CMDQ_ENT_DWORDS]; + + prod = queue_inc_prod_n(&llq, i); + queue_write(Q_ENT(&cmdq->q, prod), cmd, CMDQ_ENT_DWORDS); + } +} + +/* + * This is the actual insertion function, and provides the following + * ordering guarantees to callers: + * + * - There is a dma_wmb() before publishing any commands to the queue. + * This can be relied upon to order prior writes to data structures + * in memory (such as a CD or an STE) before the command. + * + * - On completion of a CMD_SYNC, there is a control dependency. + * This can be relied upon to order subsequent writes to memory (e.g. + * freeing an IOVA) after completion of the CMD_SYNC. + * + * - Command insertion is totally ordered, so if two CPUs each race to + * insert their own list of commands then all of the commands from one + * CPU will appear before any of the commands from the other CPU. + */ +static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, + u64 *cmds, int n, bool sync) +{ + u64 cmd_sync[CMDQ_ENT_DWORDS]; + u32 prod; unsigned long flags; - bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); - struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC }; - int ret; + bool owner; + struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + struct arm_smmu_ll_queue llq = { + .max_n_shift = cmdq->q.llq.max_n_shift, + }, head = llq; + int ret = 0; - arm_smmu_cmdq_build_cmd(cmd, &ent); + /* 1. Allocate some space in the queue */ + local_irq_save(flags); + llq.val = READ_ONCE(cmdq->q.llq.val); + do { + u64 old; + + while (!queue_has_space(&llq, n + sync)) { + local_irq_restore(flags); + if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq)) + dev_err_ratelimited(smmu->dev, "CMDQ timeout\n"); + local_irq_save(flags); + } + + head.cons = llq.cons; + head.prod = queue_inc_prod_n(&llq, n + sync) | + CMDQ_PROD_OWNED_FLAG; + + old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val); + if (old == llq.val) + break; + + llq.val = old; + } while (1); + owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG); + head.prod &= ~CMDQ_PROD_OWNED_FLAG; + llq.prod &= ~CMDQ_PROD_OWNED_FLAG; + + /* + * 2. Write our commands into the queue + * Dependency ordering from the cmpxchg() loop above. + */ + arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n); + if (sync) { + prod = queue_inc_prod_n(&llq, n); + arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod); + queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS); + + /* + * In order to determine completion of our CMD_SYNC, we must + * ensure that the queue can't wrap twice without us noticing. + * We achieve that by taking the cmdq lock as shared before + * marking our slot as valid. + */ + arm_smmu_cmdq_shared_lock(cmdq); + } + + /* 3. Mark our slots as valid, ensuring commands are visible first */ + dma_wmb(); + arm_smmu_cmdq_set_valid_map(cmdq, llq.prod, head.prod); - spin_lock_irqsave(&smmu->cmdq.lock, flags); - arm_smmu_cmdq_insert_cmd(smmu, cmd); - ret = queue_poll_cons(&smmu->cmdq.q, true, wfe); - spin_unlock_irqrestore(&smmu->cmdq.lock, flags); + /* 4. If we are the owner, take control of the SMMU hardware */ + if (owner) { + /* a. Wait for previous owner to finish */ + atomic_cond_read_relaxed(&cmdq->owner_prod, VAL == llq.prod); + + /* b. Stop gathering work by clearing the owned flag */ + prod = atomic_fetch_andnot_relaxed(CMDQ_PROD_OWNED_FLAG, + &cmdq->q.llq.atomic.prod); + prod &= ~CMDQ_PROD_OWNED_FLAG; + + /* + * c. Wait for any gathered work to be written to the queue. + * Note that we read our own entries so that we have the control + * dependency required by (d). + */ + arm_smmu_cmdq_poll_valid_map(cmdq, llq.prod, prod); + + /* + * d. Advance the hardware prod pointer + * Control dependency ordering from the entries becoming valid. + */ + writel_relaxed(prod, cmdq->q.prod_reg); + + /* + * e. Tell the next owner we're done + * Make sure we've updated the hardware first, so that we don't + * race to update prod and potentially move it backwards. + */ + atomic_set_release(&cmdq->owner_prod, prod); + } + + /* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */ + if (sync) { + llq.prod = queue_inc_prod_n(&llq, n); + ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq); + if (ret) { + dev_err_ratelimited(smmu->dev, + "CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n", + llq.prod, + readl_relaxed(cmdq->q.prod_reg), + readl_relaxed(cmdq->q.cons_reg)); + } + + /* + * Try to unlock the cmq lock. This will fail if we're the last + * reader, in which case we can safely update cmdq->q.llq.cons + */ + if (!arm_smmu_cmdq_shared_tryunlock(cmdq)) { + WRITE_ONCE(cmdq->q.llq.cons, llq.cons); + arm_smmu_cmdq_shared_unlock(cmdq); + } + } + local_irq_restore(flags); return ret; } -static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) +static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq_ent *ent) { - int ret; - bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) && - (smmu->features & ARM_SMMU_FEAT_COHERENCY); + u64 cmd[CMDQ_ENT_DWORDS]; - ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu) - : __arm_smmu_cmdq_issue_sync(smmu); - if (ret) - dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n"); - return ret; + if (arm_smmu_cmdq_build_cmd(cmd, ent)) { + dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n", + ent->opcode); + return -EINVAL; + } + + return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false); +} + +static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) +{ + return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true); } /* Context descriptor manipulation functions */ -static u64 arm_smmu_cpu_tcr_to_cd(u64 tcr) +static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain, + int ssid, bool leaf) { - u64 val = 0; + size_t i; + unsigned long flags; + struct arm_smmu_master *master; + struct arm_smmu_device *smmu = smmu_domain->smmu; + struct arm_smmu_cmdq_ent cmd = { + .opcode = CMDQ_OP_CFGI_CD, + .cfgi = { + .ssid = ssid, + .leaf = leaf, + }, + }; + + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + list_for_each_entry(master, &smmu_domain->devices, domain_head) { + for (i = 0; i < master->num_sids; i++) { + cmd.cfgi.sid = master->sids[i]; + arm_smmu_cmdq_issue_cmd(smmu, &cmd); + } + } + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); - /* Repack the TCR. Just care about TTBR0 for now */ - val |= ARM_SMMU_TCR2CD(tcr, T0SZ); - val |= ARM_SMMU_TCR2CD(tcr, TG0); - val |= ARM_SMMU_TCR2CD(tcr, IRGN0); - val |= ARM_SMMU_TCR2CD(tcr, ORGN0); - val |= ARM_SMMU_TCR2CD(tcr, SH0); - val |= ARM_SMMU_TCR2CD(tcr, EPD0); - val |= ARM_SMMU_TCR2CD(tcr, EPD1); - val |= ARM_SMMU_TCR2CD(tcr, IPS); + arm_smmu_cmdq_issue_sync(smmu); +} - return val; +static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu, + struct arm_smmu_l1_ctx_desc *l1_desc) +{ + size_t size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3); + + l1_desc->l2ptr = dmam_alloc_coherent(smmu->dev, size, + &l1_desc->l2ptr_dma, GFP_KERNEL); + if (!l1_desc->l2ptr) { + dev_warn(smmu->dev, + "failed to allocate context descriptor table\n"); + return -ENOMEM; + } + return 0; } -static void arm_smmu_write_ctx_desc(struct arm_smmu_device *smmu, - struct arm_smmu_s1_cfg *cfg) +static void arm_smmu_write_cd_l1_desc(__le64 *dst, + struct arm_smmu_l1_ctx_desc *l1_desc) { - u64 val; + u64 val = (l1_desc->l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) | + CTXDESC_L1_DESC_V; + WRITE_ONCE(*dst, cpu_to_le64(val)); +} + +static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_domain *smmu_domain, + u32 ssid) +{ + __le64 *l1ptr; + unsigned int idx; + struct arm_smmu_l1_ctx_desc *l1_desc; + struct arm_smmu_device *smmu = smmu_domain->smmu; + struct arm_smmu_ctx_desc_cfg *cdcfg = &smmu_domain->s1_cfg.cdcfg; + + if (smmu_domain->s1_cfg.s1fmt == STRTAB_STE_0_S1FMT_LINEAR) + return cdcfg->cdtab + ssid * CTXDESC_CD_DWORDS; + + idx = ssid >> CTXDESC_SPLIT; + l1_desc = &cdcfg->l1_desc[idx]; + if (!l1_desc->l2ptr) { + if (arm_smmu_alloc_cd_leaf_table(smmu, l1_desc)) + return NULL; + + l1ptr = cdcfg->cdtab + idx * CTXDESC_L1_DESC_DWORDS; + arm_smmu_write_cd_l1_desc(l1ptr, l1_desc); + /* An invalid L1CD can be cached */ + arm_smmu_sync_cd(smmu_domain, ssid, false); + } + idx = ssid & (CTXDESC_L2_ENTRIES - 1); + return l1_desc->l2ptr + idx * CTXDESC_CD_DWORDS; +} + +static int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, + int ssid, struct arm_smmu_ctx_desc *cd) +{ /* - * We don't need to issue any invalidation here, as we'll invalidate - * the STE when installing the new entry anyway. + * This function handles the following cases: + * + * (1) Install primary CD, for normal DMA traffic (SSID = 0). + * (2) Install a secondary CD, for SID+SSID traffic. + * (3) Update ASID of a CD. Atomically write the first 64 bits of the + * CD, then invalidate the old entry and mappings. + * (4) Remove a secondary CD. */ - val = arm_smmu_cpu_tcr_to_cd(cfg->cd.tcr) | + u64 val; + bool cd_live; + __le64 *cdptr; + struct arm_smmu_device *smmu = smmu_domain->smmu; + + if (WARN_ON(ssid >= (1 << smmu_domain->s1_cfg.s1cdmax))) + return -E2BIG; + + cdptr = arm_smmu_get_cd_ptr(smmu_domain, ssid); + if (!cdptr) + return -ENOMEM; + + val = le64_to_cpu(cdptr[0]); + cd_live = !!(val & CTXDESC_CD_0_V); + + if (!cd) { /* (4) */ + val = 0; + } else if (cd_live) { /* (3) */ + val &= ~CTXDESC_CD_0_ASID; + val |= FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid); + /* + * Until CD+TLB invalidation, both ASIDs may be used for tagging + * this substream's traffic + */ + } else { /* (1) and (2) */ + cdptr[1] = cpu_to_le64(cd->ttbr & CTXDESC_CD_1_TTB0_MASK); + cdptr[2] = 0; + cdptr[3] = cpu_to_le64(cd->mair); + + /* + * STE is live, and the SMMU might read dwords of this CD in any + * order. Ensure that it observes valid values before reading + * V=1. + */ + arm_smmu_sync_cd(smmu_domain, ssid, true); + + val = cd->tcr | #ifdef __BIG_ENDIAN - CTXDESC_CD_0_ENDI | + CTXDESC_CD_0_ENDI | #endif - CTXDESC_CD_0_R | CTXDESC_CD_0_A | CTXDESC_CD_0_ASET | - CTXDESC_CD_0_AA64 | FIELD_PREP(CTXDESC_CD_0_ASID, cfg->cd.asid) | - CTXDESC_CD_0_V; + CTXDESC_CD_0_R | CTXDESC_CD_0_A | CTXDESC_CD_0_ASET | + CTXDESC_CD_0_AA64 | + FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid) | + CTXDESC_CD_0_V; + + /* STALL_MODEL==0b10 && CD.S==0 is ILLEGAL */ + if (smmu->features & ARM_SMMU_FEAT_STALL_FORCE) + val |= CTXDESC_CD_0_S; + } + + /* + * The SMMU accesses 64-bit values atomically. See IHI0070Ca 3.21.3 + * "Configuration structures and configuration invalidation completion" + * + * The size of single-copy atomic reads made by the SMMU is + * IMPLEMENTATION DEFINED but must be at least 64 bits. Any single + * field within an aligned 64-bit span of a structure can be altered + * without first making the structure invalid. + */ + WRITE_ONCE(cdptr[0], cpu_to_le64(val)); + arm_smmu_sync_cd(smmu_domain, ssid, true); + return 0; +} + +static int arm_smmu_alloc_cd_tables(struct arm_smmu_domain *smmu_domain) +{ + int ret; + size_t l1size; + size_t max_contexts; + struct arm_smmu_device *smmu = smmu_domain->smmu; + struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg; + struct arm_smmu_ctx_desc_cfg *cdcfg = &cfg->cdcfg; + + max_contexts = 1 << cfg->s1cdmax; + + if (!(smmu->features & ARM_SMMU_FEAT_2_LVL_CDTAB) || + max_contexts <= CTXDESC_L2_ENTRIES) { + cfg->s1fmt = STRTAB_STE_0_S1FMT_LINEAR; + cdcfg->num_l1_ents = max_contexts; + + l1size = max_contexts * (CTXDESC_CD_DWORDS << 3); + } else { + cfg->s1fmt = STRTAB_STE_0_S1FMT_64K_L2; + cdcfg->num_l1_ents = DIV_ROUND_UP(max_contexts, + CTXDESC_L2_ENTRIES); + + cdcfg->l1_desc = devm_kcalloc(smmu->dev, cdcfg->num_l1_ents, + sizeof(*cdcfg->l1_desc), + GFP_KERNEL); + if (!cdcfg->l1_desc) + return -ENOMEM; + + l1size = cdcfg->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3); + } + + cdcfg->cdtab = dmam_alloc_coherent(smmu->dev, l1size, &cdcfg->cdtab_dma, + GFP_KERNEL); + if (!cdcfg->cdtab) { + dev_warn(smmu->dev, "failed to allocate context descriptor\n"); + ret = -ENOMEM; + goto err_free_l1; + } + + return 0; + +err_free_l1: + if (cdcfg->l1_desc) { + devm_kfree(smmu->dev, cdcfg->l1_desc); + cdcfg->l1_desc = NULL; + } + return ret; +} + +static void arm_smmu_free_cd_tables(struct arm_smmu_domain *smmu_domain) +{ + int i; + size_t size, l1size; + struct arm_smmu_device *smmu = smmu_domain->smmu; + struct arm_smmu_ctx_desc_cfg *cdcfg = &smmu_domain->s1_cfg.cdcfg; - /* STALL_MODEL==0b10 && CD.S==0 is ILLEGAL */ - if (smmu->features & ARM_SMMU_FEAT_STALL_FORCE) - val |= CTXDESC_CD_0_S; + if (cdcfg->l1_desc) { + size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3); - cfg->cdptr[0] = cpu_to_le64(val); + for (i = 0; i < cdcfg->num_l1_ents; i++) { + if (!cdcfg->l1_desc[i].l2ptr) + continue; - val = cfg->cd.ttbr & CTXDESC_CD_1_TTB0_MASK; - cfg->cdptr[1] = cpu_to_le64(val); + dmam_free_coherent(smmu->dev, size, + cdcfg->l1_desc[i].l2ptr, + cdcfg->l1_desc[i].l2ptr_dma); + } + devm_kfree(smmu->dev, cdcfg->l1_desc); + cdcfg->l1_desc = NULL; + + l1size = cdcfg->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3); + } else { + l1size = cdcfg->num_l1_ents * (CTXDESC_CD_DWORDS << 3); + } - cfg->cdptr[3] = cpu_to_le64(cfg->cd.mair); + dmam_free_coherent(smmu->dev, l1size, cdcfg->cdtab, cdcfg->cdtab_dma); + cdcfg->cdtab_dma = 0; + cdcfg->cdtab = NULL; } /* Stream table manipulation functions */ @@ -1219,6 +1836,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, if (s1_cfg) { BUG_ON(ste_live); dst[1] = cpu_to_le64( + FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) | FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) | FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) | FIELD_PREP(STRTAB_STE_1_S1CSH, ARM_SMMU_SH_ISH) | @@ -1228,8 +1846,10 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, !(smmu->features & ARM_SMMU_FEAT_STALL_FORCE)) dst[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD); - val |= (s1_cfg->cdptr_dma & STRTAB_STE_0_S1CTXPTR_MASK) | - FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS); + val |= (s1_cfg->cdcfg.cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) | + FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS) | + FIELD_PREP(STRTAB_STE_0_S1CDMAX, s1_cfg->s1cdmax) | + FIELD_PREP(STRTAB_STE_0_S1FMT, s1_cfg->s1fmt); } if (s2_cfg) { @@ -1253,7 +1873,8 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, STRTAB_STE_1_EATS_TRANS)); arm_smmu_sync_ste_for_sid(smmu, sid); - dst[0] = cpu_to_le64(val); + /* See comment in arm_smmu_write_ctx_desc() */ + WRITE_ONCE(dst[0], cpu_to_le64(val)); arm_smmu_sync_ste_for_sid(smmu, sid); /* It's likely that we'll want to use the new STE soon */ @@ -1286,7 +1907,7 @@ static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid) desc->span = STRTAB_SPLIT + 1; desc->l2ptr = dmam_alloc_coherent(smmu->dev, size, &desc->l2ptr_dma, - GFP_KERNEL | __GFP_ZERO); + GFP_KERNEL); if (!desc->l2ptr) { dev_err(smmu->dev, "failed to allocate l2 stream table for SID %u\n", @@ -1305,6 +1926,7 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev) int i; struct arm_smmu_device *smmu = dev; struct arm_smmu_queue *q = &smmu->evtq.q; + struct arm_smmu_ll_queue *llq = &q->llq; u64 evt[EVTQ_ENT_DWORDS]; do { @@ -1322,12 +1944,13 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev) * Not much we can do on overflow, so scream and pretend we're * trying harder. */ - if (queue_sync_prod(q) == -EOVERFLOW) + if (queue_sync_prod_in(q) == -EOVERFLOW) dev_err(smmu->dev, "EVTQ overflow detected -- events lost\n"); - } while (!queue_empty(q)); + } while (!queue_empty(llq)); /* Sync our overflow flag, as we believe we're up to speed */ - q->cons = Q_OVF(q, q->prod) | Q_WRP(q, q->cons) | Q_IDX(q, q->cons); + llq->cons = Q_OVF(llq->prod) | Q_WRP(llq, llq->cons) | + Q_IDX(llq, llq->cons); return IRQ_HANDLED; } @@ -1373,19 +1996,21 @@ static irqreturn_t arm_smmu_priq_thread(int irq, void *dev) { struct arm_smmu_device *smmu = dev; struct arm_smmu_queue *q = &smmu->priq.q; + struct arm_smmu_ll_queue *llq = &q->llq; u64 evt[PRIQ_ENT_DWORDS]; do { while (!queue_remove_raw(q, evt)) arm_smmu_handle_ppr(smmu, evt); - if (queue_sync_prod(q) == -EOVERFLOW) + if (queue_sync_prod_in(q) == -EOVERFLOW) dev_err(smmu->dev, "PRIQ overflow detected -- requests lost\n"); - } while (!queue_empty(q)); + } while (!queue_empty(llq)); /* Sync our overflow flag, as we believe we're up to speed */ - q->cons = Q_OVF(q, q->prod) | Q_WRP(q, q->cons) | Q_IDX(q, q->cons); - writel(q->cons, q->cons_reg); + llq->cons = Q_OVF(llq->prod) | Q_WRP(llq, llq->cons) | + Q_IDX(llq, llq->cons); + queue_sync_cons_out(q); return IRQ_HANDLED; } @@ -1534,6 +2159,23 @@ static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS)) return 0; + /* + * Ensure that we've completed prior invalidation of the main TLBs + * before we read 'nr_ats_masters' in case of a concurrent call to + * arm_smmu_enable_ats(): + * + * // unmap() // arm_smmu_enable_ats() + * TLBI+SYNC atomic_inc(&nr_ats_masters); + * smp_mb(); [...] + * atomic_read(&nr_ats_masters); pci_enable_ats() // writel() + * + * Ensures that we always see the incremented 'nr_ats_masters' count if + * ATS was enabled at the PCI device before completion of the TLBI. + */ + smp_mb(); + if (!atomic_read(&smmu_domain->nr_ats_masters)) + return 0; + arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd); spin_lock_irqsave(&smmu_domain->devices_lock, flags); @@ -1545,13 +2187,6 @@ static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, } /* IO_PGTABLE API */ -static void arm_smmu_tlb_sync(void *cookie) -{ - struct arm_smmu_domain *smmu_domain = cookie; - - arm_smmu_cmdq_issue_sync(smmu_domain->smmu); -} - static void arm_smmu_tlb_inv_context(void *cookie) { struct arm_smmu_domain *smmu_domain = cookie; @@ -1570,25 +2205,32 @@ static void arm_smmu_tlb_inv_context(void *cookie) /* * NOTE: when io-pgtable is in non-strict mode, we may get here with * PTEs previously cleared by unmaps on the current CPU not yet visible - * to the SMMU. We are relying on the DSB implicit in queue_inc_prod() - * to guarantee those are observed before the TLBI. Do be careful, 007. + * to the SMMU. We are relying on the dma_wmb() implicit during cmd + * insertion to guarantee those are observed before the TLBI. Do be + * careful, 007. */ arm_smmu_cmdq_issue_cmd(smmu, &cmd); arm_smmu_cmdq_issue_sync(smmu); + arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0); } -static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size, - size_t granule, bool leaf, void *cookie) +static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size, + size_t granule, bool leaf, + struct arm_smmu_domain *smmu_domain) { - struct arm_smmu_domain *smmu_domain = cookie; + u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS]; struct arm_smmu_device *smmu = smmu_domain->smmu; + unsigned long start = iova, end = iova + size; + int i = 0; struct arm_smmu_cmdq_ent cmd = { .tlbi = { .leaf = leaf, - .addr = iova, }, }; + if (!size) + return; + if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { cmd.opcode = CMDQ_OP_TLBI_NH_VA; cmd.tlbi.asid = smmu_domain->s1_cfg.cd.asid; @@ -1597,16 +2239,54 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size, cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; } - do { - arm_smmu_cmdq_issue_cmd(smmu, &cmd); - cmd.tlbi.addr += granule; - } while (size -= granule); + while (iova < end) { + if (i == CMDQ_BATCH_ENTRIES) { + arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, false); + i = 0; + } + + cmd.tlbi.addr = iova; + arm_smmu_cmdq_build_cmd(&cmds[i * CMDQ_ENT_DWORDS], &cmd); + iova += granule; + i++; + } + + arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, true); + + /* + * Unfortunately, this can't be leaf-only since we may have + * zapped an entire table. + */ + arm_smmu_atc_inv_domain(smmu_domain, 0, start, size); } -static const struct iommu_gather_ops arm_smmu_gather_ops = { +static void arm_smmu_tlb_inv_page_nosync(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, + void *cookie) +{ + struct arm_smmu_domain *smmu_domain = cookie; + struct iommu_domain *domain = &smmu_domain->domain; + + iommu_iotlb_gather_add_page(domain, gather, iova, granule); +} + +static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size, + size_t granule, void *cookie) +{ + arm_smmu_tlb_inv_range(iova, size, granule, false, cookie); +} + +static void arm_smmu_tlb_inv_leaf(unsigned long iova, size_t size, + size_t granule, void *cookie) +{ + arm_smmu_tlb_inv_range(iova, size, granule, true, cookie); +} + +static const struct iommu_flush_ops arm_smmu_flush_ops = { .tlb_flush_all = arm_smmu_tlb_inv_context, - .tlb_add_flush = arm_smmu_tlb_inv_range_nosync, - .tlb_sync = arm_smmu_tlb_sync, + .tlb_flush_walk = arm_smmu_tlb_inv_walk, + .tlb_flush_leaf = arm_smmu_tlb_inv_leaf, + .tlb_add_page = arm_smmu_tlb_inv_page_nosync, }; /* IOMMU API */ @@ -1683,12 +2363,8 @@ static void arm_smmu_domain_free(struct iommu_domain *domain) if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg; - if (cfg->cdptr) { - dmam_free_coherent(smmu_domain->smmu->dev, - CTXDESC_CD_DWORDS << 3, - cfg->cdptr, - cfg->cdptr_dma); - + if (cfg->cdcfg.cdtab) { + arm_smmu_free_cd_tables(smmu_domain); arm_smmu_bitmap_free(smmu->asid_map, cfg->cd.asid); } } else { @@ -1701,55 +2377,82 @@ static void arm_smmu_domain_free(struct iommu_domain *domain) } static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain, + struct arm_smmu_master *master, struct io_pgtable_cfg *pgtbl_cfg) { int ret; int asid; struct arm_smmu_device *smmu = smmu_domain->smmu; struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg; + typeof(&pgtbl_cfg->arm_lpae_s1_cfg.tcr) tcr = &pgtbl_cfg->arm_lpae_s1_cfg.tcr; asid = arm_smmu_bitmap_alloc(smmu->asid_map, smmu->asid_bits); if (asid < 0) return asid; - cfg->cdptr = dmam_alloc_coherent(smmu->dev, CTXDESC_CD_DWORDS << 3, - &cfg->cdptr_dma, - GFP_KERNEL | __GFP_ZERO); - if (!cfg->cdptr) { - dev_warn(smmu->dev, "failed to allocate context descriptor\n"); - ret = -ENOMEM; + cfg->s1cdmax = master->ssid_bits; + + ret = arm_smmu_alloc_cd_tables(smmu_domain); + if (ret) goto out_free_asid; - } cfg->cd.asid = (u16)asid; - cfg->cd.ttbr = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[0]; - cfg->cd.tcr = pgtbl_cfg->arm_lpae_s1_cfg.tcr; - cfg->cd.mair = pgtbl_cfg->arm_lpae_s1_cfg.mair[0]; + cfg->cd.ttbr = pgtbl_cfg->arm_lpae_s1_cfg.ttbr; + cfg->cd.tcr = FIELD_PREP(CTXDESC_CD_0_TCR_T0SZ, tcr->tsz) | + FIELD_PREP(CTXDESC_CD_0_TCR_TG0, tcr->tg) | + FIELD_PREP(CTXDESC_CD_0_TCR_IRGN0, tcr->irgn) | + FIELD_PREP(CTXDESC_CD_0_TCR_ORGN0, tcr->orgn) | + FIELD_PREP(CTXDESC_CD_0_TCR_SH0, tcr->sh) | + FIELD_PREP(CTXDESC_CD_0_TCR_IPS, tcr->ips) | + CTXDESC_CD_0_TCR_EPD1 | CTXDESC_CD_0_AA64; + cfg->cd.mair = pgtbl_cfg->arm_lpae_s1_cfg.mair; + + /* + * Note that this will end up calling arm_smmu_sync_cd() before + * the master has been added to the devices list for this domain. + * This isn't an issue because the STE hasn't been installed yet. + */ + ret = arm_smmu_write_ctx_desc(smmu_domain, 0, &cfg->cd); + if (ret) + goto out_free_cd_tables; + return 0; +out_free_cd_tables: + arm_smmu_free_cd_tables(smmu_domain); out_free_asid: arm_smmu_bitmap_free(smmu->asid_map, asid); return ret; } static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain, + struct arm_smmu_master *master, struct io_pgtable_cfg *pgtbl_cfg) { int vmid; struct arm_smmu_device *smmu = smmu_domain->smmu; struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg; + typeof(&pgtbl_cfg->arm_lpae_s2_cfg.vtcr) vtcr; vmid = arm_smmu_bitmap_alloc(smmu->vmid_map, smmu->vmid_bits); if (vmid < 0) return vmid; + vtcr = &pgtbl_cfg->arm_lpae_s2_cfg.vtcr; cfg->vmid = (u16)vmid; cfg->vttbr = pgtbl_cfg->arm_lpae_s2_cfg.vttbr; - cfg->vtcr = pgtbl_cfg->arm_lpae_s2_cfg.vtcr; + cfg->vtcr = FIELD_PREP(STRTAB_STE_2_VTCR_S2T0SZ, vtcr->tsz) | + FIELD_PREP(STRTAB_STE_2_VTCR_S2SL0, vtcr->sl) | + FIELD_PREP(STRTAB_STE_2_VTCR_S2IR0, vtcr->irgn) | + FIELD_PREP(STRTAB_STE_2_VTCR_S2OR0, vtcr->orgn) | + FIELD_PREP(STRTAB_STE_2_VTCR_S2SH0, vtcr->sh) | + FIELD_PREP(STRTAB_STE_2_VTCR_S2TG, vtcr->tg) | + FIELD_PREP(STRTAB_STE_2_VTCR_S2PS, vtcr->ps); return 0; } -static int arm_smmu_domain_finalise(struct iommu_domain *domain) +static int arm_smmu_domain_finalise(struct iommu_domain *domain, + struct arm_smmu_master *master) { int ret; unsigned long ias, oas; @@ -1757,6 +2460,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) struct io_pgtable_cfg pgtbl_cfg; struct io_pgtable_ops *pgtbl_ops; int (*finalise_stage_fn)(struct arm_smmu_domain *, + struct arm_smmu_master *, struct io_pgtable_cfg *); struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_device *smmu = smmu_domain->smmu; @@ -1796,7 +2500,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) .ias = ias, .oas = oas, .coherent_walk = smmu->features & ARM_SMMU_FEAT_COHERENCY, - .tlb = &arm_smmu_gather_ops, + .tlb = &arm_smmu_flush_ops, .iommu_dev = smmu->dev, }; @@ -1811,7 +2515,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) domain->geometry.aperture_end = (1UL << pgtbl_cfg.ias) - 1; domain->geometry.force_aperture = true; - ret = finalise_stage_fn(smmu_domain, &pgtbl_cfg); + ret = finalise_stage_fn(smmu_domain, master, &pgtbl_cfg); if (ret < 0) { free_io_pgtable_ops(pgtbl_ops); return ret; @@ -1863,44 +2567,65 @@ static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master) } } -static int arm_smmu_enable_ats(struct arm_smmu_master *master) +#ifdef CONFIG_PCI_ATS +static bool arm_smmu_ats_supported(struct arm_smmu_master *master) { - int ret; - size_t stu; struct pci_dev *pdev; struct arm_smmu_device *smmu = master->smmu; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(master->dev); if (!(smmu->features & ARM_SMMU_FEAT_ATS) || !dev_is_pci(master->dev) || !(fwspec->flags & IOMMU_FWSPEC_PCI_RC_ATS) || pci_ats_disabled()) - return -ENXIO; + return false; pdev = to_pci_dev(master->dev); - if (pdev->untrusted) - return -EPERM; + return !pdev->untrusted && pdev->ats_cap; +} +#else +static bool arm_smmu_ats_supported(struct arm_smmu_master *master) +{ + return false; +} +#endif + +static void arm_smmu_enable_ats(struct arm_smmu_master *master) +{ + size_t stu; + struct pci_dev *pdev; + struct arm_smmu_device *smmu = master->smmu; + struct arm_smmu_domain *smmu_domain = master->domain; + + /* Don't enable ATS at the endpoint if it's not enabled in the STE */ + if (!master->ats_enabled) + return; /* Smallest Translation Unit: log2 of the smallest supported granule */ stu = __ffs(smmu->pgsize_bitmap); + pdev = to_pci_dev(master->dev); - ret = pci_enable_ats(pdev, stu); - if (ret) - return ret; - - master->ats_enabled = true; - return 0; + atomic_inc(&smmu_domain->nr_ats_masters); + arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0); + if (pci_enable_ats(pdev, stu)) + dev_err(master->dev, "Failed to enable ATS (STU %zu)\n", stu); } static void arm_smmu_disable_ats(struct arm_smmu_master *master) { struct arm_smmu_cmdq_ent cmd; + struct arm_smmu_domain *smmu_domain = master->domain; - if (!master->ats_enabled || !dev_is_pci(master->dev)) + if (!master->ats_enabled) return; + pci_disable_ats(to_pci_dev(master->dev)); + /* + * Ensure ATS is disabled at the endpoint before we issue the + * ATC invalidation via the SMMU. + */ + wmb(); arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd); arm_smmu_atc_inv_master(master, &cmd); - pci_disable_ats(to_pci_dev(master->dev)); - master->ats_enabled = false; + atomic_dec(&smmu_domain->nr_ats_masters); } static void arm_smmu_detach_dev(struct arm_smmu_master *master) @@ -1911,14 +2636,15 @@ static void arm_smmu_detach_dev(struct arm_smmu_master *master) if (!smmu_domain) return; + arm_smmu_disable_ats(master); + spin_lock_irqsave(&smmu_domain->devices_lock, flags); list_del(&master->domain_head); spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); master->domain = NULL; + master->ats_enabled = false; arm_smmu_install_ste_for_dev(master); - - arm_smmu_disable_ats(master); } static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) @@ -1942,7 +2668,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) if (!smmu_domain->smmu) { smmu_domain->smmu = smmu; - ret = arm_smmu_domain_finalise(domain); + ret = arm_smmu_domain_finalise(domain, master); if (ret) { smmu_domain->smmu = NULL; goto out_unlock; @@ -1954,28 +2680,35 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) dev_name(smmu->dev)); ret = -ENXIO; goto out_unlock; + } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 && + master->ssid_bits != smmu_domain->s1_cfg.s1cdmax) { + dev_err(dev, + "cannot attach to incompatible domain (%u SSID bits != %u)\n", + smmu_domain->s1_cfg.s1cdmax, master->ssid_bits); + ret = -EINVAL; + goto out_unlock; } master->domain = smmu_domain; + if (smmu_domain->stage != ARM_SMMU_DOMAIN_BYPASS) + master->ats_enabled = arm_smmu_ats_supported(master); + + arm_smmu_install_ste_for_dev(master); + spin_lock_irqsave(&smmu_domain->devices_lock, flags); list_add(&master->domain_head, &smmu_domain->devices); spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); - if (smmu_domain->stage != ARM_SMMU_DOMAIN_BYPASS) - arm_smmu_enable_ats(master); + arm_smmu_enable_ats(master); - if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) - arm_smmu_write_ctx_desc(smmu, &smmu_domain->s1_cfg); - - arm_smmu_install_ste_for_dev(master); out_unlock: mutex_unlock(&smmu_domain->init_mutex); return ret; } static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot) + phys_addr_t paddr, size_t size, int prot, gfp_t gfp) { struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; @@ -1985,21 +2718,16 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, return ops->map(ops, iova, paddr, size, prot); } -static size_t -arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size) +static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, + size_t size, struct iommu_iotlb_gather *gather) { - int ret; struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; if (!ops) return 0; - ret = ops->unmap(ops, iova, size); - if (ret && arm_smmu_atc_inv_domain(smmu_domain, 0, iova, size)) - return 0; - - return ret; + return ops->unmap(ops, iova, size, gather); } static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain) @@ -2010,12 +2738,13 @@ static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain) arm_smmu_tlb_inv_context(smmu_domain); } -static void arm_smmu_iotlb_sync(struct iommu_domain *domain) +static void arm_smmu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) { - struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu; + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - if (smmu) - arm_smmu_cmdq_issue_sync(smmu); + arm_smmu_tlb_inv_range(gather->start, gather->end - gather->start, + gather->pgsize, true, smmu_domain); } static phys_addr_t @@ -2034,16 +2763,11 @@ arm_smmu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) static struct platform_driver arm_smmu_driver; -static int arm_smmu_match_node(struct device *dev, const void *data) -{ - return dev->fwnode == data; -} - static struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode) { - struct device *dev = driver_find_device(&arm_smmu_driver.driver, NULL, - fwnode, arm_smmu_match_node); + struct device *dev = driver_find_device_by_fwnode(&arm_smmu_driver.driver, + fwnode); put_device(dev); return dev ? dev_get_drvdata(dev) : NULL; } @@ -2070,51 +2794,66 @@ static int arm_smmu_add_device(struct device *dev) if (!fwspec || fwspec->ops != &arm_smmu_ops) return -ENODEV; - /* - * We _can_ actually withstand dodgy bus code re-calling add_device() - * without an intervening remove_device()/of_xlate() sequence, but - * we're not going to do so quietly... - */ - if (WARN_ON_ONCE(fwspec->iommu_priv)) { - master = fwspec->iommu_priv; - smmu = master->smmu; - } else { - smmu = arm_smmu_get_by_fwnode(fwspec->iommu_fwnode); - if (!smmu) - return -ENODEV; - master = kzalloc(sizeof(*master), GFP_KERNEL); - if (!master) - return -ENOMEM; - master->dev = dev; - master->smmu = smmu; - master->sids = fwspec->ids; - master->num_sids = fwspec->num_ids; - fwspec->iommu_priv = master; - } + if (WARN_ON_ONCE(fwspec->iommu_priv)) + return -EBUSY; + + smmu = arm_smmu_get_by_fwnode(fwspec->iommu_fwnode); + if (!smmu) + return -ENODEV; + + master = kzalloc(sizeof(*master), GFP_KERNEL); + if (!master) + return -ENOMEM; + + master->dev = dev; + master->smmu = smmu; + master->sids = fwspec->ids; + master->num_sids = fwspec->num_ids; + fwspec->iommu_priv = master; /* Check the SIDs are in range of the SMMU and our stream table */ for (i = 0; i < master->num_sids; i++) { u32 sid = master->sids[i]; - if (!arm_smmu_sid_in_range(smmu, sid)) - return -ERANGE; + if (!arm_smmu_sid_in_range(smmu, sid)) { + ret = -ERANGE; + goto err_free_master; + } /* Ensure l2 strtab is initialised */ if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) { ret = arm_smmu_init_l2_strtab(smmu, sid); if (ret) - return ret; + goto err_free_master; } } + master->ssid_bits = min(smmu->ssid_bits, fwspec->num_pasid_bits); + + if (!(smmu->features & ARM_SMMU_FEAT_2_LVL_CDTAB)) + master->ssid_bits = min_t(u8, master->ssid_bits, + CTXDESC_LINEAR_CDMAX); + + ret = iommu_device_link(&smmu->iommu, dev); + if (ret) + goto err_free_master; + group = iommu_group_get_for_dev(dev); - if (!IS_ERR(group)) { - iommu_group_put(group); - iommu_device_link(&smmu->iommu, dev); + if (IS_ERR(group)) { + ret = PTR_ERR(group); + goto err_unlink; } - return PTR_ERR_OR_ZERO(group); + iommu_group_put(group); + return 0; + +err_unlink: + iommu_device_unlink(&smmu->iommu, dev); +err_free_master: + kfree(master); + fwspec->iommu_priv = NULL; + return ret; } static void arm_smmu_remove_device(struct device *dev) @@ -2246,15 +2985,6 @@ static void arm_smmu_get_resv_regions(struct device *dev, iommu_dma_get_resv_regions(dev, head); } -static void arm_smmu_put_resv_regions(struct device *dev, - struct list_head *head) -{ - struct iommu_resv_region *entry, *next; - - list_for_each_entry_safe(entry, next, head, list) - kfree(entry); -} - static struct iommu_ops arm_smmu_ops = { .capable = arm_smmu_capable, .domain_alloc = arm_smmu_domain_alloc, @@ -2272,7 +3002,7 @@ static struct iommu_ops arm_smmu_ops = { .domain_set_attr = arm_smmu_domain_set_attr, .of_xlate = arm_smmu_of_xlate, .get_resv_regions = arm_smmu_get_resv_regions, - .put_resv_regions = arm_smmu_put_resv_regions, + .put_resv_regions = generic_iommu_put_resv_regions, .pgsize_bitmap = -1UL, /* Restricted during device attach */ }; @@ -2286,13 +3016,13 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu, size_t qsz; do { - qsz = ((1 << q->max_n_shift) * dwords) << 3; + qsz = ((1 << q->llq.max_n_shift) * dwords) << 3; q->base = dmam_alloc_coherent(smmu->dev, qsz, &q->base_dma, GFP_KERNEL); if (q->base || qsz < PAGE_SIZE) break; - q->max_n_shift--; + q->llq.max_n_shift--; } while (1); if (!q->base) { @@ -2304,7 +3034,7 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu, if (!WARN_ON(q->base_dma & (qsz - 1))) { dev_info(smmu->dev, "allocated %u entries for %s\n", - 1 << q->max_n_shift, name); + 1 << q->llq.max_n_shift, name); } q->prod_reg = arm_smmu_page1_fixup(prod_off, smmu); @@ -2313,24 +3043,55 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu, q->q_base = Q_BASE_RWA; q->q_base |= q->base_dma & Q_BASE_ADDR_MASK; - q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->max_n_shift); + q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->llq.max_n_shift); - q->prod = q->cons = 0; + q->llq.prod = q->llq.cons = 0; return 0; } +static void arm_smmu_cmdq_free_bitmap(void *data) +{ + unsigned long *bitmap = data; + bitmap_free(bitmap); +} + +static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu) +{ + int ret = 0; + struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + unsigned int nents = 1 << cmdq->q.llq.max_n_shift; + atomic_long_t *bitmap; + + atomic_set(&cmdq->owner_prod, 0); + atomic_set(&cmdq->lock, 0); + + bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL); + if (!bitmap) { + dev_err(smmu->dev, "failed to allocate cmdq bitmap\n"); + ret = -ENOMEM; + } else { + cmdq->valid_map = bitmap; + devm_add_action(smmu->dev, arm_smmu_cmdq_free_bitmap, bitmap); + } + + return ret; +} + static int arm_smmu_init_queues(struct arm_smmu_device *smmu) { int ret; /* cmdq */ - spin_lock_init(&smmu->cmdq.lock); ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD, ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS, "cmdq"); if (ret) return ret; + ret = arm_smmu_cmdq_init(smmu); + if (ret) + return ret; + /* evtq */ ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD, ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS, @@ -2388,7 +3149,7 @@ static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu) l1size = cfg->num_l1_ents * (STRTAB_L1_DESC_DWORDS << 3); strtab = dmam_alloc_coherent(smmu->dev, l1size, &cfg->strtab_dma, - GFP_KERNEL | __GFP_ZERO); + GFP_KERNEL); if (!strtab) { dev_err(smmu->dev, "failed to allocate l1 stream table (%u bytes)\n", @@ -2415,7 +3176,7 @@ static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu) size = (1 << smmu->sid_bits) * (STRTAB_STE_DWORDS << 3); strtab = dmam_alloc_coherent(smmu->dev, size, &cfg->strtab_dma, - GFP_KERNEL | __GFP_ZERO); + GFP_KERNEL); if (!strtab) { dev_err(smmu->dev, "failed to allocate linear stream table (%u bytes)\n", @@ -2708,8 +3469,8 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass) /* Command queue */ writeq_relaxed(smmu->cmdq.q.q_base, smmu->base + ARM_SMMU_CMDQ_BASE); - writel_relaxed(smmu->cmdq.q.prod, smmu->base + ARM_SMMU_CMDQ_PROD); - writel_relaxed(smmu->cmdq.q.cons, smmu->base + ARM_SMMU_CMDQ_CONS); + writel_relaxed(smmu->cmdq.q.llq.prod, smmu->base + ARM_SMMU_CMDQ_PROD); + writel_relaxed(smmu->cmdq.q.llq.cons, smmu->base + ARM_SMMU_CMDQ_CONS); enables = CR0_CMDQEN; ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0, @@ -2736,9 +3497,9 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass) /* Event queue */ writeq_relaxed(smmu->evtq.q.q_base, smmu->base + ARM_SMMU_EVTQ_BASE); - writel_relaxed(smmu->evtq.q.prod, + writel_relaxed(smmu->evtq.q.llq.prod, arm_smmu_page1_fixup(ARM_SMMU_EVTQ_PROD, smmu)); - writel_relaxed(smmu->evtq.q.cons, + writel_relaxed(smmu->evtq.q.llq.cons, arm_smmu_page1_fixup(ARM_SMMU_EVTQ_CONS, smmu)); enables |= CR0_EVTQEN; @@ -2753,9 +3514,9 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass) if (smmu->features & ARM_SMMU_FEAT_PRI) { writeq_relaxed(smmu->priq.q.q_base, smmu->base + ARM_SMMU_PRIQ_BASE); - writel_relaxed(smmu->priq.q.prod, + writel_relaxed(smmu->priq.q.llq.prod, arm_smmu_page1_fixup(ARM_SMMU_PRIQ_PROD, smmu)); - writel_relaxed(smmu->priq.q.cons, + writel_relaxed(smmu->priq.q.llq.cons, arm_smmu_page1_fixup(ARM_SMMU_PRIQ_CONS, smmu)); enables |= CR0_PRIQEN; @@ -2909,18 +3670,24 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) } /* Queue sizes, capped to ensure natural alignment */ - smmu->cmdq.q.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT, - FIELD_GET(IDR1_CMDQS, reg)); - if (!smmu->cmdq.q.max_n_shift) { - /* Odd alignment restrictions on the base, so ignore for now */ - dev_err(smmu->dev, "unit-length command queue not supported\n"); + smmu->cmdq.q.llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT, + FIELD_GET(IDR1_CMDQS, reg)); + if (smmu->cmdq.q.llq.max_n_shift <= ilog2(CMDQ_BATCH_ENTRIES)) { + /* + * We don't support splitting up batches, so one batch of + * commands plus an extra sync needs to fit inside the command + * queue. There's also no way we can handle the weird alignment + * restrictions on the base pointer for a unit-length queue. + */ + dev_err(smmu->dev, "command queue size <= %d entries not supported\n", + CMDQ_BATCH_ENTRIES); return -ENXIO; } - smmu->evtq.q.max_n_shift = min_t(u32, EVTQ_MAX_SZ_SHIFT, - FIELD_GET(IDR1_EVTQS, reg)); - smmu->priq.q.max_n_shift = min_t(u32, PRIQ_MAX_SZ_SHIFT, - FIELD_GET(IDR1_PRIQS, reg)); + smmu->evtq.q.llq.max_n_shift = min_t(u32, EVTQ_MAX_SZ_SHIFT, + FIELD_GET(IDR1_EVTQS, reg)); + smmu->priq.q.llq.max_n_shift = min_t(u32, PRIQ_MAX_SZ_SHIFT, + FIELD_GET(IDR1_PRIQS, reg)); /* SID/SSID sizes */ smmu->ssid_bits = FIELD_GET(IDR1_SSIDSIZE, reg); @@ -3069,6 +3836,43 @@ static unsigned long arm_smmu_resource_size(struct arm_smmu_device *smmu) return SZ_128K; } +static int arm_smmu_set_bus_ops(struct iommu_ops *ops) +{ + int err; + +#ifdef CONFIG_PCI + if (pci_bus_type.iommu_ops != ops) { + err = bus_set_iommu(&pci_bus_type, ops); + if (err) + return err; + } +#endif +#ifdef CONFIG_ARM_AMBA + if (amba_bustype.iommu_ops != ops) { + err = bus_set_iommu(&amba_bustype, ops); + if (err) + goto err_reset_pci_ops; + } +#endif + if (platform_bus_type.iommu_ops != ops) { + err = bus_set_iommu(&platform_bus_type, ops); + if (err) + goto err_reset_amba_ops; + } + + return 0; + +err_reset_amba_ops: +#ifdef CONFIG_ARM_AMBA + bus_set_iommu(&amba_bustype, NULL); +#endif +err_reset_pci_ops: __maybe_unused; +#ifdef CONFIG_PCI + bus_set_iommu(&pci_bus_type, NULL); +#endif + return err; +} + static int arm_smmu_device_probe(struct platform_device *pdev) { int irq, ret; @@ -3098,7 +3902,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev) /* Base address */ res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (resource_size(res) + 1 < arm_smmu_resource_size(smmu)) { + if (resource_size(res) < arm_smmu_resource_size(smmu)) { dev_err(dev, "MMIO region too small (%pr)\n", res); return -EINVAL; } @@ -3110,19 +3914,19 @@ static int arm_smmu_device_probe(struct platform_device *pdev) /* Interrupt lines */ - irq = platform_get_irq_byname(pdev, "combined"); + irq = platform_get_irq_byname_optional(pdev, "combined"); if (irq > 0) smmu->combined_irq = irq; else { - irq = platform_get_irq_byname(pdev, "eventq"); + irq = platform_get_irq_byname_optional(pdev, "eventq"); if (irq > 0) smmu->evtq.q.irq = irq; - irq = platform_get_irq_byname(pdev, "priq"); + irq = platform_get_irq_byname_optional(pdev, "priq"); if (irq > 0) smmu->priq.q.irq = irq; - irq = platform_get_irq_byname(pdev, "gerror"); + irq = platform_get_irq_byname_optional(pdev, "gerror"); if (irq > 0) smmu->gerr_irq = irq; } @@ -3159,48 +3963,45 @@ static int arm_smmu_device_probe(struct platform_device *pdev) return ret; } -#ifdef CONFIG_PCI - if (pci_bus_type.iommu_ops != &arm_smmu_ops) { - pci_request_acs(); - ret = bus_set_iommu(&pci_bus_type, &arm_smmu_ops); - if (ret) - return ret; - } -#endif -#ifdef CONFIG_ARM_AMBA - if (amba_bustype.iommu_ops != &arm_smmu_ops) { - ret = bus_set_iommu(&amba_bustype, &arm_smmu_ops); - if (ret) - return ret; - } -#endif - if (platform_bus_type.iommu_ops != &arm_smmu_ops) { - ret = bus_set_iommu(&platform_bus_type, &arm_smmu_ops); - if (ret) - return ret; - } - return 0; + return arm_smmu_set_bus_ops(&arm_smmu_ops); } -static void arm_smmu_device_shutdown(struct platform_device *pdev) +static int arm_smmu_device_remove(struct platform_device *pdev) { struct arm_smmu_device *smmu = platform_get_drvdata(pdev); + arm_smmu_set_bus_ops(NULL); + iommu_device_unregister(&smmu->iommu); + iommu_device_sysfs_remove(&smmu->iommu); arm_smmu_device_disable(smmu); + + return 0; +} + +static void arm_smmu_device_shutdown(struct platform_device *pdev) +{ + arm_smmu_device_remove(pdev); } static const struct of_device_id arm_smmu_of_match[] = { { .compatible = "arm,smmu-v3", }, { }, }; +MODULE_DEVICE_TABLE(of, arm_smmu_of_match); static struct platform_driver arm_smmu_driver = { .driver = { - .name = "arm-smmu-v3", - .of_match_table = of_match_ptr(arm_smmu_of_match), - .suppress_bind_attrs = true, + .name = "arm-smmu-v3", + .of_match_table = arm_smmu_of_match, + .suppress_bind_attrs = true, }, .probe = arm_smmu_device_probe, + .remove = arm_smmu_device_remove, .shutdown = arm_smmu_device_shutdown, }; -builtin_platform_driver(arm_smmu_driver); +module_platform_driver(arm_smmu_driver); + +MODULE_DESCRIPTION("IOMMU API for ARM architected SMMUv3 implementations"); +MODULE_AUTHOR("Will Deacon <will@kernel.org>"); +MODULE_ALIAS("platform:arm-smmu-v3"); +MODULE_LICENSE("GPL v2"); |