summaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/hw/mlx5
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/hw/mlx5')
-rw-r--r--drivers/infiniband/hw/mlx5/Makefile2
-rw-r--r--drivers/infiniband/hw/mlx5/cmd.c140
-rw-r--r--drivers/infiniband/hw/mlx5/cmd.h6
-rw-r--r--drivers/infiniband/hw/mlx5/cq.c41
-rw-r--r--drivers/infiniband/hw/mlx5/devx.c288
-rw-r--r--drivers/infiniband/hw/mlx5/doorbell.c3
-rw-r--r--drivers/infiniband/hw/mlx5/flow.c63
-rw-r--r--drivers/infiniband/hw/mlx5/gsi.c5
-rw-r--r--drivers/infiniband/hw/mlx5/ib_rep.c2
-rw-r--r--drivers/infiniband/hw/mlx5/ib_rep.h2
-rw-r--r--drivers/infiniband/hw/mlx5/ib_virt.c24
-rw-r--r--drivers/infiniband/hw/mlx5/mad.c124
-rw-r--r--drivers/infiniband/hw/mlx5/main.c831
-rw-r--r--drivers/infiniband/hw/mlx5/mem.c236
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h171
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c360
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c1185
-rw-r--r--drivers/infiniband/hw/mlx5/qp.c291
-rw-r--r--drivers/infiniband/hw/mlx5/restrack.c90
-rw-r--r--drivers/infiniband/hw/mlx5/srq.c2
-rw-r--r--drivers/infiniband/hw/mlx5/srq_cmd.c6
21 files changed, 2229 insertions, 1643 deletions
diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile
index 9924be8384d8..d0a043ccbe58 100644
--- a/drivers/infiniband/hw/mlx5/Makefile
+++ b/drivers/infiniband/hw/mlx5/Makefile
@@ -3,7 +3,7 @@ obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o
mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq_cmd.o \
srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o \
- cong.o
+ cong.o restrack.o
mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o
mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o
mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o
diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c
index 6c8645033102..4c26492ab8a3 100644
--- a/drivers/infiniband/hw/mlx5/cmd.c
+++ b/drivers/infiniband/hw/mlx5/cmd.c
@@ -157,7 +157,7 @@ int mlx5_cmd_alloc_memic(struct mlx5_dm *dm, phys_addr_t *addr,
return -ENOMEM;
}
-int mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, u64 length)
+void mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, u64 length)
{
struct mlx5_core_dev *dev = dm->dev;
u64 hw_start_addr = MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr);
@@ -175,145 +175,13 @@ int mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, u64 length)
MLX5_SET(dealloc_memic_in, in, memic_size, length);
err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
-
- if (!err) {
- spin_lock(&dm->lock);
- bitmap_clear(dm->memic_alloc_pages,
- start_page_idx, num_pages);
- spin_unlock(&dm->lock);
- }
-
- return err;
-}
-
-int mlx5_cmd_alloc_sw_icm(struct mlx5_dm *dm, int type, u64 length,
- u16 uid, phys_addr_t *addr, u32 *obj_id)
-{
- struct mlx5_core_dev *dev = dm->dev;
- u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
- u32 in[MLX5_ST_SZ_DW(create_sw_icm_in)] = {};
- unsigned long *block_map;
- u64 icm_start_addr;
- u32 log_icm_size;
- u32 num_blocks;
- u32 max_blocks;
- u64 block_idx;
- void *sw_icm;
- int ret;
-
- MLX5_SET(general_obj_in_cmd_hdr, in, opcode,
- MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
- MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_SW_ICM);
- MLX5_SET(general_obj_in_cmd_hdr, in, uid, uid);
-
- switch (type) {
- case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
- icm_start_addr = MLX5_CAP64_DEV_MEM(dev,
- steering_sw_icm_start_address);
- log_icm_size = MLX5_CAP_DEV_MEM(dev, log_steering_sw_icm_size);
- block_map = dm->steering_sw_icm_alloc_blocks;
- break;
- case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
- icm_start_addr = MLX5_CAP64_DEV_MEM(dev,
- header_modify_sw_icm_start_address);
- log_icm_size = MLX5_CAP_DEV_MEM(dev,
- log_header_modify_sw_icm_size);
- block_map = dm->header_modify_sw_icm_alloc_blocks;
- break;
- default:
- return -EINVAL;
- }
-
- num_blocks = (length + MLX5_SW_ICM_BLOCK_SIZE(dev) - 1) >>
- MLX5_LOG_SW_ICM_BLOCK_SIZE(dev);
- max_blocks = BIT(log_icm_size - MLX5_LOG_SW_ICM_BLOCK_SIZE(dev));
- spin_lock(&dm->lock);
- block_idx = bitmap_find_next_zero_area(block_map,
- max_blocks,
- 0,
- num_blocks, 0);
-
- if (block_idx < max_blocks)
- bitmap_set(block_map,
- block_idx, num_blocks);
-
- spin_unlock(&dm->lock);
-
- if (block_idx >= max_blocks)
- return -ENOMEM;
-
- sw_icm = MLX5_ADDR_OF(create_sw_icm_in, in, sw_icm);
- icm_start_addr += block_idx << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev);
- MLX5_SET64(sw_icm, sw_icm, sw_icm_start_addr,
- icm_start_addr);
- MLX5_SET(sw_icm, sw_icm, log_sw_icm_size, ilog2(length));
-
- ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
- if (ret) {
- spin_lock(&dm->lock);
- bitmap_clear(block_map,
- block_idx, num_blocks);
- spin_unlock(&dm->lock);
-
- return ret;
- }
-
- *addr = icm_start_addr;
- *obj_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
-
- return 0;
-}
-
-int mlx5_cmd_dealloc_sw_icm(struct mlx5_dm *dm, int type, u64 length,
- u16 uid, phys_addr_t addr, u32 obj_id)
-{
- struct mlx5_core_dev *dev = dm->dev;
- u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
- u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
- unsigned long *block_map;
- u32 num_blocks;
- u64 start_idx;
- int err;
-
- num_blocks = (length + MLX5_SW_ICM_BLOCK_SIZE(dev) - 1) >>
- MLX5_LOG_SW_ICM_BLOCK_SIZE(dev);
-
- switch (type) {
- case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
- start_idx =
- (addr - MLX5_CAP64_DEV_MEM(
- dev, steering_sw_icm_start_address)) >>
- MLX5_LOG_SW_ICM_BLOCK_SIZE(dev);
- block_map = dm->steering_sw_icm_alloc_blocks;
- break;
- case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
- start_idx =
- (addr -
- MLX5_CAP64_DEV_MEM(
- dev, header_modify_sw_icm_start_address)) >>
- MLX5_LOG_SW_ICM_BLOCK_SIZE(dev);
- block_map = dm->header_modify_sw_icm_alloc_blocks;
- break;
- default:
- return -EINVAL;
- }
-
- MLX5_SET(general_obj_in_cmd_hdr, in, opcode,
- MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
- MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_SW_ICM);
- MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, obj_id);
- MLX5_SET(general_obj_in_cmd_hdr, in, uid, uid);
-
- err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
if (err)
- return err;
+ return;
spin_lock(&dm->lock);
- bitmap_clear(block_map,
- start_idx, num_blocks);
+ bitmap_clear(dm->memic_alloc_pages,
+ start_page_idx, num_pages);
spin_unlock(&dm->lock);
-
- return 0;
}
int mlx5_cmd_query_ext_ppcnt_counters(struct mlx5_core_dev *dev, void *out)
diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h
index 0572dcba6eae..945ebce73613 100644
--- a/drivers/infiniband/hw/mlx5/cmd.h
+++ b/drivers/infiniband/hw/mlx5/cmd.h
@@ -46,7 +46,7 @@ int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *mdev,
void *in, int in_size);
int mlx5_cmd_alloc_memic(struct mlx5_dm *dm, phys_addr_t *addr,
u64 length, u32 alignment);
-int mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, u64 length);
+void mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, u64 length);
void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid);
void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid);
void mlx5_cmd_destroy_tis(struct mlx5_core_dev *dev, u32 tisn, u16 uid);
@@ -65,8 +65,4 @@ int mlx5_cmd_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id,
u16 uid);
int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
u16 opmod, u8 port);
-int mlx5_cmd_alloc_sw_icm(struct mlx5_dm *dm, int type, u64 length,
- u16 uid, phys_addr_t *addr, u32 *obj_id);
-int mlx5_cmd_dealloc_sw_icm(struct mlx5_dm *dm, int type, u64 length,
- u16 uid, phys_addr_t addr, u32 obj_id);
#endif /* MLX5_IB_CMD_H */
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 45f48cde6b9d..367a71bc5f4b 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -423,9 +423,6 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq,
struct mlx5_cqe64 *cqe64;
struct mlx5_core_qp *mqp;
struct mlx5_ib_wq *wq;
- struct mlx5_sig_err_cqe *sig_err_cqe;
- struct mlx5_core_mkey *mmkey;
- struct mlx5_ib_mr *mr;
uint8_t opcode;
uint32_t qpn;
u16 wqe_ctr;
@@ -519,27 +516,29 @@ repoll:
}
}
break;
- case MLX5_CQE_SIG_ERR:
- sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64;
+ case MLX5_CQE_SIG_ERR: {
+ struct mlx5_sig_err_cqe *sig_err_cqe =
+ (struct mlx5_sig_err_cqe *)cqe64;
+ struct mlx5_core_sig_ctx *sig;
- xa_lock(&dev->mdev->priv.mkey_table);
- mmkey = xa_load(&dev->mdev->priv.mkey_table,
+ xa_lock(&dev->sig_mrs);
+ sig = xa_load(&dev->sig_mrs,
mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey)));
- mr = to_mibmr(mmkey);
- get_sig_err_item(sig_err_cqe, &mr->sig->err_item);
- mr->sig->sig_err_exists = true;
- mr->sig->sigerr_count++;
+ get_sig_err_item(sig_err_cqe, &sig->err_item);
+ sig->sig_err_exists = true;
+ sig->sigerr_count++;
mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR on key: 0x%x err_type %x err_offset %llx expected %x actual %x\n",
- cq->mcq.cqn, mr->sig->err_item.key,
- mr->sig->err_item.err_type,
- mr->sig->err_item.sig_err_offset,
- mr->sig->err_item.expected,
- mr->sig->err_item.actual);
+ cq->mcq.cqn, sig->err_item.key,
+ sig->err_item.err_type,
+ sig->err_item.sig_err_offset,
+ sig->err_item.expected,
+ sig->err_item.actual);
- xa_unlock(&dev->mdev->priv.mkey_table);
+ xa_unlock(&dev->sig_mrs);
goto repoll;
}
+ }
return 0;
}
@@ -709,8 +708,8 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
*cqe_size = ucmd.cqe_size;
cq->buf.umem =
- ib_umem_get(udata, ucmd.buf_addr, entries * ucmd.cqe_size,
- IB_ACCESS_LOCAL_WRITE, 1);
+ ib_umem_get(&dev->ib_dev, ucmd.buf_addr,
+ entries * ucmd.cqe_size, IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(cq->buf.umem)) {
err = PTR_ERR(cq->buf.umem);
return err;
@@ -1109,9 +1108,9 @@ static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
if (ucmd.cqe_size && SIZE_MAX / ucmd.cqe_size <= entries - 1)
return -EINVAL;
- umem = ib_umem_get(udata, ucmd.buf_addr,
+ umem = ib_umem_get(&dev->ib_dev, ucmd.buf_addr,
(size_t)ucmd.cqe_size * entries,
- IB_ACCESS_LOCAL_WRITE, 1);
+ IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(umem)) {
err = PTR_ERR(umem);
return err;
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index af5bbb35c058..46e1ab771f10 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -30,7 +30,7 @@ enum devx_obj_flags {
struct devx_async_data {
struct mlx5_ib_dev *mdev;
struct list_head list;
- struct ib_uobject *fd_uobj;
+ struct devx_async_cmd_event_file *ev_file;
struct mlx5_async_work cb_work;
u16 cmd_out_len;
/* must be last field in this structure */
@@ -72,7 +72,6 @@ struct devx_event_subscription {
struct rcu_head rcu;
u64 cookie;
struct devx_async_event_file *ev_file;
- struct file *filp; /* Upon hot unplug we need a direct access to */
struct eventfd_ctx *eventfd;
};
@@ -100,6 +99,7 @@ struct devx_obj {
struct mlx5_ib_devx_mr devx_mr;
struct mlx5_core_dct core_dct;
struct mlx5_core_cq core_cq;
+ u32 flow_counter_bulk_size;
};
struct list_head event_sub; /* holds devx_event_subscription entries */
};
@@ -192,15 +192,20 @@ bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type)
}
}
-bool mlx5_ib_devx_is_flow_counter(void *obj, u32 *counter_id)
+bool mlx5_ib_devx_is_flow_counter(void *obj, u32 offset, u32 *counter_id)
{
struct devx_obj *devx_obj = obj;
u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode);
if (opcode == MLX5_CMD_OP_DEALLOC_FLOW_COUNTER) {
+
+ if (offset && offset >= devx_obj->flow_counter_bulk_size)
+ return false;
+
*counter_id = MLX5_GET(dealloc_flow_counter_in,
devx_obj->dinbox,
flow_counter_id);
+ *counter_id += offset;
return true;
}
@@ -233,6 +238,8 @@ static bool is_legacy_obj_event_num(u16 event_num)
case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
case MLX5_EVENT_TYPE_DCT_DRAINED:
case MLX5_EVENT_TYPE_COMP:
+ case MLX5_EVENT_TYPE_DCT_KEY_VIOLATION:
+ case MLX5_EVENT_TYPE_XRQ_ERROR:
return true;
default:
return false;
@@ -315,8 +322,10 @@ static u16 get_event_obj_type(unsigned long event_type, struct mlx5_eqe *eqe)
case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
return eqe->data.qp_srq.type;
case MLX5_EVENT_TYPE_CQ_ERROR:
+ case MLX5_EVENT_TYPE_XRQ_ERROR:
return 0;
case MLX5_EVENT_TYPE_DCT_DRAINED:
+ case MLX5_EVENT_TYPE_DCT_KEY_VIOLATION:
return MLX5_EVENT_QUEUE_TYPE_DCT;
default:
return MLX5_GET(affiliated_event_header, &eqe->data, obj_type);
@@ -542,6 +551,8 @@ static u64 devx_get_obj_id(const void *in)
break;
case MLX5_CMD_OP_ARM_XRQ:
case MLX5_CMD_OP_SET_XRQ_DC_PARAMS_ENTRY:
+ case MLX5_CMD_OP_RELEASE_XRQ_ERROR:
+ case MLX5_CMD_OP_MODIFY_XRQ:
obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_XRQ,
MLX5_GET(arm_xrq_in, in, xrqn));
break;
@@ -776,6 +787,14 @@ static bool devx_is_obj_create_cmd(const void *in, u16 *opcode)
return true;
return false;
}
+ case MLX5_CMD_OP_CREATE_PSV:
+ {
+ u8 num_psv = MLX5_GET(create_psv_in, in, num_psv);
+
+ if (num_psv == 1)
+ return true;
+ return false;
+ }
default:
return false;
}
@@ -810,6 +829,8 @@ static bool devx_is_obj_modify_cmd(const void *in)
case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION:
case MLX5_CMD_OP_ARM_XRQ:
case MLX5_CMD_OP_SET_XRQ_DC_PARAMS_ENTRY:
+ case MLX5_CMD_OP_RELEASE_XRQ_ERROR:
+ case MLX5_CMD_OP_MODIFY_XRQ:
return true;
case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY:
{
@@ -922,6 +943,7 @@ static bool devx_is_general_cmd(void *in, struct mlx5_ib_dev *dev)
case MLX5_CMD_OP_QUERY_CONG_STATUS:
case MLX5_CMD_OP_QUERY_CONG_PARAMS:
case MLX5_CMD_OP_QUERY_CONG_STATISTICS:
+ case MLX5_CMD_OP_QUERY_LAG:
return true;
default:
return false;
@@ -1215,6 +1237,12 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din,
case MLX5_CMD_OP_ALLOC_XRCD:
MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DEALLOC_XRCD);
break;
+ case MLX5_CMD_OP_CREATE_PSV:
+ MLX5_SET(general_obj_in_cmd_hdr, din, opcode,
+ MLX5_CMD_OP_DESTROY_PSV);
+ MLX5_SET(destroy_psv_in, din, psvn,
+ MLX5_GET(create_psv_out, out, psv0_index));
+ break;
default:
/* The entry must match to one of the devx_is_obj_create_cmd */
WARN_ON(true);
@@ -1242,8 +1270,8 @@ static int devx_handle_mkey_indirect(struct devx_obj *obj,
mkey->pd = MLX5_GET(mkc, mkc, pd);
devx_mr->ndescs = MLX5_GET(mkc, mkc, translations_octword_size);
- return xa_err(xa_store(&dev->mdev->priv.mkey_table,
- mlx5_base_mkey(mkey->key), mkey, GFP_KERNEL));
+ return xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(mkey->key), mkey,
+ GFP_KERNEL));
}
static int devx_handle_mkey_create(struct mlx5_ib_dev *dev,
@@ -1275,29 +1303,6 @@ static int devx_handle_mkey_create(struct mlx5_ib_dev *dev,
return 0;
}
-static void devx_free_indirect_mkey(struct rcu_head *rcu)
-{
- kfree(container_of(rcu, struct devx_obj, devx_mr.rcu));
-}
-
-/* This function to delete from the radix tree needs to be called before
- * destroying the underlying mkey. Otherwise a race might occur in case that
- * other thread will get the same mkey before this one will be deleted,
- * in that case it will fail via inserting to the tree its own data.
- *
- * Note:
- * An error in the destroy is not expected unless there is some other indirect
- * mkey which points to this one. In a kernel cleanup flow it will be just
- * destroyed in the iterative destruction call. In a user flow, in case
- * the application didn't close in the expected order it's its own problem,
- * the mkey won't be part of the tree, in both cases the kernel is safe.
- */
-static void devx_cleanup_mkey(struct devx_obj *obj)
-{
- xa_erase(&obj->ib_dev->mdev->priv.mkey_table,
- mlx5_base_mkey(obj->devx_mr.mmkey.key));
-}
-
static void devx_cleanup_subscription(struct mlx5_ib_dev *dev,
struct devx_event_subscription *sub)
{
@@ -1339,8 +1344,16 @@ static int devx_obj_cleanup(struct ib_uobject *uobject,
int ret;
dev = mlx5_udata_to_mdev(&attrs->driver_udata);
- if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY)
- devx_cleanup_mkey(obj);
+ if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
+ /*
+ * The pagefault_single_data_segment() does commands against
+ * the mmkey, we must wait for that to stop before freeing the
+ * mkey, as another allocation could get the same mkey #.
+ */
+ xa_erase(&obj->ib_dev->odp_mkeys,
+ mlx5_base_mkey(obj->devx_mr.mmkey.key));
+ synchronize_srcu(&dev->odp_srcu);
+ }
if (obj->flags & DEVX_OBJ_FLAGS_DCT)
ret = mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct);
@@ -1359,12 +1372,6 @@ static int devx_obj_cleanup(struct ib_uobject *uobject,
devx_cleanup_subscription(dev, sub_entry);
mutex_unlock(&devx_event_table->event_xa_lock);
- if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
- call_srcu(&dev->mr_srcu, &obj->devx_mr.rcu,
- devx_free_indirect_mkey);
- return ret;
- }
-
kfree(obj);
return ret;
}
@@ -1461,6 +1468,13 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
if (err)
goto obj_free;
+ if (opcode == MLX5_CMD_OP_ALLOC_FLOW_COUNTER) {
+ u8 bulk = MLX5_GET(alloc_flow_counter_in,
+ cmd_in,
+ flow_counter_bulk);
+ obj->flow_counter_bulk_size = 128UL * bulk;
+ }
+
uobj->object = obj;
INIT_LIST_HEAD(&obj->event_sub);
obj->ib_dev = dev;
@@ -1468,26 +1482,21 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
&obj_id);
WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32));
- if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
- err = devx_handle_mkey_indirect(obj, dev, cmd_in, cmd_out);
- if (err)
- goto obj_destroy;
- }
-
err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, cmd_out, cmd_out_len);
if (err)
- goto err_copy;
+ goto obj_destroy;
if (opcode == MLX5_CMD_OP_CREATE_GENERAL_OBJECT)
obj_type = MLX5_GET(general_obj_in_cmd_hdr, cmd_in, obj_type);
-
obj->obj_id = get_enc_obj_id(opcode | obj_type << 16, obj_id);
+ if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
+ err = devx_handle_mkey_indirect(obj, dev, cmd_in, cmd_out);
+ if (err)
+ goto obj_destroy;
+ }
return 0;
-err_copy:
- if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY)
- devx_cleanup_mkey(obj);
obj_destroy:
if (obj->flags & DEVX_OBJ_FLAGS_DCT)
mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct);
@@ -1664,21 +1673,20 @@ static void devx_query_callback(int status, struct mlx5_async_work *context)
{
struct devx_async_data *async_data =
container_of(context, struct devx_async_data, cb_work);
- struct ib_uobject *fd_uobj = async_data->fd_uobj;
- struct devx_async_cmd_event_file *ev_file;
- struct devx_async_event_queue *ev_queue;
+ struct devx_async_cmd_event_file *ev_file = async_data->ev_file;
+ struct devx_async_event_queue *ev_queue = &ev_file->ev_queue;
unsigned long flags;
- ev_file = container_of(fd_uobj, struct devx_async_cmd_event_file,
- uobj);
- ev_queue = &ev_file->ev_queue;
-
+ /*
+ * Note that if the struct devx_async_cmd_event_file uobj begins to be
+ * destroyed it will block at mlx5_cmd_cleanup_async_ctx() until this
+ * routine returns, ensuring that it always remains valid here.
+ */
spin_lock_irqsave(&ev_queue->lock, flags);
list_add_tail(&async_data->list, &ev_queue->event_list);
spin_unlock_irqrestore(&ev_queue->lock, flags);
wake_up_interruptible(&ev_queue->poll_wait);
- fput(fd_uobj->object);
}
#define MAX_ASYNC_BYTES_IN_USE (1024 * 1024) /* 1MB */
@@ -1747,9 +1755,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY)(
async_data->cmd_out_len = cmd_out_len;
async_data->mdev = mdev;
- async_data->fd_uobj = fd_uobj;
+ async_data->ev_file = ev_file;
- get_file(fd_uobj->object);
MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid);
err = mlx5_cmd_exec_cb(&ev_file->async_ctx, cmd_in,
uverbs_attr_get_len(attrs,
@@ -1759,12 +1766,10 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY)(
devx_query_callback, &async_data->cb_work);
if (err)
- goto cb_err;
+ goto free_async;
return 0;
-cb_err:
- fput(fd_uobj->object);
free_async:
kvfree(async_data);
sub_bytes:
@@ -2022,6 +2027,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)(
goto err;
list_add_tail(&event_sub->event_list, &sub_list);
+ uverbs_uobject_get(&ev_file->uobj);
if (use_eventfd) {
event_sub->eventfd =
eventfd_ctx_fdget(redirect_fd);
@@ -2035,7 +2041,6 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)(
event_sub->cookie = cookie;
event_sub->ev_file = ev_file;
- event_sub->filp = fd_uobj->object;
/* May be needed upon cleanup the devx object/subscription */
event_sub->xa_key_level1 = key_level1;
event_sub->xa_key_level2 = obj_id;
@@ -2089,7 +2094,7 @@ err:
if (event_sub->eventfd)
eventfd_ctx_put(event_sub->eventfd);
-
+ uverbs_uobject_put(&event_sub->ev_file->uobj);
kfree(event_sub);
}
@@ -2124,7 +2129,7 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
if (err)
return err;
- obj->umem = ib_umem_get(&attrs->driver_udata, addr, size, access, 0);
+ obj->umem = ib_umem_get(&dev->ib_dev, addr, size, access);
if (IS_ERR(obj->umem))
return PTR_ERR(obj->umem);
@@ -2285,7 +2290,11 @@ static u32 devx_get_obj_id_from_event(unsigned long event_type, void *data)
case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
obj_id = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
break;
+ case MLX5_EVENT_TYPE_XRQ_ERROR:
+ obj_id = be32_to_cpu(eqe->data.xrq_err.type_xrqn) & 0xffffff;
+ break;
case MLX5_EVENT_TYPE_DCT_DRAINED:
+ case MLX5_EVENT_TYPE_DCT_KEY_VIOLATION:
obj_id = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff;
break;
case MLX5_EVENT_TYPE_CQ_ERROR:
@@ -2310,7 +2319,8 @@ static int deliver_event(struct devx_event_subscription *event_sub,
if (ev_file->omit_data) {
spin_lock_irqsave(&ev_file->lock, flags);
- if (!list_empty(&event_sub->event_list)) {
+ if (!list_empty(&event_sub->event_list) ||
+ ev_file->is_destroyed) {
spin_unlock_irqrestore(&ev_file->lock, flags);
return 0;
}
@@ -2334,7 +2344,10 @@ static int deliver_event(struct devx_event_subscription *event_sub,
memcpy(event_data->hdr.out_data, data, sizeof(struct mlx5_eqe));
spin_lock_irqsave(&ev_file->lock, flags);
- list_add_tail(&event_data->list, &ev_file->event_list);
+ if (!ev_file->is_destroyed)
+ list_add_tail(&event_data->list, &ev_file->event_list);
+ else
+ kfree(event_data);
spin_unlock_irqrestore(&ev_file->lock, flags);
wake_up_interruptible(&ev_file->poll_wait);
@@ -2347,17 +2360,10 @@ static void dispatch_event_fd(struct list_head *fd_list,
struct devx_event_subscription *item;
list_for_each_entry_rcu(item, fd_list, xa_list) {
- if (!get_file_rcu(item->filp))
- continue;
-
- if (item->eventfd) {
+ if (item->eventfd)
eventfd_signal(item->eventfd, 1);
- fput(item->filp);
- continue;
- }
-
- deliver_event(item, data);
- fput(item->filp);
+ else
+ deliver_event(item, data);
}
}
@@ -2465,11 +2471,11 @@ static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf,
return -ERESTARTSYS;
}
- if (list_empty(&ev_queue->event_list) &&
- ev_queue->is_destroyed)
- return -EIO;
-
spin_lock_irq(&ev_queue->lock);
+ if (ev_queue->is_destroyed) {
+ spin_unlock_irq(&ev_queue->lock);
+ return -EIO;
+ }
}
event = list_entry(ev_queue->event_list.next,
@@ -2495,23 +2501,6 @@ static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf,
return ret;
}
-static int devx_async_cmd_event_close(struct inode *inode, struct file *filp)
-{
- struct ib_uobject *uobj = filp->private_data;
- struct devx_async_cmd_event_file *comp_ev_file = container_of(
- uobj, struct devx_async_cmd_event_file, uobj);
- struct devx_async_data *entry, *tmp;
-
- spin_lock_irq(&comp_ev_file->ev_queue.lock);
- list_for_each_entry_safe(entry, tmp,
- &comp_ev_file->ev_queue.event_list, list)
- kvfree(entry);
- spin_unlock_irq(&comp_ev_file->ev_queue.lock);
-
- uverbs_close_fd(filp);
- return 0;
-}
-
static __poll_t devx_async_cmd_event_poll(struct file *filp,
struct poll_table_struct *wait)
{
@@ -2535,7 +2524,7 @@ static const struct file_operations devx_async_cmd_event_fops = {
.owner = THIS_MODULE,
.read = devx_async_cmd_event_read,
.poll = devx_async_cmd_event_poll,
- .release = devx_async_cmd_event_close,
+ .release = uverbs_uobject_fd_release,
.llseek = no_llseek,
};
@@ -2560,10 +2549,6 @@ static ssize_t devx_async_event_read(struct file *filp, char __user *buf,
return -EOVERFLOW;
}
- if (ev_file->is_destroyed) {
- spin_unlock_irq(&ev_file->lock);
- return -EIO;
- }
while (list_empty(&ev_file->event_list)) {
spin_unlock_irq(&ev_file->lock);
@@ -2639,81 +2624,96 @@ static __poll_t devx_async_event_poll(struct file *filp,
return pollflags;
}
-static int devx_async_event_close(struct inode *inode, struct file *filp)
+static void devx_free_subscription(struct rcu_head *rcu)
{
- struct devx_async_event_file *ev_file = filp->private_data;
- struct devx_event_subscription *event_sub, *event_sub_tmp;
- struct devx_async_event_data *entry, *tmp;
- struct mlx5_ib_dev *dev = ev_file->dev;
+ struct devx_event_subscription *event_sub =
+ container_of(rcu, struct devx_event_subscription, rcu);
- mutex_lock(&dev->devx_event_table.event_xa_lock);
- /* delete the subscriptions which are related to this FD */
- list_for_each_entry_safe(event_sub, event_sub_tmp,
- &ev_file->subscribed_events_list, file_list) {
- devx_cleanup_subscription(dev, event_sub);
- if (event_sub->eventfd)
- eventfd_ctx_put(event_sub->eventfd);
-
- list_del_rcu(&event_sub->file_list);
- /* subscription may not be used by the read API any more */
- kfree_rcu(event_sub, rcu);
- }
-
- mutex_unlock(&dev->devx_event_table.event_xa_lock);
-
- /* free the pending events allocation */
- if (!ev_file->omit_data) {
- spin_lock_irq(&ev_file->lock);
- list_for_each_entry_safe(entry, tmp,
- &ev_file->event_list, list)
- kfree(entry); /* read can't come any more */
- spin_unlock_irq(&ev_file->lock);
- }
-
- uverbs_close_fd(filp);
- put_device(&dev->ib_dev.dev);
- return 0;
+ if (event_sub->eventfd)
+ eventfd_ctx_put(event_sub->eventfd);
+ uverbs_uobject_put(&event_sub->ev_file->uobj);
+ kfree(event_sub);
}
static const struct file_operations devx_async_event_fops = {
.owner = THIS_MODULE,
.read = devx_async_event_read,
.poll = devx_async_event_poll,
- .release = devx_async_event_close,
+ .release = uverbs_uobject_fd_release,
.llseek = no_llseek,
};
-static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj,
- enum rdma_remove_reason why)
+static int devx_async_cmd_event_destroy_uobj(struct ib_uobject *uobj,
+ enum rdma_remove_reason why)
{
struct devx_async_cmd_event_file *comp_ev_file =
container_of(uobj, struct devx_async_cmd_event_file,
uobj);
struct devx_async_event_queue *ev_queue = &comp_ev_file->ev_queue;
+ struct devx_async_data *entry, *tmp;
spin_lock_irq(&ev_queue->lock);
ev_queue->is_destroyed = 1;
spin_unlock_irq(&ev_queue->lock);
-
- if (why == RDMA_REMOVE_DRIVER_REMOVE)
- wake_up_interruptible(&ev_queue->poll_wait);
+ wake_up_interruptible(&ev_queue->poll_wait);
mlx5_cmd_cleanup_async_ctx(&comp_ev_file->async_ctx);
+
+ spin_lock_irq(&comp_ev_file->ev_queue.lock);
+ list_for_each_entry_safe(entry, tmp,
+ &comp_ev_file->ev_queue.event_list, list) {
+ list_del(&entry->list);
+ kvfree(entry);
+ }
+ spin_unlock_irq(&comp_ev_file->ev_queue.lock);
return 0;
};
-static int devx_hot_unplug_async_event_file(struct ib_uobject *uobj,
- enum rdma_remove_reason why)
+static int devx_async_event_destroy_uobj(struct ib_uobject *uobj,
+ enum rdma_remove_reason why)
{
struct devx_async_event_file *ev_file =
container_of(uobj, struct devx_async_event_file,
uobj);
+ struct devx_event_subscription *event_sub, *event_sub_tmp;
+ struct mlx5_ib_dev *dev = ev_file->dev;
spin_lock_irq(&ev_file->lock);
ev_file->is_destroyed = 1;
- spin_unlock_irq(&ev_file->lock);
+ /* free the pending events allocation */
+ if (ev_file->omit_data) {
+ struct devx_event_subscription *event_sub, *tmp;
+
+ list_for_each_entry_safe(event_sub, tmp, &ev_file->event_list,
+ event_list)
+ list_del_init(&event_sub->event_list);
+
+ } else {
+ struct devx_async_event_data *entry, *tmp;
+
+ list_for_each_entry_safe(entry, tmp, &ev_file->event_list,
+ list) {
+ list_del(&entry->list);
+ kfree(entry);
+ }
+ }
+
+ spin_unlock_irq(&ev_file->lock);
wake_up_interruptible(&ev_file->poll_wait);
+
+ mutex_lock(&dev->devx_event_table.event_xa_lock);
+ /* delete the subscriptions which are related to this FD */
+ list_for_each_entry_safe(event_sub, event_sub_tmp,
+ &ev_file->subscribed_events_list, file_list) {
+ devx_cleanup_subscription(dev, event_sub);
+ list_del_rcu(&event_sub->file_list);
+ /* subscription may not be used by the read API any more */
+ call_rcu(&event_sub->rcu, devx_free_subscription);
+ }
+ mutex_unlock(&dev->devx_event_table.event_xa_lock);
+
+ put_device(&dev->ib_dev.dev);
return 0;
};
@@ -2899,7 +2899,7 @@ DECLARE_UVERBS_NAMED_METHOD(
DECLARE_UVERBS_NAMED_OBJECT(
MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
UVERBS_TYPE_ALLOC_FD(sizeof(struct devx_async_cmd_event_file),
- devx_hot_unplug_async_cmd_event_file,
+ devx_async_cmd_event_destroy_uobj,
&devx_async_cmd_event_fops, "[devx_async_cmd]",
O_RDONLY),
&UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC));
@@ -2917,7 +2917,7 @@ DECLARE_UVERBS_NAMED_METHOD(
DECLARE_UVERBS_NAMED_OBJECT(
MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD,
UVERBS_TYPE_ALLOC_FD(sizeof(struct devx_async_event_file),
- devx_hot_unplug_async_event_file,
+ devx_async_event_destroy_uobj,
&devx_async_event_fops, "[devx_async_event]",
O_RDONLY),
&UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC));
diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c
index 8f4e5f22b84c..61475b571531 100644
--- a/drivers/infiniband/hw/mlx5/doorbell.c
+++ b/drivers/infiniband/hw/mlx5/doorbell.c
@@ -64,7 +64,8 @@ int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context,
page->user_virt = (virt & PAGE_MASK);
page->refcnt = 0;
- page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0, 0);
+ page->umem = ib_umem_get(context->ibucontext.device, virt & PAGE_MASK,
+ PAGE_SIZE, 0);
if (IS_ERR(page->umem)) {
err = PTR_ERR(page->umem);
kfree(page);
diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c
index b8841355fcd5..dbee17d22d50 100644
--- a/drivers/infiniband/hw/mlx5/flow.c
+++ b/drivers/infiniband/hw/mlx5/flow.c
@@ -32,6 +32,9 @@ mlx5_ib_ft_type_to_namespace(enum mlx5_ib_uapi_flow_table_type table_type,
case MLX5_IB_UAPI_FLOW_TABLE_TYPE_FDB:
*namespace = MLX5_FLOW_NAMESPACE_FDB;
break;
+ case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_RX:
+ *namespace = MLX5_FLOW_NAMESPACE_RDMA_RX;
+ break;
default:
return -EINVAL;
}
@@ -82,6 +85,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata);
int len, ret, i;
u32 counter_id = 0;
+ u32 *offset_attr;
+ u32 offset = 0;
if (!capable(CAP_NET_RAW))
return -EPERM;
@@ -101,6 +106,11 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB && !dest_devx)
return -EINVAL;
+ /* Allow only DEVX object or QP as dest when inserting to RDMA_RX */
+ if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) &&
+ ((!dest_devx && !dest_qp) || (dest_devx && dest_qp)))
+ return -EINVAL;
+
if (dest_devx) {
devx_obj = uverbs_attr_get_obj(
attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX);
@@ -112,8 +122,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
*/
if (!mlx5_ib_devx_is_flow_dest(devx_obj, &dest_id, &dest_type))
return -EINVAL;
- /* Allow only flow table as dest when inserting to FDB */
- if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB &&
+ /* Allow only flow table as dest when inserting to FDB or RDMA_RX */
+ if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB ||
+ fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) &&
dest_type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
return -EINVAL;
} else if (dest_qp) {
@@ -142,8 +153,27 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
if (len) {
devx_obj = arr_flow_actions[0]->object;
- if (!mlx5_ib_devx_is_flow_counter(devx_obj, &counter_id))
+ if (uverbs_attr_is_valid(attrs,
+ MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET)) {
+
+ int num_offsets = uverbs_attr_ptr_get_array_size(
+ attrs,
+ MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET,
+ sizeof(u32));
+
+ if (num_offsets != 1)
+ return -EINVAL;
+
+ offset_attr = uverbs_attr_get_alloced_ptr(
+ attrs,
+ MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET);
+ offset = *offset_attr;
+ }
+
+ if (!mlx5_ib_devx_is_flow_counter(devx_obj, offset,
+ &counter_id))
return -EINVAL;
+
flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
}
@@ -322,11 +352,11 @@ void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction)
switch (maction->flow_action_raw.sub_type) {
case MLX5_IB_FLOW_ACTION_MODIFY_HEADER:
mlx5_modify_header_dealloc(maction->flow_action_raw.dev->mdev,
- maction->flow_action_raw.action_id);
+ maction->flow_action_raw.modify_hdr);
break;
case MLX5_IB_FLOW_ACTION_PACKET_REFORMAT:
mlx5_packet_reformat_dealloc(maction->flow_action_raw.dev->mdev,
- maction->flow_action_raw.action_id);
+ maction->flow_action_raw.pkt_reformat);
break;
case MLX5_IB_FLOW_ACTION_DECAP:
break;
@@ -352,10 +382,11 @@ mlx5_ib_create_modify_header(struct mlx5_ib_dev *dev,
if (!maction)
return ERR_PTR(-ENOMEM);
- ret = mlx5_modify_header_alloc(dev->mdev, namespace, num_actions, in,
- &maction->flow_action_raw.action_id);
+ maction->flow_action_raw.modify_hdr =
+ mlx5_modify_header_alloc(dev->mdev, namespace, num_actions, in);
- if (ret) {
+ if (IS_ERR(maction->flow_action_raw.modify_hdr)) {
+ ret = PTR_ERR(maction->flow_action_raw.modify_hdr);
kfree(maction);
return ERR_PTR(ret);
}
@@ -479,11 +510,13 @@ static int mlx5_ib_flow_action_create_packet_reformat_ctx(
if (ret)
return ret;
- ret = mlx5_packet_reformat_alloc(dev->mdev, prm_prt, len,
- in, namespace,
- &maction->flow_action_raw.action_id);
- if (ret)
+ maction->flow_action_raw.pkt_reformat =
+ mlx5_packet_reformat_alloc(dev->mdev, prm_prt, len,
+ in, namespace);
+ if (IS_ERR(maction->flow_action_raw.pkt_reformat)) {
+ ret = PTR_ERR(maction->flow_action_raw.pkt_reformat);
return ret;
+ }
maction->flow_action_raw.sub_type =
MLX5_IB_FLOW_ACTION_PACKET_REFORMAT;
@@ -586,7 +619,11 @@ DECLARE_UVERBS_NAMED_METHOD(
UVERBS_ATTR_IDRS_ARR(MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX,
MLX5_IB_OBJECT_DEVX_OBJ,
UVERBS_ACCESS_READ, 1, 1,
- UA_OPTIONAL));
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET,
+ UVERBS_ATTR_MIN_SIZE(sizeof(u32)),
+ UA_OPTIONAL,
+ UA_ALLOC_AND_COPY));
DECLARE_UVERBS_NAMED_METHOD_DESTROY(
MLX5_IB_METHOD_DESTROY_FLOW,
diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c
index 4950df3f71b6..1ae6fd95acaa 100644
--- a/drivers/infiniband/hw/mlx5/gsi.c
+++ b/drivers/infiniband/hw/mlx5/gsi.c
@@ -263,7 +263,7 @@ static struct ib_qp *create_gsi_ud_qp(struct mlx5_ib_gsi_qp *gsi)
},
.sq_sig_type = gsi->sq_sig_type,
.qp_type = IB_QPT_UD,
- .create_flags = mlx5_ib_create_qp_sqpn_qp1(),
+ .create_flags = MLX5_IB_QP_CREATE_SQPN_QP1,
};
return ib_create_qp(pd, &init_attr);
@@ -507,8 +507,7 @@ int mlx5_ib_gsi_post_send(struct ib_qp *qp, const struct ib_send_wr *wr,
ret = ib_post_send(tx_qp, &cur_wr.wr, bad_wr);
if (ret) {
/* Undo the effect of adding the outstanding wr */
- gsi->outstanding_pi = (gsi->outstanding_pi - 1) %
- gsi->cap.max_send_wr;
+ gsi->outstanding_pi--;
goto err;
}
spin_unlock_irqrestore(&gsi->lock, flags);
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index 74ce9249e75a..5c3d052ac30b 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -35,7 +35,7 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
int vport_index;
if (rep->vport == MLX5_VPORT_UPLINK)
- profile = &uplink_rep_profile;
+ profile = &raw_eth_profile;
else
return mlx5_ib_set_vport_rep(dev, rep);
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.h b/drivers/infiniband/hw/mlx5/ib_rep.h
index de43b423bafc..3b6750cba796 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.h
+++ b/drivers/infiniband/hw/mlx5/ib_rep.h
@@ -10,7 +10,7 @@
#include "mlx5_ib.h"
#ifdef CONFIG_MLX5_ESWITCH
-extern const struct mlx5_ib_profile uplink_rep_profile;
+extern const struct mlx5_ib_profile raw_eth_profile;
u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw);
struct mlx5_ib_dev *mlx5_ib_get_rep_ibdev(struct mlx5_eswitch *esw,
diff --git a/drivers/infiniband/hw/mlx5/ib_virt.c b/drivers/infiniband/hw/mlx5/ib_virt.c
index 649a3364f838..b61165359954 100644
--- a/drivers/infiniband/hw/mlx5/ib_virt.c
+++ b/drivers/infiniband/hw/mlx5/ib_virt.c
@@ -164,8 +164,10 @@ static int set_vf_node_guid(struct ib_device *device, int vf, u8 port, u64 guid)
in->field_select = MLX5_HCA_VPORT_SEL_NODE_GUID;
in->node_guid = guid;
err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in);
- if (!err)
+ if (!err) {
vfs_ctx[vf].node_guid = guid;
+ vfs_ctx[vf].node_guid_valid = 1;
+ }
kfree(in);
return err;
}
@@ -185,8 +187,10 @@ static int set_vf_port_guid(struct ib_device *device, int vf, u8 port, u64 guid)
in->field_select = MLX5_HCA_VPORT_SEL_PORT_GUID;
in->port_guid = guid;
err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in);
- if (!err)
+ if (!err) {
vfs_ctx[vf].port_guid = guid;
+ vfs_ctx[vf].port_guid_valid = 1;
+ }
kfree(in);
return err;
}
@@ -201,3 +205,19 @@ int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u8 port,
return -EINVAL;
}
+
+int mlx5_ib_get_vf_guid(struct ib_device *device, int vf, u8 port,
+ struct ifla_vf_guid *node_guid,
+ struct ifla_vf_guid *port_guid)
+{
+ struct mlx5_ib_dev *dev = to_mdev(device);
+ struct mlx5_core_dev *mdev = dev->mdev;
+ struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx;
+
+ node_guid->guid =
+ vfs_ctx[vf].node_guid_valid ? vfs_ctx[vf].node_guid : 0;
+ port_guid->guid =
+ vfs_ctx[vf].port_guid_valid ? vfs_ctx[vf].port_guid : 0;
+
+ return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c
index 348c1df69cdc..14e0c17de6a9 100644
--- a/drivers/infiniband/hw/mlx5/mad.c
+++ b/drivers/infiniband/hw/mlx5/mad.c
@@ -74,58 +74,6 @@ static int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey,
port);
}
-static int process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
- const struct ib_wc *in_wc, const struct ib_grh *in_grh,
- const struct ib_mad *in_mad, struct ib_mad *out_mad)
-{
- u16 slid;
- int err;
-
- slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE);
-
- if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0)
- return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
-
- if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
- in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
- if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET &&
- in_mad->mad_hdr.method != IB_MGMT_METHOD_SET &&
- in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS)
- return IB_MAD_RESULT_SUCCESS;
-
- /* Don't process SMInfo queries -- the SMA can't handle them.
- */
- if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO)
- return IB_MAD_RESULT_SUCCESS;
- } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
- in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS1 ||
- in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS2 ||
- in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) {
- if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET &&
- in_mad->mad_hdr.method != IB_MGMT_METHOD_SET)
- return IB_MAD_RESULT_SUCCESS;
- } else {
- return IB_MAD_RESULT_SUCCESS;
- }
-
- err = mlx5_MAD_IFC(to_mdev(ibdev),
- mad_flags & IB_MAD_IGNORE_MKEY,
- mad_flags & IB_MAD_IGNORE_BKEY,
- port_num, in_wc, in_grh, in_mad, out_mad);
- if (err)
- return IB_MAD_RESULT_FAILURE;
-
- /* set return bit in status of directed route responses */
- if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
- out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
-
- if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
- /* no response for trap repress */
- return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
-
- return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
-}
-
static void pma_cnt_ext_assign(struct ib_pma_portcounters_ext *pma_cnt_ext,
void *out)
{
@@ -271,30 +219,66 @@ done:
int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
const struct ib_wc *in_wc, const struct ib_grh *in_grh,
- const struct ib_mad_hdr *in, size_t in_mad_size,
- struct ib_mad_hdr *out, size_t *out_mad_size,
- u16 *out_mad_pkey_index)
+ const struct ib_mad *in, struct ib_mad *out,
+ size_t *out_mad_size, u16 *out_mad_pkey_index)
{
struct mlx5_ib_dev *dev = to_mdev(ibdev);
- const struct ib_mad *in_mad = (const struct ib_mad *)in;
- struct ib_mad *out_mad = (struct ib_mad *)out;
- int ret;
+ u8 mgmt_class = in->mad_hdr.mgmt_class;
+ u8 method = in->mad_hdr.method;
+ u16 slid;
+ int err;
- if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
- *out_mad_size != sizeof(*out_mad)))
- return IB_MAD_RESULT_FAILURE;
+ slid = in_wc ? ib_lid_cpu16(in_wc->slid) :
+ be16_to_cpu(IB_LID_PERMISSIVE);
- memset(out_mad->data, 0, sizeof(out_mad->data));
+ if (method == IB_MGMT_METHOD_TRAP && !slid)
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
- if (MLX5_CAP_GEN(dev->mdev, vport_counters) &&
- in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT &&
- in_mad->mad_hdr.method == IB_MGMT_METHOD_GET) {
- ret = process_pma_cmd(dev, port_num, in_mad, out_mad);
- } else {
- ret = process_mad(ibdev, mad_flags, port_num, in_wc, in_grh,
- in_mad, out_mad);
+ switch (mgmt_class) {
+ case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+ case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: {
+ if (method != IB_MGMT_METHOD_GET &&
+ method != IB_MGMT_METHOD_SET &&
+ method != IB_MGMT_METHOD_TRAP_REPRESS)
+ return IB_MAD_RESULT_SUCCESS;
+
+ /* Don't process SMInfo queries -- the SMA can't handle them.
+ */
+ if (in->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO)
+ return IB_MAD_RESULT_SUCCESS;
+ } break;
+ case IB_MGMT_CLASS_PERF_MGMT:
+ if (MLX5_CAP_GEN(dev->mdev, vport_counters) &&
+ method == IB_MGMT_METHOD_GET)
+ return process_pma_cmd(dev, port_num, in, out);
+ /* fallthrough */
+ case MLX5_IB_VENDOR_CLASS1:
+ /* fallthrough */
+ case MLX5_IB_VENDOR_CLASS2:
+ case IB_MGMT_CLASS_CONG_MGMT: {
+ if (method != IB_MGMT_METHOD_GET &&
+ method != IB_MGMT_METHOD_SET)
+ return IB_MAD_RESULT_SUCCESS;
+ } break;
+ default:
+ return IB_MAD_RESULT_SUCCESS;
}
- return ret;
+
+ err = mlx5_MAD_IFC(to_mdev(ibdev), mad_flags & IB_MAD_IGNORE_MKEY,
+ mad_flags & IB_MAD_IGNORE_BKEY, port_num, in_wc,
+ in_grh, in, out);
+ if (err)
+ return IB_MAD_RESULT_FAILURE;
+
+ /* set return bit in status of directed route responses */
+ if (mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ out->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+ if (method == IB_MGMT_METHOD_TRAP_REPRESS)
+ /* no response for trap repress */
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
}
int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index e12a4404096b..e4bcfa81b70a 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -40,7 +40,7 @@
#include <linux/slab.h>
#include <linux/bitmap.h>
#if defined(CONFIG_X86)
-#include <asm/pat.h>
+#include <asm/memtype.h>
#endif
#include <linux/sched.h>
#include <linux/sched/mm.h>
@@ -67,6 +67,7 @@
#include <rdma/uverbs_std_types.h>
#include <rdma/mlx5_user_ioctl_verbs.h>
#include <rdma/mlx5_user_ioctl_cmds.h>
+#include <rdma/ib_umem_odp.h>
#define UVERBS_MODULE_NAME mlx5_ib
#include <rdma/uverbs_named_ioctl.h>
@@ -535,7 +536,7 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
props->max_msg_sz = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
props->pkey_tbl_len = 1;
props->state = IB_PORT_DOWN;
- props->phys_state = 3;
+ props->phys_state = IB_PORT_PHYS_STATE_DISABLED;
mlx5_query_nic_vport_qkey_viol_cntr(mdev, &qkey_viol_cntr);
props->qkey_viol_cntr = qkey_viol_cntr;
@@ -561,7 +562,7 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
if (netif_running(ndev) && netif_carrier_ok(ndev)) {
props->state = IB_PORT_ACTIVE;
- props->phys_state = 5;
+ props->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
}
ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
@@ -693,21 +694,6 @@ static void get_atomic_caps_qp(struct mlx5_ib_dev *dev,
get_atomic_caps(dev, atomic_size_qp, props);
}
-static void get_atomic_caps_dc(struct mlx5_ib_dev *dev,
- struct ib_device_attr *props)
-{
- u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc);
-
- get_atomic_caps(dev, atomic_size_qp, props);
-}
-
-bool mlx5_ib_dc_atomic_is_supported(struct mlx5_ib_dev *dev)
-{
- struct ib_device_attr props = {};
-
- get_atomic_caps_dc(dev, &props);
- return (props.atomic_cap == IB_ATOMIC_HCA) ? true : false;
-}
static int mlx5_query_system_image_guid(struct ib_device *ibdev,
__be64 *sys_image_guid)
{
@@ -829,6 +815,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
struct ib_device_attr *props,
struct ib_udata *uhw)
{
+ size_t uhw_outlen = (uhw) ? uhw->outlen : 0;
struct mlx5_ib_dev *dev = to_mdev(ibdev);
struct mlx5_core_dev *mdev = dev->mdev;
int err = -ENOMEM;
@@ -842,12 +829,12 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
u64 max_tso;
resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
- if (uhw->outlen && uhw->outlen < resp_len)
+ if (uhw_outlen && uhw_outlen < resp_len)
return -EINVAL;
- else
- resp.response_length = resp_len;
- if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
+ resp.response_length = resp_len;
+
+ if (uhw && uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
return -EINVAL;
memset(props, 0, sizeof(*props));
@@ -911,7 +898,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
props->raw_packet_caps |=
IB_RAW_PACKET_CAP_CVLAN_STRIPPING;
- if (field_avail(typeof(resp), tso_caps, uhw->outlen)) {
+ if (field_avail(typeof(resp), tso_caps, uhw_outlen)) {
max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
if (max_tso) {
resp.tso_caps.max_tso = 1 << max_tso;
@@ -921,7 +908,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
}
}
- if (field_avail(typeof(resp), rss_caps, uhw->outlen)) {
+ if (field_avail(typeof(resp), rss_caps, uhw_outlen)) {
resp.rss_caps.rx_hash_function =
MLX5_RX_HASH_FUNC_TOEPLITZ;
resp.rss_caps.rx_hash_fields_mask =
@@ -941,9 +928,9 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
resp.response_length += sizeof(resp.rss_caps);
}
} else {
- if (field_avail(typeof(resp), tso_caps, uhw->outlen))
+ if (field_avail(typeof(resp), tso_caps, uhw_outlen))
resp.response_length += sizeof(resp.tso_caps);
- if (field_avail(typeof(resp), rss_caps, uhw->outlen))
+ if (field_avail(typeof(resp), rss_caps, uhw_outlen))
resp.response_length += sizeof(resp.rss_caps);
}
@@ -1011,6 +998,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
props->max_pi_fast_reg_page_list_len =
props->max_fast_reg_page_list_len / 2;
+ props->max_sgl_rd =
+ MLX5_CAP_GEN(mdev, max_sgl_for_optimized_performance);
get_atomic_caps_qp(dev, props);
props->masked_atomic_cap = IB_ATOMIC_NONE;
props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
@@ -1023,15 +1012,32 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
- if (MLX5_CAP_GEN(mdev, pg))
+ if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
props->odp_caps = dev->odp_caps;
+ if (!uhw) {
+ /* ODP for kernel QPs is not implemented for receive
+ * WQEs and SRQ WQEs
+ */
+ props->odp_caps.per_transport_caps.rc_odp_caps &=
+ ~(IB_ODP_SUPPORT_READ |
+ IB_ODP_SUPPORT_SRQ_RECV);
+ props->odp_caps.per_transport_caps.uc_odp_caps &=
+ ~(IB_ODP_SUPPORT_READ |
+ IB_ODP_SUPPORT_SRQ_RECV);
+ props->odp_caps.per_transport_caps.ud_odp_caps &=
+ ~(IB_ODP_SUPPORT_READ |
+ IB_ODP_SUPPORT_SRQ_RECV);
+ props->odp_caps.per_transport_caps.xrc_odp_caps &=
+ ~(IB_ODP_SUPPORT_READ |
+ IB_ODP_SUPPORT_SRQ_RECV);
+ }
}
if (MLX5_CAP_GEN(mdev, cd))
props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
- if (!mlx5_core_is_pf(mdev))
+ if (mlx5_core_is_vf(mdev))
props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
if (mlx5_ib_port_link_layer(ibdev, 1) ==
@@ -1066,7 +1072,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
MLX5_MAX_CQ_PERIOD;
}
- if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) {
+ if (field_avail(typeof(resp), cqe_comp_caps, uhw_outlen)) {
resp.response_length += sizeof(resp.cqe_comp_caps);
if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) {
@@ -1084,7 +1090,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
}
}
- if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen) &&
+ if (field_avail(typeof(resp), packet_pacing_caps, uhw_outlen) &&
raw_support) {
if (MLX5_CAP_QOS(mdev, packet_pacing) &&
MLX5_CAP_GEN(mdev, qos)) {
@@ -1103,7 +1109,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
}
if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes,
- uhw->outlen)) {
+ uhw_outlen)) {
if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe))
resp.mlx5_ib_support_multi_pkt_send_wqes =
MLX5_IB_ALLOW_MPW;
@@ -1116,7 +1122,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
}
- if (field_avail(typeof(resp), flags, uhw->outlen)) {
+ if (field_avail(typeof(resp), flags, uhw_outlen)) {
resp.response_length += sizeof(resp.flags);
if (MLX5_CAP_GEN(mdev, cqe_compression_128))
@@ -1132,8 +1138,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT;
}
- if (field_avail(typeof(resp), sw_parsing_caps,
- uhw->outlen)) {
+ if (field_avail(typeof(resp), sw_parsing_caps, uhw_outlen)) {
resp.response_length += sizeof(resp.sw_parsing_caps);
if (MLX5_CAP_ETH(mdev, swp)) {
resp.sw_parsing_caps.sw_parsing_offloads |=
@@ -1153,7 +1158,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
}
}
- if (field_avail(typeof(resp), striding_rq_caps, uhw->outlen) &&
+ if (field_avail(typeof(resp), striding_rq_caps, uhw_outlen) &&
raw_support) {
resp.response_length += sizeof(resp.striding_rq_caps);
if (MLX5_CAP_GEN(mdev, striding_rq)) {
@@ -1161,8 +1166,14 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES;
resp.striding_rq_caps.max_single_stride_log_num_of_bytes =
MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES;
- resp.striding_rq_caps.min_single_wqe_log_num_of_strides =
- MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
+ if (MLX5_CAP_GEN(dev->mdev, ext_stride_num_range))
+ resp.striding_rq_caps
+ .min_single_wqe_log_num_of_strides =
+ MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
+ else
+ resp.striding_rq_caps
+ .min_single_wqe_log_num_of_strides =
+ MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
resp.striding_rq_caps.max_single_wqe_log_num_of_strides =
MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES;
resp.striding_rq_caps.supported_qpts =
@@ -1170,8 +1181,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
}
}
- if (field_avail(typeof(resp), tunnel_offloads_caps,
- uhw->outlen)) {
+ if (field_avail(typeof(resp), tunnel_offloads_caps, uhw_outlen)) {
resp.response_length += sizeof(resp.tunnel_offloads_caps);
if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan))
resp.tunnel_offloads_caps |=
@@ -1192,7 +1202,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP;
}
- if (uhw->outlen) {
+ if (uhw_outlen) {
err = ib_copy_to_udata(uhw, &resp, resp.response_length);
if (err)
@@ -1808,7 +1818,7 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
return -EINVAL;
resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
- if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
+ if (dev->wc_support)
resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
resp.cache_line_size = cache_line_size();
resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
@@ -1867,10 +1877,6 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
if (err)
goto out_sys_pages;
- if (ibdev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)
- context->ibucontext.invalidate_range =
- &mlx5_ib_invalidate_range;
-
if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
err = mlx5_ib_devx_create(dev, true);
if (err < 0)
@@ -1999,11 +2005,6 @@ static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
struct mlx5_bfreg_info *bfregi;
- /* All umem's must be destroyed before destroying the ucontext. */
- mutex_lock(&ibcontext->per_mm_list_lock);
- WARN_ON(!list_empty(&ibcontext->per_mm_list));
- mutex_unlock(&ibcontext->per_mm_list_lock);
-
bfregi = &context->bfregi;
mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
@@ -2089,6 +2090,31 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev,
virt_to_page(dev->mdev->clock_info));
}
+static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry)
+{
+ struct mlx5_user_mmap_entry *mentry = to_mmmap(entry);
+ struct mlx5_ib_dev *dev = to_mdev(entry->ucontext->device);
+ struct mlx5_var_table *var_table = &dev->var_table;
+ struct mlx5_ib_dm *mdm;
+
+ switch (mentry->mmap_flag) {
+ case MLX5_IB_MMAP_TYPE_MEMIC:
+ mdm = container_of(mentry, struct mlx5_ib_dm, mentry);
+ mlx5_cmd_dealloc_memic(&dev->dm, mdm->dev_addr,
+ mdm->size);
+ kfree(mdm);
+ break;
+ case MLX5_IB_MMAP_TYPE_VAR:
+ mutex_lock(&var_table->bitmap_lock);
+ clear_bit(mentry->page_idx, var_table->bitmap);
+ mutex_unlock(&var_table->bitmap_lock);
+ kfree(mentry);
+ break;
+ default:
+ WARN_ON(true);
+ }
+}
+
static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
struct vm_area_struct *vma,
struct mlx5_ib_ucontext *context)
@@ -2177,7 +2203,7 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
err = rdma_user_mmap_io(&context->ibucontext, vma, pfn, PAGE_SIZE,
- prot);
+ prot, NULL);
if (err) {
mlx5_ib_err(dev,
"rdma_user_mmap_io failed with error=%d, mmap_cmd=%s\n",
@@ -2201,25 +2227,67 @@ free_bfreg:
return err;
}
-static int dm_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+static int add_dm_mmap_entry(struct ib_ucontext *context,
+ struct mlx5_ib_dm *mdm,
+ u64 address)
+{
+ mdm->mentry.mmap_flag = MLX5_IB_MMAP_TYPE_MEMIC;
+ mdm->mentry.address = address;
+ return rdma_user_mmap_entry_insert_range(
+ context, &mdm->mentry.rdma_entry,
+ mdm->size,
+ MLX5_IB_MMAP_DEVICE_MEM << 16,
+ (MLX5_IB_MMAP_DEVICE_MEM << 16) + (1UL << 16) - 1);
+}
+
+static unsigned long mlx5_vma_to_pgoff(struct vm_area_struct *vma)
+{
+ unsigned long idx;
+ u8 command;
+
+ command = get_command(vma->vm_pgoff);
+ idx = get_extended_index(vma->vm_pgoff);
+
+ return (command << 16 | idx);
+}
+
+static int mlx5_ib_mmap_offset(struct mlx5_ib_dev *dev,
+ struct vm_area_struct *vma,
+ struct ib_ucontext *ucontext)
{
- struct mlx5_ib_ucontext *mctx = to_mucontext(context);
- struct mlx5_ib_dev *dev = to_mdev(context->device);
- u16 page_idx = get_extended_index(vma->vm_pgoff);
- size_t map_size = vma->vm_end - vma->vm_start;
- u32 npages = map_size >> PAGE_SHIFT;
+ struct mlx5_user_mmap_entry *mentry;
+ struct rdma_user_mmap_entry *entry;
+ unsigned long pgoff;
+ pgprot_t prot;
phys_addr_t pfn;
+ int ret;
- if (find_next_zero_bit(mctx->dm_pages, page_idx + npages, page_idx) !=
- page_idx + npages)
+ pgoff = mlx5_vma_to_pgoff(vma);
+ entry = rdma_user_mmap_entry_get_pgoff(ucontext, pgoff);
+ if (!entry)
return -EINVAL;
- pfn = ((dev->mdev->bar_addr +
- MLX5_CAP64_DEV_MEM(dev->mdev, memic_bar_start_addr)) >>
- PAGE_SHIFT) +
- page_idx;
- return rdma_user_mmap_io(context, vma, pfn, map_size,
- pgprot_writecombine(vma->vm_page_prot));
+ mentry = to_mmmap(entry);
+ pfn = (mentry->address >> PAGE_SHIFT);
+ if (mentry->mmap_flag == MLX5_IB_MMAP_TYPE_VAR)
+ prot = pgprot_noncached(vma->vm_page_prot);
+ else
+ prot = pgprot_writecombine(vma->vm_page_prot);
+ ret = rdma_user_mmap_io(ucontext, vma, pfn,
+ entry->npages * PAGE_SIZE,
+ prot,
+ entry);
+ rdma_user_mmap_entry_put(&mentry->rdma_entry);
+ return ret;
+}
+
+static u64 mlx5_entry_to_mmap_offset(struct mlx5_user_mmap_entry *entry)
+{
+ u64 cmd = (entry->rdma_entry.start_pgoff >> 16) & 0xFFFF;
+ u64 index = entry->rdma_entry.start_pgoff & 0xFFFF;
+
+ return (((index >> 8) << 16) | (cmd << MLX5_IB_MMAP_CMD_SHIFT) |
+ (index & 0xFF)) << PAGE_SHIFT;
}
static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
@@ -2257,15 +2325,13 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm
PAGE_SHIFT;
return rdma_user_mmap_io(&context->ibucontext, vma, pfn,
PAGE_SIZE,
- pgprot_noncached(vma->vm_page_prot));
+ pgprot_noncached(vma->vm_page_prot),
+ NULL);
case MLX5_IB_MMAP_CLOCK_INFO:
return mlx5_ib_mmap_clock_info_page(dev, vma, context);
- case MLX5_IB_MMAP_DEVICE_MEM:
- return dm_mmap(ibcontext, vma);
-
default:
- return -EINVAL;
+ return mlx5_ib_mmap_offset(dev, vma, ibcontext);
}
return 0;
@@ -2280,6 +2346,7 @@ static inline int check_dm_type_support(struct mlx5_ib_dev *dev,
return -EOPNOTSUPP;
break;
case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+ case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
if (!capable(CAP_SYS_RAWIO) ||
!capable(CAP_NET_RAW))
return -EPERM;
@@ -2300,8 +2367,9 @@ static int handle_alloc_dm_memic(struct ib_ucontext *ctx,
{
struct mlx5_dm *dm_db = &to_mdev(ctx->device)->dm;
u64 start_offset;
- u32 page_idx;
+ u16 page_idx;
int err;
+ u64 address;
dm->size = roundup(attr->length, MLX5_MEMIC_BASE_SIZE);
@@ -2310,28 +2378,30 @@ static int handle_alloc_dm_memic(struct ib_ucontext *ctx,
if (err)
return err;
- page_idx = (dm->dev_addr - pci_resource_start(dm_db->dev->pdev, 0) -
- MLX5_CAP64_DEV_MEM(dm_db->dev, memic_bar_start_addr)) >>
- PAGE_SHIFT;
+ address = dm->dev_addr & PAGE_MASK;
+ err = add_dm_mmap_entry(ctx, dm, address);
+ if (err)
+ goto err_dealloc;
+ page_idx = dm->mentry.rdma_entry.start_pgoff & 0xFFFF;
err = uverbs_copy_to(attrs,
MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
- &page_idx, sizeof(page_idx));
+ &page_idx,
+ sizeof(page_idx));
if (err)
- goto err_dealloc;
+ goto err_copy;
start_offset = dm->dev_addr & ~PAGE_MASK;
err = uverbs_copy_to(attrs,
MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
&start_offset, sizeof(start_offset));
if (err)
- goto err_dealloc;
-
- bitmap_set(to_mucontext(ctx)->dm_pages, page_idx,
- DIV_ROUND_UP(dm->size, PAGE_SIZE));
+ goto err_copy;
return 0;
+err_copy:
+ rdma_user_mmap_entry_remove(&dm->mentry.rdma_entry);
err_dealloc:
mlx5_cmd_dealloc_memic(dm_db, dm->dev_addr, dm->size);
@@ -2344,20 +2414,20 @@ static int handle_alloc_dm_sw_icm(struct ib_ucontext *ctx,
struct uverbs_attr_bundle *attrs,
int type)
{
- struct mlx5_dm *dm_db = &to_mdev(ctx->device)->dm;
+ struct mlx5_core_dev *dev = to_mdev(ctx->device)->mdev;
u64 act_size;
int err;
/* Allocation size must a multiple of the basic block size
* and a power of 2.
*/
- act_size = round_up(attr->length, MLX5_SW_ICM_BLOCK_SIZE(dm_db->dev));
+ act_size = round_up(attr->length, MLX5_SW_ICM_BLOCK_SIZE(dev));
act_size = roundup_pow_of_two(act_size);
dm->size = act_size;
- err = mlx5_cmd_alloc_sw_icm(dm_db, type, act_size,
- to_mucontext(ctx)->devx_uid, &dm->dev_addr,
- &dm->icm_dm.obj_id);
+ err = mlx5_dm_sw_icm_alloc(dev, type, act_size,
+ to_mucontext(ctx)->devx_uid, &dm->dev_addr,
+ &dm->icm_dm.obj_id);
if (err)
return err;
@@ -2365,9 +2435,9 @@ static int handle_alloc_dm_sw_icm(struct ib_ucontext *ctx,
MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
&dm->dev_addr, sizeof(dm->dev_addr));
if (err)
- mlx5_cmd_dealloc_sw_icm(dm_db, type, dm->size,
- to_mucontext(ctx)->devx_uid,
- dm->dev_addr, dm->icm_dm.obj_id);
+ mlx5_dm_sw_icm_dealloc(dev, type, dm->size,
+ to_mucontext(ctx)->devx_uid, dm->dev_addr,
+ dm->icm_dm.obj_id);
return err;
}
@@ -2407,8 +2477,14 @@ struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
attrs);
break;
case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+ err = handle_alloc_dm_sw_icm(context, dm,
+ attr, attrs,
+ MLX5_SW_ICM_TYPE_STEERING);
+ break;
case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
- err = handle_alloc_dm_sw_icm(context, dm, attr, attrs, type);
+ err = handle_alloc_dm_sw_icm(context, dm,
+ attr, attrs,
+ MLX5_SW_ICM_TYPE_HEADER_MODIFY);
break;
default:
err = -EOPNOTSUPP;
@@ -2428,30 +2504,25 @@ int mlx5_ib_dealloc_dm(struct ib_dm *ibdm, struct uverbs_attr_bundle *attrs)
{
struct mlx5_ib_ucontext *ctx = rdma_udata_to_drv_context(
&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
- struct mlx5_dm *dm_db = &to_mdev(ibdm->device)->dm;
+ struct mlx5_core_dev *dev = to_mdev(ibdm->device)->mdev;
struct mlx5_ib_dm *dm = to_mdm(ibdm);
- u32 page_idx;
int ret;
switch (dm->type) {
case MLX5_IB_UAPI_DM_TYPE_MEMIC:
- ret = mlx5_cmd_dealloc_memic(dm_db, dm->dev_addr, dm->size);
+ rdma_user_mmap_entry_remove(&dm->mentry.rdma_entry);
+ return 0;
+ case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+ ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_STEERING,
+ dm->size, ctx->devx_uid, dm->dev_addr,
+ dm->icm_dm.obj_id);
if (ret)
return ret;
-
- page_idx = (dm->dev_addr -
- pci_resource_start(dm_db->dev->pdev, 0) -
- MLX5_CAP64_DEV_MEM(dm_db->dev,
- memic_bar_start_addr)) >>
- PAGE_SHIFT;
- bitmap_clear(ctx->dm_pages, page_idx,
- DIV_ROUND_UP(dm->size, PAGE_SIZE));
break;
- case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
- ret = mlx5_cmd_dealloc_sw_icm(dm_db, dm->type, dm->size,
- ctx->devx_uid, dm->dev_addr,
- dm->icm_dm.obj_id);
+ ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_HEADER_MODIFY,
+ dm->size, ctx->devx_uid, dm->dev_addr,
+ dm->icm_dm.obj_id);
if (ret)
return ret;
break;
@@ -2646,7 +2717,8 @@ int parse_flow_flow_action(struct mlx5_ib_flow_action *maction,
if (action->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
return -EINVAL;
action->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
- action->modify_id = maction->flow_action_raw.action_id;
+ action->modify_hdr =
+ maction->flow_action_raw.modify_hdr;
return 0;
}
if (maction->flow_action_raw.sub_type ==
@@ -2663,8 +2735,8 @@ int parse_flow_flow_action(struct mlx5_ib_flow_action *maction,
return -EINVAL;
action->action |=
MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
- action->reformat_id =
- maction->flow_action_raw.action_id;
+ action->pkt_reformat =
+ maction->flow_action_raw.pkt_reformat;
return 0;
}
/* fall through */
@@ -3239,12 +3311,14 @@ static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns,
int num_entries, int num_groups,
u32 flags)
{
+ struct mlx5_flow_table_attr ft_attr = {};
struct mlx5_flow_table *ft;
- ft = mlx5_create_auto_grouped_flow_table(ns, priority,
- num_entries,
- num_groups,
- 0, flags);
+ ft_attr.prio = priority;
+ ft_attr.max_fte = num_entries;
+ ft_attr.flags = flags;
+ ft_attr.autogroup.max_num_groups = num_groups;
+ ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
if (IS_ERR(ft))
return ERR_CAST(ft);
@@ -3544,10 +3618,6 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
}
INIT_LIST_HEAD(&handler->list);
- if (dst) {
- memcpy(&dest_arr[0], dst, sizeof(*dst));
- dest_num++;
- }
for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
err = parse_flow_attr(dev->mdev, spec,
@@ -3560,6 +3630,11 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
ib_flow += ((union ib_flow_spec *)ib_flow)->size;
}
+ if (dst && !(flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP)) {
+ memcpy(&dest_arr[0], dst, sizeof(*dst));
+ dest_num++;
+ }
+
if (!flow_is_multicast_only(flow_attr))
set_underlay_qp(dev, spec, underlay_qpn);
@@ -3600,10 +3675,8 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
}
if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP) {
- if (!(flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT)) {
+ if (!dest_num)
rule_dst = NULL;
- dest_num = 0;
- }
} else {
if (is_egress)
flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
@@ -3967,6 +4040,11 @@ _get_flow_table(struct mlx5_ib_dev *dev,
esw_encap)
flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
priority = FDB_BYPASS_PATH;
+ } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) {
+ max_table_size =
+ BIT(MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev,
+ log_max_ft_size));
+ priority = fs_matcher->priority;
}
max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES);
@@ -3981,6 +4059,8 @@ _get_flow_table(struct mlx5_ib_dev *dev,
prio = &dev->flow_db->egress_prios[priority];
else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB)
prio = &dev->flow_db->fdb;
+ else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX)
+ prio = &dev->flow_db->rdma_rx[priority];
if (!prio)
return ERR_PTR(-EINVAL);
@@ -4728,7 +4808,6 @@ static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port)
struct ib_device_attr *dprops = NULL;
struct ib_port_attr *pprops = NULL;
int err = -ENOMEM;
- struct ib_udata uhw = {.inlen = 0, .outlen = 0};
pprops = kzalloc(sizeof(*pprops), GFP_KERNEL);
if (!pprops)
@@ -4738,7 +4817,7 @@ static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port)
if (!dprops)
goto out;
- err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw);
+ err = mlx5_ib_query_device(&dev->ib_dev, dprops, NULL);
if (err) {
mlx5_ib_warn(dev, "query_device failed %d\n", err);
goto out;
@@ -5134,8 +5213,7 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
immutable->pkey_tbl_len = attr.pkey_tbl_len;
immutable->gid_tbl_len = attr.gid_tbl_len;
immutable->core_cap_flags = get_core_cap_flags(ibdev, &rep);
- if ((ll == IB_LINK_LAYER_INFINIBAND) || MLX5_CAP_GEN(dev->mdev, roce))
- immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+ immutable->max_mad_size = IB_MGMT_MAD_SIZE;
return 0;
}
@@ -5238,11 +5316,9 @@ static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
{
int err;
- if (MLX5_CAP_GEN(dev->mdev, roce)) {
- err = mlx5_nic_vport_enable_roce(dev->mdev);
- if (err)
- return err;
- }
+ err = mlx5_nic_vport_enable_roce(dev->mdev);
+ if (err)
+ return err;
err = mlx5_eth_lag_init(dev);
if (err)
@@ -5251,8 +5327,7 @@ static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
return 0;
err_disable_roce:
- if (MLX5_CAP_GEN(dev->mdev, roce))
- mlx5_nic_vport_disable_roce(dev->mdev);
+ mlx5_nic_vport_disable_roce(dev->mdev);
return err;
}
@@ -5260,8 +5335,7 @@ err_disable_roce:
static void mlx5_disable_eth(struct mlx5_ib_dev *dev)
{
mlx5_eth_lag_cleanup(dev);
- if (MLX5_CAP_GEN(dev->mdev, roce))
- mlx5_nic_vport_disable_roce(dev->mdev);
+ mlx5_nic_vport_disable_roce(dev->mdev);
}
struct mlx5_ib_counter {
@@ -5313,6 +5387,14 @@ static const struct mlx5_ib_counter extended_err_cnts[] = {
INIT_Q_COUNTER(req_cqe_flush_error),
};
+static const struct mlx5_ib_counter roce_accl_cnts[] = {
+ INIT_Q_COUNTER(roce_adp_retrans),
+ INIT_Q_COUNTER(roce_adp_retrans_to),
+ INIT_Q_COUNTER(roce_slow_restart),
+ INIT_Q_COUNTER(roce_slow_restart_cnps),
+ INIT_Q_COUNTER(roce_slow_restart_trans),
+};
+
#define INIT_EXT_PPCNT_COUNTER(_name) \
{ .name = #_name, .offset = \
MLX5_BYTE_OFF(ppcnt_reg, \
@@ -5322,11 +5404,21 @@ static const struct mlx5_ib_counter ext_ppcnt_cnts[] = {
INIT_EXT_PPCNT_COUNTER(rx_icrc_encapsulated),
};
+static bool is_mdev_switchdev_mode(const struct mlx5_core_dev *mdev)
+{
+ return MLX5_ESWITCH_MANAGER(mdev) &&
+ mlx5_ib_eswitch_mode(mdev->priv.eswitch) ==
+ MLX5_ESWITCH_OFFLOADS;
+}
+
static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
{
+ int num_cnt_ports;
int i;
- for (i = 0; i < dev->num_ports; i++) {
+ num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports;
+
+ for (i = 0; i < num_cnt_ports; i++) {
if (dev->port[i].cnts.set_id_valid)
mlx5_core_dealloc_q_counter(dev->mdev,
dev->port[i].cnts.set_id);
@@ -5351,6 +5443,9 @@ static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev,
if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters))
num_counters += ARRAY_SIZE(extended_err_cnts);
+ if (MLX5_CAP_GEN(dev->mdev, roce_accl))
+ num_counters += ARRAY_SIZE(roce_accl_cnts);
+
cnts->num_q_counters = num_counters;
if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
@@ -5411,6 +5506,13 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
}
}
+ if (MLX5_CAP_GEN(dev->mdev, roce_accl)) {
+ for (i = 0; i < ARRAY_SIZE(roce_accl_cnts); i++, j++) {
+ names[j] = roce_accl_cnts[i].name;
+ offsets[j] = roce_accl_cnts[i].offset;
+ }
+ }
+
if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) {
names[j] = cong_cnts[i].name;
@@ -5428,13 +5530,15 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
{
+ int num_cnt_ports;
int err = 0;
int i;
bool is_shared;
is_shared = MLX5_CAP_GEN(dev->mdev, log_max_uctx) != 0;
+ num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports;
- for (i = 0; i < dev->num_ports; i++) {
+ for (i = 0; i < num_cnt_ports; i++) {
err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts);
if (err)
goto err_alloc;
@@ -5454,7 +5558,6 @@ static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
}
dev->port[i].cnts.set_id_valid = true;
}
-
return 0;
err_alloc:
@@ -5462,25 +5565,50 @@ err_alloc:
return err;
}
+static const struct mlx5_ib_counters *get_counters(struct mlx5_ib_dev *dev,
+ u8 port_num)
+{
+ return is_mdev_switchdev_mode(dev->mdev) ? &dev->port[0].cnts :
+ &dev->port[port_num].cnts;
+}
+
+/**
+ * mlx5_ib_get_counters_id - Returns counters id to use for device+port
+ * @dev: Pointer to mlx5 IB device
+ * @port_num: Zero based port number
+ *
+ * mlx5_ib_get_counters_id() Returns counters set id to use for given
+ * device port combination in switchdev and non switchdev mode of the
+ * parent device.
+ */
+u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u8 port_num)
+{
+ const struct mlx5_ib_counters *cnts = get_counters(dev, port_num);
+
+ return cnts->set_id;
+}
+
static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
u8 port_num)
{
struct mlx5_ib_dev *dev = to_mdev(ibdev);
- struct mlx5_ib_port *port = &dev->port[port_num - 1];
+ const struct mlx5_ib_counters *cnts;
+ bool is_switchdev = is_mdev_switchdev_mode(dev->mdev);
- /* We support only per port stats */
- if (port_num == 0)
+ if ((is_switchdev && port_num) || (!is_switchdev && !port_num))
return NULL;
- return rdma_alloc_hw_stats_struct(port->cnts.names,
- port->cnts.num_q_counters +
- port->cnts.num_cong_counters +
- port->cnts.num_ext_ppcnt_counters,
+ cnts = get_counters(dev, port_num - 1);
+
+ return rdma_alloc_hw_stats_struct(cnts->names,
+ cnts->num_q_counters +
+ cnts->num_cong_counters +
+ cnts->num_ext_ppcnt_counters,
RDMA_HW_STATS_DEFAULT_LIFESPAN);
}
static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev,
- struct mlx5_ib_port *port,
+ const struct mlx5_ib_counters *cnts,
struct rdma_hw_stats *stats,
u16 set_id)
{
@@ -5497,8 +5625,8 @@ static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev,
if (ret)
goto free;
- for (i = 0; i < port->cnts.num_q_counters; i++) {
- val = *(__be32 *)(out + port->cnts.offsets[i]);
+ for (i = 0; i < cnts->num_q_counters; i++) {
+ val = *(__be32 *)(out + cnts->offsets[i]);
stats->value[i] = (u64)be32_to_cpu(val);
}
@@ -5508,10 +5636,10 @@ free:
}
static int mlx5_ib_query_ext_ppcnt_counters(struct mlx5_ib_dev *dev,
- struct mlx5_ib_port *port,
- struct rdma_hw_stats *stats)
+ const struct mlx5_ib_counters *cnts,
+ struct rdma_hw_stats *stats)
{
- int offset = port->cnts.num_q_counters + port->cnts.num_cong_counters;
+ int offset = cnts->num_q_counters + cnts->num_cong_counters;
int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
int ret, i;
void *out;
@@ -5524,12 +5652,10 @@ static int mlx5_ib_query_ext_ppcnt_counters(struct mlx5_ib_dev *dev,
if (ret)
goto free;
- for (i = 0; i < port->cnts.num_ext_ppcnt_counters; i++) {
+ for (i = 0; i < cnts->num_ext_ppcnt_counters; i++)
stats->value[i + offset] =
be64_to_cpup((__be64 *)(out +
- port->cnts.offsets[i + offset]));
- }
-
+ cnts->offsets[i + offset]));
free:
kvfree(out);
return ret;
@@ -5540,7 +5666,7 @@ static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
u8 port_num, int index)
{
struct mlx5_ib_dev *dev = to_mdev(ibdev);
- struct mlx5_ib_port *port = &dev->port[port_num - 1];
+ const struct mlx5_ib_counters *cnts = get_counters(dev, port_num - 1);
struct mlx5_core_dev *mdev;
int ret, num_counters;
u8 mdev_port_num;
@@ -5548,18 +5674,17 @@ static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
if (!stats)
return -EINVAL;
- num_counters = port->cnts.num_q_counters +
- port->cnts.num_cong_counters +
- port->cnts.num_ext_ppcnt_counters;
+ num_counters = cnts->num_q_counters +
+ cnts->num_cong_counters +
+ cnts->num_ext_ppcnt_counters;
/* q_counters are per IB device, query the master mdev */
- ret = mlx5_ib_query_q_counters(dev->mdev, port, stats,
- port->cnts.set_id);
+ ret = mlx5_ib_query_q_counters(dev->mdev, cnts, stats, cnts->set_id);
if (ret)
return ret;
if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
- ret = mlx5_ib_query_ext_ppcnt_counters(dev, port, stats);
+ ret = mlx5_ib_query_ext_ppcnt_counters(dev, cnts, stats);
if (ret)
return ret;
}
@@ -5576,10 +5701,10 @@ static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
}
ret = mlx5_lag_query_cong_counters(dev->mdev,
stats->value +
- port->cnts.num_q_counters,
- port->cnts.num_cong_counters,
- port->cnts.offsets +
- port->cnts.num_q_counters);
+ cnts->num_q_counters,
+ cnts->num_cong_counters,
+ cnts->offsets +
+ cnts->num_q_counters);
mlx5_ib_put_native_port_mdev(dev, port_num);
if (ret)
@@ -5594,20 +5719,22 @@ static struct rdma_hw_stats *
mlx5_ib_counter_alloc_stats(struct rdma_counter *counter)
{
struct mlx5_ib_dev *dev = to_mdev(counter->device);
- struct mlx5_ib_port *port = &dev->port[counter->port - 1];
+ const struct mlx5_ib_counters *cnts =
+ get_counters(dev, counter->port - 1);
/* Q counters are in the beginning of all counters */
- return rdma_alloc_hw_stats_struct(port->cnts.names,
- port->cnts.num_q_counters,
+ return rdma_alloc_hw_stats_struct(cnts->names,
+ cnts->num_q_counters,
RDMA_HW_STATS_DEFAULT_LIFESPAN);
}
static int mlx5_ib_counter_update_stats(struct rdma_counter *counter)
{
struct mlx5_ib_dev *dev = to_mdev(counter->device);
- struct mlx5_ib_port *port = &dev->port[counter->port - 1];
+ const struct mlx5_ib_counters *cnts =
+ get_counters(dev, counter->port - 1);
- return mlx5_ib_query_q_counters(dev->mdev, port,
+ return mlx5_ib_query_q_counters(dev->mdev, cnts,
counter->stats, counter->id);
}
@@ -5664,11 +5791,10 @@ static int mlx5_ib_rn_get_params(struct ib_device *device, u8 port_num,
static void delay_drop_debugfs_cleanup(struct mlx5_ib_dev *dev)
{
- if (!dev->delay_drop.dbg)
+ if (!dev->delay_drop.dir_debugfs)
return;
- debugfs_remove_recursive(dev->delay_drop.dbg->dir_debugfs);
- kfree(dev->delay_drop.dbg);
- dev->delay_drop.dbg = NULL;
+ debugfs_remove_recursive(dev->delay_drop.dir_debugfs);
+ dev->delay_drop.dir_debugfs = NULL;
}
static void cancel_delay_drop(struct mlx5_ib_dev *dev)
@@ -5719,52 +5845,22 @@ static const struct file_operations fops_delay_drop_timeout = {
.read = delay_drop_timeout_read,
};
-static int delay_drop_debugfs_init(struct mlx5_ib_dev *dev)
+static void delay_drop_debugfs_init(struct mlx5_ib_dev *dev)
{
- struct mlx5_ib_dbg_delay_drop *dbg;
+ struct dentry *root;
if (!mlx5_debugfs_root)
- return 0;
-
- dbg = kzalloc(sizeof(*dbg), GFP_KERNEL);
- if (!dbg)
- return -ENOMEM;
-
- dev->delay_drop.dbg = dbg;
-
- dbg->dir_debugfs =
- debugfs_create_dir("delay_drop",
- dev->mdev->priv.dbg_root);
- if (!dbg->dir_debugfs)
- goto out_debugfs;
-
- dbg->events_cnt_debugfs =
- debugfs_create_atomic_t("num_timeout_events", 0400,
- dbg->dir_debugfs,
- &dev->delay_drop.events_cnt);
- if (!dbg->events_cnt_debugfs)
- goto out_debugfs;
-
- dbg->rqs_cnt_debugfs =
- debugfs_create_atomic_t("num_rqs", 0400,
- dbg->dir_debugfs,
- &dev->delay_drop.rqs_cnt);
- if (!dbg->rqs_cnt_debugfs)
- goto out_debugfs;
-
- dbg->timeout_debugfs =
- debugfs_create_file("timeout", 0600,
- dbg->dir_debugfs,
- &dev->delay_drop,
- &fops_delay_drop_timeout);
- if (!dbg->timeout_debugfs)
- goto out_debugfs;
+ return;
- return 0;
+ root = debugfs_create_dir("delay_drop", dev->mdev->priv.dbg_root);
+ dev->delay_drop.dir_debugfs = root;
-out_debugfs:
- delay_drop_debugfs_cleanup(dev);
- return -ENOMEM;
+ debugfs_create_atomic_t("num_timeout_events", 0400, root,
+ &dev->delay_drop.events_cnt);
+ debugfs_create_atomic_t("num_rqs", 0400, root,
+ &dev->delay_drop.rqs_cnt);
+ debugfs_create_file("timeout", 0600, root, &dev->delay_drop,
+ &fops_delay_drop_timeout);
}
static void init_delay_drop(struct mlx5_ib_dev *dev)
@@ -5780,11 +5876,9 @@ static void init_delay_drop(struct mlx5_ib_dev *dev)
atomic_set(&dev->delay_drop.rqs_cnt, 0);
atomic_set(&dev->delay_drop.events_cnt, 0);
- if (delay_drop_debugfs_init(dev))
- mlx5_ib_warn(dev, "Failed to init delay drop debugfs\n");
+ delay_drop_debugfs_init(dev);
}
-/* The mlx5_ib_multiport_mutex should be held when calling this function */
static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
struct mlx5_ib_multiport_info *mpi)
{
@@ -5794,6 +5888,8 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
int err;
int i;
+ lockdep_assert_held(&mlx5_ib_multiport_mutex);
+
mlx5_ib_cleanup_cong_debugfs(ibdev, port_num);
spin_lock(&port->mp.mpi_lock);
@@ -5843,13 +5939,14 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
ibdev->port[port_num].roce.last_port_state = IB_PORT_DOWN;
}
-/* The mlx5_ib_multiport_mutex should be held when calling this function */
static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
struct mlx5_ib_multiport_info *mpi)
{
u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
int err;
+ lockdep_assert_held(&mlx5_ib_multiport_mutex);
+
spin_lock(&ibdev->port[port_num].mp.mpi_lock);
if (ibdev->port[port_num].mp.mpi) {
mlx5_ib_dbg(ibdev, "port %d already affiliated.\n",
@@ -5991,6 +6088,145 @@ static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev)
mlx5_nic_vport_disable_roce(dev->mdev);
}
+static int var_obj_cleanup(struct ib_uobject *uobject,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct mlx5_user_mmap_entry *obj = uobject->object;
+
+ rdma_user_mmap_entry_remove(&obj->rdma_entry);
+ return 0;
+}
+
+static struct mlx5_user_mmap_entry *
+alloc_var_entry(struct mlx5_ib_ucontext *c)
+{
+ struct mlx5_user_mmap_entry *entry;
+ struct mlx5_var_table *var_table;
+ u32 page_idx;
+ int err;
+
+ var_table = &to_mdev(c->ibucontext.device)->var_table;
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_lock(&var_table->bitmap_lock);
+ page_idx = find_first_zero_bit(var_table->bitmap,
+ var_table->num_var_hw_entries);
+ if (page_idx >= var_table->num_var_hw_entries) {
+ err = -ENOSPC;
+ mutex_unlock(&var_table->bitmap_lock);
+ goto end;
+ }
+
+ set_bit(page_idx, var_table->bitmap);
+ mutex_unlock(&var_table->bitmap_lock);
+
+ entry->address = var_table->hw_start_addr +
+ (page_idx * var_table->stride_size);
+ entry->page_idx = page_idx;
+ entry->mmap_flag = MLX5_IB_MMAP_TYPE_VAR;
+
+ err = rdma_user_mmap_entry_insert_range(
+ &c->ibucontext, &entry->rdma_entry, var_table->stride_size,
+ MLX5_IB_MMAP_OFFSET_START << 16,
+ (MLX5_IB_MMAP_OFFSET_END << 16) + (1UL << 16) - 1);
+ if (err)
+ goto err_insert;
+
+ return entry;
+
+err_insert:
+ mutex_lock(&var_table->bitmap_lock);
+ clear_bit(page_idx, var_table->bitmap);
+ mutex_unlock(&var_table->bitmap_lock);
+end:
+ kfree(entry);
+ return ERR_PTR(err);
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_VAR_OBJ_ALLOC)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj = uverbs_attr_get_uobject(
+ attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE);
+ struct mlx5_ib_ucontext *c;
+ struct mlx5_user_mmap_entry *entry;
+ u64 mmap_offset;
+ u32 length;
+ int err;
+
+ c = to_mucontext(ib_uverbs_get_ucontext(attrs));
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+
+ entry = alloc_var_entry(c);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
+
+ mmap_offset = mlx5_entry_to_mmap_offset(entry);
+ length = entry->rdma_entry.npages * PAGE_SIZE;
+ uobj->object = entry;
+
+ err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET,
+ &mmap_offset, sizeof(mmap_offset));
+ if (err)
+ goto err;
+
+ err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID,
+ &entry->page_idx, sizeof(entry->page_idx));
+ if (err)
+ goto err;
+
+ err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH,
+ &length, sizeof(length));
+ if (err)
+ goto err;
+
+ return 0;
+
+err:
+ rdma_user_mmap_entry_remove(&entry->rdma_entry);
+ return err;
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+ MLX5_IB_METHOD_VAR_OBJ_ALLOC,
+ UVERBS_ATTR_IDR(MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE,
+ MLX5_IB_OBJECT_VAR,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ MLX5_IB_METHOD_VAR_OBJ_DESTROY,
+ UVERBS_ATTR_IDR(MLX5_IB_ATTR_VAR_OBJ_DESTROY_HANDLE,
+ MLX5_IB_OBJECT_VAR,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_VAR,
+ UVERBS_TYPE_ALLOC_IDR(var_obj_cleanup),
+ &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_ALLOC),
+ &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_DESTROY));
+
+static bool var_is_supported(struct ib_device *device)
+{
+ struct mlx5_ib_dev *dev = to_mdev(device);
+
+ return (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
+ MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q);
+}
+
ADD_UVERBS_ATTRIBUTES_SIMPLE(
mlx5_ib_dm,
UVERBS_OBJECT_DM,
@@ -6013,14 +6249,14 @@ ADD_UVERBS_ATTRIBUTES_SIMPLE(
enum mlx5_ib_uapi_flow_action_flags));
static const struct uapi_definition mlx5_ib_defs[] = {
-#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
UAPI_DEF_CHAIN(mlx5_ib_devx_defs),
UAPI_DEF_CHAIN(mlx5_ib_flow_defs),
-#endif
UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
&mlx5_ib_flow_action),
UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm),
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR,
+ UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)),
{}
};
@@ -6096,38 +6332,17 @@ static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device,
static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
{
- struct mlx5_core_dev *mdev = dev->mdev;
-
mlx5_ib_cleanup_multiport_master(dev);
- if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
- srcu_barrier(&dev->mr_srcu);
- cleanup_srcu_struct(&dev->mr_srcu);
- }
+ WARN_ON(!xa_empty(&dev->odp_mkeys));
+ cleanup_srcu_struct(&dev->odp_srcu);
+ WARN_ON(!xa_empty(&dev->sig_mrs));
WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES));
-
- WARN_ON(dev->dm.steering_sw_icm_alloc_blocks &&
- !bitmap_empty(
- dev->dm.steering_sw_icm_alloc_blocks,
- BIT(MLX5_CAP_DEV_MEM(mdev, log_steering_sw_icm_size) -
- MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev))));
-
- kfree(dev->dm.steering_sw_icm_alloc_blocks);
-
- WARN_ON(dev->dm.header_modify_sw_icm_alloc_blocks &&
- !bitmap_empty(dev->dm.header_modify_sw_icm_alloc_blocks,
- BIT(MLX5_CAP_DEV_MEM(
- mdev, log_header_modify_sw_icm_size) -
- MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev))));
-
- kfree(dev->dm.header_modify_sw_icm_alloc_blocks);
}
static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
{
struct mlx5_core_dev *mdev = dev->mdev;
- u64 header_modify_icm_blocks = 0;
- u64 steering_icm_blocks = 0;
int err;
int i;
@@ -6139,6 +6354,8 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
dev->port[i].roce.last_port_state = IB_PORT_DOWN;
}
+ mlx5_ib_internal_fill_odp_caps(dev);
+
err = mlx5_ib_init_multiport_master(dev);
if (err)
return err;
@@ -6171,52 +6388,18 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
mutex_init(&dev->cap_mask_mutex);
INIT_LIST_HEAD(&dev->qp_list);
spin_lock_init(&dev->reset_flow_resource_lock);
-
- if (MLX5_CAP_GEN_64(mdev, general_obj_types) &
- MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM) {
- if (MLX5_CAP64_DEV_MEM(mdev, steering_sw_icm_start_address)) {
- steering_icm_blocks =
- BIT(MLX5_CAP_DEV_MEM(mdev,
- log_steering_sw_icm_size) -
- MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev));
-
- dev->dm.steering_sw_icm_alloc_blocks =
- kcalloc(BITS_TO_LONGS(steering_icm_blocks),
- sizeof(unsigned long), GFP_KERNEL);
- if (!dev->dm.steering_sw_icm_alloc_blocks)
- goto err_mp;
- }
-
- if (MLX5_CAP64_DEV_MEM(mdev,
- header_modify_sw_icm_start_address)) {
- header_modify_icm_blocks = BIT(
- MLX5_CAP_DEV_MEM(
- mdev, log_header_modify_sw_icm_size) -
- MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev));
-
- dev->dm.header_modify_sw_icm_alloc_blocks =
- kcalloc(BITS_TO_LONGS(header_modify_icm_blocks),
- sizeof(unsigned long), GFP_KERNEL);
- if (!dev->dm.header_modify_sw_icm_alloc_blocks)
- goto err_dm;
- }
- }
+ xa_init(&dev->odp_mkeys);
+ xa_init(&dev->sig_mrs);
spin_lock_init(&dev->dm.lock);
dev->dm.dev = mdev;
- if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
- err = init_srcu_struct(&dev->mr_srcu);
- if (err)
- goto err_dm;
- }
+ err = init_srcu_struct(&dev->odp_srcu);
+ if (err)
+ goto err_mp;
return 0;
-err_dm:
- kfree(dev->dm.steering_sw_icm_alloc_blocks);
- kfree(dev->dm.header_modify_sw_icm_alloc_blocks);
-
err_mp:
mlx5_ib_cleanup_multiport_master(dev);
@@ -6273,12 +6456,16 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
.disassociate_ucontext = mlx5_ib_disassociate_ucontext,
.drain_rq = mlx5_ib_drain_rq,
.drain_sq = mlx5_ib_drain_sq,
+ .enable_driver = mlx5_ib_enable_driver,
+ .fill_res_entry = mlx5_ib_fill_res_entry,
+ .fill_stat_entry = mlx5_ib_fill_stat_entry,
.get_dev_fw_str = get_dev_fw_str,
.get_dma_mr = mlx5_ib_get_dma_mr,
.get_link_layer = mlx5_ib_port_link_layer,
.map_mr_sg = mlx5_ib_map_mr_sg,
.map_mr_sg_pi = mlx5_ib_map_mr_sg_pi,
.mmap = mlx5_ib_mmap,
+ .mmap_free = mlx5_ib_mmap_free,
.modify_cq = mlx5_ib_modify_cq,
.modify_device = mlx5_ib_modify_device,
.modify_port = mlx5_ib_modify_port,
@@ -6319,6 +6506,7 @@ static const struct ib_device_ops mlx5_ib_dev_ipoib_enhanced_ops = {
static const struct ib_device_ops mlx5_ib_dev_sriov_ops = {
.get_vf_config = mlx5_ib_get_vf_config,
+ .get_vf_guid = mlx5_ib_get_vf_guid,
.get_vf_stats = mlx5_ib_get_vf_stats,
.set_vf_guid = mlx5_ib_set_vf_guid,
.set_vf_link_state = mlx5_ib_set_vf_link_state,
@@ -6340,6 +6528,35 @@ static const struct ib_device_ops mlx5_ib_dev_dm_ops = {
.reg_dm_mr = mlx5_ib_reg_dm_mr,
};
+static int mlx5_ib_init_var_table(struct mlx5_ib_dev *dev)
+{
+ struct mlx5_core_dev *mdev = dev->mdev;
+ struct mlx5_var_table *var_table = &dev->var_table;
+ u8 log_doorbell_bar_size;
+ u8 log_doorbell_stride;
+ u64 bar_size;
+
+ log_doorbell_bar_size = MLX5_CAP_DEV_VDPA_EMULATION(mdev,
+ log_doorbell_bar_size);
+ log_doorbell_stride = MLX5_CAP_DEV_VDPA_EMULATION(mdev,
+ log_doorbell_stride);
+ var_table->hw_start_addr = dev->mdev->bar_addr +
+ MLX5_CAP64_DEV_VDPA_EMULATION(mdev,
+ doorbell_bar_offset);
+ bar_size = (1ULL << log_doorbell_bar_size) * 4096;
+ var_table->stride_size = 1ULL << log_doorbell_stride;
+ var_table->num_var_hw_entries = div64_u64(bar_size, var_table->stride_size);
+ mutex_init(&var_table->bitmap_lock);
+ var_table->bitmap = bitmap_zalloc(var_table->num_var_hw_entries,
+ GFP_KERNEL);
+ return (var_table->bitmap) ? 0 : -ENOMEM;
+}
+
+static void mlx5_ib_stage_caps_cleanup(struct mlx5_ib_dev *dev)
+{
+ bitmap_free(dev->var_table.bitmap);
+}
+
static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
{
struct mlx5_core_dev *mdev = dev->mdev;
@@ -6427,6 +6644,13 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
mutex_init(&dev->lb.mutex);
+ if (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
+ MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q) {
+ err = mlx5_ib_init_var_table(dev);
+ if (err)
+ return err;
+ }
+
dev->ib_dev.use_cq_dim = true;
return 0;
@@ -6448,7 +6672,7 @@ static const struct ib_device_ops mlx5_ib_dev_port_rep_ops = {
.query_port = mlx5_ib_rep_query_port,
};
-static int mlx5_ib_stage_rep_non_default_cb(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_raw_eth_non_default_cb(struct mlx5_ib_dev *dev)
{
ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_rep_ops);
return 0;
@@ -6488,7 +6712,7 @@ static void mlx5_ib_stage_common_roce_cleanup(struct mlx5_ib_dev *dev)
mlx5_remove_netdev_notifier(dev, port_num);
}
-static int mlx5_ib_stage_rep_roce_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_raw_eth_roce_init(struct mlx5_ib_dev *dev)
{
struct mlx5_core_dev *mdev = dev->mdev;
enum rdma_link_layer ll;
@@ -6504,7 +6728,7 @@ static int mlx5_ib_stage_rep_roce_init(struct mlx5_ib_dev *dev)
return err;
}
-static void mlx5_ib_stage_rep_roce_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_ib_stage_raw_eth_roce_cleanup(struct mlx5_ib_dev *dev)
{
mlx5_ib_stage_common_roce_cleanup(dev);
}
@@ -6563,8 +6787,6 @@ static void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev)
static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev)
{
- mlx5_ib_internal_fill_odp_caps(dev);
-
return mlx5_ib_odp_init_one(dev);
}
@@ -6716,10 +6938,24 @@ static void mlx5_ib_stage_devx_cleanup(struct mlx5_ib_dev *dev)
}
}
+int mlx5_ib_enable_driver(struct ib_device *dev)
+{
+ struct mlx5_ib_dev *mdev = to_mdev(dev);
+ int ret;
+
+ ret = mlx5_ib_test_wc(mdev);
+ mlx5_ib_dbg(mdev, "Write-Combining %s",
+ mdev->wc_support ? "supported" : "not supported");
+
+ return ret;
+}
+
void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
const struct mlx5_ib_profile *profile,
int stage)
{
+ dev->ib_active = false;
+
/* Number of stages to cleanup */
while (stage) {
stage--;
@@ -6765,7 +7001,7 @@ static const struct mlx5_ib_profile pf_profile = {
mlx5_ib_stage_flow_db_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_CAPS,
mlx5_ib_stage_caps_init,
- NULL),
+ mlx5_ib_stage_caps_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
mlx5_ib_stage_non_default_cb,
NULL),
@@ -6813,7 +7049,7 @@ static const struct mlx5_ib_profile pf_profile = {
mlx5_ib_stage_delay_drop_cleanup),
};
-const struct mlx5_ib_profile uplink_rep_profile = {
+const struct mlx5_ib_profile raw_eth_profile = {
STAGE_CREATE(MLX5_IB_STAGE_INIT,
mlx5_ib_stage_init_init,
mlx5_ib_stage_init_cleanup),
@@ -6822,13 +7058,13 @@ const struct mlx5_ib_profile uplink_rep_profile = {
mlx5_ib_stage_flow_db_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_CAPS,
mlx5_ib_stage_caps_init,
- NULL),
+ mlx5_ib_stage_caps_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
- mlx5_ib_stage_rep_non_default_cb,
+ mlx5_ib_stage_raw_eth_non_default_cb,
NULL),
STAGE_CREATE(MLX5_IB_STAGE_ROCE,
- mlx5_ib_stage_rep_roce_init,
- mlx5_ib_stage_rep_roce_cleanup),
+ mlx5_ib_stage_raw_eth_roce_init,
+ mlx5_ib_stage_raw_eth_roce_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_SRQ,
mlx5_init_srq_table,
mlx5_cleanup_srq_table),
@@ -6904,6 +7140,7 @@ static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev)
static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
{
+ const struct mlx5_ib_profile *profile;
enum rdma_link_layer ll;
struct mlx5_ib_dev *dev;
int port_type_cap;
@@ -6932,14 +7169,19 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
dev->port = kcalloc(num_ports, sizeof(*dev->port),
GFP_KERNEL);
if (!dev->port) {
- ib_dealloc_device((struct ib_device *)dev);
+ ib_dealloc_device(&dev->ib_dev);
return NULL;
}
dev->mdev = mdev;
dev->num_ports = num_ports;
- return __mlx5_ib_add(dev, &pf_profile);
+ if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_is_roce_enabled(mdev))
+ profile = &raw_eth_profile;
+ else
+ profile = &pf_profile;
+
+ return __mlx5_ib_add(dev, profile);
}
static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
@@ -6959,6 +7201,7 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
mlx5_ib_unbind_slave_port(mpi->ibdev, mpi);
list_del(&mpi->list);
mutex_unlock(&mlx5_ib_multiport_mutex);
+ kfree(mpi);
return;
}
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index fe1a76d8531c..b90a3649e7d1 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -34,6 +34,7 @@
#include <rdma/ib_umem.h>
#include <rdma/ib_umem_odp.h>
#include "mlx5_ib.h"
+#include <linux/jiffies.h>
/* @umem: umem object to scan
* @addr: ib virtual address requested by the user
@@ -56,18 +57,6 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
struct scatterlist *sg;
int entry;
- if (umem->is_odp) {
- unsigned int page_shift = to_ib_umem_odp(umem)->page_shift;
-
- *ncont = ib_umem_page_count(umem);
- *count = *ncont << (page_shift - PAGE_SHIFT);
- *shift = page_shift;
- if (order)
- *order = ilog2(roundup_pow_of_two(*ncont));
-
- return;
- }
-
addr = addr >> PAGE_SHIFT;
tmp = (unsigned long)addr;
m = find_first_bit(&tmp, BITS_PER_LONG);
@@ -112,18 +101,6 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
*count = i;
}
-static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
-{
- u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
-
- if (umem_dma & ODP_READ_ALLOWED_BIT)
- mtt_entry |= MLX5_IB_MTT_READ;
- if (umem_dma & ODP_WRITE_ALLOWED_BIT)
- mtt_entry |= MLX5_IB_MTT_WRITE;
-
- return mtt_entry;
-}
-
/*
* Populate the given array with bus addresses from the umem.
*
@@ -150,19 +127,6 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
struct scatterlist *sg;
int entry;
- if (umem->is_odp) {
- WARN_ON(shift != 0);
- WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE));
-
- for (i = 0; i < num_pages; ++i) {
- dma_addr_t pa =
- to_ib_umem_odp(umem)->dma_list[offset + i];
-
- pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
- }
- return;
- }
-
i = 0;
for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
len = sg_dma_len(sg) >> PAGE_SHIFT;
@@ -228,3 +192,201 @@ int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset)
*offset = buf_off >> ilog2(off_size);
return 0;
}
+
+#define WR_ID_BF 0xBF
+#define WR_ID_END 0xBAD
+#define TEST_WC_NUM_WQES 255
+#define TEST_WC_POLLING_MAX_TIME_JIFFIES msecs_to_jiffies(100)
+static int post_send_nop(struct mlx5_ib_dev *dev, struct ib_qp *ibqp, u64 wr_id,
+ bool signaled)
+{
+ struct mlx5_ib_qp *qp = to_mqp(ibqp);
+ struct mlx5_wqe_ctrl_seg *ctrl;
+ struct mlx5_bf *bf = &qp->bf;
+ __be32 mmio_wqe[16] = {};
+ unsigned long flags;
+ unsigned int idx;
+ int i;
+
+ if (unlikely(dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR))
+ return -EIO;
+
+ spin_lock_irqsave(&qp->sq.lock, flags);
+
+ idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1);
+ ctrl = mlx5_frag_buf_get_wqe(&qp->sq.fbc, idx);
+
+ memset(ctrl, 0, sizeof(struct mlx5_wqe_ctrl_seg));
+ ctrl->fm_ce_se = signaled ? MLX5_WQE_CTRL_CQ_UPDATE : 0;
+ ctrl->opmod_idx_opcode =
+ cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | MLX5_OPCODE_NOP);
+ ctrl->qpn_ds = cpu_to_be32((sizeof(struct mlx5_wqe_ctrl_seg) / 16) |
+ (qp->trans_qp.base.mqp.qpn << 8));
+
+ qp->sq.wrid[idx] = wr_id;
+ qp->sq.w_list[idx].opcode = MLX5_OPCODE_NOP;
+ qp->sq.wqe_head[idx] = qp->sq.head + 1;
+ qp->sq.cur_post += DIV_ROUND_UP(sizeof(struct mlx5_wqe_ctrl_seg),
+ MLX5_SEND_WQE_BB);
+ qp->sq.w_list[idx].next = qp->sq.cur_post;
+ qp->sq.head++;
+
+ memcpy(mmio_wqe, ctrl, sizeof(*ctrl));
+ ((struct mlx5_wqe_ctrl_seg *)&mmio_wqe)->fm_ce_se |=
+ MLX5_WQE_CTRL_CQ_UPDATE;
+
+ /* Make sure that descriptors are written before
+ * updating doorbell record and ringing the doorbell
+ */
+ wmb();
+
+ qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post);
+
+ /* Make sure doorbell record is visible to the HCA before
+ * we hit doorbell
+ */
+ wmb();
+ for (i = 0; i < 8; i++)
+ mlx5_write64(&mmio_wqe[i * 2],
+ bf->bfreg->map + bf->offset + i * 8);
+
+ bf->offset ^= bf->buf_size;
+
+ spin_unlock_irqrestore(&qp->sq.lock, flags);
+
+ return 0;
+}
+
+static int test_wc_poll_cq_result(struct mlx5_ib_dev *dev, struct ib_cq *cq)
+{
+ int ret;
+ struct ib_wc wc = {};
+ unsigned long end = jiffies + TEST_WC_POLLING_MAX_TIME_JIFFIES;
+
+ do {
+ ret = ib_poll_cq(cq, 1, &wc);
+ if (ret < 0 || wc.status)
+ return ret < 0 ? ret : -EINVAL;
+ if (ret)
+ break;
+ } while (!time_after(jiffies, end));
+
+ if (!ret)
+ return -ETIMEDOUT;
+
+ if (wc.wr_id != WR_ID_BF)
+ ret = 0;
+
+ return ret;
+}
+
+static int test_wc_do_send(struct mlx5_ib_dev *dev, struct ib_qp *qp)
+{
+ int err, i;
+
+ for (i = 0; i < TEST_WC_NUM_WQES; i++) {
+ err = post_send_nop(dev, qp, WR_ID_BF, false);
+ if (err)
+ return err;
+ }
+
+ return post_send_nop(dev, qp, WR_ID_END, true);
+}
+
+int mlx5_ib_test_wc(struct mlx5_ib_dev *dev)
+{
+ struct ib_cq_init_attr cq_attr = { .cqe = TEST_WC_NUM_WQES + 1 };
+ int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
+ struct ib_qp_init_attr qp_init_attr = {
+ .cap = { .max_send_wr = TEST_WC_NUM_WQES },
+ .qp_type = IB_QPT_UD,
+ .sq_sig_type = IB_SIGNAL_REQ_WR,
+ .create_flags = MLX5_IB_QP_CREATE_WC_TEST,
+ };
+ struct ib_qp_attr qp_attr = { .port_num = 1 };
+ struct ib_device *ibdev = &dev->ib_dev;
+ struct ib_qp *qp;
+ struct ib_cq *cq;
+ struct ib_pd *pd;
+ int ret;
+
+ if (!MLX5_CAP_GEN(dev->mdev, bf))
+ return 0;
+
+ if (!dev->mdev->roce.roce_en &&
+ port_type_cap == MLX5_CAP_PORT_TYPE_ETH) {
+ if (mlx5_core_is_pf(dev->mdev))
+ dev->wc_support = true;
+ return 0;
+ }
+
+ ret = mlx5_alloc_bfreg(dev->mdev, &dev->wc_bfreg, true, false);
+ if (ret)
+ goto print_err;
+
+ if (!dev->wc_bfreg.wc)
+ goto out1;
+
+ pd = ib_alloc_pd(ibdev, 0);
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
+ goto out1;
+ }
+
+ cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr);
+ if (IS_ERR(cq)) {
+ ret = PTR_ERR(cq);
+ goto out2;
+ }
+
+ qp_init_attr.recv_cq = cq;
+ qp_init_attr.send_cq = cq;
+ qp = ib_create_qp(pd, &qp_init_attr);
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
+ goto out3;
+ }
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = ib_modify_qp(qp, &qp_attr,
+ IB_QP_STATE | IB_QP_PORT | IB_QP_PKEY_INDEX |
+ IB_QP_QKEY);
+ if (ret)
+ goto out4;
+
+ qp_attr.qp_state = IB_QPS_RTR;
+ ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+ if (ret)
+ goto out4;
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN);
+ if (ret)
+ goto out4;
+
+ ret = test_wc_do_send(dev, qp);
+ if (ret < 0)
+ goto out4;
+
+ ret = test_wc_poll_cq_result(dev, cq);
+ if (ret > 0) {
+ dev->wc_support = true;
+ ret = 0;
+ }
+
+out4:
+ ib_destroy_qp(qp);
+out3:
+ ib_destroy_cq(cq);
+out2:
+ ib_dealloc_pd(pd);
+out1:
+ mlx5_free_bfreg(dev->mdev, &dev->wc_bfreg);
+print_err:
+ if (ret)
+ mlx5_ib_err(
+ dev,
+ "Error %d while trying to test write-combining support\n",
+ ret);
+ return ret;
+}
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index f6a53455bf8b..d9bffcc93587 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -72,6 +72,11 @@
#define MLX5_MKEY_PAGE_SHIFT_MASK __mlx5_mask(mkc, log_page_size)
enum {
+ MLX5_IB_MMAP_OFFSET_START = 9,
+ MLX5_IB_MMAP_OFFSET_END = 255,
+};
+
+enum {
MLX5_IB_MMAP_CMD_SHIFT = 8,
MLX5_IB_MMAP_CMD_MASK = 0xff,
};
@@ -118,6 +123,11 @@ enum {
MLX5_MEMIC_BASE_SIZE = 1 << MLX5_MEMIC_BASE_ALIGN,
};
+enum mlx5_ib_mmap_type {
+ MLX5_IB_MMAP_TYPE_MEMIC = 1,
+ MLX5_IB_MMAP_TYPE_VAR = 2,
+};
+
#define MLX5_LOG_SW_ICM_BLOCK_SIZE(dev) \
(MLX5_CAP_DEV_MEM(dev, log_sw_icm_alloc_granularity))
#define MLX5_SW_ICM_BLOCK_SIZE(dev) (1 << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev))
@@ -135,7 +145,6 @@ struct mlx5_ib_ucontext {
u32 tdn;
u64 lib_caps;
- DECLARE_BITMAP(dm_pages, MLX5_MAX_MEMIC_PAGES);
u16 devx_uid;
/* For RoCE LAG TX affinity */
atomic_t tx_port_affinity;
@@ -200,6 +209,7 @@ struct mlx5_ib_flow_db {
struct mlx5_ib_flow_prio sniffer[MLX5_IB_NUM_SNIFFER_FTS];
struct mlx5_ib_flow_prio egress[MLX5_IB_NUM_EGRESS_FTS];
struct mlx5_ib_flow_prio fdb;
+ struct mlx5_ib_flow_prio rdma_rx[MLX5_IB_NUM_FLOW_FT];
struct mlx5_flow_table *lag_demux_ft;
/* Protect flow steering bypass flow tables
* when add/del flow rules.
@@ -246,12 +256,8 @@ struct mlx5_ib_flow_db {
* These flags are intended for internal use by the mlx5_ib driver, and they
* rely on the range reserved for that use in the ib_qp_create_flags enum.
*/
-
-/* Create a UD QP whose source QP number is 1 */
-static inline enum ib_qp_create_flags mlx5_ib_create_qp_sqpn_qp1(void)
-{
- return IB_QP_CREATE_RESERVED_START;
-}
+#define MLX5_IB_QP_CREATE_SQPN_QP1 IB_QP_CREATE_RESERVED_START
+#define MLX5_IB_QP_CREATE_WC_TEST (IB_QP_CREATE_RESERVED_START << 1)
struct wr_list {
u16 opcode;
@@ -294,6 +300,7 @@ enum mlx5_ib_wq_flags {
#define MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES 16
#define MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES 6
#define MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES 13
+#define MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES 3
struct mlx5_ib_rwq {
struct ib_wq ibwq;
@@ -558,6 +565,13 @@ enum mlx5_ib_mtt_access_flags {
MLX5_IB_MTT_WRITE = (1 << 1),
};
+struct mlx5_user_mmap_entry {
+ struct rdma_user_mmap_entry rdma_entry;
+ u8 mmap_flag;
+ u64 address;
+ u32 page_idx;
+};
+
struct mlx5_ib_dm {
struct ib_dm ibdm;
phys_addr_t dev_addr;
@@ -569,6 +583,7 @@ struct mlx5_ib_dm {
} icm_dm;
/* other dm types specific params should be added here */
};
+ struct mlx5_user_mmap_entry mentry;
};
#define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)
@@ -584,6 +599,9 @@ struct mlx5_ib_dm {
IB_ACCESS_REMOTE_READ |\
IB_ZERO_BASED)
+#define mlx5_update_odp_stats(mr, counter_name, value) \
+ atomic64_add(value, &((mr)->odp_stats.counter_name))
+
struct mlx5_ib_mr {
struct ib_mr ibmr;
void *descs;
@@ -605,7 +623,6 @@ struct mlx5_ib_mr {
struct mlx5_ib_dev *dev;
u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
struct mlx5_core_sig_ctx *sig;
- int live;
void *descs_alloc;
int access_flags; /* Needed for rereg MR */
@@ -617,10 +634,18 @@ struct mlx5_ib_mr {
u64 data_iova;
u64 pi_iova;
- atomic_t num_leaf_free;
- wait_queue_head_t q_leaf_free;
+ /* For ODP and implicit */
+ atomic_t num_deferred_work;
+ struct xarray implicit_children;
+ union {
+ struct rcu_head rcu;
+ struct list_head elm;
+ struct work_struct work;
+ } odp_destroy;
+ struct ib_odp_counters odp_stats;
+ bool is_odp_implicit;
+
struct mlx5_async_work cb_work;
- atomic_t num_pending_prefetch;
};
static inline bool is_odp_mr(struct mlx5_ib_mr *mr)
@@ -638,7 +663,6 @@ struct mlx5_ib_mw {
struct mlx5_ib_devx_mr {
struct mlx5_core_mkey mmkey;
int ndescs;
- struct rcu_head rcu;
};
struct mlx5_ib_umr_context {
@@ -792,13 +816,6 @@ enum {
MLX5_MAX_DELAY_DROP_TIMEOUT_MS = 100,
};
-struct mlx5_ib_dbg_delay_drop {
- struct dentry *dir_debugfs;
- struct dentry *rqs_cnt_debugfs;
- struct dentry *events_cnt_debugfs;
- struct dentry *timeout_debugfs;
-};
-
struct mlx5_ib_delay_drop {
struct mlx5_ib_dev *dev;
struct work_struct delay_drop_work;
@@ -808,7 +825,7 @@ struct mlx5_ib_delay_drop {
bool activate;
atomic_t events_cnt;
atomic_t rqs_cnt;
- struct mlx5_ib_dbg_delay_drop *dbg;
+ struct dentry *dir_debugfs;
};
enum mlx5_ib_stages {
@@ -868,7 +885,10 @@ struct mlx5_ib_flow_action {
struct {
struct mlx5_ib_dev *dev;
u32 sub_type;
- u32 action_id;
+ union {
+ struct mlx5_modify_hdr *modify_hdr;
+ struct mlx5_pkt_reformat *pkt_reformat;
+ };
} flow_action_raw;
};
};
@@ -881,8 +901,6 @@ struct mlx5_dm {
*/
spinlock_t lock;
DECLARE_BITMAP(memic_alloc_pages, MLX5_MAX_MEMIC_PAGES);
- unsigned long *steering_sw_icm_alloc_blocks;
- unsigned long *header_modify_sw_icm_alloc_blocks;
};
struct mlx5_read_counters_attr {
@@ -948,6 +966,15 @@ struct mlx5_devx_event_table {
struct xarray event_xa;
};
+struct mlx5_var_table {
+ /* serialize updating the bitmap */
+ struct mutex bitmap_lock;
+ unsigned long *bitmap;
+ u64 hw_start_addr;
+ u32 stride_size;
+ u64 num_var_hw_entries;
+};
+
struct mlx5_ib_dev {
struct ib_device ib_dev;
struct mlx5_core_dev *mdev;
@@ -956,7 +983,11 @@ struct mlx5_ib_dev {
/* serialize update of capability mask
*/
struct mutex cap_mask_mutex;
- bool ib_active;
+ u8 ib_active:1;
+ u8 fill_delay:1;
+ u8 is_rep:1;
+ u8 lag_active:1;
+ u8 wc_support:1;
struct umr_common umrc;
/* sync used page count stats
*/
@@ -965,7 +996,6 @@ struct mlx5_ib_dev {
struct timer_list delay_timer;
/* Prevents soft lock on massive reg MRs */
struct mutex slow_path_mutex;
- int fill_delay;
struct ib_odp_caps odp_caps;
u64 odp_max_size;
struct mlx5_ib_pf_eq odp_pf_eq;
@@ -974,7 +1004,9 @@ struct mlx5_ib_dev {
* Sleepable RCU that prevents destruction of MRs while they are still
* being used by a page fault handler.
*/
- struct srcu_struct mr_srcu;
+ struct srcu_struct odp_srcu;
+ struct xarray odp_mkeys;
+
u32 null_mkey;
struct mlx5_ib_flow_db *flow_db;
/* protect resources needed as part of reset flow */
@@ -983,11 +1015,10 @@ struct mlx5_ib_dev {
/* Array with num_ports elements */
struct mlx5_ib_port *port;
struct mlx5_sq_bfreg bfreg;
+ struct mlx5_sq_bfreg wc_bfreg;
struct mlx5_sq_bfreg fp_bfreg;
struct mlx5_ib_delay_drop delay_drop;
const struct mlx5_ib_profile *profile;
- bool is_rep;
- int lag_active;
struct mlx5_ib_lb_state lb;
u8 umr_fence;
@@ -998,6 +1029,9 @@ struct mlx5_ib_dev {
struct mlx5_srq_table srq_table;
struct mlx5_async_ctx async_ctx;
struct mlx5_devx_event_table devx_event_table;
+ struct mlx5_var_table var_table;
+
+ struct xarray sig_mrs;
};
static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
@@ -1094,6 +1128,13 @@ to_mflow_act(struct ib_flow_action *ibact)
return container_of(ibact, struct mlx5_ib_flow_action, ib_action);
}
+static inline struct mlx5_user_mmap_entry *
+to_mmmap(struct rdma_user_mmap_entry *rdma_entry)
+{
+ return container_of(rdma_entry,
+ struct mlx5_user_mmap_entry, rdma_entry);
+}
+
int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context,
struct ib_udata *udata, unsigned long virt,
struct mlx5_db *db);
@@ -1129,12 +1170,12 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
const struct ib_send_wr **bad_wr);
int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
const struct ib_recv_wr **bad_wr);
-int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
- int buflen, size_t *bc);
-int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
- int buflen, size_t *bc);
-int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index,
- void *buffer, int buflen, size_t *bc);
+int mlx5_ib_read_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
+ size_t buflen, size_t *bc);
+int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
+ size_t buflen, size_t *bc);
+int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer,
+ size_t buflen, size_t *bc);
int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
struct ib_udata *udata);
void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
@@ -1161,6 +1202,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
struct ib_udata *udata,
int access_flags);
void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr);
+void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr);
int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
u64 length, u64 virt_addr, int access_flags,
struct ib_pd *pd, struct ib_udata *udata);
@@ -1178,9 +1220,8 @@ int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
unsigned int *meta_sg_offset);
int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
const struct ib_wc *in_wc, const struct ib_grh *in_grh,
- const struct ib_mad_hdr *in, size_t in_mad_size,
- struct ib_mad_hdr *out, size_t *out_mad_size,
- u16 *out_mad_pkey_index);
+ const struct ib_mad *in, struct ib_mad *out,
+ size_t *out_mad_size, u16 *out_mad_pkey_index);
struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
struct ib_udata *udata);
int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata);
@@ -1222,6 +1263,8 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry);
void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
+int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr);
+
int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
struct ib_mr_status *mr_status);
struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
@@ -1234,7 +1277,6 @@ struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
struct ib_rwq_ind_table_init_attr *init_attr,
struct ib_udata *udata);
int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
-bool mlx5_ib_dc_atomic_is_supported(struct mlx5_ib_dev *dev);
struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
struct ib_ucontext *context,
struct ib_dm_alloc_attr *attr,
@@ -1250,11 +1292,9 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
int __init mlx5_ib_odp_init(void);
void mlx5_ib_odp_cleanup(void);
-void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
- unsigned long end);
void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
-void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
- size_t nentries, struct mlx5_ib_mr *mr, int flags);
+void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
+ struct mlx5_ib_mr *mr, int flags);
int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
enum ib_uverbs_advise_mr_advice advice,
@@ -1270,9 +1310,8 @@ static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {}
static inline int mlx5_ib_odp_init(void) { return 0; }
static inline void mlx5_ib_odp_cleanup(void) {}
static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}
-static inline void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
- size_t nentries, struct mlx5_ib_mr *mr,
- int flags) {}
+static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
+ struct mlx5_ib_mr *mr, int flags) {}
static inline int
mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
@@ -1281,11 +1320,10 @@ mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
{
return -EOPNOTSUPP;
}
-static inline void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp,
- unsigned long start,
- unsigned long end){};
#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+extern const struct mmu_interval_notifier_ops mlx5_mn_ops;
+
/* Needed for rep profile */
void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
const struct mlx5_ib_profile *profile,
@@ -1299,6 +1337,9 @@ int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf,
u8 port, int state);
int mlx5_ib_get_vf_stats(struct ib_device *device, int vf,
u8 port, struct ifla_vf_stats *stats);
+int mlx5_ib_get_vf_guid(struct ib_device *device, int vf, u8 port,
+ struct ifla_vf_guid *node_guid,
+ struct ifla_vf_guid *port_guid);
int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u8 port,
u64 guid, int type);
@@ -1333,23 +1374,26 @@ struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *dev,
u8 *native_port_num);
void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *dev,
u8 port_num);
+int mlx5_ib_fill_res_entry(struct sk_buff *msg,
+ struct rdma_restrack_entry *res);
+int mlx5_ib_fill_stat_entry(struct sk_buff *msg,
+ struct rdma_restrack_entry *res);
+
+extern const struct uapi_definition mlx5_ib_devx_defs[];
+extern const struct uapi_definition mlx5_ib_flow_defs[];
#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user);
void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid);
void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev);
void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev);
-const struct uverbs_object_tree_def *mlx5_ib_get_devx_tree(void);
-extern const struct uapi_definition mlx5_ib_devx_defs[];
-extern const struct uapi_definition mlx5_ib_flow_defs[];
struct mlx5_ib_flow_handler *mlx5_ib_raw_fs_rule_add(
struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher,
struct mlx5_flow_context *flow_context,
struct mlx5_flow_act *flow_act, u32 counter_id,
void *cmd_in, int inlen, int dest_id, int dest_type);
bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type);
-bool mlx5_ib_devx_is_flow_counter(void *obj, u32 *counter_id);
-int mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root);
+bool mlx5_ib_devx_is_flow_counter(void *obj, u32 offset, u32 *counter_id);
void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction);
#else
static inline int
@@ -1475,4 +1519,25 @@ int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
bool dyn_bfreg);
int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter);
+u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u8 port_num);
+
+static inline bool mlx5_ib_can_use_umr(struct mlx5_ib_dev *dev,
+ bool do_modify_atomic, int access_flags)
+{
+ if (MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled))
+ return false;
+
+ if (do_modify_atomic &&
+ MLX5_CAP_GEN(dev->mdev, atomic) &&
+ MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
+ return false;
+
+ if (access_flags & IB_ACCESS_RELAXED_ORDERING)
+ return false;
+
+ return true;
+}
+
+int mlx5_ib_enable_driver(struct ib_device *dev);
+int mlx5_ib_test_wc(struct mlx5_ib_dev *dev);
#endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index b74fad08412f..6fa0a83c19de 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -50,7 +50,6 @@ enum {
static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
static int mr_cache_max_order(struct mlx5_ib_dev *dev);
-static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
{
@@ -59,13 +58,9 @@ static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
{
- int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
+ WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
- if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
- /* Wait until all page fault handlers using the mr complete. */
- synchronize_srcu(&dev->mr_srcu);
-
- return err;
+ return mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
}
static int order2idx(struct mlx5_ib_dev *dev, int order)
@@ -84,32 +79,6 @@ static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length)
length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1));
}
-static void update_odp_mr(struct mlx5_ib_mr *mr)
-{
- if (is_odp_mr(mr)) {
- /*
- * This barrier prevents the compiler from moving the
- * setting of umem->odp_data->private to point to our
- * MR, before reg_umr finished, to ensure that the MR
- * initialization have finished before starting to
- * handle invalidations.
- */
- smp_wmb();
- to_ib_umem_odp(mr->umem)->private = mr;
- /*
- * Make sure we will see the new
- * umem->odp_data->private value in the invalidation
- * routines, before we can get page faults on the
- * MR. Page faults can happen once we put the MR in
- * the tree, below this line. Without the barrier,
- * there can be a fault handling and an invalidation
- * before umem->odp_data->private == mr is visible to
- * the invalidation handler.
- */
- smp_wmb();
- }
-}
-
static void reg_mr_callback(int status, struct mlx5_async_work *context)
{
struct mlx5_ib_mr *mr =
@@ -120,8 +89,6 @@ static void reg_mr_callback(int status, struct mlx5_async_work *context)
struct mlx5_cache_ent *ent = &cache->ent[c];
u8 key;
unsigned long flags;
- struct xarray *mkeys = &dev->mdev->priv.mkey_table;
- int err;
spin_lock_irqsave(&ent->lock, flags);
ent->pending--;
@@ -148,13 +115,6 @@ static void reg_mr_callback(int status, struct mlx5_async_work *context)
ent->size++;
spin_unlock_irqrestore(&ent->lock, flags);
- xa_lock_irqsave(mkeys, flags);
- err = xa_err(__xa_store(mkeys, mlx5_base_mkey(mr->mmkey.key),
- &mr->mmkey, GFP_ATOMIC));
- xa_unlock_irqrestore(mkeys, flags);
- if (err)
- pr_err("Error inserting to mkey tree. 0x%x\n", -err);
-
if (!completion_done(&ent->compl))
complete(&ent->compl);
}
@@ -187,7 +147,7 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
break;
}
mr->order = ent->order;
- mr->allocated_from_cache = 1;
+ mr->allocated_from_cache = true;
mr->dev = dev;
MLX5_SET(mkc, mkc, free, 1);
@@ -244,9 +204,6 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
}
- if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
- synchronize_srcu(&dev->mr_srcu);
-
list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
list_del(&mr->list);
kfree(mr);
@@ -454,7 +411,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry)
if (entry < 0 || entry >= MAX_MR_CACHE_ENTRIES) {
mlx5_ib_err(dev, "cache entry %d is out of range\n", entry);
- return NULL;
+ return ERR_PTR(-EINVAL);
}
ent = &cache->ent[entry];
@@ -537,7 +494,7 @@ void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
c = order2idx(dev, mr->order);
WARN_ON(c < 0 || c >= MAX_MR_CACHE_ENTRIES);
- if (unreg_umr(dev, mr)) {
+ if (mlx5_mr_cache_invalidate(mr)) {
mr->allocated_from_cache = false;
destroy_mkey(dev, mr);
ent = &cache->ent[c];
@@ -581,10 +538,6 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c)
mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
}
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
- synchronize_srcu(&dev->mr_srcu);
-#endif
-
list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
list_del(&mr->list);
kfree(mr);
@@ -705,6 +658,29 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
return 0;
}
+static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
+ struct ib_pd *pd)
+{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+
+ MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
+ MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
+ MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
+ MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
+ MLX5_SET(mkc, mkc, lr, 1);
+
+ if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
+ MLX5_SET(mkc, mkc, relaxed_ordering_write,
+ !!(acc & IB_ACCESS_RELAXED_ORDERING));
+ if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read))
+ MLX5_SET(mkc, mkc, relaxed_ordering_read,
+ !!(acc & IB_ACCESS_RELAXED_ORDERING));
+
+ MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
+ MLX5_SET(mkc, mkc, qpn, 0xffffff);
+ MLX5_SET64(mkc, mkc, start_addr, start_addr);
+}
+
struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
@@ -728,16 +704,8 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
- MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
- MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
- MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
- MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
- MLX5_SET(mkc, mkc, lr, 1);
-
MLX5_SET(mkc, mkc, length64, 1);
- MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
- MLX5_SET(mkc, mkc, qpn, 0xffffff);
- MLX5_SET64(mkc, mkc, start_addr, 0);
+ set_mkc_access_pd_addr_fields(mkc, acc, 0, pd);
err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
if (err)
@@ -778,25 +746,43 @@ static int mr_cache_max_order(struct mlx5_ib_dev *dev)
return MLX5_MAX_UMR_SHIFT;
}
-static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
- u64 start, u64 length, int access_flags,
- struct ib_umem **umem, int *npages, int *page_shift,
- int *ncont, int *order)
+static int mr_umem_get(struct mlx5_ib_dev *dev, u64 start, u64 length,
+ int access_flags, struct ib_umem **umem, int *npages,
+ int *page_shift, int *ncont, int *order)
{
struct ib_umem *u;
- int err;
*umem = NULL;
- u = ib_umem_get(udata, start, length, access_flags, 0);
- err = PTR_ERR_OR_ZERO(u);
- if (err) {
- mlx5_ib_dbg(dev, "umem get failed (%d)\n", err);
- return err;
+ if (access_flags & IB_ACCESS_ON_DEMAND) {
+ struct ib_umem_odp *odp;
+
+ odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
+ &mlx5_mn_ops);
+ if (IS_ERR(odp)) {
+ mlx5_ib_dbg(dev, "umem get failed (%ld)\n",
+ PTR_ERR(odp));
+ return PTR_ERR(odp);
+ }
+
+ u = &odp->umem;
+
+ *page_shift = odp->page_shift;
+ *ncont = ib_umem_odp_num_pages(odp);
+ *npages = *ncont << (*page_shift - PAGE_SHIFT);
+ if (order)
+ *order = ilog2(roundup_pow_of_two(*ncont));
+ } else {
+ u = ib_umem_get(&dev->ib_dev, start, length, access_flags);
+ if (IS_ERR(u)) {
+ mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u));
+ return PTR_ERR(u);
+ }
+
+ mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages,
+ page_shift, ncont, order);
}
- mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages,
- page_shift, ncont, order);
if (!*npages) {
mlx5_ib_warn(dev, "avoid zero region\n");
ib_umem_release(u);
@@ -890,36 +876,6 @@ static struct mlx5_ib_mr *alloc_mr_from_cache(
return mr;
}
-static inline int populate_xlt(struct mlx5_ib_mr *mr, int idx, int npages,
- void *xlt, int page_shift, size_t size,
- int flags)
-{
- struct mlx5_ib_dev *dev = mr->dev;
- struct ib_umem *umem = mr->umem;
-
- if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
- if (!umr_can_use_indirect_mkey(dev))
- return -EPERM;
- mlx5_odp_populate_klm(xlt, idx, npages, mr, flags);
- return npages;
- }
-
- npages = min_t(size_t, npages, ib_umem_num_pages(umem) - idx);
-
- if (!(flags & MLX5_IB_UPD_XLT_ZAP)) {
- __mlx5_ib_populate_pas(dev, umem, page_shift,
- idx, npages, xlt,
- MLX5_IB_MTT_PRESENT);
- /* Clear padding after the pages
- * brought from the umem.
- */
- memset(xlt + (npages * sizeof(struct mlx5_mtt)), 0,
- size - npages * sizeof(struct mlx5_mtt));
- }
-
- return npages;
-}
-
#define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \
MLX5_UMR_MTT_ALIGNMENT)
#define MLX5_SPARE_UMR_CHUNK 0x10000
@@ -943,6 +899,7 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
size_t pages_mapped = 0;
size_t pages_to_map = 0;
size_t pages_iter = 0;
+ size_t size_to_map = 0;
gfp_t gfp;
bool use_emergency_page = false;
@@ -989,6 +946,15 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
goto free_xlt;
}
+ if (mr->umem->is_odp) {
+ if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
+ struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+ size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
+
+ pages_to_map = min_t(size_t, pages_to_map, max_pages);
+ }
+ }
+
sg.addr = dma;
sg.lkey = dev->umrc.pd->local_dma_lkey;
@@ -1011,14 +977,22 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
pages_mapped < pages_to_map && !err;
pages_mapped += pages_iter, idx += pages_iter) {
npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
+ size_to_map = npages * desc_size;
dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
- npages = populate_xlt(mr, idx, npages, xlt,
- page_shift, size, flags);
-
+ if (mr->umem->is_odp) {
+ mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
+ } else {
+ __mlx5_ib_populate_pas(dev, mr->umem, page_shift, idx,
+ npages, xlt,
+ MLX5_IB_MTT_PRESENT);
+ /* Clear padding after the pages
+ * brought from the umem.
+ */
+ memset(xlt + size_to_map, 0, size - size_to_map);
+ }
dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
- sg.length = ALIGN(npages * desc_size,
- MLX5_UMR_MTT_ALIGNMENT);
+ sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT);
if (pages_mapped + pages_iter >= pages_to_map) {
if (flags & MLX5_IB_UPD_XLT_ENABLE)
@@ -1097,6 +1071,12 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
MLX5_SET(mkc, mkc, free, !populate);
MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
+ if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
+ MLX5_SET(mkc, mkc, relaxed_ordering_write,
+ !!(access_flags & IB_ACCESS_RELAXED_ORDERING));
+ if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read))
+ MLX5_SET(mkc, mkc, relaxed_ordering_read,
+ !!(access_flags & IB_ACCESS_RELAXED_ORDERING));
MLX5_SET(mkc, mkc, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
MLX5_SET(mkc, mkc, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE));
MLX5_SET(mkc, mkc, rr, !!(access_flags & IB_ACCESS_REMOTE_READ));
@@ -1177,16 +1157,8 @@ static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
- MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
- MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
- MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
- MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
- MLX5_SET(mkc, mkc, lr, 1);
-
MLX5_SET64(mkc, mkc, len, length);
- MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
- MLX5_SET(mkc, mkc, qpn, 0xffffff);
- MLX5_SET64(mkc, mkc, start_addr, start_addr);
+ set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
if (err)
@@ -1277,6 +1249,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && !start &&
length == U64_MAX) {
+ if (virt_addr != start)
+ return ERR_PTR(-EINVAL);
if (!(access_flags & IB_ACCESS_ON_DEMAND) ||
!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
return ERR_PTR(-EINVAL);
@@ -1287,15 +1261,13 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
return &mr->ibmr;
}
- err = mr_umem_get(dev, udata, start, length, access_flags, &umem,
+ err = mr_umem_get(dev, start, length, access_flags, &umem,
&npages, &page_shift, &ncont, &order);
if (err < 0)
return ERR_PTR(err);
- use_umr = !MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled) &&
- (!MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled) ||
- !MLX5_CAP_GEN(dev->mdev, atomic));
+ use_umr = mlx5_ib_can_use_umr(dev, true, access_flags);
if (order <= mr_cache_max_order(dev) && use_umr) {
mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont,
@@ -1330,8 +1302,6 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mr->umem = umem;
set_mr_fields(dev, mr, npages, length, access_flags);
- update_odp_mr(mr);
-
if (use_umr) {
int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE;
@@ -1347,9 +1317,16 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
}
}
- if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
- mr->live = 1;
- atomic_set(&mr->num_pending_prefetch, 0);
+ if (is_odp_mr(mr)) {
+ to_ib_umem_odp(mr->umem)->private = mr;
+ atomic_set(&mr->num_deferred_work, 0);
+ err = xa_err(xa_store(&dev->odp_mkeys,
+ mlx5_base_mkey(mr->mmkey.key), &mr->mmkey,
+ GFP_KERNEL));
+ if (err) {
+ dereg_mr(dev, mr);
+ return ERR_PTR(err);
+ }
}
return &mr->ibmr;
@@ -1358,22 +1335,29 @@ error:
return ERR_PTR(err);
}
-static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+/**
+ * mlx5_mr_cache_invalidate - Fence all DMA on the MR
+ * @mr: The MR to fence
+ *
+ * Upon return the NIC will not be doing any DMA to the pages under the MR,
+ * and any DMA inprogress will be completed. Failure of this function
+ * indicates the HW has failed catastrophically.
+ */
+int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr)
{
- struct mlx5_core_dev *mdev = dev->mdev;
struct mlx5_umr_wr umrwr = {};
- if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+ if (mr->dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
return 0;
umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR |
MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
umrwr.wr.opcode = MLX5_IB_WR_UMR;
- umrwr.pd = dev->umrc.pd;
+ umrwr.pd = mr->dev->umrc.pd;
umrwr.mkey = mr->mmkey.key;
umrwr.ignore_free_state = 1;
- return mlx5_ib_post_send_wait(dev, &umrwr);
+ return mlx5_ib_post_send_wait(mr->dev, &umrwr);
}
static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr,
@@ -1425,6 +1409,9 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
if (!mr->umem)
return -EINVAL;
+ if (is_odp_mr(mr))
+ return -EOPNOTSUPP;
+
if (flags & IB_MR_REREG_TRANS) {
addr = virt_addr;
len = length;
@@ -1441,19 +1428,19 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
flags |= IB_MR_REREG_TRANS;
ib_umem_release(mr->umem);
mr->umem = NULL;
- err = mr_umem_get(dev, udata, addr, len, access_flags,
- &mr->umem, &npages, &page_shift, &ncont,
- &order);
+ err = mr_umem_get(dev, addr, len, access_flags, &mr->umem,
+ &npages, &page_shift, &ncont, &order);
if (err)
goto err;
}
- if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) {
+ if (!mlx5_ib_can_use_umr(dev, true, access_flags) ||
+ (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len))) {
/*
* UMR can't be used - MKey needs to be replaced.
*/
if (mr->allocated_from_cache)
- err = unreg_umr(dev, mr);
+ err = mlx5_mr_cache_invalidate(mr);
else
err = destroy_mkey(dev, mr);
if (err)
@@ -1468,9 +1455,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
goto err;
}
- mr->allocated_from_cache = 0;
- if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
- mr->live = 1;
+ mr->allocated_from_cache = false;
} else {
/*
* Send a UMR WQE
@@ -1499,7 +1484,6 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
set_mr_fields(dev, mr, npages, len, access_flags);
- update_odp_mr(mr);
return 0;
err:
@@ -1569,6 +1553,7 @@ static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
mr->sig->psv_wire.psv_idx))
mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
mr->sig->psv_wire.psv_idx);
+ xa_erase(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key));
kfree(mr->sig);
mr->sig = NULL;
}
@@ -1584,53 +1569,20 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
int npages = mr->npages;
struct ib_umem *umem = mr->umem;
- if (is_odp_mr(mr)) {
- struct ib_umem_odp *umem_odp = to_ib_umem_odp(umem);
-
- /* Prevent new page faults and
- * prefetch requests from succeeding
- */
- mr->live = 0;
-
- /* dequeue pending prefetch requests for the mr */
- if (atomic_read(&mr->num_pending_prefetch))
- flush_workqueue(system_unbound_wq);
- WARN_ON(atomic_read(&mr->num_pending_prefetch));
-
- /* Wait for all running page-fault handlers to finish. */
- synchronize_srcu(&dev->mr_srcu);
- /* Destroy all page mappings */
- if (umem_odp->page_list)
- mlx5_ib_invalidate_range(umem_odp,
- ib_umem_start(umem_odp),
- ib_umem_end(umem_odp));
- else
- mlx5_ib_free_implicit_mr(mr);
- /*
- * We kill the umem before the MR for ODP,
- * so that there will not be any invalidations in
- * flight, looking at the *mr struct.
- */
- ib_umem_release(umem);
- atomic_sub(npages, &dev->mdev->priv.reg_pages);
-
- /* Avoid double-freeing the umem. */
- umem = NULL;
- }
+ /* Stop all DMA */
+ if (is_odp_mr(mr))
+ mlx5_ib_fence_odp_mr(mr);
+ else
+ clean_mr(dev, mr);
- clean_mr(dev, mr);
+ if (mr->allocated_from_cache)
+ mlx5_mr_cache_free(dev, mr);
+ else
+ kfree(mr);
- /*
- * We should unregister the DMA address from the HCA before
- * remove the DMA mapping.
- */
- mlx5_mr_cache_free(dev, mr);
ib_umem_release(umem);
- if (umem)
- atomic_sub(npages, &dev->mdev->priv.reg_pages);
+ atomic_sub(npages, &dev->mdev->priv.reg_pages);
- if (!mr->allocated_from_cache)
- kfree(mr);
}
int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
@@ -1642,6 +1594,11 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr);
}
+ if (is_odp_mr(mmr) && to_ib_umem_odp(mmr->umem)->is_implicit_odp) {
+ mlx5_ib_free_implicit_mr(mmr);
+ return 0;
+ }
+
dereg_mr(to_mdev(ibmr->device), mmr);
return 0;
@@ -1805,8 +1762,15 @@ static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
if (err)
goto err_free_mtt_mr;
+ err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
+ mr->sig, GFP_KERNEL));
+ if (err)
+ goto err_free_descs;
return 0;
+err_free_descs:
+ destroy_mkey(dev, mr);
+ mlx5_free_priv_descs(mr);
err_free_mtt_mr:
dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr);
mr->mtt_mr = NULL;
@@ -1959,9 +1923,19 @@ struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
}
}
+ if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+ err = xa_err(xa_store(&dev->odp_mkeys,
+ mlx5_base_mkey(mw->mmkey.key), &mw->mmkey,
+ GFP_KERNEL));
+ if (err)
+ goto free_mkey;
+ }
+
kfree(in);
return &mw->ibmw;
+free_mkey:
+ mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
free:
kfree(mw);
kfree(in);
@@ -1970,14 +1944,24 @@ free:
int mlx5_ib_dealloc_mw(struct ib_mw *mw)
{
+ struct mlx5_ib_dev *dev = to_mdev(mw->device);
struct mlx5_ib_mw *mmw = to_mmw(mw);
int err;
- err = mlx5_core_destroy_mkey((to_mdev(mw->device))->mdev,
- &mmw->mmkey);
- if (!err)
- kfree(mmw);
- return err;
+ if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+ xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key));
+ /*
+ * pagefault_single_data_segment() may be accessing mmw under
+ * SRCU if the user bound an ODP MR to this MW.
+ */
+ synchronize_srcu(&dev->odp_srcu);
+ }
+
+ err = mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey);
+ if (err)
+ return err;
+ kfree(mmw);
+ return 0;
}
int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 1d257d1b3b0d..4216814ba871 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -93,152 +93,223 @@ struct mlx5_pagefault {
static u64 mlx5_imr_ksm_entries;
-static int check_parent(struct ib_umem_odp *odp,
- struct mlx5_ib_mr *parent)
+static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
+ struct mlx5_ib_mr *imr, int flags)
{
- struct mlx5_ib_mr *mr = odp->private;
+ struct mlx5_klm *end = pklm + nentries;
- return mr && mr->parent == parent && !odp->dying;
+ if (flags & MLX5_IB_UPD_XLT_ZAP) {
+ for (; pklm != end; pklm++, idx++) {
+ pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
+ pklm->key = cpu_to_be32(imr->dev->null_mkey);
+ pklm->va = 0;
+ }
+ return;
+ }
+
+ /*
+ * The locking here is pretty subtle. Ideally the implicit_children
+ * xarray would be protected by the umem_mutex, however that is not
+ * possible. Instead this uses a weaker update-then-lock pattern:
+ *
+ * srcu_read_lock()
+ * xa_store()
+ * mutex_lock(umem_mutex)
+ * mlx5_ib_update_xlt()
+ * mutex_unlock(umem_mutex)
+ * destroy lkey
+ *
+ * ie any change the xarray must be followed by the locked update_xlt
+ * before destroying.
+ *
+ * The umem_mutex provides the acquire/release semantic needed to make
+ * the xa_store() visible to a racing thread. While SRCU is not
+ * technically required, using it gives consistent use of the SRCU
+ * locking around the xarray.
+ */
+ lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex);
+ lockdep_assert_held(&imr->dev->odp_srcu);
+
+ for (; pklm != end; pklm++, idx++) {
+ struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
+
+ pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
+ if (mtt) {
+ pklm->key = cpu_to_be32(mtt->ibmr.lkey);
+ pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE);
+ } else {
+ pklm->key = cpu_to_be32(imr->dev->null_mkey);
+ pklm->va = 0;
+ }
+ }
}
-static struct ib_ucontext_per_mm *mr_to_per_mm(struct mlx5_ib_mr *mr)
+static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
{
- if (WARN_ON(!mr || !is_odp_mr(mr)))
- return NULL;
+ u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
+
+ if (umem_dma & ODP_READ_ALLOWED_BIT)
+ mtt_entry |= MLX5_IB_MTT_READ;
+ if (umem_dma & ODP_WRITE_ALLOWED_BIT)
+ mtt_entry |= MLX5_IB_MTT_WRITE;
- return to_ib_umem_odp(mr->umem)->per_mm;
+ return mtt_entry;
}
-static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp)
+static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
+ struct mlx5_ib_mr *mr, int flags)
{
- struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent;
- struct ib_ucontext_per_mm *per_mm = odp->per_mm;
- struct rb_node *rb;
-
- down_read(&per_mm->umem_rwsem);
- while (1) {
- rb = rb_next(&odp->interval_tree.rb);
- if (!rb)
- goto not_found;
- odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
- if (check_parent(odp, parent))
- goto end;
+ struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+ dma_addr_t pa;
+ size_t i;
+
+ if (flags & MLX5_IB_UPD_XLT_ZAP)
+ return;
+
+ for (i = 0; i < nentries; i++) {
+ pa = odp->dma_list[idx + i];
+ pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
}
-not_found:
- odp = NULL;
-end:
- up_read(&per_mm->umem_rwsem);
- return odp;
}
-static struct ib_umem_odp *odp_lookup(u64 start, u64 length,
- struct mlx5_ib_mr *parent)
+void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
+ struct mlx5_ib_mr *mr, int flags)
{
- struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(parent);
- struct ib_umem_odp *odp;
- struct rb_node *rb;
-
- down_read(&per_mm->umem_rwsem);
- odp = rbt_ib_umem_lookup(&per_mm->umem_tree, start, length);
- if (!odp)
- goto end;
-
- while (1) {
- if (check_parent(odp, parent))
- goto end;
- rb = rb_next(&odp->interval_tree.rb);
- if (!rb)
- goto not_found;
- odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
- if (ib_umem_start(odp) > start + length)
- goto not_found;
+ if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
+ populate_klm(xlt, idx, nentries, mr, flags);
+ } else {
+ populate_mtt(xlt, idx, nentries, mr, flags);
}
-not_found:
- odp = NULL;
-end:
- up_read(&per_mm->umem_rwsem);
- return odp;
}
-void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
- size_t nentries, struct mlx5_ib_mr *mr, int flags)
+static void dma_fence_odp_mr(struct mlx5_ib_mr *mr)
{
- struct ib_pd *pd = mr->ibmr.pd;
- struct mlx5_ib_dev *dev = to_mdev(pd->device);
- struct ib_umem_odp *odp;
- unsigned long va;
- int i;
+ struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
- if (flags & MLX5_IB_UPD_XLT_ZAP) {
- for (i = 0; i < nentries; i++, pklm++) {
- pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
- pklm->key = cpu_to_be32(dev->null_mkey);
- pklm->va = 0;
- }
- return;
+ /* Ensure mlx5_ib_invalidate_range() will not touch the MR any more */
+ mutex_lock(&odp->umem_mutex);
+ if (odp->npages) {
+ mlx5_mr_cache_invalidate(mr);
+ ib_umem_odp_unmap_dma_pages(odp, ib_umem_start(odp),
+ ib_umem_end(odp));
+ WARN_ON(odp->npages);
}
+ odp->private = NULL;
+ mutex_unlock(&odp->umem_mutex);
- odp = odp_lookup(offset * MLX5_IMR_MTT_SIZE,
- nentries * MLX5_IMR_MTT_SIZE, mr);
-
- for (i = 0; i < nentries; i++, pklm++) {
- pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
- va = (offset + i) * MLX5_IMR_MTT_SIZE;
- if (odp && odp->umem.address == va) {
- struct mlx5_ib_mr *mtt = odp->private;
-
- pklm->key = cpu_to_be32(mtt->ibmr.lkey);
- odp = odp_next(odp);
- } else {
- pklm->key = cpu_to_be32(dev->null_mkey);
- }
- mlx5_ib_dbg(dev, "[%d] va %lx key %x\n",
- i, va, be32_to_cpu(pklm->key));
+ if (!mr->allocated_from_cache) {
+ mlx5_core_destroy_mkey(mr->dev->mdev, &mr->mmkey);
+ WARN_ON(mr->descs);
}
}
-static void mr_leaf_free_action(struct work_struct *work)
+/*
+ * This must be called after the mr has been removed from implicit_children
+ * and the SRCU synchronized. NOTE: The MR does not necessarily have to be
+ * empty here, parallel page faults could have raced with the free process and
+ * added pages to it.
+ */
+static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt)
{
- struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
- int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
- struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
+ struct mlx5_ib_mr *imr = mr->parent;
+ struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
+ struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+ unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
+ int srcu_key;
- mr->parent = NULL;
- synchronize_srcu(&mr->dev->mr_srcu);
+ /* implicit_child_mr's are not allowed to have deferred work */
+ WARN_ON(atomic_read(&mr->num_deferred_work));
- ib_umem_release(&odp->umem);
- if (imr->live)
- mlx5_ib_update_xlt(imr, idx, 1, 0,
+ if (need_imr_xlt) {
+ srcu_key = srcu_read_lock(&mr->dev->odp_srcu);
+ mutex_lock(&odp_imr->umem_mutex);
+ mlx5_ib_update_xlt(mr->parent, idx, 1, 0,
MLX5_IB_UPD_XLT_INDIRECT |
MLX5_IB_UPD_XLT_ATOMIC);
+ mutex_unlock(&odp_imr->umem_mutex);
+ srcu_read_unlock(&mr->dev->odp_srcu, srcu_key);
+ }
+
+ dma_fence_odp_mr(mr);
+
+ mr->parent = NULL;
mlx5_mr_cache_free(mr->dev, mr);
+ ib_umem_odp_release(odp);
+ atomic_dec(&imr->num_deferred_work);
+}
+
+static void free_implicit_child_mr_work(struct work_struct *work)
+{
+ struct mlx5_ib_mr *mr =
+ container_of(work, struct mlx5_ib_mr, odp_destroy.work);
+
+ free_implicit_child_mr(mr, true);
+}
+
+static void free_implicit_child_mr_rcu(struct rcu_head *head)
+{
+ struct mlx5_ib_mr *mr =
+ container_of(head, struct mlx5_ib_mr, odp_destroy.rcu);
+
+ /* Freeing a MR is a sleeping operation, so bounce to a work queue */
+ INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
+ queue_work(system_unbound_wq, &mr->odp_destroy.work);
+}
+
+static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
+{
+ struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+ unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
+ struct mlx5_ib_mr *imr = mr->parent;
+
+ xa_lock(&imr->implicit_children);
+ /*
+ * This can race with mlx5_ib_free_implicit_mr(), the first one to
+ * reach the xa lock wins the race and destroys the MR.
+ */
+ if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_ATOMIC) !=
+ mr)
+ goto out_unlock;
- if (atomic_dec_and_test(&imr->num_leaf_free))
- wake_up(&imr->q_leaf_free);
+ atomic_inc(&imr->num_deferred_work);
+ call_srcu(&mr->dev->odp_srcu, &mr->odp_destroy.rcu,
+ free_implicit_child_mr_rcu);
+
+out_unlock:
+ xa_unlock(&imr->implicit_children);
}
-void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
- unsigned long end)
+static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq)
{
+ struct ib_umem_odp *umem_odp =
+ container_of(mni, struct ib_umem_odp, notifier);
struct mlx5_ib_mr *mr;
const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
sizeof(struct mlx5_mtt)) - 1;
u64 idx = 0, blk_start_idx = 0;
+ u64 invalidations = 0;
+ unsigned long start;
+ unsigned long end;
int in_block = 0;
u64 addr;
- if (!umem_odp) {
- pr_err("invalidation called on NULL umem or non-ODP umem\n");
- return;
- }
+ if (!mmu_notifier_range_blockable(range))
+ return false;
+ mutex_lock(&umem_odp->umem_mutex);
+ mmu_interval_set_seq(mni, cur_seq);
+ /*
+ * If npages is zero then umem_odp->private may not be setup yet. This
+ * does not complete until after the first page is mapped for DMA.
+ */
+ if (!umem_odp->npages)
+ goto out;
mr = umem_odp->private;
- if (!mr || !mr->ibmr.pd)
- return;
-
- start = max_t(u64, ib_umem_start(umem_odp), start);
- end = min_t(u64, ib_umem_end(umem_odp), end);
+ start = max_t(u64, ib_umem_start(umem_odp), range->start);
+ end = min_t(u64, ib_umem_end(umem_odp), range->end);
/*
* Iteration one - zap the HW's MTTs. The notifiers_count ensures that
@@ -246,7 +317,6 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
* overwrite the same MTTs. Concurent invalidations might race us,
* but they will write 0s as well, so no difference in the end result.
*/
- mutex_lock(&umem_odp->umem_mutex);
for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) {
idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
/*
@@ -261,6 +331,9 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
blk_start_idx = idx;
in_block = 1;
}
+
+ /* Count page invalidations */
+ invalidations += idx - blk_start_idx + 1;
} else {
u64 umr_offset = idx & umr_block_mask;
@@ -278,7 +351,9 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
idx - blk_start_idx + 1, 0,
MLX5_IB_UPD_XLT_ZAP |
MLX5_IB_UPD_XLT_ATOMIC);
- mutex_unlock(&umem_odp->umem_mutex);
+
+ mlx5_update_odp_stats(mr, invalidations, invalidations);
+
/*
* We are now sure that the device will not access the
* memory. We can safely unmap it, and mark it as dirty if
@@ -287,21 +362,25 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
- if (unlikely(!umem_odp->npages && mr->parent &&
- !umem_odp->dying)) {
- WRITE_ONCE(umem_odp->dying, 1);
- atomic_inc(&mr->parent->num_leaf_free);
- schedule_work(&umem_odp->work);
- }
+ if (unlikely(!umem_odp->npages && mr->parent))
+ destroy_unused_implicit_child_mr(mr);
+out:
+ mutex_unlock(&umem_odp->umem_mutex);
+ return true;
}
+const struct mmu_interval_notifier_ops mlx5_mn_ops = {
+ .invalidate = mlx5_ib_invalidate_range,
+};
+
void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
{
struct ib_odp_caps *caps = &dev->odp_caps;
memset(caps, 0, sizeof(*caps));
- if (!MLX5_CAP_GEN(dev->mdev, pg))
+ if (!MLX5_CAP_GEN(dev->mdev, pg) ||
+ !mlx5_ib_can_use_umr(dev, true, 0))
return;
caps->general_caps = IB_ODP_SUPPORT;
@@ -355,10 +434,9 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
MLX5_CAP_GEN(dev->mdev, null_mkey) &&
- MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
+ MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) &&
+ !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled))
caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
-
- return;
}
static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
@@ -383,257 +461,225 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
wq_num, err);
}
-static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
- struct ib_umem *umem,
- bool ksm, int access_flags)
+static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
+ unsigned long idx)
{
- struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ struct ib_umem_odp *odp;
struct mlx5_ib_mr *mr;
+ struct mlx5_ib_mr *ret;
int err;
- mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY :
- MLX5_IMR_MTT_CACHE_ENTRY);
+ odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem),
+ idx * MLX5_IMR_MTT_SIZE,
+ MLX5_IMR_MTT_SIZE, &mlx5_mn_ops);
+ if (IS_ERR(odp))
+ return ERR_CAST(odp);
+ ret = mr = mlx5_mr_cache_alloc(imr->dev, MLX5_IMR_MTT_CACHE_ENTRY);
if (IS_ERR(mr))
- return mr;
-
- mr->ibmr.pd = pd;
-
- mr->dev = dev;
- mr->access_flags = access_flags;
- mr->mmkey.iova = 0;
- mr->umem = umem;
-
- if (ksm) {
- err = mlx5_ib_update_xlt(mr, 0,
- mlx5_imr_ksm_entries,
- MLX5_KSM_PAGE_SHIFT,
- MLX5_IB_UPD_XLT_INDIRECT |
- MLX5_IB_UPD_XLT_ZAP |
- MLX5_IB_UPD_XLT_ENABLE);
-
- } else {
- err = mlx5_ib_update_xlt(mr, 0,
- MLX5_IMR_MTT_ENTRIES,
- PAGE_SHIFT,
- MLX5_IB_UPD_XLT_ZAP |
- MLX5_IB_UPD_XLT_ENABLE |
- MLX5_IB_UPD_XLT_ATOMIC);
- }
-
- if (err)
- goto fail;
+ goto out_umem;
+ mr->ibmr.pd = imr->ibmr.pd;
+ mr->access_flags = imr->access_flags;
+ mr->umem = &odp->umem;
mr->ibmr.lkey = mr->mmkey.key;
mr->ibmr.rkey = mr->mmkey.key;
-
- mr->live = 1;
-
- mlx5_ib_dbg(dev, "key %x dev %p mr %p\n",
- mr->mmkey.key, dev->mdev, mr);
-
- return mr;
-
-fail:
- mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
- mlx5_mr_cache_free(dev, mr);
-
- return ERR_PTR(err);
-}
-
-static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr,
- u64 io_virt, size_t bcnt)
-{
- struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device);
- struct ib_umem_odp *odp, *result = NULL;
- struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem);
- u64 addr = io_virt & MLX5_IMR_MTT_MASK;
- int nentries = 0, start_idx = 0, ret;
- struct mlx5_ib_mr *mtt;
-
- mutex_lock(&odp_mr->umem_mutex);
- odp = odp_lookup(addr, 1, mr);
-
- mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n",
- io_virt, bcnt, addr, odp);
-
-next_mr:
- if (likely(odp)) {
- if (nentries)
- nentries++;
- } else {
- odp = ib_alloc_odp_umem(odp_mr, addr,
- MLX5_IMR_MTT_SIZE);
- if (IS_ERR(odp)) {
- mutex_unlock(&odp_mr->umem_mutex);
- return ERR_CAST(odp);
- }
-
- mtt = implicit_mr_alloc(mr->ibmr.pd, &odp->umem, 0,
- mr->access_flags);
- if (IS_ERR(mtt)) {
- mutex_unlock(&odp_mr->umem_mutex);
- ib_umem_release(&odp->umem);
- return ERR_CAST(mtt);
- }
-
- odp->private = mtt;
- mtt->umem = &odp->umem;
- mtt->mmkey.iova = addr;
- mtt->parent = mr;
- INIT_WORK(&odp->work, mr_leaf_free_action);
-
- if (!nentries)
- start_idx = addr >> MLX5_IMR_MTT_SHIFT;
- nentries++;
- }
-
- /* Return first odp if region not covered by single one */
- if (likely(!result))
- result = odp;
-
- addr += MLX5_IMR_MTT_SIZE;
- if (unlikely(addr < io_virt + bcnt)) {
- odp = odp_next(odp);
- if (odp && odp->umem.address != addr)
- odp = NULL;
- goto next_mr;
+ mr->mmkey.iova = idx * MLX5_IMR_MTT_SIZE;
+ mr->parent = imr;
+ odp->private = mr;
+
+ err = mlx5_ib_update_xlt(mr, 0,
+ MLX5_IMR_MTT_ENTRIES,
+ PAGE_SHIFT,
+ MLX5_IB_UPD_XLT_ZAP |
+ MLX5_IB_UPD_XLT_ENABLE);
+ if (err) {
+ ret = ERR_PTR(err);
+ goto out_mr;
}
- if (unlikely(nentries)) {
- ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0,
- MLX5_IB_UPD_XLT_INDIRECT |
- MLX5_IB_UPD_XLT_ATOMIC);
- if (ret) {
- mlx5_ib_err(dev, "Failed to update PAS\n");
- result = ERR_PTR(ret);
+ /*
+ * Once the store to either xarray completes any error unwind has to
+ * use synchronize_srcu(). Avoid this with xa_reserve()
+ */
+ ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr,
+ GFP_KERNEL);
+ if (unlikely(ret)) {
+ if (xa_is_err(ret)) {
+ ret = ERR_PTR(xa_err(ret));
+ goto out_mr;
}
+ /*
+ * Another thread beat us to creating the child mr, use
+ * theirs.
+ */
+ goto out_mr;
}
- mutex_unlock(&odp_mr->umem_mutex);
- return result;
+ mlx5_ib_dbg(imr->dev, "key %x mr %p\n", mr->mmkey.key, mr);
+ return mr;
+
+out_mr:
+ mlx5_mr_cache_free(imr->dev, mr);
+out_umem:
+ ib_umem_odp_release(odp);
+ return ret;
}
struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
struct ib_udata *udata,
int access_flags)
{
+ struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device);
+ struct ib_umem_odp *umem_odp;
struct mlx5_ib_mr *imr;
- struct ib_umem *umem;
+ int err;
- umem = ib_umem_get(udata, 0, 0, access_flags, 0);
- if (IS_ERR(umem))
- return ERR_CAST(umem);
+ umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags);
+ if (IS_ERR(umem_odp))
+ return ERR_CAST(umem_odp);
- imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags);
+ imr = mlx5_mr_cache_alloc(dev, MLX5_IMR_KSM_CACHE_ENTRY);
if (IS_ERR(imr)) {
- ib_umem_release(umem);
- return ERR_CAST(imr);
+ err = PTR_ERR(imr);
+ goto out_umem;
}
- imr->umem = umem;
- init_waitqueue_head(&imr->q_leaf_free);
- atomic_set(&imr->num_leaf_free, 0);
- atomic_set(&imr->num_pending_prefetch, 0);
+ imr->ibmr.pd = &pd->ibpd;
+ imr->access_flags = access_flags;
+ imr->mmkey.iova = 0;
+ imr->umem = &umem_odp->umem;
+ imr->ibmr.lkey = imr->mmkey.key;
+ imr->ibmr.rkey = imr->mmkey.key;
+ imr->umem = &umem_odp->umem;
+ imr->is_odp_implicit = true;
+ atomic_set(&imr->num_deferred_work, 0);
+ xa_init(&imr->implicit_children);
+
+ err = mlx5_ib_update_xlt(imr, 0,
+ mlx5_imr_ksm_entries,
+ MLX5_KSM_PAGE_SHIFT,
+ MLX5_IB_UPD_XLT_INDIRECT |
+ MLX5_IB_UPD_XLT_ZAP |
+ MLX5_IB_UPD_XLT_ENABLE);
+ if (err)
+ goto out_mr;
+
+ err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key),
+ &imr->mmkey, GFP_KERNEL));
+ if (err)
+ goto out_mr;
+ mlx5_ib_dbg(dev, "key %x mr %p\n", imr->mmkey.key, imr);
return imr;
+out_mr:
+ mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
+ mlx5_mr_cache_free(dev, imr);
+out_umem:
+ ib_umem_odp_release(umem_odp);
+ return ERR_PTR(err);
}
-static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end,
- void *cookie)
+void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
{
- struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie;
+ struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
+ struct mlx5_ib_dev *dev = imr->dev;
+ struct list_head destroy_list;
+ struct mlx5_ib_mr *mtt;
+ struct mlx5_ib_mr *tmp;
+ unsigned long idx;
- if (mr->parent != imr)
- return 0;
+ INIT_LIST_HEAD(&destroy_list);
+
+ xa_erase(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key));
+ /*
+ * This stops the SRCU protected page fault path from touching either
+ * the imr or any children. The page fault path can only reach the
+ * children xarray via the imr.
+ */
+ synchronize_srcu(&dev->odp_srcu);
+
+ xa_lock(&imr->implicit_children);
+ xa_for_each (&imr->implicit_children, idx, mtt) {
+ __xa_erase(&imr->implicit_children, idx);
+ list_add(&mtt->odp_destroy.elm, &destroy_list);
+ }
+ xa_unlock(&imr->implicit_children);
- ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
- ib_umem_end(umem_odp));
+ /*
+ * num_deferred_work can only be incremented inside the odp_srcu, or
+ * under xa_lock while the child is in the xarray. Thus at this point
+ * it is only decreasing, and all work holding it is now on the wq.
+ */
+ if (atomic_read(&imr->num_deferred_work)) {
+ flush_workqueue(system_unbound_wq);
+ WARN_ON(atomic_read(&imr->num_deferred_work));
+ }
- if (umem_odp->dying)
- return 0;
+ /*
+ * Fence the imr before we destroy the children. This allows us to
+ * skip updating the XLT of the imr during destroy of the child mkey
+ * the imr points to.
+ */
+ mlx5_mr_cache_invalidate(imr);
- WRITE_ONCE(umem_odp->dying, 1);
- atomic_inc(&imr->num_leaf_free);
- schedule_work(&umem_odp->work);
+ list_for_each_entry_safe (mtt, tmp, &destroy_list, odp_destroy.elm)
+ free_implicit_child_mr(mtt, false);
- return 0;
+ mlx5_mr_cache_free(dev, imr);
+ ib_umem_odp_release(odp_imr);
}
-void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
+/**
+ * mlx5_ib_fence_odp_mr - Stop all access to the ODP MR
+ * @mr: to fence
+ *
+ * On return no parallel threads will be touching this MR and no DMA will be
+ * active.
+ */
+void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr)
{
- struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr);
+ /* Prevent new page faults and prefetch requests from succeeding */
+ xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
- down_read(&per_mm->umem_rwsem);
- rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX,
- mr_leaf_free, true, imr);
- up_read(&per_mm->umem_rwsem);
+ /* Wait for all running page-fault handlers to finish. */
+ synchronize_srcu(&mr->dev->odp_srcu);
- wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
+ if (atomic_read(&mr->num_deferred_work)) {
+ flush_workqueue(system_unbound_wq);
+ WARN_ON(atomic_read(&mr->num_deferred_work));
+ }
+
+ dma_fence_odp_mr(mr);
}
-#define MLX5_PF_FLAGS_PREFETCH BIT(0)
#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
-static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
- u64 io_virt, size_t bcnt, u32 *bytes_mapped,
- u32 flags)
+static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
+ u64 user_va, size_t bcnt, u32 *bytes_mapped,
+ u32 flags)
{
- int npages = 0, current_seq, page_shift, ret, np;
- struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem);
+ int page_shift, ret, np;
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
- bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
+ unsigned long current_seq;
u64 access_mask;
- u64 start_idx, page_mask;
- struct ib_umem_odp *odp;
- size_t size;
-
- if (!odp_mr->page_list) {
- odp = implicit_mr_get_data(mr, io_virt, bcnt);
-
- if (IS_ERR(odp))
- return PTR_ERR(odp);
- mr = odp->private;
- } else {
- odp = odp_mr;
- }
-
-next_mr:
- size = min_t(size_t, bcnt, ib_umem_end(odp) - io_virt);
+ u64 start_idx;
page_shift = odp->page_shift;
- page_mask = ~(BIT(page_shift) - 1);
- start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
+ start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
access_mask = ODP_READ_ALLOWED_BIT;
- if (prefetch && !downgrade && !mr->umem->writable) {
- /* prefetch with write-access must
- * be supported by the MR
- */
- ret = -EINVAL;
- goto out;
- }
-
- if (mr->umem->writable && !downgrade)
+ if (odp->umem.writable && !downgrade)
access_mask |= ODP_WRITE_ALLOWED_BIT;
- current_seq = READ_ONCE(odp->notifiers_seq);
- /*
- * Ensure the sequence number is valid for some time before we call
- * gup.
- */
- smp_rmb();
+ current_seq = mmu_interval_read_begin(&odp->notifier);
- ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), io_virt, size,
- access_mask, current_seq);
-
- if (ret < 0)
- goto out;
-
- np = ret;
+ np = ib_umem_odp_map_dma_pages(odp, user_va, bcnt, access_mask,
+ current_seq);
+ if (np < 0)
+ return np;
mutex_lock(&odp->umem_mutex);
- if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr->umem),
- current_seq)) {
+ if (!mmu_interval_read_retry(&odp->notifier, current_seq)) {
/*
* No need to check whether the MTTs really belong to
* this MR, since ib_umem_odp_map_dma_pages already
@@ -648,53 +694,135 @@ next_mr:
if (ret < 0) {
if (ret != -EAGAIN)
- mlx5_ib_err(dev, "Failed to update mkey page tables\n");
+ mlx5_ib_err(mr->dev,
+ "Failed to update mkey page tables\n");
goto out;
}
if (bytes_mapped) {
u32 new_mappings = (np << page_shift) -
- (io_virt - round_down(io_virt, 1 << page_shift));
- *bytes_mapped += min_t(u32, new_mappings, size);
+ (user_va - round_down(user_va, 1 << page_shift));
+
+ *bytes_mapped += min_t(u32, new_mappings, bcnt);
}
- npages += np << (page_shift - PAGE_SHIFT);
- bcnt -= size;
+ return np << (page_shift - PAGE_SHIFT);
- if (unlikely(bcnt)) {
- struct ib_umem_odp *next;
+out:
+ return ret;
+}
+
+static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
+ struct ib_umem_odp *odp_imr, u64 user_va,
+ size_t bcnt, u32 *bytes_mapped, u32 flags)
+{
+ unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT;
+ unsigned long upd_start_idx = end_idx + 1;
+ unsigned long upd_len = 0;
+ unsigned long npages = 0;
+ int err;
+ int ret;
- io_virt += size;
- next = odp_next(odp);
- if (unlikely(!next || next->umem.address != io_virt)) {
- mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
- io_virt, next);
- return -EAGAIN;
+ if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE ||
+ mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt))
+ return -EFAULT;
+
+ /* Fault each child mr that intersects with our interval. */
+ while (bcnt) {
+ unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT;
+ struct ib_umem_odp *umem_odp;
+ struct mlx5_ib_mr *mtt;
+ u64 len;
+
+ mtt = xa_load(&imr->implicit_children, idx);
+ if (unlikely(!mtt)) {
+ mtt = implicit_get_child_mr(imr, idx);
+ if (IS_ERR(mtt)) {
+ ret = PTR_ERR(mtt);
+ goto out;
+ }
+ upd_start_idx = min(upd_start_idx, idx);
+ upd_len = idx - upd_start_idx + 1;
}
- odp = next;
- mr = odp->private;
- goto next_mr;
+
+ umem_odp = to_ib_umem_odp(mtt->umem);
+ len = min_t(u64, user_va + bcnt, ib_umem_end(umem_odp)) -
+ user_va;
+
+ ret = pagefault_real_mr(mtt, umem_odp, user_va, len,
+ bytes_mapped, flags);
+ if (ret < 0)
+ goto out;
+ user_va += len;
+ bcnt -= len;
+ npages += ret;
}
- return npages;
+ ret = npages;
+ /*
+ * Any time the implicit_children are changed we must perform an
+ * update of the xlt before exiting to ensure the HW and the
+ * implicit_children remains synchronized.
+ */
out:
- if (ret == -EAGAIN) {
- unsigned long timeout = msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
-
- if (!wait_for_completion_timeout(&odp->notifier_completion,
- timeout)) {
- mlx5_ib_warn(
- dev,
- "timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n",
- current_seq, odp->notifiers_seq,
- odp->notifiers_count);
- }
- }
+ if (likely(!upd_len))
+ return ret;
+ /*
+ * Notice this is not strictly ordered right, the KSM is updated after
+ * the implicit_children is updated, so a parallel page fault could
+ * see a MR that is not yet visible in the KSM. This is similar to a
+ * parallel page fault seeing a MR that is being concurrently removed
+ * from the KSM. Both of these improbable situations are resolved
+ * safely by resuming the HW and then taking another page fault. The
+ * next pagefault handler will see the new information.
+ */
+ mutex_lock(&odp_imr->umem_mutex);
+ err = mlx5_ib_update_xlt(imr, upd_start_idx, upd_len, 0,
+ MLX5_IB_UPD_XLT_INDIRECT |
+ MLX5_IB_UPD_XLT_ATOMIC);
+ mutex_unlock(&odp_imr->umem_mutex);
+ if (err) {
+ mlx5_ib_err(imr->dev, "Failed to update PAS\n");
+ return err;
+ }
return ret;
}
+/*
+ * Returns:
+ * -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
+ * not accessible, or the MR is no longer valid.
+ * -EAGAIN/-ENOMEM: The operation should be retried
+ *
+ * -EINVAL/others: General internal malfunction
+ * >0: Number of pages mapped
+ */
+static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
+ u32 *bytes_mapped, u32 flags)
+{
+ struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+
+ if (unlikely(io_virt < mr->mmkey.iova))
+ return -EFAULT;
+
+ if (!odp->is_implicit_odp) {
+ u64 user_va;
+
+ if (check_add_overflow(io_virt - mr->mmkey.iova,
+ (u64)odp->umem.address, &user_va))
+ return -EFAULT;
+ if (unlikely(user_va >= ib_umem_end(odp) ||
+ ib_umem_end(odp) - user_va < bcnt))
+ return -EFAULT;
+ return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped,
+ flags);
+ }
+ return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped,
+ flags);
+}
+
struct pf_frame {
struct pf_frame *next;
u32 key;
@@ -742,10 +870,9 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
struct ib_pd *pd, u32 key,
u64 io_virt, size_t bcnt,
u32 *bytes_committed,
- u32 *bytes_mapped, u32 flags)
+ u32 *bytes_mapped)
{
int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0;
- bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
struct pf_frame *head = NULL, *frame;
struct mlx5_core_mkey *mmkey;
struct mlx5_ib_mr *mr;
@@ -754,58 +881,49 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
size_t offset;
int ndescs;
- srcu_key = srcu_read_lock(&dev->mr_srcu);
+ srcu_key = srcu_read_lock(&dev->odp_srcu);
io_virt += *bytes_committed;
bcnt -= *bytes_committed;
next_mr:
- mmkey = xa_load(&dev->mdev->priv.mkey_table, mlx5_base_mkey(key));
+ mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key));
+ if (!mmkey) {
+ mlx5_ib_dbg(
+ dev,
+ "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
+ key);
+ if (bytes_mapped)
+ *bytes_mapped += bcnt;
+ /*
+ * The user could specify a SGL with multiple lkeys and only
+ * some of them are ODP. Treat the non-ODP ones as fully
+ * faulted.
+ */
+ ret = 0;
+ goto srcu_unlock;
+ }
if (!mkey_is_eq(mmkey, key)) {
mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
ret = -EFAULT;
goto srcu_unlock;
}
- if (prefetch && mmkey->type != MLX5_MKEY_MR) {
- mlx5_ib_dbg(dev, "prefetch is allowed only for MR\n");
- ret = -EINVAL;
- goto srcu_unlock;
- }
-
switch (mmkey->type) {
case MLX5_MKEY_MR:
mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
- if (!mr->live || !mr->ibmr.pd) {
- mlx5_ib_dbg(dev, "got dead MR\n");
- ret = -EFAULT;
- goto srcu_unlock;
- }
-
- if (prefetch) {
- if (!is_odp_mr(mr) ||
- mr->ibmr.pd != pd) {
- mlx5_ib_dbg(dev, "Invalid prefetch request: %s\n",
- is_odp_mr(mr) ? "MR is not ODP" :
- "PD is not of the MR");
- ret = -EINVAL;
- goto srcu_unlock;
- }
- }
- if (!is_odp_mr(mr)) {
- mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
- key);
- if (bytes_mapped)
- *bytes_mapped += bcnt;
- ret = 0;
- goto srcu_unlock;
- }
-
- ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped, flags);
+ ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0);
if (ret < 0)
goto srcu_unlock;
+ /*
+ * When prefetching a page, page fault is generated
+ * in order to bring the page to the main memory.
+ * In the current flow, page faults are being counted.
+ */
+ mlx5_update_odp_stats(mr, faults, ret);
+
npages += ret;
ret = 0;
break;
@@ -895,7 +1013,7 @@ srcu_unlock:
}
kfree(out);
- srcu_read_unlock(&dev->mr_srcu, srcu_key);
+ srcu_read_unlock(&dev->odp_srcu, srcu_key);
*bytes_committed = 0;
return ret ? ret : npages;
}
@@ -976,7 +1094,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev,
ret = pagefault_single_data_segment(dev, NULL, key,
io_virt, bcnt,
&pfault->bytes_committed,
- bytes_mapped, 0);
+ bytes_mapped);
if (ret < 0)
break;
npages += ret;
@@ -985,17 +1103,6 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev,
return ret < 0 ? ret : npages;
}
-static const u32 mlx5_ib_odp_opcode_cap[] = {
- [MLX5_OPCODE_SEND] = IB_ODP_SUPPORT_SEND,
- [MLX5_OPCODE_SEND_IMM] = IB_ODP_SUPPORT_SEND,
- [MLX5_OPCODE_SEND_INVAL] = IB_ODP_SUPPORT_SEND,
- [MLX5_OPCODE_RDMA_WRITE] = IB_ODP_SUPPORT_WRITE,
- [MLX5_OPCODE_RDMA_WRITE_IMM] = IB_ODP_SUPPORT_WRITE,
- [MLX5_OPCODE_RDMA_READ] = IB_ODP_SUPPORT_READ,
- [MLX5_OPCODE_ATOMIC_CS] = IB_ODP_SUPPORT_ATOMIC,
- [MLX5_OPCODE_ATOMIC_FA] = IB_ODP_SUPPORT_ATOMIC,
-};
-
/*
* Parse initiator WQE. Advances the wqe pointer to point at the
* scatter-gather list, and set wqe_end to the end of the WQE.
@@ -1006,12 +1113,8 @@ static int mlx5_ib_mr_initiator_pfault_handler(
{
struct mlx5_wqe_ctrl_seg *ctrl = *wqe;
u16 wqe_index = pfault->wqe.wqe_index;
- u32 transport_caps;
struct mlx5_base_av *av;
unsigned ds, opcode;
-#if defined(DEBUG)
- u32 ctrl_wqe_index, ctrl_qpn;
-#endif
u32 qpn = qp->trans_qp.base.mqp.qpn;
ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
@@ -1027,58 +1130,17 @@ static int mlx5_ib_mr_initiator_pfault_handler(
return -EFAULT;
}
-#if defined(DEBUG)
- ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) &
- MLX5_WQE_CTRL_WQE_INDEX_MASK) >>
- MLX5_WQE_CTRL_WQE_INDEX_SHIFT;
- if (wqe_index != ctrl_wqe_index) {
- mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n",
- wqe_index, qpn,
- ctrl_wqe_index);
- return -EFAULT;
- }
-
- ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >>
- MLX5_WQE_CTRL_QPN_SHIFT;
- if (qpn != ctrl_qpn) {
- mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n",
- wqe_index, qpn,
- ctrl_qpn);
- return -EFAULT;
- }
-#endif /* DEBUG */
-
*wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS;
*wqe += sizeof(*ctrl);
opcode = be32_to_cpu(ctrl->opmod_idx_opcode) &
MLX5_WQE_CTRL_OPCODE_MASK;
- switch (qp->ibqp.qp_type) {
- case IB_QPT_XRC_INI:
+ if (qp->ibqp.qp_type == IB_QPT_XRC_INI)
*wqe += sizeof(struct mlx5_wqe_xrc_seg);
- transport_caps = dev->odp_caps.per_transport_caps.xrc_odp_caps;
- break;
- case IB_QPT_RC:
- transport_caps = dev->odp_caps.per_transport_caps.rc_odp_caps;
- break;
- case IB_QPT_UD:
- transport_caps = dev->odp_caps.per_transport_caps.ud_odp_caps;
- break;
- default:
- mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport 0x%x\n",
- qp->ibqp.qp_type);
- return -EFAULT;
- }
- if (unlikely(opcode >= ARRAY_SIZE(mlx5_ib_odp_opcode_cap) ||
- !(transport_caps & mlx5_ib_odp_opcode_cap[opcode]))) {
- mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode 0x%x\n",
- opcode);
- return -EFAULT;
- }
-
- if (qp->ibqp.qp_type == IB_QPT_UD) {
+ if (qp->ibqp.qp_type == IB_QPT_UD ||
+ qp->qp_sub_type == MLX5_IB_QPT_DCI) {
av = *wqe;
if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV))
*wqe += sizeof(struct mlx5_av);
@@ -1141,19 +1203,6 @@ static int mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev *dev,
return -EFAULT;
}
- switch (qp->ibqp.qp_type) {
- case IB_QPT_RC:
- if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
- IB_ODP_SUPPORT_RECV))
- goto invalid_transport_or_opcode;
- break;
- default:
-invalid_transport_or_opcode:
- mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n",
- qp->ibqp.qp_type);
- return -EFAULT;
- }
-
*wqe_end = wqe + wqe_size;
return 0;
@@ -1203,7 +1252,7 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
{
bool sq = pfault->type & MLX5_PFAULT_REQUESTOR;
u16 wqe_index = pfault->wqe.wqe_index;
- void *wqe = NULL, *wqe_end = NULL;
+ void *wqe, *wqe_start = NULL, *wqe_end = NULL;
u32 bytes_mapped, total_wqe_bytes;
struct mlx5_core_rsc_common *res;
int resume_with_error = 1;
@@ -1224,23 +1273,24 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
goto resolve_page_fault;
}
- wqe = (void *)__get_free_page(GFP_KERNEL);
- if (!wqe) {
+ wqe_start = (void *)__get_free_page(GFP_KERNEL);
+ if (!wqe_start) {
mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n");
goto resolve_page_fault;
}
+ wqe = wqe_start;
qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL;
if (qp && sq) {
- ret = mlx5_ib_read_user_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE,
- &bytes_copied);
+ ret = mlx5_ib_read_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE,
+ &bytes_copied);
if (ret)
goto read_user;
ret = mlx5_ib_mr_initiator_pfault_handler(
dev, pfault, qp, &wqe, &wqe_end, bytes_copied);
} else if (qp && !sq) {
- ret = mlx5_ib_read_user_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE,
- &bytes_copied);
+ ret = mlx5_ib_read_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE,
+ &bytes_copied);
if (ret)
goto read_user;
ret = mlx5_ib_mr_responder_pfault_handler_rq(
@@ -1248,8 +1298,8 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
} else if (!qp) {
struct mlx5_ib_srq *srq = res_to_srq(res);
- ret = mlx5_ib_read_user_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE,
- &bytes_copied);
+ ret = mlx5_ib_read_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE,
+ &bytes_copied);
if (ret)
goto read_user;
ret = mlx5_ib_mr_responder_pfault_handler_srq(
@@ -1284,7 +1334,7 @@ resolve_page_fault:
pfault->wqe.wq_num, resume_with_error,
pfault->type);
mlx5_core_res_put(res);
- free_page((unsigned long)wqe);
+ free_page((unsigned long)wqe_start);
}
static int pages_in_range(u64 address, u32 length)
@@ -1327,8 +1377,7 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
}
ret = pagefault_single_data_segment(dev, NULL, rkey, address, length,
- &pfault->bytes_committed, NULL,
- 0);
+ &pfault->bytes_committed, NULL);
if (ret == -EAGAIN) {
/* We're racing with an invalidation, don't prefetch */
prefetch_activated = 0;
@@ -1355,8 +1404,7 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
ret = pagefault_single_data_segment(dev, NULL, rkey, address,
prefetch_len,
- &bytes_committed, NULL,
- 0);
+ &bytes_committed, NULL);
if (ret < 0 && ret != -EAGAIN) {
mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
ret, pfault->token, address, prefetch_len);
@@ -1622,8 +1670,10 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
{
int ret = 0;
- if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
- ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
+ if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
+ return ret;
+
+ ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
@@ -1633,9 +1683,6 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
}
}
- if (!MLX5_CAP_GEN(dev->mdev, pg))
- return ret;
-
ret = mlx5_ib_create_pf_eq(dev, &dev->odp_pf_eq);
return ret;
@@ -1643,7 +1690,7 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev)
{
- if (!MLX5_CAP_GEN(dev->mdev, pg))
+ if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
return;
mlx5_ib_destroy_pf_eq(dev, &dev->odp_pf_eq);
@@ -1659,114 +1706,128 @@ int mlx5_ib_odp_init(void)
struct prefetch_mr_work {
struct work_struct work;
- struct ib_pd *pd;
u32 pf_flags;
u32 num_sge;
- struct ib_sge sg_list[0];
+ struct {
+ u64 io_virt;
+ struct mlx5_ib_mr *mr;
+ size_t length;
+ } frags[];
};
-static void num_pending_prefetch_dec(struct mlx5_ib_dev *dev,
- struct ib_sge *sg_list, u32 num_sge,
- u32 from)
+static void destroy_prefetch_work(struct prefetch_mr_work *work)
{
u32 i;
- int srcu_key;
-
- srcu_key = srcu_read_lock(&dev->mr_srcu);
-
- for (i = from; i < num_sge; ++i) {
- struct mlx5_core_mkey *mmkey;
- struct mlx5_ib_mr *mr;
- mmkey = xa_load(&dev->mdev->priv.mkey_table,
- mlx5_base_mkey(sg_list[i].lkey));
- mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
- atomic_dec(&mr->num_pending_prefetch);
- }
-
- srcu_read_unlock(&dev->mr_srcu, srcu_key);
+ for (i = 0; i < work->num_sge; ++i)
+ atomic_dec(&work->frags[i].mr->num_deferred_work);
+ kvfree(work);
}
-static bool num_pending_prefetch_inc(struct ib_pd *pd,
- struct ib_sge *sg_list, u32 num_sge)
+static struct mlx5_ib_mr *
+get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
+ u32 lkey)
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
- bool ret = true;
- u32 i;
+ struct mlx5_core_mkey *mmkey;
+ struct ib_umem_odp *odp;
+ struct mlx5_ib_mr *mr;
- for (i = 0; i < num_sge; ++i) {
- struct mlx5_core_mkey *mmkey;
- struct mlx5_ib_mr *mr;
+ lockdep_assert_held(&dev->odp_srcu);
- mmkey = xa_load(&dev->mdev->priv.mkey_table,
- mlx5_base_mkey(sg_list[i].lkey));
- if (!mmkey || mmkey->key != sg_list[i].lkey) {
- ret = false;
- break;
- }
+ mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey));
+ if (!mmkey || mmkey->key != lkey || mmkey->type != MLX5_MKEY_MR)
+ return NULL;
- if (mmkey->type != MLX5_MKEY_MR) {
- ret = false;
- break;
- }
+ mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
- mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+ if (mr->ibmr.pd != pd)
+ return NULL;
- if (mr->ibmr.pd != pd) {
- ret = false;
- break;
- }
+ odp = to_ib_umem_odp(mr->umem);
- if (!mr->live) {
- ret = false;
- break;
- }
+ /* prefetch with write-access must be supported by the MR */
+ if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
+ !odp->umem.writable)
+ return NULL;
- atomic_inc(&mr->num_pending_prefetch);
- }
+ return mr;
+}
- if (!ret)
- num_pending_prefetch_dec(dev, sg_list, i, 0);
+static void mlx5_ib_prefetch_mr_work(struct work_struct *w)
+{
+ struct prefetch_mr_work *work =
+ container_of(w, struct prefetch_mr_work, work);
+ u32 bytes_mapped = 0;
+ u32 i;
- return ret;
+ for (i = 0; i < work->num_sge; ++i)
+ pagefault_mr(work->frags[i].mr, work->frags[i].io_virt,
+ work->frags[i].length, &bytes_mapped,
+ work->pf_flags);
+
+ destroy_prefetch_work(work);
}
-static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, u32 pf_flags,
- struct ib_sge *sg_list, u32 num_sge)
+static bool init_prefetch_work(struct ib_pd *pd,
+ enum ib_uverbs_advise_mr_advice advice,
+ u32 pf_flags, struct prefetch_mr_work *work,
+ struct ib_sge *sg_list, u32 num_sge)
{
u32 i;
- int ret = 0;
- struct mlx5_ib_dev *dev = to_mdev(pd->device);
+
+ INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
+ work->pf_flags = pf_flags;
for (i = 0; i < num_sge; ++i) {
- struct ib_sge *sg = &sg_list[i];
- int bytes_committed = 0;
+ work->frags[i].io_virt = sg_list[i].addr;
+ work->frags[i].length = sg_list[i].length;
+ work->frags[i].mr =
+ get_prefetchable_mr(pd, advice, sg_list[i].lkey);
+ if (!work->frags[i].mr) {
+ work->num_sge = i - 1;
+ if (i)
+ destroy_prefetch_work(work);
+ return false;
+ }
- ret = pagefault_single_data_segment(dev, pd, sg->lkey, sg->addr,
- sg->length,
- &bytes_committed, NULL,
- pf_flags);
- if (ret < 0)
- break;
+ /* Keep the MR pointer will valid outside the SRCU */
+ atomic_inc(&work->frags[i].mr->num_deferred_work);
}
-
- return ret < 0 ? ret : 0;
+ work->num_sge = num_sge;
+ return true;
}
-static void mlx5_ib_prefetch_mr_work(struct work_struct *work)
+static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd,
+ enum ib_uverbs_advise_mr_advice advice,
+ u32 pf_flags, struct ib_sge *sg_list,
+ u32 num_sge)
{
- struct prefetch_mr_work *w =
- container_of(work, struct prefetch_mr_work, work);
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ u32 bytes_mapped = 0;
+ int srcu_key;
+ int ret = 0;
+ u32 i;
+
+ srcu_key = srcu_read_lock(&dev->odp_srcu);
+ for (i = 0; i < num_sge; ++i) {
+ struct mlx5_ib_mr *mr;
- if (ib_device_try_get(w->pd->device)) {
- mlx5_ib_prefetch_sg_list(w->pd, w->pf_flags, w->sg_list,
- w->num_sge);
- ib_device_put(w->pd->device);
+ mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey);
+ if (!mr) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length,
+ &bytes_mapped, pf_flags);
+ if (ret < 0)
+ goto out;
}
+ ret = 0;
- num_pending_prefetch_dec(to_mdev(w->pd->device), w->sg_list,
- w->num_sge, 0);
- kvfree(w);
+out:
+ srcu_read_unlock(&dev->odp_srcu, srcu_key);
+ return ret;
}
int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
@@ -1774,43 +1835,27 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
u32 flags, struct ib_sge *sg_list, u32 num_sge)
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
- u32 pf_flags = MLX5_PF_FLAGS_PREFETCH;
+ u32 pf_flags = 0;
struct prefetch_mr_work *work;
- bool valid_req;
int srcu_key;
if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
- return mlx5_ib_prefetch_sg_list(pd, pf_flags, sg_list,
+ return mlx5_ib_prefetch_sg_list(pd, advice, pf_flags, sg_list,
num_sge);
- work = kvzalloc(struct_size(work, sg_list, num_sge), GFP_KERNEL);
+ work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL);
if (!work)
return -ENOMEM;
- memcpy(work->sg_list, sg_list, num_sge * sizeof(struct ib_sge));
-
- /* It is guaranteed that the pd when work is executed is the pd when
- * work was queued since pd can't be destroyed while it holds MRs and
- * destroying a MR leads to flushing the workquque
- */
- work->pd = pd;
- work->pf_flags = pf_flags;
- work->num_sge = num_sge;
-
- INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
-
- srcu_key = srcu_read_lock(&dev->mr_srcu);
-
- valid_req = num_pending_prefetch_inc(pd, sg_list, num_sge);
- if (valid_req)
- queue_work(system_unbound_wq, &work->work);
- else
- kvfree(work);
-
- srcu_read_unlock(&dev->mr_srcu, srcu_key);
-
- return valid_req ? 0 : -EINVAL;
+ srcu_key = srcu_read_lock(&dev->odp_srcu);
+ if (!init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge)) {
+ srcu_read_unlock(&dev->odp_srcu, srcu_key);
+ return -EINVAL;
+ }
+ queue_work(system_unbound_wq, &work->work);
+ srcu_read_unlock(&dev->odp_srcu, srcu_key);
+ return 0;
}
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 379328b2598f..957f3a52589b 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -129,14 +129,10 @@ static int is_sqp(enum ib_qp_type qp_type)
*
* Return: zero on success, or an error code.
*/
-static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem,
- void *buffer,
- u32 buflen,
- int wqe_index,
- int wq_offset,
- int wq_wqe_cnt,
- int wq_wqe_shift,
- int bcnt,
+static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem, void *buffer,
+ size_t buflen, int wqe_index,
+ int wq_offset, int wq_wqe_cnt,
+ int wq_wqe_shift, int bcnt,
size_t *bytes_copied)
{
size_t offset = wq_offset + ((wqe_index % wq_wqe_cnt) << wq_wqe_shift);
@@ -160,11 +156,43 @@ static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem,
return 0;
}
-int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp,
- int wqe_index,
- void *buffer,
- int buflen,
- size_t *bc)
+static int mlx5_ib_read_kernel_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index,
+ void *buffer, size_t buflen, size_t *bc)
+{
+ struct mlx5_wqe_ctrl_seg *ctrl;
+ size_t bytes_copied = 0;
+ size_t wqe_length;
+ void *p;
+ int ds;
+
+ wqe_index = wqe_index & qp->sq.fbc.sz_m1;
+
+ /* read the control segment first */
+ p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, wqe_index);
+ ctrl = p;
+ ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
+ wqe_length = ds * MLX5_WQE_DS_UNITS;
+
+ /* read rest of WQE if it spreads over more than one stride */
+ while (bytes_copied < wqe_length) {
+ size_t copy_length =
+ min_t(size_t, buflen - bytes_copied, MLX5_SEND_WQE_BB);
+
+ if (!copy_length)
+ break;
+
+ memcpy(buffer + bytes_copied, p, copy_length);
+ bytes_copied += copy_length;
+
+ wqe_index = (wqe_index + 1) & qp->sq.fbc.sz_m1;
+ p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, wqe_index);
+ }
+ *bc = bytes_copied;
+ return 0;
+}
+
+static int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index,
+ void *buffer, size_t buflen, size_t *bc)
{
struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
struct ib_umem *umem = base->ubuffer.umem;
@@ -176,18 +204,10 @@ int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp,
int ret;
int ds;
- if (buflen < sizeof(*ctrl))
- return -EINVAL;
-
/* at first read as much as possible */
- ret = mlx5_ib_read_user_wqe_common(umem,
- buffer,
- buflen,
- wqe_index,
- wq->offset,
- wq->wqe_cnt,
- wq->wqe_shift,
- buflen,
+ ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index,
+ wq->offset, wq->wqe_cnt,
+ wq->wqe_shift, buflen,
&bytes_copied);
if (ret)
return ret;
@@ -210,13 +230,9 @@ int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp,
* so read the remaining bytes starting
* from wqe_index 0
*/
- ret = mlx5_ib_read_user_wqe_common(umem,
- buffer + bytes_copied,
- buflen - bytes_copied,
- 0,
- wq->offset,
- wq->wqe_cnt,
- wq->wqe_shift,
+ ret = mlx5_ib_read_user_wqe_common(umem, buffer + bytes_copied,
+ buflen - bytes_copied, 0, wq->offset,
+ wq->wqe_cnt, wq->wqe_shift,
wqe_length - bytes_copied,
&bytes_copied2);
@@ -226,11 +242,24 @@ int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp,
return 0;
}
-int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp,
- int wqe_index,
- void *buffer,
- int buflen,
- size_t *bc)
+int mlx5_ib_read_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
+ size_t buflen, size_t *bc)
+{
+ struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
+ struct ib_umem *umem = base->ubuffer.umem;
+
+ if (buflen < sizeof(struct mlx5_wqe_ctrl_seg))
+ return -EINVAL;
+
+ if (!umem)
+ return mlx5_ib_read_kernel_wqe_sq(qp, wqe_index, buffer,
+ buflen, bc);
+
+ return mlx5_ib_read_user_wqe_sq(qp, wqe_index, buffer, buflen, bc);
+}
+
+static int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index,
+ void *buffer, size_t buflen, size_t *bc)
{
struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
struct ib_umem *umem = base->ubuffer.umem;
@@ -238,14 +267,9 @@ int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp,
size_t bytes_copied;
int ret;
- ret = mlx5_ib_read_user_wqe_common(umem,
- buffer,
- buflen,
- wqe_index,
- wq->offset,
- wq->wqe_cnt,
- wq->wqe_shift,
- buflen,
+ ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index,
+ wq->offset, wq->wqe_cnt,
+ wq->wqe_shift, buflen,
&bytes_copied);
if (ret)
@@ -254,25 +278,33 @@ int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp,
return 0;
}
-int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq,
- int wqe_index,
- void *buffer,
- int buflen,
- size_t *bc)
+int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
+ size_t buflen, size_t *bc)
+{
+ struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
+ struct ib_umem *umem = base->ubuffer.umem;
+ struct mlx5_ib_wq *wq = &qp->rq;
+ size_t wqe_size = 1 << wq->wqe_shift;
+
+ if (buflen < wqe_size)
+ return -EINVAL;
+
+ if (!umem)
+ return -EOPNOTSUPP;
+
+ return mlx5_ib_read_user_wqe_rq(qp, wqe_index, buffer, buflen, bc);
+}
+
+static int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index,
+ void *buffer, size_t buflen, size_t *bc)
{
struct ib_umem *umem = srq->umem;
size_t bytes_copied;
int ret;
- ret = mlx5_ib_read_user_wqe_common(umem,
- buffer,
- buflen,
- wqe_index,
- 0,
- srq->msrq.max,
- srq->msrq.wqe_shift,
- buflen,
- &bytes_copied);
+ ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index, 0,
+ srq->msrq.max, srq->msrq.wqe_shift,
+ buflen, &bytes_copied);
if (ret)
return ret;
@@ -280,6 +312,21 @@ int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq,
return 0;
}
+int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer,
+ size_t buflen, size_t *bc)
+{
+ struct ib_umem *umem = srq->umem;
+ size_t wqe_size = 1 << srq->msrq.wqe_shift;
+
+ if (buflen < wqe_size)
+ return -EINVAL;
+
+ if (!umem)
+ return -EOPNOTSUPP;
+
+ return mlx5_ib_read_user_wqe_srq(srq, wqe_index, buffer, buflen, bc);
+}
+
static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
{
struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
@@ -749,7 +796,7 @@ static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
{
int err;
- *umem = ib_umem_get(udata, addr, size, 0, 0);
+ *umem = ib_umem_get(&dev->ib_dev, addr, size, 0);
if (IS_ERR(*umem)) {
mlx5_ib_dbg(dev, "umem_get failed\n");
return PTR_ERR(*umem);
@@ -806,7 +853,7 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
if (!ucmd->buf_addr)
return -EINVAL;
- rwq->umem = ib_umem_get(udata, ucmd->buf_addr, rwq->buf_size, 0, 0);
+ rwq->umem = ib_umem_get(&dev->ib_dev, ucmd->buf_addr, rwq->buf_size, 0);
if (IS_ERR(rwq->umem)) {
mlx5_ib_dbg(dev, "umem_get failed\n");
err = PTR_ERR(rwq->umem);
@@ -1041,11 +1088,14 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
IB_QP_CREATE_IPOIB_UD_LSO |
IB_QP_CREATE_NETIF_QP |
- mlx5_ib_create_qp_sqpn_qp1()))
+ MLX5_IB_QP_CREATE_SQPN_QP1 |
+ MLX5_IB_QP_CREATE_WC_TEST))
return -EINVAL;
if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR)
qp->bf.bfreg = &dev->fp_bfreg;
+ else if (init_attr->create_flags & MLX5_IB_QP_CREATE_WC_TEST)
+ qp->bf.bfreg = &dev->wc_bfreg;
else
qp->bf.bfreg = &dev->bfreg;
@@ -1104,7 +1154,7 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
MLX5_SET(qpc, qpc, fre, 1);
MLX5_SET(qpc, qpc, rlky, 1);
- if (init_attr->create_flags & mlx5_ib_create_qp_sqpn_qp1()) {
+ if (init_attr->create_flags & MLX5_IB_QP_CREATE_SQPN_QP1) {
MLX5_SET(qpc, qpc, deth_sqpn, 1);
qp->flags |= MLX5_IB_QP_SQPN_QP1;
}
@@ -1868,7 +1918,7 @@ static void configure_requester_scat_cqe(struct mlx5_ib_dev *dev,
{
enum ib_qp_type qpt = init_attr->qp_type;
int scqe_sz;
- bool allow_scat_cqe = 0;
+ bool allow_scat_cqe = false;
if (qpt == IB_QPT_UC || qpt == IB_QPT_UD)
return;
@@ -2140,7 +2190,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
return -EINVAL;
}
if (init_attr->create_flags &
- mlx5_ib_create_qp_sqpn_qp1()) {
+ MLX5_IB_QP_CREATE_SQPN_QP1) {
mlx5_ib_dbg(dev, "user-space is not allowed to create UD QPs spoofing as QP1\n");
return -EINVAL;
}
@@ -3249,10 +3299,12 @@ static int modify_raw_packet_qp_sq(
}
/* Only remove the old rate after new rate was set */
- if ((old_rl.rate &&
- !mlx5_rl_are_equal(&old_rl, &new_rl)) ||
- (new_state != MLX5_SQC_STATE_RDY))
+ if ((old_rl.rate && !mlx5_rl_are_equal(&old_rl, &new_rl)) ||
+ (new_state != MLX5_SQC_STATE_RDY)) {
mlx5_rl_remove_rate(dev, &old_rl);
+ if (new_state != MLX5_SQC_STATE_RDY)
+ memset(&new_rl, 0, sizeof(new_rl));
+ }
ibqp->rl = new_rl;
sq->state = new_state;
@@ -3386,19 +3438,13 @@ static int __mlx5_ib_qp_set_counter(struct ib_qp *qp,
struct mlx5_ib_dev *dev = to_mdev(qp->device);
struct mlx5_ib_qp *mqp = to_mqp(qp);
struct mlx5_qp_context context = {};
- struct mlx5_ib_port *mibport = NULL;
struct mlx5_ib_qp_base *base;
u32 set_id;
- if (!MLX5_CAP_GEN(dev->mdev, rts2rts_qp_counters_set_id))
- return 0;
-
- if (counter) {
+ if (counter)
set_id = counter->id;
- } else {
- mibport = &dev->port[mqp->port - 1];
- set_id = mibport->cnts.set_id;
- }
+ else
+ set_id = mlx5_ib_get_counters_id(dev, mqp->port - 1);
base = &mqp->trans_qp.base;
context.qp_counter_set_usr_page &= cpu_to_be32(0xffffff);
@@ -3459,7 +3505,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
struct mlx5_ib_cq *send_cq, *recv_cq;
struct mlx5_qp_context *context;
struct mlx5_ib_pd *pd;
- struct mlx5_ib_port *mibport = NULL;
enum mlx5_qp_state mlx5_cur, mlx5_new;
enum mlx5_qp_optpar optpar;
u32 set_id = 0;
@@ -3624,11 +3669,10 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
if (qp->flags & MLX5_IB_QP_UNDERLAY)
port_num = 0;
- mibport = &dev->port[port_num];
if (ibqp->counter)
set_id = ibqp->counter->id;
else
- set_id = mibport->cnts.set_id;
+ set_id = mlx5_ib_get_counters_id(dev, port_num);
context->qp_counter_set_usr_page |=
cpu_to_be32(set_id << 24);
}
@@ -3817,6 +3861,8 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr,
dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry);
if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+ u16 set_id;
+
required |= IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
if (!is_valid_mask(attr_mask, required, 0))
return -EINVAL;
@@ -3843,7 +3889,9 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr,
}
MLX5_SET(dctc, dctc, pkey_index, attr->pkey_index);
MLX5_SET(dctc, dctc, port, attr->port_num);
- MLX5_SET(dctc, dctc, counter_set_id, dev->port[attr->port_num - 1].cnts.set_id);
+
+ set_id = mlx5_ib_get_counters_id(dev, attr->port_num - 1);
+ MLX5_SET(dctc, dctc, counter_set_id, set_id);
} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
struct mlx5_ib_modify_qp_resp resp = {};
@@ -4162,7 +4210,7 @@ static u64 get_xlt_octo(u64 bytes)
MLX5_IB_UMR_OCTOWORD;
}
-static __be64 frwr_mkey_mask(void)
+static __be64 frwr_mkey_mask(bool atomic)
{
u64 result;
@@ -4175,10 +4223,12 @@ static __be64 frwr_mkey_mask(void)
MLX5_MKEY_MASK_LW |
MLX5_MKEY_MASK_RR |
MLX5_MKEY_MASK_RW |
- MLX5_MKEY_MASK_A |
MLX5_MKEY_MASK_SMALL_FENCE |
MLX5_MKEY_MASK_FREE;
+ if (atomic)
+ result |= MLX5_MKEY_MASK_A;
+
return cpu_to_be64(result);
}
@@ -4204,7 +4254,7 @@ static __be64 sig_mkey_mask(void)
}
static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr,
- struct mlx5_ib_mr *mr, u8 flags)
+ struct mlx5_ib_mr *mr, u8 flags, bool atomic)
{
int size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size;
@@ -4212,7 +4262,7 @@ static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr,
umr->flags = flags;
umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size));
- umr->mkey_mask = frwr_mkey_mask();
+ umr->mkey_mask = frwr_mkey_mask(atomic);
}
static void set_linv_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr)
@@ -4811,10 +4861,22 @@ static int set_reg_wr(struct mlx5_ib_qp *qp,
{
struct mlx5_ib_mr *mr = to_mmr(wr->mr);
struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd);
+ struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device);
int mr_list_size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size;
bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD;
+ bool atomic = wr->access & IB_ACCESS_REMOTE_ATOMIC;
u8 flags = 0;
+ if (!mlx5_ib_can_use_umr(dev, atomic, wr->access)) {
+ mlx5_ib_warn(to_mdev(qp->ibqp.device),
+ "Fast update of %s for MR is disabled\n",
+ (MLX5_CAP_GEN(dev->mdev,
+ umr_modify_entity_size_disabled)) ?
+ "entity size" :
+ "atomic access");
+ return -EINVAL;
+ }
+
if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) {
mlx5_ib_warn(to_mdev(qp->ibqp.device),
"Invalid IB_SEND_INLINE send flag\n");
@@ -4826,7 +4888,7 @@ static int set_reg_wr(struct mlx5_ib_qp *qp,
if (umr_inline)
flags |= MLX5_UMR_INLINE;
- set_reg_umr_seg(*seg, mr, flags);
+ set_reg_umr_seg(*seg, mr, flags, atomic);
*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
@@ -5315,7 +5377,6 @@ out:
* we hit doorbell */
wmb();
- /* currently we support only regular doorbells */
mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset);
/* Make sure doorbells don't leak out of SQ spinlock
* and reach the HCA out of order.
@@ -5810,7 +5871,7 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
if (qp->flags & MLX5_IB_QP_MANAGED_RECV)
qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV;
if (qp->flags & MLX5_IB_QP_SQPN_QP1)
- qp_init_attr->create_flags |= mlx5_ib_create_qp_sqpn_qp1();
+ qp_init_attr->create_flags |= MLX5_IB_QP_CREATE_SQPN_QP1;
qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ?
IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
@@ -5942,12 +6003,21 @@ static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd,
}
MLX5_SET(wq, wq, log_wq_stride, rwq->log_rq_stride);
if (rwq->create_flags & MLX5_IB_WQ_FLAGS_STRIDING_RQ) {
+ /*
+ * In Firmware number of strides in each WQE is:
+ * "512 * 2^single_wqe_log_num_of_strides"
+ * Values 3 to 8 are accepted as 10 to 15, 9 to 18 are
+ * accepted as 0 to 9
+ */
+ static const u8 fw_map[] = { 10, 11, 12, 13, 14, 15, 0, 1,
+ 2, 3, 4, 5, 6, 7, 8, 9 };
MLX5_SET(wq, wq, two_byte_shift_en, rwq->two_byte_shift_en);
MLX5_SET(wq, wq, log_wqe_stride_size,
rwq->single_stride_log_num_of_bytes -
MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES);
- MLX5_SET(wq, wq, log_wqe_num_of_strides, rwq->log_num_strides -
- MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES);
+ MLX5_SET(wq, wq, log_wqe_num_of_strides,
+ fw_map[rwq->log_num_strides -
+ MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES]);
}
MLX5_SET(wq, wq, log_wq_sz, rwq->log_rq_size);
MLX5_SET(wq, wq, pd, to_mpd(pd)->pdn);
@@ -6022,6 +6092,19 @@ static int set_user_rq_size(struct mlx5_ib_dev *dev,
return 0;
}
+static bool log_of_strides_valid(struct mlx5_ib_dev *dev, u32 log_num_strides)
+{
+ if ((log_num_strides > MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES) ||
+ (log_num_strides < MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES))
+ return false;
+
+ if (!MLX5_CAP_GEN(dev->mdev, ext_stride_num_range) &&
+ (log_num_strides < MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES))
+ return false;
+
+ return true;
+}
+
static int prepare_user_rq(struct ib_pd *pd,
struct ib_wq_init_attr *init_attr,
struct ib_udata *udata,
@@ -6069,14 +6152,16 @@ static int prepare_user_rq(struct ib_pd *pd,
MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES);
return -EINVAL;
}
- if ((ucmd.single_wqe_log_num_of_strides >
- MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES) ||
- (ucmd.single_wqe_log_num_of_strides <
- MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES)) {
- mlx5_ib_dbg(dev, "Invalid log num strides (%u. Range is %u - %u)\n",
- ucmd.single_wqe_log_num_of_strides,
- MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES,
- MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES);
+ if (!log_of_strides_valid(dev,
+ ucmd.single_wqe_log_num_of_strides)) {
+ mlx5_ib_dbg(
+ dev,
+ "Invalid log num strides (%u. Range is %u - %u)\n",
+ ucmd.single_wqe_log_num_of_strides,
+ MLX5_CAP_GEN(dev->mdev, ext_stride_num_range) ?
+ MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES :
+ MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES,
+ MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES);
return -EINVAL;
}
rwq->single_stride_log_num_of_bytes =
@@ -6331,11 +6416,13 @@ int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
}
if (curr_wq_state == IB_WQS_RESET && wq_state == IB_WQS_RDY) {
+ u16 set_id;
+
+ set_id = mlx5_ib_get_counters_id(dev, 0);
if (MLX5_CAP_GEN(dev->mdev, modify_rq_counter_set_id)) {
MLX5_SET64(modify_rq_in, in, modify_bitmask,
MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID);
- MLX5_SET(rqc, rqc, counter_set_id,
- dev->port->cnts.set_id);
+ MLX5_SET(rqc, rqc, counter_set_id, set_id);
} else
dev_info_once(
&dev->ib_dev.dev,
@@ -6486,6 +6573,7 @@ void mlx5_ib_drain_rq(struct ib_qp *qp)
*/
int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter)
{
+ struct mlx5_ib_dev *dev = to_mdev(qp->device);
struct mlx5_ib_qp *mqp = to_mqp(qp);
int err = 0;
@@ -6495,6 +6583,11 @@ int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter)
goto out;
}
+ if (!MLX5_CAP_GEN(dev->mdev, rts2rts_qp_counters_set_id)) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
if (mqp->state == IB_QPS_RTS) {
err = __mlx5_ib_qp_set_counter(qp, counter);
if (!err)
diff --git a/drivers/infiniband/hw/mlx5/restrack.c b/drivers/infiniband/hw/mlx5/restrack.c
new file mode 100644
index 000000000000..8f6c04f12531
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/restrack.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved.
+ */
+
+#include <uapi/rdma/rdma_netlink.h>
+#include <rdma/ib_umem_odp.h>
+#include <rdma/restrack.h>
+#include "mlx5_ib.h"
+
+static int fill_stat_mr_entry(struct sk_buff *msg,
+ struct rdma_restrack_entry *res)
+{
+ struct ib_mr *ibmr = container_of(res, struct ib_mr, res);
+ struct mlx5_ib_mr *mr = to_mmr(ibmr);
+ struct nlattr *table_attr;
+
+ if (!(mr->access_flags & IB_ACCESS_ON_DEMAND))
+ return 0;
+
+ table_attr = nla_nest_start(msg,
+ RDMA_NLDEV_ATTR_STAT_HWCOUNTERS);
+
+ if (!table_attr)
+ goto err;
+
+ if (rdma_nl_stat_hwcounter_entry(msg, "page_faults",
+ atomic64_read(&mr->odp_stats.faults)))
+ goto err_table;
+ if (rdma_nl_stat_hwcounter_entry(
+ msg, "page_invalidations",
+ atomic64_read(&mr->odp_stats.invalidations)))
+ goto err_table;
+
+ nla_nest_end(msg, table_attr);
+ return 0;
+
+err_table:
+ nla_nest_cancel(msg, table_attr);
+err:
+ return -EMSGSIZE;
+}
+
+static int fill_res_mr_entry(struct sk_buff *msg,
+ struct rdma_restrack_entry *res)
+{
+ struct ib_mr *ibmr = container_of(res, struct ib_mr, res);
+ struct mlx5_ib_mr *mr = to_mmr(ibmr);
+ struct nlattr *table_attr;
+
+ if (!(mr->access_flags & IB_ACCESS_ON_DEMAND))
+ return 0;
+
+ table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER);
+ if (!table_attr)
+ goto err;
+
+ if (mr->is_odp_implicit) {
+ if (rdma_nl_put_driver_string(msg, "odp", "implicit"))
+ goto err;
+ } else {
+ if (rdma_nl_put_driver_string(msg, "odp", "explicit"))
+ goto err;
+ }
+
+ nla_nest_end(msg, table_attr);
+ return 0;
+
+err:
+ nla_nest_cancel(msg, table_attr);
+ return -EMSGSIZE;
+}
+
+int mlx5_ib_fill_res_entry(struct sk_buff *msg,
+ struct rdma_restrack_entry *res)
+{
+ if (res->type == RDMA_RESTRACK_MR)
+ return fill_res_mr_entry(msg, res);
+
+ return 0;
+}
+
+int mlx5_ib_fill_stat_entry(struct sk_buff *msg,
+ struct rdma_restrack_entry *res)
+{
+ if (res->type == RDMA_RESTRACK_MR)
+ return fill_stat_mr_entry(msg, res);
+
+ return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index 4e7fde86c96b..b1a8a9175040 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -80,7 +80,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE);
- srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0, 0);
+ srq->umem = ib_umem_get(pd->device, ucmd.buf_addr, buf_size, 0);
if (IS_ERR(srq->umem)) {
mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size);
err = PTR_ERR(srq->umem);
diff --git a/drivers/infiniband/hw/mlx5/srq_cmd.c b/drivers/infiniband/hw/mlx5/srq_cmd.c
index b0d0687c7a68..8fc3630a9d4c 100644
--- a/drivers/infiniband/hw/mlx5/srq_cmd.c
+++ b/drivers/infiniband/hw/mlx5/srq_cmd.c
@@ -86,7 +86,7 @@ struct mlx5_core_srq *mlx5_cmd_get_srq(struct mlx5_ib_dev *dev, u32 srqn)
xa_lock(&table->array);
srq = xa_load(&table->array, srqn);
if (srq)
- atomic_inc(&srq->common.refcount);
+ refcount_inc(&srq->common.refcount);
xa_unlock(&table->array);
return srq;
@@ -592,7 +592,7 @@ int mlx5_cmd_create_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
if (err)
return err;
- atomic_set(&srq->common.refcount, 1);
+ refcount_set(&srq->common.refcount, 1);
init_completion(&srq->common.free);
err = xa_err(xa_store_irq(&table->array, srq->srqn, srq, GFP_KERNEL));
@@ -675,7 +675,7 @@ static int srq_event_notifier(struct notifier_block *nb,
xa_lock(&table->array);
srq = xa_load(&table->array, srqn);
if (srq)
- atomic_inc(&srq->common.refcount);
+ refcount_inc(&srq->common.refcount);
xa_unlock(&table->array);
if (!srq)
OpenPOWER on IntegriCloud